Files
awoooi/.github/workflows/runner-healthcheck.yml
2026-05-02 15:20:01 +08:00

157 lines
5.6 KiB
YAML

# =============================================================================
# AWOOOI - Self-hosted Runner 健康檢查 Workflow
# =============================================================================
# 設計原則:
# - 每 10 分鐘檢查一次 Runner 狀態
# - 只使用 self-hosted runner (禁止 ubuntu-latest)
#
# 🔴 HARD RULE: 禁止 ubuntu-latest (GitHub Billing 限制)
# =============================================================================
name: Runner Health Check
on:
schedule:
# 每 10 分鐘執行一次
- cron: '*/10 * * * *'
workflow_dispatch:
inputs:
notify_telegram:
description: '發送 Telegram 通知'
required: false
default: true
type: boolean
env:
OPENCLAW_URL: http://192.168.0.188:8088
RUNNER_HOST: 192.168.0.110
jobs:
# ===========================================
# 內部健康檢查 - 在 Self-hosted Runner 執行
# ===========================================
health-check:
name: "Runner Health Check"
runs-on: [self-hosted, harbor, k8s]
timeout-minutes: 5
steps:
# =================================================================
# 2026-03-29 Claude Code: 修復 _diag/pages 檔案衝突
# 根本原因: GitHub Actions Runner 診斷檔案沒有自動清理
# =================================================================
- name: "Clean Runner Diagnostics"
run: |
RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")")
rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true
mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true
echo "✅ Cleaned _diag/pages"
- name: "Check Docker Service"
id: docker
run: |
echo "=== Docker 版本 ==="
docker version --format '{{.Server.Version}}' || echo "DOCKER_FAILED"
if docker info > /dev/null 2>&1; then
echo "docker_status=healthy" >> $GITHUB_OUTPUT
else
echo "docker_status=unhealthy" >> $GITHUB_OUTPUT
fi
- name: "Check Disk Space"
id: disk
run: |
echo "=== 磁碟使用量 ==="
df -h /
USAGE=$(df / | tail -1 | awk '{print $5}' | tr -d '%')
echo "disk_usage=$USAGE" >> $GITHUB_OUTPUT
if [ "$USAGE" -gt 95 ]; then
echo "disk_status=critical" >> $GITHUB_OUTPUT
elif [ "$USAGE" -gt 85 ]; then
echo "disk_status=warning" >> $GITHUB_OUTPUT
else
echo "disk_status=healthy" >> $GITHUB_OUTPUT
fi
- name: "Check Harbor Registry"
id: harbor
run: |
HTTP_CODE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \
"http://192.168.0.110:5000/v2/" 2>/dev/null || echo "000")
if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then
echo "harbor_status=healthy" >> $GITHUB_OUTPUT
echo "Harbor Registry: OK (HTTP $HTTP_CODE)"
else
echo "harbor_status=unhealthy" >> $GITHUB_OUTPUT
echo "::error::Harbor Registry 無法連接"
fi
- name: "Check K8s Connectivity"
id: k8s
run: |
if timeout 10 ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 \
wooo@192.168.0.125 "kubectl get nodes" 2>/dev/null; then
echo "k8s_status=healthy" >> $GITHUB_OUTPUT
else
echo "k8s_status=unhealthy" >> $GITHUB_OUTPUT
echo "::warning::K8s Master 連接失敗"
fi
- name: "Report to OpenClaw"
if: always()
env:
DOCKER_STATUS: ${{ steps.docker.outputs.docker_status }}
DISK_STATUS: ${{ steps.disk.outputs.disk_status }}
DISK_USAGE: ${{ steps.disk.outputs.disk_usage }}
HARBOR_STATUS: ${{ steps.harbor.outputs.harbor_status }}
K8S_STATUS: ${{ steps.k8s.outputs.k8s_status }}
run: |
OVERALL="healthy"
if [ "$DOCKER_STATUS" != "healthy" ] || [ "$HARBOR_STATUS" != "healthy" ]; then
OVERALL="critical"
elif [ "$DISK_STATUS" = "warning" ] || [ "$K8S_STATUS" != "healthy" ]; then
OVERALL="warning"
fi
curl -sf -X POST "${{ env.OPENCLAW_URL }}/api/v1/webhook/runner-health" \
-H "Content-Type: application/json" \
-d "{
\"runner\": \"${{ runner.name }}\",
\"host\": \"${{ env.RUNNER_HOST }}\",
\"overall_status\": \"${OVERALL}\",
\"checks\": {
\"docker\": \"${DOCKER_STATUS}\",
\"disk\": {\"status\": \"${DISK_STATUS}\", \"usage\": ${DISK_USAGE:-0}},
\"harbor\": \"${HARBOR_STATUS}\",
\"k8s\": \"${K8S_STATUS}\"
},
\"timestamp\": \"$(date -Iseconds)\"
}" || echo "::warning::OpenClaw 通知失敗"
# ===========================================
# 自動修復 (如果有問題)
# ===========================================
auto-repair:
name: "Auto Repair (if needed)"
needs: [health-check]
if: failure()
runs-on: [self-hosted, harbor, k8s]
steps:
- name: "Clean Runner Diagnostics"
run: |
RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")")
rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true
mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true
- name: "Attempt Docker Cleanup"
run: |
echo "=== 嘗試清理 Docker ==="
docker system prune -f --volumes || true
docker image prune -a -f --filter "until=168h" || true
df -h /