# ============================================================================= # AWOOOI - Self-hosted Runner 健康檢查 Workflow # ============================================================================= # 設計原則: # - 每 10 分鐘檢查一次 Runner 狀態 # - 只使用 self-hosted runner (禁止 ubuntu-latest) # # 🔴 HARD RULE: 禁止 ubuntu-latest (GitHub Billing 限制) # ============================================================================= name: Runner Health Check on: schedule: # 每 10 分鐘執行一次 - cron: '*/10 * * * *' workflow_dispatch: inputs: notify_telegram: description: '發送 Telegram 通知' required: false default: true type: boolean env: OPENCLAW_URL: http://192.168.0.188:8088 RUNNER_HOST: 192.168.0.110 jobs: # =========================================== # 內部健康檢查 - 在 Self-hosted Runner 執行 # =========================================== health-check: name: "Runner Health Check" runs-on: [self-hosted, harbor, k8s] timeout-minutes: 5 steps: # ================================================================= # 2026-03-29 Claude Code: 修復 _diag/pages 檔案衝突 # 根本原因: GitHub Actions Runner 診斷檔案沒有自動清理 # ================================================================= - name: "Clean Runner Diagnostics" run: | RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true echo "✅ Cleaned _diag/pages" - name: "Check Docker Service" id: docker run: | echo "=== Docker 版本 ===" docker version --format '{{.Server.Version}}' || echo "DOCKER_FAILED" if docker info > /dev/null 2>&1; then echo "docker_status=healthy" >> $GITHUB_OUTPUT else echo "docker_status=unhealthy" >> $GITHUB_OUTPUT fi - name: "Check Disk Space" id: disk run: | echo "=== 磁碟使用量 ===" df -h / USAGE=$(df / | tail -1 | awk '{print $5}' | tr -d '%') echo "disk_usage=$USAGE" >> $GITHUB_OUTPUT if [ "$USAGE" -gt 95 ]; then echo "disk_status=critical" >> $GITHUB_OUTPUT elif [ "$USAGE" -gt 85 ]; then echo "disk_status=warning" >> $GITHUB_OUTPUT else echo "disk_status=healthy" >> $GITHUB_OUTPUT fi - name: "Check Harbor Registry" id: harbor run: | HTTP_CODE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \ "http://192.168.0.110:5000/v2/" 2>/dev/null || echo "000") if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then echo "harbor_status=healthy" >> $GITHUB_OUTPUT echo "Harbor Registry: OK (HTTP $HTTP_CODE)" else echo "harbor_status=unhealthy" >> $GITHUB_OUTPUT echo "::error::Harbor Registry 無法連接" fi - name: "Check K8s Connectivity" id: k8s run: | if timeout 10 ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 \ wooo@192.168.0.125 "kubectl get nodes" 2>/dev/null; then echo "k8s_status=healthy" >> $GITHUB_OUTPUT else echo "k8s_status=unhealthy" >> $GITHUB_OUTPUT echo "::warning::K8s Master 連接失敗" fi - name: "Report to OpenClaw" if: always() env: DOCKER_STATUS: ${{ steps.docker.outputs.docker_status }} DISK_STATUS: ${{ steps.disk.outputs.disk_status }} DISK_USAGE: ${{ steps.disk.outputs.disk_usage }} HARBOR_STATUS: ${{ steps.harbor.outputs.harbor_status }} K8S_STATUS: ${{ steps.k8s.outputs.k8s_status }} run: | OVERALL="healthy" if [ "$DOCKER_STATUS" != "healthy" ] || [ "$HARBOR_STATUS" != "healthy" ]; then OVERALL="critical" elif [ "$DISK_STATUS" = "warning" ] || [ "$K8S_STATUS" != "healthy" ]; then OVERALL="warning" fi curl -sf -X POST "${{ env.OPENCLAW_URL }}/api/v1/webhook/runner-health" \ -H "Content-Type: application/json" \ -d "{ \"runner\": \"${{ runner.name }}\", \"host\": \"${{ env.RUNNER_HOST }}\", \"overall_status\": \"${OVERALL}\", \"checks\": { \"docker\": \"${DOCKER_STATUS}\", \"disk\": {\"status\": \"${DISK_STATUS}\", \"usage\": ${DISK_USAGE:-0}}, \"harbor\": \"${HARBOR_STATUS}\", \"k8s\": \"${K8S_STATUS}\" }, \"timestamp\": \"$(date -Iseconds)\" }" || echo "::warning::OpenClaw 通知失敗" # =========================================== # 自動修復 (如果有問題) # =========================================== auto-repair: name: "Auto Repair (if needed)" needs: [health-check] if: failure() runs-on: [self-hosted, harbor, k8s] steps: - name: "Clean Runner Diagnostics" run: | RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")") rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true - name: "Attempt Docker Cleanup" run: | echo "=== 嘗試清理 Docker ===" docker system prune -f --volumes || true docker image prune -a -f --filter "until=168h" || true df -h /