From 7383e14ff4b41c522ed9552b9e2e965e102950eb Mon Sep 17 00:00:00 2001 From: OG T Date: Tue, 24 Mar 2026 09:36:10 +0800 Subject: [PATCH] feat(ci): Add Runner Health Check workflow from AIOPS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 移植 WOOO-AIOPS 驗證過的設計: - External Sentinel (ubuntu-latest) 監控 self-hosted runner - Telegram 連通性檢查 - Docker/Disk/Harbor/K8s 健康檢查 - 自動修復 (Docker cleanup) - 每 10 分鐘執行一次 Co-Authored-By: Claude Opus 4.5 --- .github/workflows/runner-healthcheck.yml | 295 +++++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 .github/workflows/runner-healthcheck.yml diff --git a/.github/workflows/runner-healthcheck.yml b/.github/workflows/runner-healthcheck.yml new file mode 100644 index 00000000..dad9204c --- /dev/null +++ b/.github/workflows/runner-healthcheck.yml @@ -0,0 +1,295 @@ +# ============================================================================= +# AWOOOI - Self-hosted Runner 健康檢查 Workflow +# ============================================================================= +# 移植自 WOOO AIOps (OPS.157) +# 設計原則: +# - 「外部哨兵」: 使用 GitHub 託管 runner 監控 self-hosted runner +# - 即使 110 主機當機,external-sentinel Job 仍能執行並發送告警 +# - 每 10 分鐘檢查一次 Runner 狀態 +# ============================================================================= + +name: Runner Health Check + +on: + schedule: + # 每 10 分鐘執行一次 + - cron: '*/10 * * * *' + workflow_dispatch: + inputs: + notify_telegram: + description: '發送 Telegram 通知' + required: false + default: true + type: boolean + +env: + OPENCLAW_URL: http://192.168.0.188:8088 + RUNNER_HOST: 192.168.0.110 + +jobs: + # =========================================== + # 外部哨兵 - 在 GitHub 託管 runner 執行 + # 即使 110 當機也能發送告警 (核心設計) + # =========================================== + external-sentinel: + name: "External Sentinel (GitHub-hosted)" + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + - name: "Check Self-Hosted Runners via API" + id: api_check + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + echo "🔍 外部哨兵檢查 Self-Hosted Runners 狀態..." + echo "時間: $(date '+%Y-%m-%d %H:%M:%S %Z')" + echo "" + + # 獲取 Runner 列表 + set +e + RUNNERS=$(gh api repos/${{ github.repository }}/actions/runners --jq '.runners // []' 2>&1) + API_EXIT_CODE=$? + set -e + + # 檢查是否是權限錯誤 + if echo "$RUNNERS" | grep -q "403"; then + echo "⚠️ GitHub API 權限不足 (403 Forbidden)" + echo "all_healthy=unknown" >> $GITHUB_OUTPUT + echo "api_error=permission_denied" >> $GITHUB_OUTPUT + exit 0 + fi + + if [ "$API_EXIT_CODE" -ne 0 ] || [ -z "$RUNNERS" ] || [ "$RUNNERS" = "[]" ]; then + echo "⚠️ 未找到任何 Self-Hosted Runner" + echo "offline_count=0" >> $GITHUB_OUTPUT + echo "total_runners=0" >> $GITHUB_OUTPUT + echo "all_healthy=true" >> $GITHUB_OUTPUT + exit 0 + fi + + # 解析 Runner 狀態 + TOTAL=$(echo "$RUNNERS" | jq 'length') + OFFLINE_RUNNERS=$(echo "$RUNNERS" | jq -r '[.[] | select(.status != "online")] | .[].name' | tr '\n' ',' | sed 's/,$//' || echo "") + OFFLINE_COUNT=$(echo "$RUNNERS" | jq '[.[] | select(.status != "online")] | length') + ONLINE_COUNT=$(echo "$RUNNERS" | jq '[.[] | select(.status == "online")] | length') + + echo "📊 Runner 統計:" + echo " - 總數: $TOTAL" + echo " - 線上: $ONLINE_COUNT" + echo " - 離線: $OFFLINE_COUNT" + echo "" + + # 列出所有 Runner 詳情 + echo "📋 Runner 詳情:" + echo "$RUNNERS" | jq -r '.[] | " - \(.name): \(.status) (busy: \(.busy))"' + + # 設定輸出變數 + echo "offline_count=$OFFLINE_COUNT" >> $GITHUB_OUTPUT + echo "offline_runners=${OFFLINE_RUNNERS:-none}" >> $GITHUB_OUTPUT + echo "total_runners=$TOTAL" >> $GITHUB_OUTPUT + echo "online_count=$ONLINE_COUNT" >> $GITHUB_OUTPUT + + if [ "$OFFLINE_COUNT" -eq 0 ]; then + echo "all_healthy=true" >> $GITHUB_OUTPUT + else + echo "all_healthy=false" >> $GITHUB_OUTPUT + fi + + - name: "Send Critical Alert if Runners Offline" + if: steps.api_check.outputs.all_healthy == 'false' + env: + TELEGRAM_TOKEN: ${{ secrets.OPENCLAW_TG_BOT_TOKEN }} + TELEGRAM_CHAT_ID: ${{ secrets.OPENCLAW_TG_CHAT_ID }} + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + echo "🚨 發現離線 Runner,發送緊急告警..." + + OFFLINE_COUNT="${{ steps.api_check.outputs.offline_count }}" + OFFLINE_RUNNERS="${{ steps.api_check.outputs.offline_runners }}" + TOTAL_RUNNERS="${{ steps.api_check.outputs.total_runners }}" + + MESSAGE="🚨 [AWOOOI] Runner 離線告警 + +📍 主機: 192.168.0.110 +🔴 受影響: ${OFFLINE_RUNNERS} +📊 離線: ${OFFLINE_COUNT}/${TOTAL_RUNNERS} + +⚠️ 影響: CI/CD 部署已停擺 + +🔧 修復: +ssh wooo@192.168.0.110 'cd actions-runner-awoooi && ./run.sh' + +🔗 ${RUN_URL}" + + if [ -n "$TELEGRAM_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then + curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \ + -H "Content-Type: application/json" \ + -d "{\"chat_id\":\"${TELEGRAM_CHAT_ID}\",\"text\":\"${MESSAGE}\",\"disable_web_page_preview\":true}" \ + && echo "✅ Telegram 告警發送成功" || echo "⚠️ Telegram 告警發送失敗" + fi + + exit 1 + + - name: "Report All Healthy" + if: steps.api_check.outputs.all_healthy == 'true' + run: | + echo "✅ 外部哨兵確認: 所有 Runner 運行正常" + echo " - 線上: ${{ steps.api_check.outputs.online_count }} / ${{ steps.api_check.outputs.total_runners }}" + + # =========================================== + # Telegram 連通性監控 + # 從 GitHub-hosted runner 監控 188 的 Telegram 連通性 + # =========================================== + telegram-connectivity: + name: "Telegram Connectivity Check" + runs-on: ubuntu-latest + timeout-minutes: 3 + + steps: + - name: "Check Telegram API Connectivity" + id: telegram_api + env: + TELEGRAM_TOKEN: ${{ secrets.OPENCLAW_TG_BOT_TOKEN }} + run: | + echo "🔍 檢查 Telegram API 連通性..." + + RESPONSE=$(curl -sf --max-time 10 \ + "https://api.telegram.org/bot${TELEGRAM_TOKEN}/getMe" 2>&1 || echo "FAILED") + + if echo "$RESPONSE" | grep -q '"ok":true'; then + BOT_NAME=$(echo "$RESPONSE" | jq -r '.result.username // "unknown"') + echo "✅ Telegram Bot 有效: @$BOT_NAME" + echo "telegram_api=healthy" >> $GITHUB_OUTPUT + else + echo "❌ Telegram API 無法連接" + echo "telegram_api=unhealthy" >> $GITHUB_OUTPUT + fi + + - name: "Test OpenClaw Telegram Relay" + id: openclaw_relay + run: | + echo "🔍 測試 OpenClaw Telegram 轉發能力..." + + RESPONSE=$(curl -sf --max-time 15 \ + "${{ env.OPENCLAW_URL }}/api/v1/health/telegram" 2>&1 || echo "FAILED") + + if echo "$RESPONSE" | grep -q '"telegram"'; then + TELE_STATUS=$(echo "$RESPONSE" | jq -r '.telegram // "unknown"') + echo "openclaw_telegram=$TELE_STATUS" >> $GITHUB_OUTPUT + echo "OpenClaw Telegram 狀態: $TELE_STATUS" + else + echo "openclaw_telegram=unreachable" >> $GITHUB_OUTPUT + echo "❌ 無法連接 OpenClaw" + fi + + # =========================================== + # 內部健康檢查 - 在 Self-hosted Runner 執行 + # =========================================== + health-check: + name: "Runner Health Check" + runs-on: [self-hosted, harbor, k8s] + timeout-minutes: 5 + + steps: + - name: "Check Docker Service" + id: docker + run: | + echo "=== Docker 版本 ===" + docker version --format '{{.Server.Version}}' || echo "DOCKER_FAILED" + + if docker info > /dev/null 2>&1; then + echo "docker_status=healthy" >> $GITHUB_OUTPUT + else + echo "docker_status=unhealthy" >> $GITHUB_OUTPUT + fi + + - name: "Check Disk Space" + id: disk + run: | + echo "=== 磁碟使用量 ===" + df -h / + + USAGE=$(df / | tail -1 | awk '{print $5}' | tr -d '%') + echo "disk_usage=$USAGE" >> $GITHUB_OUTPUT + + if [ "$USAGE" -gt 95 ]; then + echo "disk_status=critical" >> $GITHUB_OUTPUT + elif [ "$USAGE" -gt 85 ]; then + echo "disk_status=warning" >> $GITHUB_OUTPUT + else + echo "disk_status=healthy" >> $GITHUB_OUTPUT + fi + + - name: "Check Harbor Registry" + id: harbor + run: | + HTTP_CODE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \ + "http://192.168.0.110:5000/v2/" 2>/dev/null || echo "000") + + if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then + echo "harbor_status=healthy" >> $GITHUB_OUTPUT + echo "Harbor Registry: OK (HTTP $HTTP_CODE)" + else + echo "harbor_status=unhealthy" >> $GITHUB_OUTPUT + echo "::error::Harbor Registry 無法連接" + fi + + - name: "Check K8s Connectivity" + id: k8s + run: | + if timeout 10 ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 \ + wooo@192.168.0.120 "kubectl get nodes" 2>/dev/null; then + echo "k8s_status=healthy" >> $GITHUB_OUTPUT + else + echo "k8s_status=unhealthy" >> $GITHUB_OUTPUT + echo "::warning::K8s Master 連接失敗" + fi + + - name: "Report to OpenClaw" + if: always() + env: + DOCKER_STATUS: ${{ steps.docker.outputs.docker_status }} + DISK_STATUS: ${{ steps.disk.outputs.disk_status }} + DISK_USAGE: ${{ steps.disk.outputs.disk_usage }} + HARBOR_STATUS: ${{ steps.harbor.outputs.harbor_status }} + K8S_STATUS: ${{ steps.k8s.outputs.k8s_status }} + run: | + OVERALL="healthy" + if [ "$DOCKER_STATUS" != "healthy" ] || [ "$HARBOR_STATUS" != "healthy" ]; then + OVERALL="critical" + elif [ "$DISK_STATUS" = "warning" ] || [ "$K8S_STATUS" != "healthy" ]; then + OVERALL="warning" + fi + + curl -sf -X POST "${{ env.OPENCLAW_URL }}/api/v1/webhook/runner-health" \ + -H "Content-Type: application/json" \ + -d "{ + \"runner\": \"${{ runner.name }}\", + \"host\": \"${{ env.RUNNER_HOST }}\", + \"overall_status\": \"${OVERALL}\", + \"checks\": { + \"docker\": \"${DOCKER_STATUS}\", + \"disk\": {\"status\": \"${DISK_STATUS}\", \"usage\": ${DISK_USAGE:-0}}, + \"harbor\": \"${HARBOR_STATUS}\", + \"k8s\": \"${K8S_STATUS}\" + }, + \"timestamp\": \"$(date -Iseconds)\" + }" || echo "::warning::OpenClaw 通知失敗" + + # =========================================== + # 自動修復 (如果有問題) + # =========================================== + auto-repair: + name: "Auto Repair (if needed)" + needs: [health-check] + if: failure() + runs-on: [self-hosted, harbor, k8s] + + steps: + - name: "Attempt Docker Cleanup" + run: | + echo "=== 嘗試清理 Docker ===" + docker system prune -f --volumes || true + docker image prune -a -f --filter "until=168h" || true + df -h /