feat(ci): Add Runner Health Check workflow from AIOPS

移植 WOOO-AIOPS 驗證過的設計:
- External Sentinel (ubuntu-latest) 監控 self-hosted runner
- Telegram 連通性檢查
- Docker/Disk/Harbor/K8s 健康檢查
- 自動修復 (Docker cleanup)
- 每 10 分鐘執行一次

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-03-24 09:36:10 +08:00
parent ffc7b1fdcc
commit 7383e14ff4

295
.github/workflows/runner-healthcheck.yml vendored Normal file
View File

@@ -0,0 +1,295 @@
# =============================================================================
# AWOOOI - Self-hosted Runner 健康檢查 Workflow
# =============================================================================
# 移植自 WOOO AIOps (OPS.157)
# 設計原則:
# - 「外部哨兵」: 使用 GitHub 託管 runner 監控 self-hosted runner
# - 即使 110 主機當機external-sentinel Job 仍能執行並發送告警
# - 每 10 分鐘檢查一次 Runner 狀態
# =============================================================================
name: Runner Health Check
on:
schedule:
# 每 10 分鐘執行一次
- cron: '*/10 * * * *'
workflow_dispatch:
inputs:
notify_telegram:
description: '發送 Telegram 通知'
required: false
default: true
type: boolean
env:
OPENCLAW_URL: http://192.168.0.188:8088
RUNNER_HOST: 192.168.0.110
jobs:
# ===========================================
# 外部哨兵 - 在 GitHub 託管 runner 執行
# 即使 110 當機也能發送告警 (核心設計)
# ===========================================
external-sentinel:
name: "External Sentinel (GitHub-hosted)"
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- name: "Check Self-Hosted Runners via API"
id: api_check
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
echo "🔍 外部哨兵檢查 Self-Hosted Runners 狀態..."
echo "時間: $(date '+%Y-%m-%d %H:%M:%S %Z')"
echo ""
# 獲取 Runner 列表
set +e
RUNNERS=$(gh api repos/${{ github.repository }}/actions/runners --jq '.runners // []' 2>&1)
API_EXIT_CODE=$?
set -e
# 檢查是否是權限錯誤
if echo "$RUNNERS" | grep -q "403"; then
echo "⚠️ GitHub API 權限不足 (403 Forbidden)"
echo "all_healthy=unknown" >> $GITHUB_OUTPUT
echo "api_error=permission_denied" >> $GITHUB_OUTPUT
exit 0
fi
if [ "$API_EXIT_CODE" -ne 0 ] || [ -z "$RUNNERS" ] || [ "$RUNNERS" = "[]" ]; then
echo "⚠️ 未找到任何 Self-Hosted Runner"
echo "offline_count=0" >> $GITHUB_OUTPUT
echo "total_runners=0" >> $GITHUB_OUTPUT
echo "all_healthy=true" >> $GITHUB_OUTPUT
exit 0
fi
# 解析 Runner 狀態
TOTAL=$(echo "$RUNNERS" | jq 'length')
OFFLINE_RUNNERS=$(echo "$RUNNERS" | jq -r '[.[] | select(.status != "online")] | .[].name' | tr '\n' ',' | sed 's/,$//' || echo "")
OFFLINE_COUNT=$(echo "$RUNNERS" | jq '[.[] | select(.status != "online")] | length')
ONLINE_COUNT=$(echo "$RUNNERS" | jq '[.[] | select(.status == "online")] | length')
echo "📊 Runner 統計:"
echo " - 總數: $TOTAL"
echo " - 線上: $ONLINE_COUNT"
echo " - 離線: $OFFLINE_COUNT"
echo ""
# 列出所有 Runner 詳情
echo "📋 Runner 詳情:"
echo "$RUNNERS" | jq -r '.[] | " - \(.name): \(.status) (busy: \(.busy))"'
# 設定輸出變數
echo "offline_count=$OFFLINE_COUNT" >> $GITHUB_OUTPUT
echo "offline_runners=${OFFLINE_RUNNERS:-none}" >> $GITHUB_OUTPUT
echo "total_runners=$TOTAL" >> $GITHUB_OUTPUT
echo "online_count=$ONLINE_COUNT" >> $GITHUB_OUTPUT
if [ "$OFFLINE_COUNT" -eq 0 ]; then
echo "all_healthy=true" >> $GITHUB_OUTPUT
else
echo "all_healthy=false" >> $GITHUB_OUTPUT
fi
- name: "Send Critical Alert if Runners Offline"
if: steps.api_check.outputs.all_healthy == 'false'
env:
TELEGRAM_TOKEN: ${{ secrets.OPENCLAW_TG_BOT_TOKEN }}
TELEGRAM_CHAT_ID: ${{ secrets.OPENCLAW_TG_CHAT_ID }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
run: |
echo "🚨 發現離線 Runner發送緊急告警..."
OFFLINE_COUNT="${{ steps.api_check.outputs.offline_count }}"
OFFLINE_RUNNERS="${{ steps.api_check.outputs.offline_runners }}"
TOTAL_RUNNERS="${{ steps.api_check.outputs.total_runners }}"
MESSAGE="🚨 [AWOOOI] Runner 離線告警
📍 主機: 192.168.0.110
🔴 受影響: ${OFFLINE_RUNNERS}
📊 離線: ${OFFLINE_COUNT}/${TOTAL_RUNNERS}
⚠️ 影響: CI/CD 部署已停擺
🔧 修復:
ssh wooo@192.168.0.110 'cd actions-runner-awoooi && ./run.sh'
🔗 ${RUN_URL}"
if [ -n "$TELEGRAM_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
-H "Content-Type: application/json" \
-d "{\"chat_id\":\"${TELEGRAM_CHAT_ID}\",\"text\":\"${MESSAGE}\",\"disable_web_page_preview\":true}" \
&& echo "✅ Telegram 告警發送成功" || echo "⚠️ Telegram 告警發送失敗"
fi
exit 1
- name: "Report All Healthy"
if: steps.api_check.outputs.all_healthy == 'true'
run: |
echo "✅ 外部哨兵確認: 所有 Runner 運行正常"
echo " - 線上: ${{ steps.api_check.outputs.online_count }} / ${{ steps.api_check.outputs.total_runners }}"
# ===========================================
# Telegram 連通性監控
# 從 GitHub-hosted runner 監控 188 的 Telegram 連通性
# ===========================================
telegram-connectivity:
name: "Telegram Connectivity Check"
runs-on: ubuntu-latest
timeout-minutes: 3
steps:
- name: "Check Telegram API Connectivity"
id: telegram_api
env:
TELEGRAM_TOKEN: ${{ secrets.OPENCLAW_TG_BOT_TOKEN }}
run: |
echo "🔍 檢查 Telegram API 連通性..."
RESPONSE=$(curl -sf --max-time 10 \
"https://api.telegram.org/bot${TELEGRAM_TOKEN}/getMe" 2>&1 || echo "FAILED")
if echo "$RESPONSE" | grep -q '"ok":true'; then
BOT_NAME=$(echo "$RESPONSE" | jq -r '.result.username // "unknown"')
echo "✅ Telegram Bot 有效: @$BOT_NAME"
echo "telegram_api=healthy" >> $GITHUB_OUTPUT
else
echo "❌ Telegram API 無法連接"
echo "telegram_api=unhealthy" >> $GITHUB_OUTPUT
fi
- name: "Test OpenClaw Telegram Relay"
id: openclaw_relay
run: |
echo "🔍 測試 OpenClaw Telegram 轉發能力..."
RESPONSE=$(curl -sf --max-time 15 \
"${{ env.OPENCLAW_URL }}/api/v1/health/telegram" 2>&1 || echo "FAILED")
if echo "$RESPONSE" | grep -q '"telegram"'; then
TELE_STATUS=$(echo "$RESPONSE" | jq -r '.telegram // "unknown"')
echo "openclaw_telegram=$TELE_STATUS" >> $GITHUB_OUTPUT
echo "OpenClaw Telegram 狀態: $TELE_STATUS"
else
echo "openclaw_telegram=unreachable" >> $GITHUB_OUTPUT
echo "❌ 無法連接 OpenClaw"
fi
# ===========================================
# 內部健康檢查 - 在 Self-hosted Runner 執行
# ===========================================
health-check:
name: "Runner Health Check"
runs-on: [self-hosted, harbor, k8s]
timeout-minutes: 5
steps:
- name: "Check Docker Service"
id: docker
run: |
echo "=== Docker 版本 ==="
docker version --format '{{.Server.Version}}' || echo "DOCKER_FAILED"
if docker info > /dev/null 2>&1; then
echo "docker_status=healthy" >> $GITHUB_OUTPUT
else
echo "docker_status=unhealthy" >> $GITHUB_OUTPUT
fi
- name: "Check Disk Space"
id: disk
run: |
echo "=== 磁碟使用量 ==="
df -h /
USAGE=$(df / | tail -1 | awk '{print $5}' | tr -d '%')
echo "disk_usage=$USAGE" >> $GITHUB_OUTPUT
if [ "$USAGE" -gt 95 ]; then
echo "disk_status=critical" >> $GITHUB_OUTPUT
elif [ "$USAGE" -gt 85 ]; then
echo "disk_status=warning" >> $GITHUB_OUTPUT
else
echo "disk_status=healthy" >> $GITHUB_OUTPUT
fi
- name: "Check Harbor Registry"
id: harbor
run: |
HTTP_CODE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \
"http://192.168.0.110:5000/v2/" 2>/dev/null || echo "000")
if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then
echo "harbor_status=healthy" >> $GITHUB_OUTPUT
echo "Harbor Registry: OK (HTTP $HTTP_CODE)"
else
echo "harbor_status=unhealthy" >> $GITHUB_OUTPUT
echo "::error::Harbor Registry 無法連接"
fi
- name: "Check K8s Connectivity"
id: k8s
run: |
if timeout 10 ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 \
wooo@192.168.0.120 "kubectl get nodes" 2>/dev/null; then
echo "k8s_status=healthy" >> $GITHUB_OUTPUT
else
echo "k8s_status=unhealthy" >> $GITHUB_OUTPUT
echo "::warning::K8s Master 連接失敗"
fi
- name: "Report to OpenClaw"
if: always()
env:
DOCKER_STATUS: ${{ steps.docker.outputs.docker_status }}
DISK_STATUS: ${{ steps.disk.outputs.disk_status }}
DISK_USAGE: ${{ steps.disk.outputs.disk_usage }}
HARBOR_STATUS: ${{ steps.harbor.outputs.harbor_status }}
K8S_STATUS: ${{ steps.k8s.outputs.k8s_status }}
run: |
OVERALL="healthy"
if [ "$DOCKER_STATUS" != "healthy" ] || [ "$HARBOR_STATUS" != "healthy" ]; then
OVERALL="critical"
elif [ "$DISK_STATUS" = "warning" ] || [ "$K8S_STATUS" != "healthy" ]; then
OVERALL="warning"
fi
curl -sf -X POST "${{ env.OPENCLAW_URL }}/api/v1/webhook/runner-health" \
-H "Content-Type: application/json" \
-d "{
\"runner\": \"${{ runner.name }}\",
\"host\": \"${{ env.RUNNER_HOST }}\",
\"overall_status\": \"${OVERALL}\",
\"checks\": {
\"docker\": \"${DOCKER_STATUS}\",
\"disk\": {\"status\": \"${DISK_STATUS}\", \"usage\": ${DISK_USAGE:-0}},
\"harbor\": \"${HARBOR_STATUS}\",
\"k8s\": \"${K8S_STATUS}\"
},
\"timestamp\": \"$(date -Iseconds)\"
}" || echo "::warning::OpenClaw 通知失敗"
# ===========================================
# 自動修復 (如果有問題)
# ===========================================
auto-repair:
name: "Auto Repair (if needed)"
needs: [health-check]
if: failure()
runs-on: [self-hosted, harbor, k8s]
steps:
- name: "Attempt Docker Cleanup"
run: |
echo "=== 嘗試清理 Docker ==="
docker system prune -f --volumes || true
docker image prune -a -f --filter "until=168h" || true
df -h /