feat(ci): Add Runner Health Check workflow from AIOPS
移植 WOOO-AIOPS 驗證過的設計: - External Sentinel (ubuntu-latest) 監控 self-hosted runner - Telegram 連通性檢查 - Docker/Disk/Harbor/K8s 健康檢查 - 自動修復 (Docker cleanup) - 每 10 分鐘執行一次 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
295
.github/workflows/runner-healthcheck.yml
vendored
Normal file
295
.github/workflows/runner-healthcheck.yml
vendored
Normal file
@@ -0,0 +1,295 @@
|
||||
# =============================================================================
|
||||
# AWOOOI - Self-hosted Runner 健康檢查 Workflow
|
||||
# =============================================================================
|
||||
# 移植自 WOOO AIOps (OPS.157)
|
||||
# 設計原則:
|
||||
# - 「外部哨兵」: 使用 GitHub 託管 runner 監控 self-hosted runner
|
||||
# - 即使 110 主機當機,external-sentinel Job 仍能執行並發送告警
|
||||
# - 每 10 分鐘檢查一次 Runner 狀態
|
||||
# =============================================================================
|
||||
|
||||
name: Runner Health Check
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# 每 10 分鐘執行一次
|
||||
- cron: '*/10 * * * *'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
notify_telegram:
|
||||
description: '發送 Telegram 通知'
|
||||
required: false
|
||||
default: true
|
||||
type: boolean
|
||||
|
||||
env:
|
||||
OPENCLAW_URL: http://192.168.0.188:8088
|
||||
RUNNER_HOST: 192.168.0.110
|
||||
|
||||
jobs:
|
||||
# ===========================================
|
||||
# 外部哨兵 - 在 GitHub 託管 runner 執行
|
||||
# 即使 110 當機也能發送告警 (核心設計)
|
||||
# ===========================================
|
||||
external-sentinel:
|
||||
name: "External Sentinel (GitHub-hosted)"
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 5
|
||||
|
||||
steps:
|
||||
- name: "Check Self-Hosted Runners via API"
|
||||
id: api_check
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
echo "🔍 外部哨兵檢查 Self-Hosted Runners 狀態..."
|
||||
echo "時間: $(date '+%Y-%m-%d %H:%M:%S %Z')"
|
||||
echo ""
|
||||
|
||||
# 獲取 Runner 列表
|
||||
set +e
|
||||
RUNNERS=$(gh api repos/${{ github.repository }}/actions/runners --jq '.runners // []' 2>&1)
|
||||
API_EXIT_CODE=$?
|
||||
set -e
|
||||
|
||||
# 檢查是否是權限錯誤
|
||||
if echo "$RUNNERS" | grep -q "403"; then
|
||||
echo "⚠️ GitHub API 權限不足 (403 Forbidden)"
|
||||
echo "all_healthy=unknown" >> $GITHUB_OUTPUT
|
||||
echo "api_error=permission_denied" >> $GITHUB_OUTPUT
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ "$API_EXIT_CODE" -ne 0 ] || [ -z "$RUNNERS" ] || [ "$RUNNERS" = "[]" ]; then
|
||||
echo "⚠️ 未找到任何 Self-Hosted Runner"
|
||||
echo "offline_count=0" >> $GITHUB_OUTPUT
|
||||
echo "total_runners=0" >> $GITHUB_OUTPUT
|
||||
echo "all_healthy=true" >> $GITHUB_OUTPUT
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 解析 Runner 狀態
|
||||
TOTAL=$(echo "$RUNNERS" | jq 'length')
|
||||
OFFLINE_RUNNERS=$(echo "$RUNNERS" | jq -r '[.[] | select(.status != "online")] | .[].name' | tr '\n' ',' | sed 's/,$//' || echo "")
|
||||
OFFLINE_COUNT=$(echo "$RUNNERS" | jq '[.[] | select(.status != "online")] | length')
|
||||
ONLINE_COUNT=$(echo "$RUNNERS" | jq '[.[] | select(.status == "online")] | length')
|
||||
|
||||
echo "📊 Runner 統計:"
|
||||
echo " - 總數: $TOTAL"
|
||||
echo " - 線上: $ONLINE_COUNT"
|
||||
echo " - 離線: $OFFLINE_COUNT"
|
||||
echo ""
|
||||
|
||||
# 列出所有 Runner 詳情
|
||||
echo "📋 Runner 詳情:"
|
||||
echo "$RUNNERS" | jq -r '.[] | " - \(.name): \(.status) (busy: \(.busy))"'
|
||||
|
||||
# 設定輸出變數
|
||||
echo "offline_count=$OFFLINE_COUNT" >> $GITHUB_OUTPUT
|
||||
echo "offline_runners=${OFFLINE_RUNNERS:-none}" >> $GITHUB_OUTPUT
|
||||
echo "total_runners=$TOTAL" >> $GITHUB_OUTPUT
|
||||
echo "online_count=$ONLINE_COUNT" >> $GITHUB_OUTPUT
|
||||
|
||||
if [ "$OFFLINE_COUNT" -eq 0 ]; then
|
||||
echo "all_healthy=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "all_healthy=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: "Send Critical Alert if Runners Offline"
|
||||
if: steps.api_check.outputs.all_healthy == 'false'
|
||||
env:
|
||||
TELEGRAM_TOKEN: ${{ secrets.OPENCLAW_TG_BOT_TOKEN }}
|
||||
TELEGRAM_CHAT_ID: ${{ secrets.OPENCLAW_TG_CHAT_ID }}
|
||||
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
run: |
|
||||
echo "🚨 發現離線 Runner,發送緊急告警..."
|
||||
|
||||
OFFLINE_COUNT="${{ steps.api_check.outputs.offline_count }}"
|
||||
OFFLINE_RUNNERS="${{ steps.api_check.outputs.offline_runners }}"
|
||||
TOTAL_RUNNERS="${{ steps.api_check.outputs.total_runners }}"
|
||||
|
||||
MESSAGE="🚨 [AWOOOI] Runner 離線告警
|
||||
|
||||
📍 主機: 192.168.0.110
|
||||
🔴 受影響: ${OFFLINE_RUNNERS}
|
||||
📊 離線: ${OFFLINE_COUNT}/${TOTAL_RUNNERS}
|
||||
|
||||
⚠️ 影響: CI/CD 部署已停擺
|
||||
|
||||
🔧 修復:
|
||||
ssh wooo@192.168.0.110 'cd actions-runner-awoooi && ./run.sh'
|
||||
|
||||
🔗 ${RUN_URL}"
|
||||
|
||||
if [ -n "$TELEGRAM_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
|
||||
curl -sf -X POST "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"chat_id\":\"${TELEGRAM_CHAT_ID}\",\"text\":\"${MESSAGE}\",\"disable_web_page_preview\":true}" \
|
||||
&& echo "✅ Telegram 告警發送成功" || echo "⚠️ Telegram 告警發送失敗"
|
||||
fi
|
||||
|
||||
exit 1
|
||||
|
||||
- name: "Report All Healthy"
|
||||
if: steps.api_check.outputs.all_healthy == 'true'
|
||||
run: |
|
||||
echo "✅ 外部哨兵確認: 所有 Runner 運行正常"
|
||||
echo " - 線上: ${{ steps.api_check.outputs.online_count }} / ${{ steps.api_check.outputs.total_runners }}"
|
||||
|
||||
# ===========================================
|
||||
# Telegram 連通性監控
|
||||
# 從 GitHub-hosted runner 監控 188 的 Telegram 連通性
|
||||
# ===========================================
|
||||
telegram-connectivity:
|
||||
name: "Telegram Connectivity Check"
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 3
|
||||
|
||||
steps:
|
||||
- name: "Check Telegram API Connectivity"
|
||||
id: telegram_api
|
||||
env:
|
||||
TELEGRAM_TOKEN: ${{ secrets.OPENCLAW_TG_BOT_TOKEN }}
|
||||
run: |
|
||||
echo "🔍 檢查 Telegram API 連通性..."
|
||||
|
||||
RESPONSE=$(curl -sf --max-time 10 \
|
||||
"https://api.telegram.org/bot${TELEGRAM_TOKEN}/getMe" 2>&1 || echo "FAILED")
|
||||
|
||||
if echo "$RESPONSE" | grep -q '"ok":true'; then
|
||||
BOT_NAME=$(echo "$RESPONSE" | jq -r '.result.username // "unknown"')
|
||||
echo "✅ Telegram Bot 有效: @$BOT_NAME"
|
||||
echo "telegram_api=healthy" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "❌ Telegram API 無法連接"
|
||||
echo "telegram_api=unhealthy" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: "Test OpenClaw Telegram Relay"
|
||||
id: openclaw_relay
|
||||
run: |
|
||||
echo "🔍 測試 OpenClaw Telegram 轉發能力..."
|
||||
|
||||
RESPONSE=$(curl -sf --max-time 15 \
|
||||
"${{ env.OPENCLAW_URL }}/api/v1/health/telegram" 2>&1 || echo "FAILED")
|
||||
|
||||
if echo "$RESPONSE" | grep -q '"telegram"'; then
|
||||
TELE_STATUS=$(echo "$RESPONSE" | jq -r '.telegram // "unknown"')
|
||||
echo "openclaw_telegram=$TELE_STATUS" >> $GITHUB_OUTPUT
|
||||
echo "OpenClaw Telegram 狀態: $TELE_STATUS"
|
||||
else
|
||||
echo "openclaw_telegram=unreachable" >> $GITHUB_OUTPUT
|
||||
echo "❌ 無法連接 OpenClaw"
|
||||
fi
|
||||
|
||||
# ===========================================
|
||||
# 內部健康檢查 - 在 Self-hosted Runner 執行
|
||||
# ===========================================
|
||||
health-check:
|
||||
name: "Runner Health Check"
|
||||
runs-on: [self-hosted, harbor, k8s]
|
||||
timeout-minutes: 5
|
||||
|
||||
steps:
|
||||
- name: "Check Docker Service"
|
||||
id: docker
|
||||
run: |
|
||||
echo "=== Docker 版本 ==="
|
||||
docker version --format '{{.Server.Version}}' || echo "DOCKER_FAILED"
|
||||
|
||||
if docker info > /dev/null 2>&1; then
|
||||
echo "docker_status=healthy" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "docker_status=unhealthy" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: "Check Disk Space"
|
||||
id: disk
|
||||
run: |
|
||||
echo "=== 磁碟使用量 ==="
|
||||
df -h /
|
||||
|
||||
USAGE=$(df / | tail -1 | awk '{print $5}' | tr -d '%')
|
||||
echo "disk_usage=$USAGE" >> $GITHUB_OUTPUT
|
||||
|
||||
if [ "$USAGE" -gt 95 ]; then
|
||||
echo "disk_status=critical" >> $GITHUB_OUTPUT
|
||||
elif [ "$USAGE" -gt 85 ]; then
|
||||
echo "disk_status=warning" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "disk_status=healthy" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: "Check Harbor Registry"
|
||||
id: harbor
|
||||
run: |
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \
|
||||
"http://192.168.0.110:5000/v2/" 2>/dev/null || echo "000")
|
||||
|
||||
if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then
|
||||
echo "harbor_status=healthy" >> $GITHUB_OUTPUT
|
||||
echo "Harbor Registry: OK (HTTP $HTTP_CODE)"
|
||||
else
|
||||
echo "harbor_status=unhealthy" >> $GITHUB_OUTPUT
|
||||
echo "::error::Harbor Registry 無法連接"
|
||||
fi
|
||||
|
||||
- name: "Check K8s Connectivity"
|
||||
id: k8s
|
||||
run: |
|
||||
if timeout 10 ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 \
|
||||
wooo@192.168.0.120 "kubectl get nodes" 2>/dev/null; then
|
||||
echo "k8s_status=healthy" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "k8s_status=unhealthy" >> $GITHUB_OUTPUT
|
||||
echo "::warning::K8s Master 連接失敗"
|
||||
fi
|
||||
|
||||
- name: "Report to OpenClaw"
|
||||
if: always()
|
||||
env:
|
||||
DOCKER_STATUS: ${{ steps.docker.outputs.docker_status }}
|
||||
DISK_STATUS: ${{ steps.disk.outputs.disk_status }}
|
||||
DISK_USAGE: ${{ steps.disk.outputs.disk_usage }}
|
||||
HARBOR_STATUS: ${{ steps.harbor.outputs.harbor_status }}
|
||||
K8S_STATUS: ${{ steps.k8s.outputs.k8s_status }}
|
||||
run: |
|
||||
OVERALL="healthy"
|
||||
if [ "$DOCKER_STATUS" != "healthy" ] || [ "$HARBOR_STATUS" != "healthy" ]; then
|
||||
OVERALL="critical"
|
||||
elif [ "$DISK_STATUS" = "warning" ] || [ "$K8S_STATUS" != "healthy" ]; then
|
||||
OVERALL="warning"
|
||||
fi
|
||||
|
||||
curl -sf -X POST "${{ env.OPENCLAW_URL }}/api/v1/webhook/runner-health" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"runner\": \"${{ runner.name }}\",
|
||||
\"host\": \"${{ env.RUNNER_HOST }}\",
|
||||
\"overall_status\": \"${OVERALL}\",
|
||||
\"checks\": {
|
||||
\"docker\": \"${DOCKER_STATUS}\",
|
||||
\"disk\": {\"status\": \"${DISK_STATUS}\", \"usage\": ${DISK_USAGE:-0}},
|
||||
\"harbor\": \"${HARBOR_STATUS}\",
|
||||
\"k8s\": \"${K8S_STATUS}\"
|
||||
},
|
||||
\"timestamp\": \"$(date -Iseconds)\"
|
||||
}" || echo "::warning::OpenClaw 通知失敗"
|
||||
|
||||
# ===========================================
|
||||
# 自動修復 (如果有問題)
|
||||
# ===========================================
|
||||
auto-repair:
|
||||
name: "Auto Repair (if needed)"
|
||||
needs: [health-check]
|
||||
if: failure()
|
||||
runs-on: [self-hosted, harbor, k8s]
|
||||
|
||||
steps:
|
||||
- name: "Attempt Docker Cleanup"
|
||||
run: |
|
||||
echo "=== 嘗試清理 Docker ==="
|
||||
docker system prune -f --volumes || true
|
||||
docker image prune -a -f --filter "until=168h" || true
|
||||
df -h /
|
||||
Reference in New Issue
Block a user