157 lines
5.6 KiB
YAML
157 lines
5.6 KiB
YAML
# =============================================================================
|
|
# AWOOOI - Self-hosted Runner 健康檢查 Workflow
|
|
# =============================================================================
|
|
# 設計原則:
|
|
# - 每 10 分鐘檢查一次 Runner 狀態
|
|
# - 只使用 self-hosted runner (禁止 ubuntu-latest)
|
|
#
|
|
# 🔴 HARD RULE: 禁止 ubuntu-latest (GitHub Billing 限制)
|
|
# =============================================================================
|
|
|
|
name: Runner Health Check
|
|
|
|
on:
|
|
schedule:
|
|
# 每 10 分鐘執行一次
|
|
- cron: '*/10 * * * *'
|
|
workflow_dispatch:
|
|
inputs:
|
|
notify_telegram:
|
|
description: '發送 Telegram 通知'
|
|
required: false
|
|
default: true
|
|
type: boolean
|
|
|
|
env:
|
|
OPENCLAW_URL: http://192.168.0.188:8088
|
|
RUNNER_HOST: 192.168.0.110
|
|
|
|
jobs:
|
|
# ===========================================
|
|
# 內部健康檢查 - 在 Self-hosted Runner 執行
|
|
# ===========================================
|
|
health-check:
|
|
name: "Runner Health Check"
|
|
runs-on: [self-hosted, harbor, k8s]
|
|
timeout-minutes: 5
|
|
|
|
steps:
|
|
# =================================================================
|
|
# 2026-03-29 Claude Code: 修復 _diag/pages 檔案衝突
|
|
# 根本原因: GitHub Actions Runner 診斷檔案沒有自動清理
|
|
# =================================================================
|
|
- name: "Clean Runner Diagnostics"
|
|
run: |
|
|
RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")")
|
|
rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true
|
|
mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true
|
|
echo "✅ Cleaned _diag/pages"
|
|
|
|
- name: "Check Docker Service"
|
|
id: docker
|
|
run: |
|
|
echo "=== Docker 版本 ==="
|
|
docker version --format '{{.Server.Version}}' || echo "DOCKER_FAILED"
|
|
|
|
if docker info > /dev/null 2>&1; then
|
|
echo "docker_status=healthy" >> $GITHUB_OUTPUT
|
|
else
|
|
echo "docker_status=unhealthy" >> $GITHUB_OUTPUT
|
|
fi
|
|
|
|
- name: "Check Disk Space"
|
|
id: disk
|
|
run: |
|
|
echo "=== 磁碟使用量 ==="
|
|
df -h /
|
|
|
|
USAGE=$(df / | tail -1 | awk '{print $5}' | tr -d '%')
|
|
echo "disk_usage=$USAGE" >> $GITHUB_OUTPUT
|
|
|
|
if [ "$USAGE" -gt 95 ]; then
|
|
echo "disk_status=critical" >> $GITHUB_OUTPUT
|
|
elif [ "$USAGE" -gt 85 ]; then
|
|
echo "disk_status=warning" >> $GITHUB_OUTPUT
|
|
else
|
|
echo "disk_status=healthy" >> $GITHUB_OUTPUT
|
|
fi
|
|
|
|
- name: "Check Harbor Registry"
|
|
id: harbor
|
|
run: |
|
|
HTTP_CODE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 \
|
|
"http://192.168.0.110:5000/v2/" 2>/dev/null || echo "000")
|
|
|
|
if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "401" ]; then
|
|
echo "harbor_status=healthy" >> $GITHUB_OUTPUT
|
|
echo "Harbor Registry: OK (HTTP $HTTP_CODE)"
|
|
else
|
|
echo "harbor_status=unhealthy" >> $GITHUB_OUTPUT
|
|
echo "::error::Harbor Registry 無法連接"
|
|
fi
|
|
|
|
- name: "Check K8s Connectivity"
|
|
id: k8s
|
|
run: |
|
|
if timeout 10 ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 \
|
|
wooo@192.168.0.125 "kubectl get nodes" 2>/dev/null; then
|
|
echo "k8s_status=healthy" >> $GITHUB_OUTPUT
|
|
else
|
|
echo "k8s_status=unhealthy" >> $GITHUB_OUTPUT
|
|
echo "::warning::K8s Master 連接失敗"
|
|
fi
|
|
|
|
- name: "Report to OpenClaw"
|
|
if: always()
|
|
env:
|
|
DOCKER_STATUS: ${{ steps.docker.outputs.docker_status }}
|
|
DISK_STATUS: ${{ steps.disk.outputs.disk_status }}
|
|
DISK_USAGE: ${{ steps.disk.outputs.disk_usage }}
|
|
HARBOR_STATUS: ${{ steps.harbor.outputs.harbor_status }}
|
|
K8S_STATUS: ${{ steps.k8s.outputs.k8s_status }}
|
|
run: |
|
|
OVERALL="healthy"
|
|
if [ "$DOCKER_STATUS" != "healthy" ] || [ "$HARBOR_STATUS" != "healthy" ]; then
|
|
OVERALL="critical"
|
|
elif [ "$DISK_STATUS" = "warning" ] || [ "$K8S_STATUS" != "healthy" ]; then
|
|
OVERALL="warning"
|
|
fi
|
|
|
|
curl -sf -X POST "${{ env.OPENCLAW_URL }}/api/v1/webhook/runner-health" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{
|
|
\"runner\": \"${{ runner.name }}\",
|
|
\"host\": \"${{ env.RUNNER_HOST }}\",
|
|
\"overall_status\": \"${OVERALL}\",
|
|
\"checks\": {
|
|
\"docker\": \"${DOCKER_STATUS}\",
|
|
\"disk\": {\"status\": \"${DISK_STATUS}\", \"usage\": ${DISK_USAGE:-0}},
|
|
\"harbor\": \"${HARBOR_STATUS}\",
|
|
\"k8s\": \"${K8S_STATUS}\"
|
|
},
|
|
\"timestamp\": \"$(date -Iseconds)\"
|
|
}" || echo "::warning::OpenClaw 通知失敗"
|
|
|
|
# ===========================================
|
|
# 自動修復 (如果有問題)
|
|
# ===========================================
|
|
auto-repair:
|
|
name: "Auto Repair (if needed)"
|
|
needs: [health-check]
|
|
if: failure()
|
|
runs-on: [self-hosted, harbor, k8s]
|
|
|
|
steps:
|
|
- name: "Clean Runner Diagnostics"
|
|
run: |
|
|
RUNNER_ROOT=$(dirname "$(dirname "$RUNNER_TEMP")")
|
|
rm -rf "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true
|
|
mkdir -p "$RUNNER_ROOT/_diag/pages" 2>/dev/null || true
|
|
|
|
- name: "Attempt Docker Cleanup"
|
|
run: |
|
|
echo "=== 嘗試清理 Docker ==="
|
|
docker system prune -f --volumes || true
|
|
docker image prune -a -f --filter "until=168h" || true
|
|
df -h /
|