From ad4abefcd93b0e212a8c53e416647c3a08cda1ea Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 5 Apr 2026 01:42:52 +0800 Subject: [PATCH] =?UTF-8?q?fix(k8s+ops):=20=E4=BF=AE=E5=BE=A9=E5=91=8A?= =?UTF-8?q?=E8=AD=A6=E9=8F=88=E8=B7=AF=20+=20Gitea=20runner=20=E8=87=AA?= =?UTF-8?q?=E5=8B=95=E5=95=9F=E5=8B=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## 修復項目 1. NetworkPolicy allow-nginx-ingress 加入 192.168.0.110 - Alertmanager (在 110) 需要從 110 直接 POST webhook 到 API pod - 修復前: 110 被 NetworkPolicy default-deny 阻擋,webhook timeout - 修復後: 110 加入 ingress 白名單,告警鏈路恢復 2. awoooi-startup-110.sh 加入 Gitea Act Runner - Step 6: 啟動 /home/wooo/act-runner (gitea-runner container) - 修復前: 重開機後 runner 離線,CD pipeline 全面失效 - 修復後: runner 自動重啟,若配置過期自動清除重新註冊 Co-Authored-By: Claude Sonnet 4.6 --- k8s/awoooi-prod/02-network-policy.yaml | 3 + scripts/reboot-recovery/awoooi-startup-110.sh | 55 +++++++++++++++++-- 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/k8s/awoooi-prod/02-network-policy.yaml b/k8s/awoooi-prod/02-network-policy.yaml index 038d4c24..e7b16b39 100644 --- a/k8s/awoooi-prod/02-network-policy.yaml +++ b/k8s/awoooi-prod/02-network-policy.yaml @@ -43,11 +43,14 @@ spec: - protocol: TCP port: 8000 # 允許 K3s NodePort 流量 (SNAT 後源 IP 變為節點 IP) + # 2026-04-05 Claude Code: 加入 110 — Alertmanager 需要從 110 送 webhook 到 API - from: - ipBlock: cidr: 192.168.0.120/32 # K3s Master (mon) - ipBlock: cidr: 192.168.0.121/32 # K3s Worker (mon1) + - ipBlock: + cidr: 192.168.0.110/32 # DevOps 主機 (Alertmanager 來源) ports: - protocol: TCP port: 3000 diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index 25b7e38b..ddc8eccf 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -100,7 +100,7 @@ fi # ────────────────────────────────────────────── # STEP 4: 其他服務(Gitea, Langfuse, Monitoring) # ────────────────────────────────────────────── -log "[4/5] 啟動其他服務..." +log "[4/6] 啟動其他服務(Gitea, Langfuse, Monitoring)..." GITEA_DIR="/home/wooo/gitea" if [ -f "$GITEA_DIR/docker-compose.yml" ]; then @@ -120,13 +120,22 @@ MONITORING_DIR="/home/wooo/monitoring" if [ -f "$MONITORING_DIR/docker-compose.yml" ]; then cd "$MONITORING_DIR" docker compose up -d 2>&1 | tail -3 - log "✅ Monitoring 啟動指令已發送" + log "✅ Monitoring (Prometheus/Grafana/Alertmanager) 啟動指令已發送" + sleep 10 + # 驗證 Alertmanager 是否啟動 + if curl -sf --max-time 5 http://localhost:9093/-/healthy >/dev/null 2>&1; then + log "✅ Alertmanager healthy" + else + log "⚠️ Alertmanager 尚未就緒,等待 20 秒..." + sleep 20 + curl -sf --max-time 5 http://localhost:9093/-/healthy >/dev/null 2>&1 && log "✅ Alertmanager 就緒" || log "❌ Alertmanager 未就緒,需手動檢查" + fi fi # ────────────────────────────────────────────── # STEP 5: SignOz # ────────────────────────────────────────────── -log "[5/5] 啟動 SignOz..." +log "[5/6] 啟動 SignOz..." SIGNOZ_DIR="/home/wooo/signoz/deploy/docker" if [ -f "$SIGNOZ_DIR/docker-compose.yaml" ]; then cd "$SIGNOZ_DIR" @@ -134,12 +143,46 @@ if [ -f "$SIGNOZ_DIR/docker-compose.yaml" ]; then log "✅ SignOz 啟動指令已發送" fi +# ────────────────────────────────────────────── +# STEP 6: Gitea Act Runner(CI/CD 核心) +# 2026-04-05 Claude Code: 加入 — 解決重開機後 Gitea runner 離線、CD 失效 +# 重要:必須在 Gitea server 啟動後才能啟動 runner +# ────────────────────────────────────────────── +log "[6/6] 啟動 Gitea Act Runner..." +RUNNER_DIR="/home/wooo/act-runner" +if [ -f "$RUNNER_DIR/docker-compose.yml" ]; then + # 若舊的 .runner 配置指向過期 hostname,先清除讓 runner 重新註冊 + RUNNER_FILE="$RUNNER_DIR/data/.runner" + if [ -f "$RUNNER_FILE" ]; then + OLD_URL=$(python3 -c "import json; d=json.load(open('$RUNNER_FILE')); print(d.get('address',''))" 2>/dev/null || echo "") + if [ "$OLD_URL" != "http://192.168.0.110:3001" ]; then + log "⚠️ runner 配置過期 ($OLD_URL),清除重新註冊..." + rm -f "$RUNNER_FILE" || true + fi + fi + + cd "$RUNNER_DIR" + docker compose up -d 2>&1 | tail -3 + sleep 15 + + # 驗證 runner 已連線 Gitea + if docker logs gitea-runner --tail 5 2>/dev/null | grep -q "SUCCESS\|Connected\|Listening"; then + log "✅ Gitea Act Runner 已連線" + else + log "⚠️ Gitea Act Runner 可能尚未連線,查看: docker logs gitea-runner" + fi +else + log "⚠️ 找不到 act-runner compose 檔案: $RUNNER_DIR/docker-compose.yml" +fi + # ────────────────────────────────────────────── # 完成 # ────────────────────────────────────────────── log "=== 192.168.0.110 啟動序列完成 ===" -log "Harbor: http://192.168.0.110:5000" -log "Gitea: http://192.168.0.110:3001" -log "Grafana: http://192.168.0.110:3002" +log "Harbor: http://192.168.0.110:5000" +log "Gitea: http://192.168.0.110:3001" +log "Grafana: http://192.168.0.110:3002" +log "Alertmanager: http://192.168.0.110:9093" +log "Gitea Runner: docker logs gitea-runner" exit 0