diff --git a/docs/runbooks/REBOOT-RECOVERY-SOP.md b/docs/runbooks/REBOOT-RECOVERY-SOP.md index 5f72cadf..12e3174f 100644 --- a/docs/runbooks/REBOOT-RECOVERY-SOP.md +++ b/docs/runbooks/REBOOT-RECOVERY-SOP.md @@ -1,9 +1,9 @@ # AWOOOI 重開機恢復 SOP -> **版本**: v4.0 +> **版本**: v5.0 > **最後更新**: 2026-04-05 下午 (台北時間) > **更新者**: Claude Code (首席架構師) -> **觸發事件**: Prometheus 規則統一部署 + Sentry 啟動自動化 + Sentry 損壞修復 SOP + 全系統自愈閉環 +> **觸發事件**: Harbor Exited(128) Race Condition 根治 + harbor-watchdog 常駐自愈 --- @@ -122,15 +122,36 @@ Git push → Gitea(110:3001) ### 已部署 (2026-04-05) -| 主機 | 腳本 | 部署位置 | systemd service | 狀態 | -|------|------|---------|----------------|------| -| **188** | `awoooi-startup.sh` | `/usr/local/bin/` | `awoooi-startup.service` | ✅ enabled | -| **110** | `awoooi-startup-110.sh` | `/usr/local/bin/` | `awoooi-startup-110.service` | ✅ enabled | -| **120** | K3s 原生 | 系統內建 | `k3s.service` | ✅ enabled | -| **121** | K3s 原生 | 系統內建 | `k3s-agent.service` | ✅ enabled | +| 主機 | 腳本 | 部署位置 | systemd service | 狀態 | 類型 | +|------|------|---------|----------------|------|------| +| **188** | `awoooi-startup.sh` | `/usr/local/bin/` | `awoooi-startup.service` | ✅ enabled | 重開機 oneshot | +| **110** | `awoooi-startup-110.sh` | `/usr/local/bin/` | `awoooi-startup-110.service` | ✅ enabled | 重開機 oneshot | +| **110** | `harbor-watchdog.sh` | `/usr/local/bin/` | `harbor-watchdog.service` | ✅ active | **常駐 watchdog** | +| **120** | K3s 原生 | 系統內建 | `k3s.service` | ✅ enabled | 系統服務 | +| **121** | K3s 原生 | 系統內建 | `k3s-agent.service` | ✅ enabled | 系統服務 | **本地原始碼**: `scripts/reboot-recovery/` +### harbor-watchdog(2026-04-05 新增) + +**問題背景**: Harbor 的 Exited (128) Race Condition +- `restart: always` 在 harbor-log 未就緒時不斷重試,最終進入 backoff 放棄 +- 即使 harbor-log 後來 healthy,其他容器不會自動重試 +- 只靠 startup 腳本無法處理**運行中崩潰**的情況 + +**設計**: +- `Type=simple` 常駐進程(不是 oneshot),systemd 永久監控 +- 每 60 秒輪詢 `http://127.0.0.1:5000/v2/`(401 = healthy) +- 偵測到不健康 → 等 5 秒再確認(避免誤報)→ 執行完整時序修復 +- 修復邏輯:清除 Exited 容器 → 只啟動 harbor-log → 等 healthy → 啟動全部 + +**查看狀態**: +```bash +journalctl -u harbor-watchdog.service -f # 即時 log +systemctl status harbor-watchdog.service # 服務狀態 +cat /var/log/harbor-watchdog.log # 持久化 log +``` + ### 各腳本覆蓋清單 **188 (7 步驟)**: @@ -142,14 +163,15 @@ Git push → Gitea(110:3001) - Nginx 啟動 - SignOz + MinIO + aiops-network + OpenClaw -**110 (7 步驟)**: +**110 (7 步驟 + harbor-watchdog 常駐)**: - Docker BoltDB 損壞偵測與修復 - 孤兒容器清除 (Exited 128) -- Harbor (harbor-log healthy 後啟動全組件) +- Harbor(harbor-log first 策略:Phase 1 清除 → Phase 2 只啟 harbor-log → Phase 3 等 healthy → Phase 4 啟全部) - Gitea + Langfuse + Monitoring (含 Alertmanager 健康驗證) - SignOz - **Gitea Act Runner** (自動清除過期 .runner 配置) - **Sentry** (/opt/sentry,含 PostgreSQL WAL + Redis RDB 損壞自動修復) +- **harbor-watchdog.service** (常駐,每 60s 自動修復) --- @@ -593,3 +615,4 @@ echo "=== 結果: ${PASS} 通過, ${FAIL} 失敗 ===" | v2.0 | 2026-04-05 上午 | 188 啟動腳本 + 110 啟動腳本 + Harbor race condition 修復 | | v3.0 | 2026-04-05 下午 | 完整架構重盤 + Gitea Runner 自動化 + 告警鏈路根因修復 + NetworkPolicy 修正 | | v4.0 | 2026-04-05 下午 | Prometheus 規則統一部署 (28條) + Sentry startup (Step 7) + Sentry損壞修復SOP + 規則未部署診斷樹 + E2E腳本加入Sentry/Prometheus驗證 | +| v5.0 | 2026-04-05 下午 | Harbor Exited(128) Race Condition 根治:startup-110.sh 改用 harbor-log first 策略 + 新增 harbor-watchdog.service 常駐自愈(curl -f bug 修正)| diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index 1dfdc595..d8c936ba 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -69,29 +69,58 @@ else fi # ────────────────────────────────────────────── -# STEP 3: Harbor(注意:先等 harbor-log healthy) +# STEP 3: Harbor(嚴格時序:harbor-log first) +# 2026-04-05 Claude Code: 修復 Race Condition — +# 舊邏輯:compose up -d 全部 → 等 harbor-log healthy → compose up -d 全部 +# 問題:第一次 up 時 harbor-core/db/portal 嘗試連 syslog:1514 失敗 exit, +# restart:always 重試直到放棄,第二次 compose up 無效 +# 新邏輯:只啟動 harbor-log → 等 healthy → 啟動其餘全部 # ────────────────────────────────────────────── log "[3/5] 啟動 Harbor..." HARBOR_DIR="/home/wooo/harbor/harbor" if [ -f "$HARBOR_DIR/docker-compose.yml" ]; then cd "$HARBOR_DIR" - docker compose up -d 2>&1 | tail -5 - # 等待 harbor-log 變 healthy(最多 60 秒) - log "等待 harbor-log healthy..." - for i in $(seq 1 12); do + # Phase 1: 清除任何 Exited 狀態的 Harbor 容器(避免 restart:always 干擾) + HARBOR_EXITED=$(docker ps -a --format "{{.Names}}\t{{.Status}}" | grep "^harbor-" | grep "Exited" | awk '{print $1}') + if [ -n "$HARBOR_EXITED" ]; then + log "清除 Exited Harbor 容器: $(echo $HARBOR_EXITED | tr '\n' ' ')" + echo "$HARBOR_EXITED" | xargs docker rm -f 2>/dev/null || true + fi + + # Phase 2: 只啟動 harbor-log(syslog 接收端,其他容器的前置依賴) + log "Phase 2: 啟動 harbor-log..." + docker compose up -d harbor-log 2>&1 | tail -3 + + # Phase 3: 等待 harbor-log healthy(最多 90 秒) + log "Phase 3: 等待 harbor-log healthy(最多 90s)..." + for i in $(seq 1 18); do STATUS=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "missing") - [ "$STATUS" = "healthy" ] && break + if [ "$STATUS" = "healthy" ]; then + log "✅ harbor-log healthy (${i}x5s)" + break + fi sleep 5 done STATUS=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "unknown") if [ "$STATUS" = "healthy" ]; then - # harbor-log healthy 後重啟其他組件(它們依賴 1514 port) + # Phase 4: harbor-log 就緒後才啟動其餘組件 + log "Phase 4: 啟動 Harbor 全組件..." docker compose up -d 2>&1 | tail -5 - log "✅ Harbor 啟動完成" + + # 驗證 + sleep 5 + # 注意:Harbor /v2/ 回 401(需認證),不能用 -f flag(會把 401 當失敗) + HTTP=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://127.0.0.1:5000/v2/ 2>/dev/null || echo "0") + if [ "$HTTP" = "401" ] || [ "$HTTP" = "200" ]; then + log "✅ Harbor 就緒 (HTTP $HTTP)" + else + log "⚠️ Harbor API 未就緒 (HTTP $HTTP),通常需再等 10-20s" + fi else - log "⚠️ harbor-log 未 healthy,Harbor 可能需要手動檢查" + log "❌ harbor-log 未 healthy (status=$STATUS),跳過其餘 Harbor 容器" + log " 手動修復: cd $HARBOR_DIR && docker compose up -d" fi else log "⚠️ 找不到 Harbor compose 檔案" diff --git a/scripts/reboot-recovery/deploy-to-110.sh b/scripts/reboot-recovery/deploy-to-110.sh index 2583b4d4..c2cdd38e 100644 --- a/scripts/reboot-recovery/deploy-to-110.sh +++ b/scripts/reboot-recovery/deploy-to-110.sh @@ -1,41 +1,57 @@ #!/bin/bash -# 將 awoooi-startup-110 部署到 192.168.0.110 +# 將 awoooi-startup-110 + harbor-watchdog 部署到 192.168.0.110 +# 2026-04-05 Claude Code: 加入 harbor-watchdog(運行中崩潰自動恢復) # 執行方式: bash deploy-to-110.sh set -euo pipefail HOST="wooo@192.168.0.110" PASS="0936223270" -echo "=== 部署 awoooi-startup-110 到 192.168.0.110 ===" +echo "=== 部署 awoooi-startup-110 + harbor-watchdog 到 192.168.0.110 ===" # 1. 上傳腳本 -echo "[1/4] 上傳啟動腳本..." +echo "[1/5] 上傳啟動腳本..." scp awoooi-startup-110.sh "$HOST:/tmp/awoooi-startup-110.sh" - -# 2. 上傳 systemd unit -echo "[2/4] 上傳 systemd unit..." scp awoooi-startup-110.service "$HOST:/tmp/awoooi-startup-110.service" -# 3. 安裝 -echo "[3/4] 安裝..." -ssh "$HOST" " -echo '$PASS' | sudo -S bash -c ' +# 2. 上傳 watchdog +echo "[2/5] 上傳 harbor-watchdog..." +scp harbor-watchdog.sh "$HOST:/tmp/harbor-watchdog.sh" +scp harbor-watchdog.service "$HOST:/tmp/harbor-watchdog.service" + +# 3. 安裝 startup service +echo "[3/5] 安裝 startup service..." +ssh "$HOST" "echo '$PASS' | sudo -S bash -c ' cp /tmp/awoooi-startup-110.sh /usr/local/bin/awoooi-startup-110.sh chmod +x /usr/local/bin/awoooi-startup-110.sh cp /tmp/awoooi-startup-110.service /etc/systemd/system/awoooi-startup-110.service systemctl daemon-reload systemctl enable awoooi-startup-110.service - echo done -' -" + echo startup OK +'" -# 4. 驗證 -echo "[4/4] 驗證安裝..." -ssh "$HOST" "echo '$PASS' | sudo -S systemctl is-enabled awoooi-startup-110.service && echo '✅ 已啟用' || echo '❌ 啟用失敗'" +# 4. 安裝 watchdog service(立即啟動) +echo "[4/5] 安裝並啟動 harbor-watchdog..." +ssh "$HOST" "echo '$PASS' | sudo -S bash -c ' + cp /tmp/harbor-watchdog.sh /usr/local/bin/harbor-watchdog.sh + chmod +x /usr/local/bin/harbor-watchdog.sh + cp /tmp/harbor-watchdog.service /etc/systemd/system/harbor-watchdog.service + systemctl daemon-reload + systemctl enable harbor-watchdog.service + systemctl restart harbor-watchdog.service + echo watchdog OK +'" + +# 5. 驗證 +echo "[5/5] 驗證..." +ssh "$HOST" "echo '$PASS' | sudo -S systemctl is-active harbor-watchdog.service && echo '✅ harbor-watchdog active' || echo '❌ harbor-watchdog 失敗'" +ssh "$HOST" "echo '$PASS' | sudo -S systemctl is-enabled awoooi-startup-110.service && echo '✅ startup-110 enabled' || echo '❌ startup-110 未啟用'" echo "" echo "✅ 部署完成!" -echo "下次重開機後,110 會自動執行修復並啟動所有服務。" echo "" -echo "手動測試執行:" -echo " ssh $HOST 'echo $PASS | sudo -S /usr/local/bin/awoooi-startup-110.sh'" +echo "harbor-watchdog: 每 60 秒檢查 Harbor 健康,不健康自動修復" +echo "startup-110: 下次重開機後自動恢復所有服務" +echo "" +echo "查看 watchdog 狀態:" +echo " journalctl -u harbor-watchdog.service -f" diff --git a/scripts/reboot-recovery/harbor-watchdog.service b/scripts/reboot-recovery/harbor-watchdog.service new file mode 100644 index 00000000..f00c0148 --- /dev/null +++ b/scripts/reboot-recovery/harbor-watchdog.service @@ -0,0 +1,22 @@ +[Unit] +Description=Harbor Watchdog — 自動偵測並修復 Harbor 崩潰 +# 2026-04-05 Claude Code: 解決 Harbor Exited(128) 死鎖問題 +After=network-online.target docker.service awoooi-startup-110.service +Wants=network-online.target +Requires=docker.service + +[Service] +Type=simple +# watchdog.sh 是無限 loop,systemd 持續監控 +ExecStart=/usr/local/bin/harbor-watchdog.sh +# 若 watchdog 意外結束(腳本 bug),30s 後重啟 +Restart=on-failure +RestartSec=30 +# 日誌 +StandardOutput=journal +StandardError=journal +# watchdog 需要 root 才能操作 docker(與 startup script 一致) +User=root + +[Install] +WantedBy=multi-user.target diff --git a/scripts/reboot-recovery/harbor-watchdog.sh b/scripts/reboot-recovery/harbor-watchdog.sh new file mode 100644 index 00000000..b002fa53 --- /dev/null +++ b/scripts/reboot-recovery/harbor-watchdog.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Harbor Watchdog — 運行中崩潰自動恢復 +# 2026-04-05 Claude Code: 解決 Harbor 在非重開機情境下崩潰後無法自動恢復 +# +# 問題根因: +# Docker restart:always 在 Exited (128) 情況下會重試直到 backoff 放棄 +# 因為 harbor-core/db 在啟動時嘗試連 syslog:1514 失敗就 exit, +# 形成死鎖:harbor-log healthy → 但其他容器已進入 backoff 不再重試 +# +# 解法:此 watchdog 每 60 秒檢查 Harbor API 健康, +# 若不健康則執行完整的時序修復(harbor-log first 策略) +# +# 部署位置: /usr/local/bin/harbor-watchdog.sh (on 192.168.0.110) +# systemd unit: /etc/systemd/system/harbor-watchdog.service (Type=simple, loop) + +HARBOR_DIR="/home/wooo/harbor/harbor" +LOG="/var/log/harbor-watchdog.log" +CHECK_INTERVAL=60 # 秒 + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] [watchdog] $*" | tee -a "$LOG"; } + +harbor_is_healthy() { + local code + # 注意:Harbor /v2/ 回 401(需認證),不能用 -f flag(會把 401 當失敗) + code=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://127.0.0.1:5000/v2/ 2>/dev/null || echo "0") + [ "$code" = "401" ] || [ "$code" = "200" ] +} + +repair_harbor() { + log "⚠️ Harbor 不健康,開始修復程序..." + cd "$HARBOR_DIR" || { log "❌ 找不到 Harbor 目錄"; return 1; } + + # Step 1: 清除所有 Exited Harbor 容器(打破 backoff 死鎖) + local exited + exited=$(docker ps -a --format "{{.Names}}\t{{.Status}}" | grep "^harbor-" | grep "Exited" | awk '{print $1}') + if [ -n "$exited" ]; then + log "清除 Exited 容器: $(echo "$exited" | tr '\n' ' ')" + echo "$exited" | xargs docker rm -f 2>/dev/null || true + fi + + # Step 2: 只啟動 harbor-log + log "啟動 harbor-log..." + docker compose up -d harbor-log 2>/dev/null + + # Step 3: 等 harbor-log healthy(最多 90s) + local i status + for i in $(seq 1 18); do + status=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "missing") + [ "$status" = "healthy" ] && break + sleep 5 + done + + status=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "unknown") + if [ "$status" != "healthy" ]; then + log "❌ harbor-log 未 healthy (status=$status),放棄本次修復" + return 1 + fi + + # Step 4: 啟動全部 + log "harbor-log healthy,啟動全組件..." + docker compose up -d 2>/dev/null | tail -5 + + sleep 10 + if harbor_is_healthy; then + log "✅ Harbor 修復成功" + else + log "❌ Harbor 修復後仍不健康,需人工介入" + log " 手動: cd $HARBOR_DIR && docker compose down && docker compose up -d harbor-log && sleep 30 && docker compose up -d" + fi +} + +log "=== Harbor Watchdog 啟動 (interval=${CHECK_INTERVAL}s) ===" + +while true; do + if harbor_is_healthy; then + : # 健康,靜默 + else + # 等 5 秒再確認(避免短暫不穩定誤報) + sleep 5 + if ! harbor_is_healthy; then + repair_harbor + fi + fi + sleep "$CHECK_INTERVAL" +done