Files
awoooi/scripts/reboot-recovery/harbor-watchdog.sh
OG T 3f7a742683 fix(infra): 首席架構師 Review 修正 — C1/I1/I2/I3/I4/S1
C1: 移除 deploy-to-110.sh 密碼明文,改用 SSH key + sudoers NOPASSWD
I1: 加入 /var/lock/harbor-repair.lock 防止 watchdog 與 startup 並行修復
I2: docker compose 的 stderr 不再靜默(改用 tee -a log | while read 輸出)
I3: watchdog while loop 包在子 shell + || true,子 shell 異常不終止 watchdog
I4: repair_harbor 關鍵指令(harbor-log 啟動)加入退出碼捕捉
S1: 修復後驗證等待從 5s/10s 改為 30s(harbor-core 初始化需要足夠時間)
S2: docker ps 改用 --filter status=exited 取代 grep/awk

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 12:18:41 +08:00

114 lines
4.4 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# Harbor Watchdog — 運行中崩潰自動恢復
# 2026-04-05 Claude Code: 解決 Harbor Exited(128) 死鎖問題
# 首席架構師 Review 2026-04-05: 修正 I1(lockfile) I2(stderr) I3(loop防護) I4(指令退出碼) S1(驗證等待)
#
# 問題根因:
# Docker restart:always 在 Exited(128) 情況下重試後進入 backoff 放棄
# harbor-core/db/portal 啟動時連 syslog:1514harbor-log 未就緒)失敗 exit
#
# 設計:此 watchdog 每 60 秒輪詢 Harbor API
# 不健康 → 等 5s 確認 → 執行 4 Phase 修復harbor-log first 策略)
# 使用 lockfile 防止與 startup-110.sh 並行修復I1
#
# 部署位置: /usr/local/bin/harbor-watchdog.sh (on 192.168.0.110)
# systemd unit: /etc/systemd/system/harbor-watchdog.service
HARBOR_DIR="/home/wooo/harbor/harbor"
LOG="/var/log/harbor-watchdog.log"
LOCKFILE="/var/lock/harbor-repair.lock"
CHECK_INTERVAL=60 # 秒
# 注意:不使用 set -ewatchdog 是長駐 loop-e 會在任何指令失敗時終止腳本
# 所有關鍵指令的退出碼需明確捕捉I4
set -uo pipefail
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] [watchdog] $*" | tee -a "$LOG"; }
harbor_is_healthy() {
local code
# 注意Harbor /v2/ 正常回傳 401需認證不能用 -f-f 把 401 視為失敗 exit 22
# 使用 127.0.0.1 避免 IPv6 解析問題
code=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://127.0.0.1:5000/v2/ 2>/dev/null || echo "000")
[ "$code" = "401" ] || [ "$code" = "200" ]
}
repair_harbor() {
# I1: lockfile 防止與 startup-110.sh 並行修復
exec 9>"$LOCKFILE"
if ! flock -n 9; then
log "另一修復程序正在進行,跳過本次修復"
return 0
fi
log "⚠️ 開始修復 Harbor4 Phase 策略)..."
cd "$HARBOR_DIR" || { log "❌ 找不到 Harbor 目錄: $HARBOR_DIR"; return 1; }
# Phase 1: 清除所有 Exited Harbor 容器(打破 backoff 死鎖)
local exited
exited=$(docker ps -a --filter name="harbor-" --filter status=exited --format "{{.Names}}" 2>/dev/null || true)
if [ -n "$exited" ]; then
log "Phase 1: 清除 Exited 容器: $(echo "$exited" | tr '\n' ' ')"
echo "$exited" | xargs docker rm -f 2>&1 | while IFS= read -r line; do log " $line"; done || true
else
log "Phase 1: 無 Exited 容器"
fi
# Phase 2: 只啟動 harbor-logI2: 捕捉 stderrI4: 確認退出碼)
log "Phase 2: 啟動 harbor-log..."
if ! docker compose up -d harbor-log 2>&1 | while IFS= read -r line; do log " $line"; done; then
log "❌ harbor-log 啟動指令失敗"
return 1
fi
# Phase 3: 等 harbor-log healthy最多 90s
log "Phase 3: 等待 harbor-log healthy最多 90s..."
local i status
for i in $(seq 1 18); do
status=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "missing")
if [ "$status" = "healthy" ]; then
log "✅ harbor-log healthy (${i}x5s)"
break
fi
sleep 5
done
status=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "unknown")
if [ "$status" != "healthy" ]; then
log "❌ harbor-log 未 healthy (status=$status),放棄本次修復"
return 1
fi
# Phase 4: 啟動全組件I2: 捕捉 stderr
log "Phase 4: 啟動 Harbor 全組件..."
docker compose up -d 2>&1 | while IFS= read -r line; do log " $line"; done || true
# S1: 等 30s 讓 harbor-core 完成初始化(原 10s 不夠)
log "等待 30s 讓 harbor-core 完成初始化..."
sleep 30
if harbor_is_healthy; then
log "✅ Harbor 修復成功"
else
log "❌ Harbor 修復後仍不健康,需人工介入"
log " 手動: cd $HARBOR_DIR && docker compose down && docker compose up -d harbor-log && sleep 60 && docker compose up -d"
fi
}
log "=== Harbor Watchdog 啟動 (interval=${CHECK_INTERVAL}s) ==="
# I3: while loop 本體用 || true 保護,子 shell 異常不終止整個 watchdog
while true; do
(
if ! harbor_is_healthy; then
# 等 5 秒再確認(避免短暫不穩定誤報)
sleep 5
if ! harbor_is_healthy; then
repair_harbor
fi
fi
) || log "⚠️ 本次檢查週期發生未預期錯誤,繼續下一週期"
sleep "$CHECK_INTERVAL"
done