C1: 移除 deploy-to-110.sh 密碼明文,改用 SSH key + sudoers NOPASSWD I1: 加入 /var/lock/harbor-repair.lock 防止 watchdog 與 startup 並行修復 I2: docker compose 的 stderr 不再靜默(改用 tee -a log | while read 輸出) I3: watchdog while loop 包在子 shell + || true,子 shell 異常不終止 watchdog I4: repair_harbor 關鍵指令(harbor-log 啟動)加入退出碼捕捉 S1: 修復後驗證等待從 5s/10s 改為 30s(harbor-core 初始化需要足夠時間) S2: docker ps 改用 --filter status=exited 取代 grep/awk Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
114 lines
4.4 KiB
Bash
114 lines
4.4 KiB
Bash
#!/bin/bash
|
||
# Harbor Watchdog — 運行中崩潰自動恢復
|
||
# 2026-04-05 Claude Code: 解決 Harbor Exited(128) 死鎖問題
|
||
# 首席架構師 Review 2026-04-05: 修正 I1(lockfile) I2(stderr) I3(loop防護) I4(指令退出碼) S1(驗證等待)
|
||
#
|
||
# 問題根因:
|
||
# Docker restart:always 在 Exited(128) 情況下重試後進入 backoff 放棄
|
||
# harbor-core/db/portal 啟動時連 syslog:1514(harbor-log 未就緒)失敗 exit
|
||
#
|
||
# 設計:此 watchdog 每 60 秒輪詢 Harbor API,
|
||
# 不健康 → 等 5s 確認 → 執行 4 Phase 修復(harbor-log first 策略)
|
||
# 使用 lockfile 防止與 startup-110.sh 並行修復(I1)
|
||
#
|
||
# 部署位置: /usr/local/bin/harbor-watchdog.sh (on 192.168.0.110)
|
||
# systemd unit: /etc/systemd/system/harbor-watchdog.service
|
||
|
||
HARBOR_DIR="/home/wooo/harbor/harbor"
|
||
LOG="/var/log/harbor-watchdog.log"
|
||
LOCKFILE="/var/lock/harbor-repair.lock"
|
||
CHECK_INTERVAL=60 # 秒
|
||
|
||
# 注意:不使用 set -e,watchdog 是長駐 loop,-e 會在任何指令失敗時終止腳本
|
||
# 所有關鍵指令的退出碼需明確捕捉(I4)
|
||
set -uo pipefail
|
||
|
||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] [watchdog] $*" | tee -a "$LOG"; }
|
||
|
||
harbor_is_healthy() {
|
||
local code
|
||
# 注意:Harbor /v2/ 正常回傳 401(需認證),不能用 -f(-f 把 401 視為失敗 exit 22)
|
||
# 使用 127.0.0.1 避免 IPv6 解析問題
|
||
code=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://127.0.0.1:5000/v2/ 2>/dev/null || echo "000")
|
||
[ "$code" = "401" ] || [ "$code" = "200" ]
|
||
}
|
||
|
||
repair_harbor() {
|
||
# I1: lockfile 防止與 startup-110.sh 並行修復
|
||
exec 9>"$LOCKFILE"
|
||
if ! flock -n 9; then
|
||
log "另一修復程序正在進行,跳過本次修復"
|
||
return 0
|
||
fi
|
||
|
||
log "⚠️ 開始修復 Harbor(4 Phase 策略)..."
|
||
cd "$HARBOR_DIR" || { log "❌ 找不到 Harbor 目錄: $HARBOR_DIR"; return 1; }
|
||
|
||
# Phase 1: 清除所有 Exited Harbor 容器(打破 backoff 死鎖)
|
||
local exited
|
||
exited=$(docker ps -a --filter name="harbor-" --filter status=exited --format "{{.Names}}" 2>/dev/null || true)
|
||
if [ -n "$exited" ]; then
|
||
log "Phase 1: 清除 Exited 容器: $(echo "$exited" | tr '\n' ' ')"
|
||
echo "$exited" | xargs docker rm -f 2>&1 | while IFS= read -r line; do log " $line"; done || true
|
||
else
|
||
log "Phase 1: 無 Exited 容器"
|
||
fi
|
||
|
||
# Phase 2: 只啟動 harbor-log(I2: 捕捉 stderr,I4: 確認退出碼)
|
||
log "Phase 2: 啟動 harbor-log..."
|
||
if ! docker compose up -d harbor-log 2>&1 | while IFS= read -r line; do log " $line"; done; then
|
||
log "❌ harbor-log 啟動指令失敗"
|
||
return 1
|
||
fi
|
||
|
||
# Phase 3: 等 harbor-log healthy(最多 90s)
|
||
log "Phase 3: 等待 harbor-log healthy(最多 90s)..."
|
||
local i status
|
||
for i in $(seq 1 18); do
|
||
status=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "missing")
|
||
if [ "$status" = "healthy" ]; then
|
||
log "✅ harbor-log healthy (${i}x5s)"
|
||
break
|
||
fi
|
||
sleep 5
|
||
done
|
||
|
||
status=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "unknown")
|
||
if [ "$status" != "healthy" ]; then
|
||
log "❌ harbor-log 未 healthy (status=$status),放棄本次修復"
|
||
return 1
|
||
fi
|
||
|
||
# Phase 4: 啟動全組件(I2: 捕捉 stderr)
|
||
log "Phase 4: 啟動 Harbor 全組件..."
|
||
docker compose up -d 2>&1 | while IFS= read -r line; do log " $line"; done || true
|
||
|
||
# S1: 等 30s 讓 harbor-core 完成初始化(原 10s 不夠)
|
||
log "等待 30s 讓 harbor-core 完成初始化..."
|
||
sleep 30
|
||
|
||
if harbor_is_healthy; then
|
||
log "✅ Harbor 修復成功"
|
||
else
|
||
log "❌ Harbor 修復後仍不健康,需人工介入"
|
||
log " 手動: cd $HARBOR_DIR && docker compose down && docker compose up -d harbor-log && sleep 60 && docker compose up -d"
|
||
fi
|
||
}
|
||
|
||
log "=== Harbor Watchdog 啟動 (interval=${CHECK_INTERVAL}s) ==="
|
||
|
||
# I3: while loop 本體用 || true 保護,子 shell 異常不終止整個 watchdog
|
||
while true; do
|
||
(
|
||
if ! harbor_is_healthy; then
|
||
# 等 5 秒再確認(避免短暫不穩定誤報)
|
||
sleep 5
|
||
if ! harbor_is_healthy; then
|
||
repair_harbor
|
||
fi
|
||
fi
|
||
) || log "⚠️ 本次檢查週期發生未預期錯誤,繼續下一週期"
|
||
|
||
sleep "$CHECK_INTERVAL"
|
||
done
|