Files
awoooi/scripts/reboot-recovery/awoooi-startup-110.sh
Your Name 9b465ee140
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
ci(runner): drain legacy docker act runner safely
2026-05-21 18:53:45 +08:00

314 lines
14 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# 192.168.0.110 重開機自動恢復腳本
# 2026-04-05 ogt: 根據第二次重開機事故建立
# 部署位置: /usr/local/bin/awoooi-startup-110.sh (on 192.168.0.110)
# systemd unit: /etc/systemd/system/awoooi-startup-110.service
#
# 已知問題處理:
# - Docker BoltDB 損壞 (network/files/local-kv.db, volumes/metadata.db)
# - 舊容器使用已不存在的 Docker network (需要 docker rm -f 全部)
# - Harbor nginx 依賴 harbor-log (需要等 harbor-log healthy 後才 compose up)
set -uo pipefail
LOG="/var/log/awoooi-startup-110.log"
exec > >(tee -a "$LOG") 2>&1
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
log "=== 192.168.0.110 啟動序列開始 ==="
# ──────────────────────────────────────────────
# STEP 1: Docker 修復(若 BoltDB 損壞)
# ──────────────────────────────────────────────
log "[1/5] 檢查 Docker..."
if ! systemctl is-active docker >/dev/null 2>&1; then
log "Docker 未啟動,嘗試啟動..."
systemctl start docker || true
sleep 8
fi
if ! systemctl is-active docker >/dev/null 2>&1; then
log "Docker 啟動失敗,修復 BoltDB..."
# 清除所有已知 BoltDB 損壞點
NETWORK_DB="/var/lib/docker/network/files/local-kv.db"
VOLUMES_DB="/var/lib/docker/volumes/metadata.db"
if [ -f "$NETWORK_DB" ]; then
cp "$NETWORK_DB" "${NETWORK_DB}.bak.$(date +%Y%m%d%H%M%S)"
rm -f "$NETWORK_DB"
log "清除損壞的 local-kv.db"
fi
if [ -f "$VOLUMES_DB" ]; then
cp "$VOLUMES_DB" "${VOLUMES_DB}.bak.$(date +%Y%m%d%H%M%S)"
rm -f "$VOLUMES_DB"
log "清除損壞的 metadata.db"
fi
# 清除 buildkit 快取(也可能損壞)
find /var/lib/docker/buildkit -name "*.db" -delete 2>/dev/null || true
systemctl start docker
sleep 8
systemctl is-active docker && log "✅ Docker 修復成功" || { log "❌ Docker 修復失敗"; exit 1; }
else
log "✅ Docker 已 active"
fi
# ──────────────────────────────────────────────
# STEP 2: 清除孤兒容器(舊容器的 network 已不存在)
# ──────────────────────────────────────────────
log "[2/5] 清除孤兒容器..."
STALE=$(docker ps -a --format "{{.Names}}\t{{.Status}}" | grep "Exited (128)\|Exited (137)" | awk '{print $1}')
if [ -n "$STALE" ]; then
log "發現孤兒容器: $(echo $STALE | tr '\n' ' ')"
echo "$STALE" | xargs docker rm -f 2>/dev/null || true
docker network prune -f 2>/dev/null || true
log "✅ 孤兒容器清除完成"
else
log "✅ 無孤兒容器"
fi
# ──────────────────────────────────────────────
# STEP 3: Harbor嚴格時序harbor-log first
# 2026-04-05 Claude Code: 修復 Race Condition —
# 舊邏輯compose up -d 全部 → 等 harbor-log healthy → compose up -d 全部
# 問題:第一次 up 時 harbor-core/db/portal 嘗試連 syslog:1514 失敗 exit
# restart:always 重試直到放棄,第二次 compose up 無效
# 新邏輯:只啟動 harbor-log → 等 healthy → 啟動其餘全部
# 首席架構師 Review I1: 使用 lockfile 防止 watchdog 並行修復
# ──────────────────────────────────────────────
log "[3/5] 啟動 Harbor..."
HARBOR_DIR="/home/wooo/harbor/harbor"
HARBOR_LOCKFILE="/var/lock/harbor-repair.lock"
if [ -f "$HARBOR_DIR/docker-compose.yml" ]; then
cd "$HARBOR_DIR"
# I1: 持有 lockfile防止 harbor-watchdog 同時修復
exec 9>"$HARBOR_LOCKFILE"
flock -w 30 9 || { log "⚠️ 無法取得 Harbor lock跳過watchdog 可能正在修復)"; return 0; }
# Phase 1: 清除任何 Exited 狀態的 Harbor 容器(避免 restart:always 干擾)
# 使用 --filter 取代 grep/awk更精確不受欄位格式影響
HARBOR_EXITED=$(docker ps -a --filter name="harbor-" --filter status=exited --format "{{.Names}}" 2>/dev/null || true)
if [ -n "$HARBOR_EXITED" ]; then
log "Phase 1: 清除 Exited Harbor 容器: $(echo "$HARBOR_EXITED" | tr '\n' ' ')"
echo "$HARBOR_EXITED" | xargs docker rm -f 2>&1 | tail -3 || true
else
log "Phase 1: 無 Exited 容器"
fi
# Phase 2: 只啟動 harbor-logsyslog 接收端,其他容器的前置依賴)
log "Phase 2: 啟動 harbor-log..."
docker compose up -d harbor-log 2>&1 | tail -3
# Phase 3: 等待 harbor-log healthy最多 90 秒)
log "Phase 3: 等待 harbor-log healthy最多 90s..."
for i in $(seq 1 18); do
STATUS=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "missing")
if [ "$STATUS" = "healthy" ]; then
log "✅ harbor-log healthy (${i}x5s)"
break
fi
sleep 5
done
STATUS=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "unknown")
if [ "$STATUS" = "healthy" ]; then
# Phase 4: harbor-log 就緒後才啟動其餘組件
log "Phase 4: 啟動 Harbor 全組件..."
docker compose up -d 2>&1 | tail -5
# S1: 等 30s 讓 harbor-core 完成初始化(之前 5s 不夠)
sleep 30
# 注意Harbor /v2/ 回 401需認證不能用 -f flag會把 401 當失敗)
HTTP=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://127.0.0.1:5000/v2/ 2>/dev/null || echo "0")
if [ "$HTTP" = "401" ] || [ "$HTTP" = "200" ]; then
log "✅ Harbor 就緒 (HTTP $HTTP)"
else
log "⚠️ Harbor API 未就緒 (HTTP $HTTP)harbor-watchdog 會在 60s 後自動修復"
fi
else
log "❌ harbor-log 未 healthy (status=$STATUS),跳過其餘 Harbor 容器"
log " harbor-watchdog 會在 60s 後自動偵測並修復"
fi
else
log "⚠️ 找不到 Harbor compose 檔案"
fi
# ──────────────────────────────────────────────
# STEP 4: 其他服務Gitea, Langfuse, Monitoring
# ──────────────────────────────────────────────
log "[4/6] 啟動其他服務Gitea, Langfuse, Monitoring..."
GITEA_DIR="/home/wooo/gitea"
if [ -f "$GITEA_DIR/docker-compose.yml" ]; then
cd "$GITEA_DIR"
docker compose up -d 2>&1 | tail -3
log "✅ Gitea 啟動指令已發送"
fi
LANGFUSE_DIR="/home/wooo/langfuse"
if [ -f "$LANGFUSE_DIR/docker-compose.yml" ]; then
cd "$LANGFUSE_DIR"
docker compose up -d 2>&1 | tail -3
log "✅ Langfuse 啟動指令已發送"
fi
MONITORING_DIR="/home/wooo/monitoring"
if [ -f "$MONITORING_DIR/docker-compose.yml" ]; then
cd "$MONITORING_DIR"
docker compose up -d 2>&1 | tail -3
log "✅ Monitoring (Prometheus/Grafana/Alertmanager) 啟動指令已發送"
sleep 10
# 驗證 Alertmanager 是否啟動
if curl -sf --max-time 5 http://localhost:9093/-/healthy >/dev/null 2>&1; then
log "✅ Alertmanager healthy"
else
log "⚠️ Alertmanager 尚未就緒,等待 20 秒..."
sleep 20
curl -sf --max-time 5 http://localhost:9093/-/healthy >/dev/null 2>&1 && log "✅ Alertmanager 就緒" || log "❌ Alertmanager 未就緒,需手動檢查"
fi
fi
# ──────────────────────────────────────────────
# STEP 5: SignOz
# ──────────────────────────────────────────────
log "[5/6] 啟動 SignOz..."
SIGNOZ_DIR="/home/wooo/signoz/deploy/docker"
if [ -f "$SIGNOZ_DIR/docker-compose.yaml" ]; then
cd "$SIGNOZ_DIR"
docker compose up -d 2>&1 | tail -5
log "✅ SignOz 啟動指令已發送"
fi
# ──────────────────────────────────────────────
# STEP 6: Gitea Act RunnerCI/CD 核心)
# 2026-04-05 Claude Code: 加入 — 解決重開機後 Gitea runner 離線、CD 失效
# 重要:必須在 Gitea server 啟動後才能啟動 runner
# ──────────────────────────────────────────────
log "[6/6] 啟動 Gitea Act Runner..."
RUNNER_DIR="/home/wooo/act-runner"
RUNNER_SERVICE="gitea-act-runner-host.service"
if [ -x "$RUNNER_DIR/act_runner" ] && [ -f "$RUNNER_DIR/config.yaml" ]; then
# 若舊的 .runner 配置指向過期 hostname先清除讓 runner 重新註冊
RUNNER_FILE="$RUNNER_DIR/data/.runner"
if [ -f "$RUNNER_FILE" ]; then
OLD_URL=$(python3 -c "import json; d=json.load(open('$RUNNER_FILE')); print(d.get('address',''))" 2>/dev/null || echo "")
if [ "$OLD_URL" != "http://192.168.0.110:3001" ]; then
log "⚠️ runner 配置過期 ($OLD_URL),清除重新註冊..."
rm -f "$RUNNER_FILE" || true
fi
fi
# act_runner 預設 shutdown_timeout=0s會在 daemon 重啟時立刻取消
# 正在執行的 CD job。這會造成部署實際完成但 Gitea status 失真。
python3 - "$RUNNER_DIR/config.yaml" <<'PY' || true
import sys
from pathlib import Path
path = Path(sys.argv[1])
lines = path.read_text().splitlines()
if any(line.strip().startswith("shutdown_timeout:") for line in lines):
lines = [
" shutdown_timeout: 1h" if line.strip().startswith("shutdown_timeout:") else line
for line in lines
]
else:
for idx, line in enumerate(lines):
if line.strip().startswith("timeout:") and idx > 0:
lines.insert(idx + 1, " shutdown_timeout: 1h")
break
path.write_text("\n".join(lines) + "\n")
PY
if systemctl list-unit-files "$RUNNER_SERVICE" >/dev/null 2>&1; then
systemctl enable --now "$RUNNER_SERVICE" >/dev/null 2>&1 || true
elif ! pgrep -f "$RUNNER_DIR/act_runner daemon" >/dev/null; then
nohup "$RUNNER_DIR/run-host-runner.sh" >> "$RUNNER_DIR/host-runner.log" 2>&1 &
fi
# 已停用 Docker-wrapped runner避免它搶走 host label job。
# 若手動執行此 recovery script 時仍有 task container送 SIGINT
# 讓 act_runner drain不再接新 job並等手上的 job 收尾。
docker update --restart=no gitea-runner >/dev/null 2>&1 || true
if docker ps --format '{{.Names}}' | grep -q '^GITEA-ACTIONS-TASK-'; then
log "⚠️ Gitea Actions task container still running; draining docker-wrapped gitea-runner"
docker kill --signal=SIGINT gitea-runner >/dev/null 2>&1 || true
else
docker stop -t 3700 gitea-runner >/dev/null 2>&1 || true
fi
sleep 15
# 驗證 runner 已連線 Gitea
if pgrep -f "$RUNNER_DIR/act_runner daemon" >/dev/null; then
log "✅ Gitea host act_runner 已啟動"
else
log "⚠️ Gitea host act_runner 可能尚未啟動,查看: $RUNNER_DIR/host-runner.log"
fi
else
log "⚠️ 找不到 act-runner binary/config: $RUNNER_DIR"
fi
# ──────────────────────────────────────────────
# STEP 7: SentryError Tracking
# 2026-04-05 Claude Code: 加入 — 解決重開機後 Sentry 未自動啟動
# 安裝位置: /opt/sentry (2026-03-24 已安裝)
# DSN: awoooi-web :2, awoooi-api :3 (見 memory/project_sentry_full_integration.md)
#
# 已知重開機後損壞問題 (2026-04-05 事故記錄):
# - sentry-postgres: WAL 損壞 → pg_resetwal -f
# - sentry-redis: dump.rdb 損壞 → 刪除重建 (redis 是 session cache可丟失)
# - sentry-clickhouse: system table parts 損壞 → 刪除對應 store/ 目錄
# ──────────────────────────────────────────────
log "[7/7] 啟動 Sentry..."
SENTRY_DIR="/opt/sentry"
if [ -d "$SENTRY_DIR" ]; then
cd "$SENTRY_DIR"
# 修復 Sentry PostgreSQL WAL (若損壞)
SENTRY_PG_VOL="sentry-postgres"
PG_RUNNING=$(docker ps --filter name=sentry-self-hosted-postgres-1 --format "{{.Status}}" 2>/dev/null | head -1)
if echo "$PG_RUNNING" | grep -q "Restarting"; then
log "⚠️ sentry-postgres 損壞,嘗試 pg_resetwal 修復..."
docker run --rm -u 999 -v ${SENTRY_PG_VOL}:/var/lib/postgresql/data \
postgres:14 pg_resetwal -f /var/lib/postgresql/data 2>&1 | head -3 || true
log "✅ sentry-postgres WAL 修復完成"
fi
# 修復 Sentry Redis (若 dump.rdb 損壞)
REDIS_STATUS=$(docker inspect sentry-self-hosted-redis-1 --format='{{.State.Status}}' 2>/dev/null || echo "missing")
if [ "$REDIS_STATUS" = "restarting" ]; then
log "⚠️ sentry-redis 損壞,清除 dump.rdb..."
docker run --rm --user root -v sentry-redis:/data alpine \
sh -c 'rm -f /data/dump.rdb && echo cleared' 2>/dev/null || true
log "✅ sentry-redis dump.rdb 已清除"
fi
docker compose up -d 2>&1 | tail -5
log "✅ Sentry 啟動指令已發送 (啟動約需 2-3 分鐘)"
sleep 20
# 非阻塞驗證Sentry 啟動慢,只做快速健康檢查
HTTP_CODE=$(curl -sf --max-time 10 -o /dev/null -w "%{http_code}" http://localhost:9000/ 2>/dev/null || echo "0")
if echo "$HTTP_CODE" | grep -qE "^(200|302|400)$"; then
log "✅ Sentry 已回應 (HTTP $HTTP_CODE)"
else
log "⚠️ Sentry 尚未就緒(正常現象,通常需 2-3 分鐘HTTP=$HTTP_CODE"
fi
else
log "⚠️ 找不到 Sentry 目錄: $SENTRY_DIR"
fi
# ──────────────────────────────────────────────
# 完成
# ──────────────────────────────────────────────
log "=== 192.168.0.110 啟動序列完成 ==="
log "Harbor: http://192.168.0.110:5000"
log "Gitea: http://192.168.0.110:3001"
log "Grafana: http://192.168.0.110:3002"
log "Alertmanager: http://192.168.0.110:9093"
log "Gitea Runner: docker logs gitea-runner"
log "Sentry: http://192.168.0.110:9000"
exit 0