314 lines
14 KiB
Bash
314 lines
14 KiB
Bash
#!/bin/bash
|
||
# 192.168.0.110 重開機自動恢復腳本
|
||
# 2026-04-05 ogt: 根據第二次重開機事故建立
|
||
# 部署位置: /usr/local/bin/awoooi-startup-110.sh (on 192.168.0.110)
|
||
# systemd unit: /etc/systemd/system/awoooi-startup-110.service
|
||
#
|
||
# 已知問題處理:
|
||
# - Docker BoltDB 損壞 (network/files/local-kv.db, volumes/metadata.db)
|
||
# - 舊容器使用已不存在的 Docker network (需要 docker rm -f 全部)
|
||
# - Harbor nginx 依賴 harbor-log (需要等 harbor-log healthy 後才 compose up)
|
||
|
||
set -uo pipefail
|
||
LOG="/var/log/awoooi-startup-110.log"
|
||
exec > >(tee -a "$LOG") 2>&1
|
||
|
||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
||
|
||
log "=== 192.168.0.110 啟動序列開始 ==="
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 1: Docker 修復(若 BoltDB 損壞)
|
||
# ──────────────────────────────────────────────
|
||
log "[1/5] 檢查 Docker..."
|
||
|
||
if ! systemctl is-active docker >/dev/null 2>&1; then
|
||
log "Docker 未啟動,嘗試啟動..."
|
||
systemctl start docker || true
|
||
sleep 8
|
||
fi
|
||
|
||
if ! systemctl is-active docker >/dev/null 2>&1; then
|
||
log "Docker 啟動失敗,修復 BoltDB..."
|
||
# 清除所有已知 BoltDB 損壞點
|
||
NETWORK_DB="/var/lib/docker/network/files/local-kv.db"
|
||
VOLUMES_DB="/var/lib/docker/volumes/metadata.db"
|
||
|
||
if [ -f "$NETWORK_DB" ]; then
|
||
cp "$NETWORK_DB" "${NETWORK_DB}.bak.$(date +%Y%m%d%H%M%S)"
|
||
rm -f "$NETWORK_DB"
|
||
log "清除損壞的 local-kv.db"
|
||
fi
|
||
if [ -f "$VOLUMES_DB" ]; then
|
||
cp "$VOLUMES_DB" "${VOLUMES_DB}.bak.$(date +%Y%m%d%H%M%S)"
|
||
rm -f "$VOLUMES_DB"
|
||
log "清除損壞的 metadata.db"
|
||
fi
|
||
# 清除 buildkit 快取(也可能損壞)
|
||
find /var/lib/docker/buildkit -name "*.db" -delete 2>/dev/null || true
|
||
|
||
systemctl start docker
|
||
sleep 8
|
||
systemctl is-active docker && log "✅ Docker 修復成功" || { log "❌ Docker 修復失敗"; exit 1; }
|
||
else
|
||
log "✅ Docker 已 active"
|
||
fi
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 2: 清除孤兒容器(舊容器的 network 已不存在)
|
||
# ──────────────────────────────────────────────
|
||
log "[2/5] 清除孤兒容器..."
|
||
STALE=$(docker ps -a --format "{{.Names}}\t{{.Status}}" | grep "Exited (128)\|Exited (137)" | awk '{print $1}')
|
||
if [ -n "$STALE" ]; then
|
||
log "發現孤兒容器: $(echo $STALE | tr '\n' ' ')"
|
||
echo "$STALE" | xargs docker rm -f 2>/dev/null || true
|
||
docker network prune -f 2>/dev/null || true
|
||
log "✅ 孤兒容器清除完成"
|
||
else
|
||
log "✅ 無孤兒容器"
|
||
fi
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 3: Harbor(嚴格時序:harbor-log first)
|
||
# 2026-04-05 Claude Code: 修復 Race Condition —
|
||
# 舊邏輯:compose up -d 全部 → 等 harbor-log healthy → compose up -d 全部
|
||
# 問題:第一次 up 時 harbor-core/db/portal 嘗試連 syslog:1514 失敗 exit,
|
||
# restart:always 重試直到放棄,第二次 compose up 無效
|
||
# 新邏輯:只啟動 harbor-log → 等 healthy → 啟動其餘全部
|
||
# 首席架構師 Review I1: 使用 lockfile 防止 watchdog 並行修復
|
||
# ──────────────────────────────────────────────
|
||
log "[3/5] 啟動 Harbor..."
|
||
HARBOR_DIR="/home/wooo/harbor/harbor"
|
||
HARBOR_LOCKFILE="/var/lock/harbor-repair.lock"
|
||
if [ -f "$HARBOR_DIR/docker-compose.yml" ]; then
|
||
cd "$HARBOR_DIR"
|
||
|
||
# I1: 持有 lockfile,防止 harbor-watchdog 同時修復
|
||
exec 9>"$HARBOR_LOCKFILE"
|
||
flock -w 30 9 || { log "⚠️ 無法取得 Harbor lock,跳過(watchdog 可能正在修復)"; return 0; }
|
||
|
||
# Phase 1: 清除任何 Exited 狀態的 Harbor 容器(避免 restart:always 干擾)
|
||
# 使用 --filter 取代 grep/awk(更精確,不受欄位格式影響)
|
||
HARBOR_EXITED=$(docker ps -a --filter name="harbor-" --filter status=exited --format "{{.Names}}" 2>/dev/null || true)
|
||
if [ -n "$HARBOR_EXITED" ]; then
|
||
log "Phase 1: 清除 Exited Harbor 容器: $(echo "$HARBOR_EXITED" | tr '\n' ' ')"
|
||
echo "$HARBOR_EXITED" | xargs docker rm -f 2>&1 | tail -3 || true
|
||
else
|
||
log "Phase 1: 無 Exited 容器"
|
||
fi
|
||
|
||
# Phase 2: 只啟動 harbor-log(syslog 接收端,其他容器的前置依賴)
|
||
log "Phase 2: 啟動 harbor-log..."
|
||
docker compose up -d harbor-log 2>&1 | tail -3
|
||
|
||
# Phase 3: 等待 harbor-log healthy(最多 90 秒)
|
||
log "Phase 3: 等待 harbor-log healthy(最多 90s)..."
|
||
for i in $(seq 1 18); do
|
||
STATUS=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "missing")
|
||
if [ "$STATUS" = "healthy" ]; then
|
||
log "✅ harbor-log healthy (${i}x5s)"
|
||
break
|
||
fi
|
||
sleep 5
|
||
done
|
||
|
||
STATUS=$(docker inspect --format='{{.State.Health.Status}}' harbor-log 2>/dev/null || echo "unknown")
|
||
if [ "$STATUS" = "healthy" ]; then
|
||
# Phase 4: harbor-log 就緒後才啟動其餘組件
|
||
log "Phase 4: 啟動 Harbor 全組件..."
|
||
docker compose up -d 2>&1 | tail -5
|
||
|
||
# S1: 等 30s 讓 harbor-core 完成初始化(之前 5s 不夠)
|
||
sleep 30
|
||
# 注意:Harbor /v2/ 回 401(需認證),不能用 -f flag(會把 401 當失敗)
|
||
HTTP=$(curl -s --max-time 5 -o /dev/null -w "%{http_code}" http://127.0.0.1:5000/v2/ 2>/dev/null || echo "0")
|
||
if [ "$HTTP" = "401" ] || [ "$HTTP" = "200" ]; then
|
||
log "✅ Harbor 就緒 (HTTP $HTTP)"
|
||
else
|
||
log "⚠️ Harbor API 未就緒 (HTTP $HTTP),harbor-watchdog 會在 60s 後自動修復"
|
||
fi
|
||
else
|
||
log "❌ harbor-log 未 healthy (status=$STATUS),跳過其餘 Harbor 容器"
|
||
log " harbor-watchdog 會在 60s 後自動偵測並修復"
|
||
fi
|
||
else
|
||
log "⚠️ 找不到 Harbor compose 檔案"
|
||
fi
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 4: 其他服務(Gitea, Langfuse, Monitoring)
|
||
# ──────────────────────────────────────────────
|
||
log "[4/6] 啟動其他服務(Gitea, Langfuse, Monitoring)..."
|
||
|
||
GITEA_DIR="/home/wooo/gitea"
|
||
if [ -f "$GITEA_DIR/docker-compose.yml" ]; then
|
||
cd "$GITEA_DIR"
|
||
docker compose up -d 2>&1 | tail -3
|
||
log "✅ Gitea 啟動指令已發送"
|
||
fi
|
||
|
||
LANGFUSE_DIR="/home/wooo/langfuse"
|
||
if [ -f "$LANGFUSE_DIR/docker-compose.yml" ]; then
|
||
cd "$LANGFUSE_DIR"
|
||
docker compose up -d 2>&1 | tail -3
|
||
log "✅ Langfuse 啟動指令已發送"
|
||
fi
|
||
|
||
MONITORING_DIR="/home/wooo/monitoring"
|
||
if [ -f "$MONITORING_DIR/docker-compose.yml" ]; then
|
||
cd "$MONITORING_DIR"
|
||
docker compose up -d 2>&1 | tail -3
|
||
log "✅ Monitoring (Prometheus/Grafana/Alertmanager) 啟動指令已發送"
|
||
sleep 10
|
||
# 驗證 Alertmanager 是否啟動
|
||
if curl -sf --max-time 5 http://localhost:9093/-/healthy >/dev/null 2>&1; then
|
||
log "✅ Alertmanager healthy"
|
||
else
|
||
log "⚠️ Alertmanager 尚未就緒,等待 20 秒..."
|
||
sleep 20
|
||
curl -sf --max-time 5 http://localhost:9093/-/healthy >/dev/null 2>&1 && log "✅ Alertmanager 就緒" || log "❌ Alertmanager 未就緒,需手動檢查"
|
||
fi
|
||
fi
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 5: SignOz
|
||
# ──────────────────────────────────────────────
|
||
log "[5/6] 啟動 SignOz..."
|
||
SIGNOZ_DIR="/home/wooo/signoz/deploy/docker"
|
||
if [ -f "$SIGNOZ_DIR/docker-compose.yaml" ]; then
|
||
cd "$SIGNOZ_DIR"
|
||
docker compose up -d 2>&1 | tail -5
|
||
log "✅ SignOz 啟動指令已發送"
|
||
fi
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 6: Gitea Act Runner(CI/CD 核心)
|
||
# 2026-04-05 Claude Code: 加入 — 解決重開機後 Gitea runner 離線、CD 失效
|
||
# 重要:必須在 Gitea server 啟動後才能啟動 runner
|
||
# ──────────────────────────────────────────────
|
||
log "[6/6] 啟動 Gitea Act Runner..."
|
||
RUNNER_DIR="/home/wooo/act-runner"
|
||
RUNNER_SERVICE="gitea-act-runner-host.service"
|
||
if [ -x "$RUNNER_DIR/act_runner" ] && [ -f "$RUNNER_DIR/config.yaml" ]; then
|
||
# 若舊的 .runner 配置指向過期 hostname,先清除讓 runner 重新註冊
|
||
RUNNER_FILE="$RUNNER_DIR/data/.runner"
|
||
if [ -f "$RUNNER_FILE" ]; then
|
||
OLD_URL=$(python3 -c "import json; d=json.load(open('$RUNNER_FILE')); print(d.get('address',''))" 2>/dev/null || echo "")
|
||
if [ "$OLD_URL" != "http://192.168.0.110:3001" ]; then
|
||
log "⚠️ runner 配置過期 ($OLD_URL),清除重新註冊..."
|
||
rm -f "$RUNNER_FILE" || true
|
||
fi
|
||
fi
|
||
|
||
# act_runner 預設 shutdown_timeout=0s,會在 daemon 重啟時立刻取消
|
||
# 正在執行的 CD job。這會造成部署實際完成但 Gitea status 失真。
|
||
python3 - "$RUNNER_DIR/config.yaml" <<'PY' || true
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
path = Path(sys.argv[1])
|
||
lines = path.read_text().splitlines()
|
||
if any(line.strip().startswith("shutdown_timeout:") for line in lines):
|
||
lines = [
|
||
" shutdown_timeout: 1h" if line.strip().startswith("shutdown_timeout:") else line
|
||
for line in lines
|
||
]
|
||
else:
|
||
for idx, line in enumerate(lines):
|
||
if line.strip().startswith("timeout:") and idx > 0:
|
||
lines.insert(idx + 1, " shutdown_timeout: 1h")
|
||
break
|
||
path.write_text("\n".join(lines) + "\n")
|
||
PY
|
||
|
||
if systemctl list-unit-files "$RUNNER_SERVICE" >/dev/null 2>&1; then
|
||
systemctl enable --now "$RUNNER_SERVICE" >/dev/null 2>&1 || true
|
||
elif ! pgrep -f "$RUNNER_DIR/act_runner daemon" >/dev/null; then
|
||
nohup "$RUNNER_DIR/run-host-runner.sh" >> "$RUNNER_DIR/host-runner.log" 2>&1 &
|
||
fi
|
||
|
||
# 已停用 Docker-wrapped runner;避免它搶走 host label job。
|
||
# 若手動執行此 recovery script 時仍有 task container,送 SIGINT
|
||
# 讓 act_runner drain,不再接新 job,並等手上的 job 收尾。
|
||
docker update --restart=no gitea-runner >/dev/null 2>&1 || true
|
||
if docker ps --format '{{.Names}}' | grep -q '^GITEA-ACTIONS-TASK-'; then
|
||
log "⚠️ Gitea Actions task container still running; draining docker-wrapped gitea-runner"
|
||
docker kill --signal=SIGINT gitea-runner >/dev/null 2>&1 || true
|
||
else
|
||
docker stop -t 3700 gitea-runner >/dev/null 2>&1 || true
|
||
fi
|
||
|
||
sleep 15
|
||
|
||
# 驗證 runner 已連線 Gitea
|
||
if pgrep -f "$RUNNER_DIR/act_runner daemon" >/dev/null; then
|
||
log "✅ Gitea host act_runner 已啟動"
|
||
else
|
||
log "⚠️ Gitea host act_runner 可能尚未啟動,查看: $RUNNER_DIR/host-runner.log"
|
||
fi
|
||
else
|
||
log "⚠️ 找不到 act-runner binary/config: $RUNNER_DIR"
|
||
fi
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 7: Sentry(Error Tracking)
|
||
# 2026-04-05 Claude Code: 加入 — 解決重開機後 Sentry 未自動啟動
|
||
# 安裝位置: /opt/sentry (2026-03-24 已安裝)
|
||
# DSN: awoooi-web :2, awoooi-api :3 (見 memory/project_sentry_full_integration.md)
|
||
#
|
||
# 已知重開機後損壞問題 (2026-04-05 事故記錄):
|
||
# - sentry-postgres: WAL 損壞 → pg_resetwal -f
|
||
# - sentry-redis: dump.rdb 損壞 → 刪除重建 (redis 是 session cache,可丟失)
|
||
# - sentry-clickhouse: system table parts 損壞 → 刪除對應 store/ 目錄
|
||
# ──────────────────────────────────────────────
|
||
log "[7/7] 啟動 Sentry..."
|
||
SENTRY_DIR="/opt/sentry"
|
||
if [ -d "$SENTRY_DIR" ]; then
|
||
cd "$SENTRY_DIR"
|
||
|
||
# 修復 Sentry PostgreSQL WAL (若損壞)
|
||
SENTRY_PG_VOL="sentry-postgres"
|
||
PG_RUNNING=$(docker ps --filter name=sentry-self-hosted-postgres-1 --format "{{.Status}}" 2>/dev/null | head -1)
|
||
if echo "$PG_RUNNING" | grep -q "Restarting"; then
|
||
log "⚠️ sentry-postgres 損壞,嘗試 pg_resetwal 修復..."
|
||
docker run --rm -u 999 -v ${SENTRY_PG_VOL}:/var/lib/postgresql/data \
|
||
postgres:14 pg_resetwal -f /var/lib/postgresql/data 2>&1 | head -3 || true
|
||
log "✅ sentry-postgres WAL 修復完成"
|
||
fi
|
||
|
||
# 修復 Sentry Redis (若 dump.rdb 損壞)
|
||
REDIS_STATUS=$(docker inspect sentry-self-hosted-redis-1 --format='{{.State.Status}}' 2>/dev/null || echo "missing")
|
||
if [ "$REDIS_STATUS" = "restarting" ]; then
|
||
log "⚠️ sentry-redis 損壞,清除 dump.rdb..."
|
||
docker run --rm --user root -v sentry-redis:/data alpine \
|
||
sh -c 'rm -f /data/dump.rdb && echo cleared' 2>/dev/null || true
|
||
log "✅ sentry-redis dump.rdb 已清除"
|
||
fi
|
||
|
||
docker compose up -d 2>&1 | tail -5
|
||
log "✅ Sentry 啟動指令已發送 (啟動約需 2-3 分鐘)"
|
||
sleep 20
|
||
# 非阻塞驗證:Sentry 啟動慢,只做快速健康檢查
|
||
HTTP_CODE=$(curl -sf --max-time 10 -o /dev/null -w "%{http_code}" http://localhost:9000/ 2>/dev/null || echo "0")
|
||
if echo "$HTTP_CODE" | grep -qE "^(200|302|400)$"; then
|
||
log "✅ Sentry 已回應 (HTTP $HTTP_CODE)"
|
||
else
|
||
log "⚠️ Sentry 尚未就緒(正常現象,通常需 2-3 分鐘,HTTP=$HTTP_CODE)"
|
||
fi
|
||
else
|
||
log "⚠️ 找不到 Sentry 目錄: $SENTRY_DIR"
|
||
fi
|
||
|
||
# ──────────────────────────────────────────────
|
||
# 完成
|
||
# ──────────────────────────────────────────────
|
||
log "=== 192.168.0.110 啟動序列完成 ==="
|
||
log "Harbor: http://192.168.0.110:5000"
|
||
log "Gitea: http://192.168.0.110:3001"
|
||
log "Grafana: http://192.168.0.110:3002"
|
||
log "Alertmanager: http://192.168.0.110:9093"
|
||
log "Gitea Runner: docker logs gitea-runner"
|
||
log "Sentry: http://192.168.0.110:9000"
|
||
|
||
exit 0
|