MinIO 重開機後不會自動啟動,導致 Velero BackupStorageLocation Unavailable 加入 MinIO docker compose up -d 到 STEP 7 Docker Compose 服務區段 ⚠️ 統帥需要手動執行以下指令讓 188 上的 startup script 生效: sudo cp /tmp/awoooi-startup.sh /usr/local/bin/awoooi-startup.sh sudo chmod +x /usr/local/bin/awoooi-startup.sh Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
192 lines
8.3 KiB
Bash
192 lines
8.3 KiB
Bash
#!/bin/bash
|
||
# AWOOOI 重開機自動恢復腳本
|
||
# 2026-04-04 ogt: 根據實際事故建立,解決 PostgreSQL WAL 損壞 + Docker BoltDB 損壞 + K3s Kine 慢查詢
|
||
# 部署位置: /usr/local/bin/awoooi-startup.sh (on 192.168.0.188)
|
||
# systemd unit: /etc/systemd/system/awoooi-startup.service
|
||
|
||
set -uo pipefail
|
||
LOG="/var/log/awoooi-startup.log"
|
||
exec > >(tee -a "$LOG") 2>&1
|
||
|
||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
||
|
||
log "=== AWOOOI 啟動序列開始 ==="
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 1: containerd 修復(若損壞)
|
||
# ──────────────────────────────────────────────
|
||
log "[1/7] 檢查 containerd..."
|
||
|
||
if ! systemctl is-active containerd >/dev/null 2>&1; then
|
||
log "containerd 未啟動,嘗試啟動..."
|
||
systemctl start containerd || true
|
||
sleep 5
|
||
fi
|
||
|
||
if ! systemctl is-active containerd >/dev/null 2>&1; then
|
||
log "containerd 啟動失敗,檢查 BoltDB 損壞..."
|
||
BOLT_DB="/var/lib/containerd/io.containerd.metadata.v1.bolt/meta.db"
|
||
if [ -f "$BOLT_DB" ]; then
|
||
log "備份並刪除損壞的 meta.db..."
|
||
cp "$BOLT_DB" "${BOLT_DB}.bak.$(date +%Y%m%d%H%M%S)"
|
||
rm -f "$BOLT_DB"
|
||
fi
|
||
systemctl start containerd
|
||
sleep 5
|
||
systemctl is-active containerd && log "✅ containerd 修復成功" || { log "❌ containerd 修復失敗"; exit 1; }
|
||
else
|
||
log "✅ containerd 已 active"
|
||
fi
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 2: Docker 修復(若損壞)
|
||
# ──────────────────────────────────────────────
|
||
log "[2/7] 檢查 Docker..."
|
||
|
||
if ! systemctl is-active docker >/dev/null 2>&1; then
|
||
log "Docker 未啟動,嘗試啟動..."
|
||
systemctl start docker || true
|
||
sleep 8
|
||
fi
|
||
|
||
if ! systemctl is-active docker >/dev/null 2>&1; then
|
||
log "Docker 啟動失敗,修復 network BoltDB..."
|
||
NETWORK_DB="/var/lib/docker/network/files/local-kv.db"
|
||
if [ -f "$NETWORK_DB" ]; then
|
||
log "備份並刪除損壞的 local-kv.db..."
|
||
cp "$NETWORK_DB" "${NETWORK_DB}.bak.$(date +%Y%m%d%H%M%S)"
|
||
rm -f "$NETWORK_DB"
|
||
fi
|
||
systemctl restart containerd
|
||
sleep 5
|
||
systemctl start docker
|
||
sleep 8
|
||
systemctl is-active docker && log "✅ Docker 修復成功" || { log "❌ Docker 修復失敗"; exit 1; }
|
||
else
|
||
log "✅ Docker 已 active"
|
||
fi
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 3: PostgreSQL 修復(若損壞)
|
||
# ──────────────────────────────────────────────
|
||
log "[3/7] 檢查 PostgreSQL..."
|
||
|
||
if ! systemctl is-active postgresql@14-main >/dev/null 2>&1; then
|
||
log "PostgreSQL 未啟動,嘗試啟動..."
|
||
systemctl start postgresql@14-main || true
|
||
sleep 8
|
||
fi
|
||
|
||
if ! systemctl is-active postgresql@14-main >/dev/null 2>&1; then
|
||
log "PostgreSQL 啟動失敗,檢查是否 WAL 損壞..."
|
||
if journalctl -u postgresql@14-main -n 20 | grep -q "could not locate a valid checkpoint"; then
|
||
log "⚠️ WAL 損壞!執行 pg_resetwal..."
|
||
/usr/lib/postgresql/14/bin/pg_resetwal -f /var/lib/postgresql/14/main
|
||
log "WAL 重置完成,重啟 PostgreSQL..."
|
||
systemctl start postgresql@14-main
|
||
sleep 8
|
||
fi
|
||
systemctl is-active postgresql@14-main && log "✅ PostgreSQL 修復成功" || { log "❌ PostgreSQL 修復失敗"; exit 1; }
|
||
fi
|
||
|
||
# 等待 PG 接受連線
|
||
log "等待 PostgreSQL 就緒..."
|
||
for i in $(seq 1 30); do
|
||
pg_isready -h localhost -p 5432 >/dev/null 2>&1 && break
|
||
sleep 2
|
||
done
|
||
pg_isready -h localhost -p 5432 && log "✅ PostgreSQL accepting connections" || { log "❌ PostgreSQL 無法接受連線"; exit 1; }
|
||
|
||
# kine 表維護(若有 stale 連線或 WAL 剛重置)
|
||
log "維護 k3s_datastore kine 表..."
|
||
sudo -u postgres psql -d k3s_datastore -c "
|
||
SELECT pg_terminate_backend(pid)
|
||
FROM pg_stat_activity
|
||
WHERE datname='k3s_datastore' AND pid!=pg_backend_pid() AND state='active'
|
||
AND query_start < now() - interval '5 minutes';
|
||
" 2>/dev/null || true
|
||
|
||
sudo -u postgres psql -d k3s_datastore -c "VACUUM ANALYZE kine;" 2>/dev/null && log "✅ kine VACUUM ANALYZE 完成" || log "⚠️ kine VACUUM 跳過(可能無法連線)"
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 4: Redis
|
||
# ──────────────────────────────────────────────
|
||
log "[4/7] 啟動 Redis..."
|
||
systemctl start redis-server || true
|
||
sleep 3
|
||
redis-cli ping 2>/dev/null | grep -q PONG && log "✅ Redis UP" || log "⚠️ Redis 可能未就緒"
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 5: Ollama
|
||
# ──────────────────────────────────────────────
|
||
log "[5/7] 啟動 Ollama..."
|
||
systemctl start ollama || true
|
||
# Ollama 載入模型需要時間,不立刻驗證
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 6: Nginx
|
||
# ──────────────────────────────────────────────
|
||
log "[6/7] 啟動 Nginx..."
|
||
systemctl start nginx || true
|
||
systemctl is-active nginx >/dev/null 2>&1 && log "✅ Nginx UP" || log "⚠️ Nginx 未就緒"
|
||
|
||
# ──────────────────────────────────────────────
|
||
# STEP 7: Docker Compose 服務
|
||
# ──────────────────────────────────────────────
|
||
log "[7/7] 啟動 Docker Compose 服務..."
|
||
|
||
# SignOz
|
||
SIGNOZ_DIR="/home/ollama/signoz/deploy/docker"
|
||
if [ -f "$SIGNOZ_DIR/docker-compose.yaml" ]; then
|
||
log "啟動 SignOz..."
|
||
cd "$SIGNOZ_DIR"
|
||
docker compose up -d 2>&1 | tail -3
|
||
log "✅ SignOz 啟動指令已發送"
|
||
else
|
||
log "⚠️ 找不到 SignOz compose 檔案"
|
||
fi
|
||
|
||
# MinIO(Velero 備份存儲)
|
||
# 2026-04-05 Claude Code: 加入 MinIO — 解決 Velero BSL Unavailable 問題
|
||
MINIO_DIR="/home/ollama/minio"
|
||
if [ -f "$MINIO_DIR/docker-compose.yml" ]; then
|
||
log "啟動 MinIO (Velero 備份存儲)..."
|
||
cd "$MINIO_DIR"
|
||
docker compose up -d 2>&1 | tail -3
|
||
log "✅ MinIO 啟動指令已發送"
|
||
else
|
||
log "⚠️ 找不到 MinIO compose 檔案"
|
||
fi
|
||
|
||
# ClawBot(依賴 aiops-network)
|
||
# 確保 aiops-network 存在(重開機後 external network 需手動建立)
|
||
if ! docker network ls | grep -q aiops-network; then
|
||
log "建立 aiops-network..."
|
||
docker network create aiops-network && log "✅ aiops-network 建立" || log "⚠️ aiops-network 建立失敗"
|
||
fi
|
||
|
||
CLAWBOT_DIR="/home/ollama/clawbot-v5"
|
||
if [ -f "$CLAWBOT_DIR/docker-compose.yml" ]; then
|
||
log "啟動 ClawBot..."
|
||
cd "$CLAWBOT_DIR"
|
||
# 若 image snapshot 損壞,先 rebuild
|
||
if ! docker compose up -d 2>&1 | tee /tmp/clawbot-up.log | grep -q "Started\|Running\|healthy"; then
|
||
log "ClawBot 啟動失敗,嘗試 rebuild..."
|
||
docker compose build --no-cache 2>&1 | tail -5 || true
|
||
docker compose up -d 2>&1 | tail -3 || log "⚠️ ClawBot rebuild 也失敗,跳過"
|
||
fi
|
||
log "✅ ClawBot 啟動指令已發送"
|
||
else
|
||
log "⚠️ 找不到 ClawBot compose 檔案"
|
||
fi
|
||
|
||
# ──────────────────────────────────────────────
|
||
# 完成
|
||
# ──────────────────────────────────────────────
|
||
log "=== AWOOOI 啟動序列完成 ==="
|
||
# 最後永遠 exit 0,不讓 systemd 認為失敗
|
||
log "K3s 需在 120/121 手動確認啟動(或由 k3s.service 自動啟動)"
|
||
log "詳細 SOP: docs/runbooks/REBOOT-RECOVERY-SOP.md"
|
||
|
||
exit 0
|