Files
awoooi/scripts/reboot-recovery/awoooi-startup.sh
OG T c0c903dc48 fix(startup): 188 啟動腳本加入 MinIO — 解決 Velero BSL Unavailable
MinIO 重開機後不會自動啟動,導致 Velero BackupStorageLocation Unavailable
加入 MinIO docker compose up -d 到 STEP 7 Docker Compose 服務區段

⚠️ 統帥需要手動執行以下指令讓 188 上的 startup script 生效:
  sudo cp /tmp/awoooi-startup.sh /usr/local/bin/awoooi-startup.sh
  sudo chmod +x /usr/local/bin/awoooi-startup.sh

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 00:52:13 +08:00

192 lines
8.3 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# AWOOOI 重開機自動恢復腳本
# 2026-04-04 ogt: 根據實際事故建立,解決 PostgreSQL WAL 損壞 + Docker BoltDB 損壞 + K3s Kine 慢查詢
# 部署位置: /usr/local/bin/awoooi-startup.sh (on 192.168.0.188)
# systemd unit: /etc/systemd/system/awoooi-startup.service
set -uo pipefail
LOG="/var/log/awoooi-startup.log"
exec > >(tee -a "$LOG") 2>&1
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
log "=== AWOOOI 啟動序列開始 ==="
# ──────────────────────────────────────────────
# STEP 1: containerd 修復(若損壞)
# ──────────────────────────────────────────────
log "[1/7] 檢查 containerd..."
if ! systemctl is-active containerd >/dev/null 2>&1; then
log "containerd 未啟動,嘗試啟動..."
systemctl start containerd || true
sleep 5
fi
if ! systemctl is-active containerd >/dev/null 2>&1; then
log "containerd 啟動失敗,檢查 BoltDB 損壞..."
BOLT_DB="/var/lib/containerd/io.containerd.metadata.v1.bolt/meta.db"
if [ -f "$BOLT_DB" ]; then
log "備份並刪除損壞的 meta.db..."
cp "$BOLT_DB" "${BOLT_DB}.bak.$(date +%Y%m%d%H%M%S)"
rm -f "$BOLT_DB"
fi
systemctl start containerd
sleep 5
systemctl is-active containerd && log "✅ containerd 修復成功" || { log "❌ containerd 修復失敗"; exit 1; }
else
log "✅ containerd 已 active"
fi
# ──────────────────────────────────────────────
# STEP 2: Docker 修復(若損壞)
# ──────────────────────────────────────────────
log "[2/7] 檢查 Docker..."
if ! systemctl is-active docker >/dev/null 2>&1; then
log "Docker 未啟動,嘗試啟動..."
systemctl start docker || true
sleep 8
fi
if ! systemctl is-active docker >/dev/null 2>&1; then
log "Docker 啟動失敗,修復 network BoltDB..."
NETWORK_DB="/var/lib/docker/network/files/local-kv.db"
if [ -f "$NETWORK_DB" ]; then
log "備份並刪除損壞的 local-kv.db..."
cp "$NETWORK_DB" "${NETWORK_DB}.bak.$(date +%Y%m%d%H%M%S)"
rm -f "$NETWORK_DB"
fi
systemctl restart containerd
sleep 5
systemctl start docker
sleep 8
systemctl is-active docker && log "✅ Docker 修復成功" || { log "❌ Docker 修復失敗"; exit 1; }
else
log "✅ Docker 已 active"
fi
# ──────────────────────────────────────────────
# STEP 3: PostgreSQL 修復(若損壞)
# ──────────────────────────────────────────────
log "[3/7] 檢查 PostgreSQL..."
if ! systemctl is-active postgresql@14-main >/dev/null 2>&1; then
log "PostgreSQL 未啟動,嘗試啟動..."
systemctl start postgresql@14-main || true
sleep 8
fi
if ! systemctl is-active postgresql@14-main >/dev/null 2>&1; then
log "PostgreSQL 啟動失敗,檢查是否 WAL 損壞..."
if journalctl -u postgresql@14-main -n 20 | grep -q "could not locate a valid checkpoint"; then
log "⚠️ WAL 損壞!執行 pg_resetwal..."
/usr/lib/postgresql/14/bin/pg_resetwal -f /var/lib/postgresql/14/main
log "WAL 重置完成,重啟 PostgreSQL..."
systemctl start postgresql@14-main
sleep 8
fi
systemctl is-active postgresql@14-main && log "✅ PostgreSQL 修復成功" || { log "❌ PostgreSQL 修復失敗"; exit 1; }
fi
# 等待 PG 接受連線
log "等待 PostgreSQL 就緒..."
for i in $(seq 1 30); do
pg_isready -h localhost -p 5432 >/dev/null 2>&1 && break
sleep 2
done
pg_isready -h localhost -p 5432 && log "✅ PostgreSQL accepting connections" || { log "❌ PostgreSQL 無法接受連線"; exit 1; }
# kine 表維護(若有 stale 連線或 WAL 剛重置)
log "維護 k3s_datastore kine 表..."
sudo -u postgres psql -d k3s_datastore -c "
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE datname='k3s_datastore' AND pid!=pg_backend_pid() AND state='active'
AND query_start < now() - interval '5 minutes';
" 2>/dev/null || true
sudo -u postgres psql -d k3s_datastore -c "VACUUM ANALYZE kine;" 2>/dev/null && log "✅ kine VACUUM ANALYZE 完成" || log "⚠️ kine VACUUM 跳過(可能無法連線)"
# ──────────────────────────────────────────────
# STEP 4: Redis
# ──────────────────────────────────────────────
log "[4/7] 啟動 Redis..."
systemctl start redis-server || true
sleep 3
redis-cli ping 2>/dev/null | grep -q PONG && log "✅ Redis UP" || log "⚠️ Redis 可能未就緒"
# ──────────────────────────────────────────────
# STEP 5: Ollama
# ──────────────────────────────────────────────
log "[5/7] 啟動 Ollama..."
systemctl start ollama || true
# Ollama 載入模型需要時間,不立刻驗證
# ──────────────────────────────────────────────
# STEP 6: Nginx
# ──────────────────────────────────────────────
log "[6/7] 啟動 Nginx..."
systemctl start nginx || true
systemctl is-active nginx >/dev/null 2>&1 && log "✅ Nginx UP" || log "⚠️ Nginx 未就緒"
# ──────────────────────────────────────────────
# STEP 7: Docker Compose 服務
# ──────────────────────────────────────────────
log "[7/7] 啟動 Docker Compose 服務..."
# SignOz
SIGNOZ_DIR="/home/ollama/signoz/deploy/docker"
if [ -f "$SIGNOZ_DIR/docker-compose.yaml" ]; then
log "啟動 SignOz..."
cd "$SIGNOZ_DIR"
docker compose up -d 2>&1 | tail -3
log "✅ SignOz 啟動指令已發送"
else
log "⚠️ 找不到 SignOz compose 檔案"
fi
# MinIOVelero 備份存儲)
# 2026-04-05 Claude Code: 加入 MinIO — 解決 Velero BSL Unavailable 問題
MINIO_DIR="/home/ollama/minio"
if [ -f "$MINIO_DIR/docker-compose.yml" ]; then
log "啟動 MinIO (Velero 備份存儲)..."
cd "$MINIO_DIR"
docker compose up -d 2>&1 | tail -3
log "✅ MinIO 啟動指令已發送"
else
log "⚠️ 找不到 MinIO compose 檔案"
fi
# ClawBot依賴 aiops-network
# 確保 aiops-network 存在(重開機後 external network 需手動建立)
if ! docker network ls | grep -q aiops-network; then
log "建立 aiops-network..."
docker network create aiops-network && log "✅ aiops-network 建立" || log "⚠️ aiops-network 建立失敗"
fi
CLAWBOT_DIR="/home/ollama/clawbot-v5"
if [ -f "$CLAWBOT_DIR/docker-compose.yml" ]; then
log "啟動 ClawBot..."
cd "$CLAWBOT_DIR"
# 若 image snapshot 損壞,先 rebuild
if ! docker compose up -d 2>&1 | tee /tmp/clawbot-up.log | grep -q "Started\|Running\|healthy"; then
log "ClawBot 啟動失敗,嘗試 rebuild..."
docker compose build --no-cache 2>&1 | tail -5 || true
docker compose up -d 2>&1 | tail -3 || log "⚠️ ClawBot rebuild 也失敗,跳過"
fi
log "✅ ClawBot 啟動指令已發送"
else
log "⚠️ 找不到 ClawBot compose 檔案"
fi
# ──────────────────────────────────────────────
# 完成
# ──────────────────────────────────────────────
log "=== AWOOOI 啟動序列完成 ==="
# 最後永遠 exit 0不讓 systemd 認為失敗
log "K3s 需在 120/121 手動確認啟動(或由 k3s.service 自動啟動)"
log "詳細 SOP: docs/runbooks/REBOOT-RECOVERY-SOP.md"
exit 0