From 43edff184d2edc7686fd68543d618f49ccd61dd5 Mon Sep 17 00:00:00 2001 From: OG T Date: Sat, 11 Apr 2026 03:04:18 +0800 Subject: [PATCH] =?UTF-8?q?feat(dr):=20Sprint=20C=20=E2=80=94=20Host=20rsy?= =?UTF-8?q?nc=20=E5=82=99=E4=BB=BD=20+=20DR=20SOP=20=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit C-1 Velero: 已確認運作中(daily-awoooi-prod schedule, 13d, MinIO Available) C-2 Host rsync 備份: scripts/ops/backup-from-110.sh — 188 每日凌晨 1:00 rsync 備份 110 - Harbor registry data(最高優先) - Gitea repos - bitan-pharmacy.git(若存在) - 成功寫入 /var/run/backup-110.last_success 供 Prometheus 監控 - 失敗時 Telegram 告警 ops/monitoring/alerts-unified.yml — 新增 HostBackupFailed 告警規則 C-3 DR SOP 文件: docs/runbooks/disaster-recovery/DR-K8s-awoooi.md (<15分鐘) docs/runbooks/disaster-recovery/DR-Nginx.md (<5分鐘) docs/runbooks/disaster-recovery/DR-Harbor.md (<30分鐘) docs/runbooks/disaster-recovery/DR-Bitan.md (<5分鐘) docs/runbooks/disaster-recovery/DR-Stock.md (<5分鐘) 部署備份腳本說明 (需手動執行): scp scripts/ops/backup-from-110.sh ollama@192.168.0.188:~/bin/backup-from-110.sh ssh ollama@192.168.0.188 "chmod +x ~/bin/backup-from-110.sh && mkdir -p /backup/110/{harbor,gitea}" ssh ollama@192.168.0.188 "echo '0 1 * * * /home/ollama/bin/backup-from-110.sh' | crontab -" Co-Authored-By: Claude Sonnet 4.6 --- docs/runbooks/disaster-recovery/DR-Bitan.md | 46 +++++++ docs/runbooks/disaster-recovery/DR-Harbor.md | 83 +++++++++++++ .../disaster-recovery/DR-K8s-awoooi.md | 112 ++++++++++++++++++ docs/runbooks/disaster-recovery/DR-Nginx.md | 69 +++++++++++ docs/runbooks/disaster-recovery/DR-Stock.md | 47 ++++++++ ops/monitoring/alerts-unified.yml | 18 +++ scripts/ops/backup-from-110.sh | 97 +++++++++++++++ 7 files changed, 472 insertions(+) create mode 100644 docs/runbooks/disaster-recovery/DR-Bitan.md create mode 100644 docs/runbooks/disaster-recovery/DR-Harbor.md create mode 100644 docs/runbooks/disaster-recovery/DR-K8s-awoooi.md create mode 100644 docs/runbooks/disaster-recovery/DR-Nginx.md create mode 100644 docs/runbooks/disaster-recovery/DR-Stock.md create mode 100644 scripts/ops/backup-from-110.sh diff --git a/docs/runbooks/disaster-recovery/DR-Bitan.md b/docs/runbooks/disaster-recovery/DR-Bitan.md new file mode 100644 index 00000000..c991fa07 --- /dev/null +++ b/docs/runbooks/disaster-recovery/DR-Bitan.md @@ -0,0 +1,46 @@ +# DR-Bitan — bitan-pharmacy 容器崩潰復原 SOP + +**目標時間**: < 5 分鐘 +**觸發場景**: bitan-pharmacy 容器停止、崩潰,或 Docker daemon 重啟後未自動啟動 +**工具**: docker compose, Ansible +**最後更新**: 2026-04-11 (Claude Sonnet 4.6 Asia/Taipei) + +--- + +## 快速復原 + +```bash +ssh wooo@192.168.0.110 "cd /home/wooo/apps/bitan-pharmacy && docker compose up -d" + +# 驗收(30 秒內) +curl -s -o /dev/null -w '%{http_code}' https://bitan.wooo.work +# 期望: 200 +``` + +--- + +## 診斷步驟(若快速復原失敗) + +```bash +# 查看容器狀態 +ssh wooo@192.168.0.110 "docker ps -a | grep bitan" + +# 查看最近 log +ssh wooo@192.168.0.110 "docker logs bitan-pharmacy --tail 50" + +# 常見問題: +# 1. Port 3003 被佔用 → 找佔用程序: ss -tlnp | grep 3003 +# 2. 磁碟空間不足 → df -h +# 3. Image 損壞 → docker compose build && docker compose up -d +``` + +--- + +## 用 Ansible 確認狀態 + +```bash +# 在 MacBook 執行 +ansible-playbook -i infra/ansible/inventory/hosts.yml \ + infra/ansible/playbooks/110-devops.yml \ + --tags bitan +``` diff --git a/docs/runbooks/disaster-recovery/DR-Harbor.md b/docs/runbooks/disaster-recovery/DR-Harbor.md new file mode 100644 index 00000000..99ee533b --- /dev/null +++ b/docs/runbooks/disaster-recovery/DR-Harbor.md @@ -0,0 +1,83 @@ +# DR-Harbor — Harbor Registry 資料毀損還原 SOP + +**目標時間**: < 30 分鐘 +**觸發場景**: Harbor 資料毀損、container image 消失,或 harbor-data volume 損壞 +**工具**: rsync, docker compose +**最後更新**: 2026-04-11 (Claude Sonnet 4.6 Asia/Taipei) + +--- + +## 前提確認 + +```bash +# 確認 188 上的 rsync 備份存在 +ls -lh /backup/110/harbor/ +# 期望:有檔案,最近修改時間 < 25 小時 + +# 確認最後備份時間 +cat /var/run/backup-110.last_success | xargs -I{} date -d @{} +``` + +--- + +## 步驟 1:停止 Harbor(2 分鐘) + +```bash +ssh wooo@192.168.0.110 "cd /opt/harbor && docker compose down" +``` + +--- + +## 步驟 2:備份現有損壞資料(1 分鐘) + +```bash +ssh wooo@192.168.0.110 \ + "sudo mv /var/lib/docker/volumes/harbor_harbor-data \ + /var/lib/docker/volumes/harbor_harbor-data.bak.$(date +%Y%m%d)" +``` + +--- + +## 步驟 3:從 188 rsync 備份還原到 110(15 分鐘) + +```bash +# 在 188 執行(rsync 方向:188 備份 → 110) +rsync -avz \ + /backup/110/harbor/ \ + wooo@192.168.0.110:/var/lib/docker/volumes/harbor_harbor-data/_data/ +``` + +--- + +## 步驟 4:重啟 Harbor(5 分鐘) + +```bash +ssh wooo@192.168.0.110 "cd /opt/harbor && docker compose up -d" + +# 等待 Harbor 就緒 +for i in $(seq 1 12); do + CODE=$(curl -s -o /dev/null -w '%{http_code}' http://192.168.0.110:5000 2>/dev/null) + [ "$CODE" = "200" ] || [ "$CODE" = "301" ] && echo "✅ Harbor ready" && break + echo "⏳ 等待 Harbor... ($i/12)"; sleep 10 +done +``` + +--- + +## 步驟 5:驗收 + +```bash +# 確認 images 存在 +curl -s http://192.168.0.110:5000/v2/_catalog | python3 -m json.tool + +# 驗證 CD pipeline 可 pull(下一次部署自動驗證) +docker pull 192.168.0.110:5000/awoooi/api:latest +``` + +--- + +## 預防 + +- `scripts/ops/backup-from-110.sh` 每天凌晨 1:00 在 188 執行 +- 備份失敗時觸發 Telegram 告警 +- 若 rsync 備份超過 25 小時未更新,Prometheus `BackupFailed` 告警觸發 diff --git a/docs/runbooks/disaster-recovery/DR-K8s-awoooi.md b/docs/runbooks/disaster-recovery/DR-K8s-awoooi.md new file mode 100644 index 00000000..7d0e0e5f --- /dev/null +++ b/docs/runbooks/disaster-recovery/DR-K8s-awoooi.md @@ -0,0 +1,112 @@ +# DR-K8s-awoooi — K8s Pod 全掛復原 SOP + +**目標時間**: < 15 分鐘 +**觸發場景**: awoooi-prod namespace 所有 Pod 無法啟動,或 K3s 節點失效 +**工具**: Velero, kubectl, ArgoCD +**最後更新**: 2026-04-11 (Claude Sonnet 4.6 Asia/Taipei) + +--- + +## 前提確認 + +```bash +# 確認 Velero 可存取備份 +kubectl get backups -n velero --sort-by='.metadata.creationTimestamp' | tail -5 + +# 確認 BackupStorageLocation 狀態 +kubectl get backupstoragelocations -n velero +# 期望: default Available +``` + +--- + +## 步驟 1:確認最新可用備份(1 分鐘) + +```bash +# 列出最近備份 +kubectl get backups -n velero | grep daily-awoooi-prod | tail -3 + +# 確認備份完整 +BACKUP_NAME=daily-awoooi-prod-$(date -d yesterday +%Y%m%d)020005 +kubectl describe backup $BACKUP_NAME -n velero | grep -E "Phase|Errors|Warnings" +# 期望: Phase: Completed, Errors: 0 +``` + +--- + +## 步驟 2:暫停 ArgoCD 自動 Sync(2 分鐘) + +> ⚠️ 必須先暫停,否則 ArgoCD selfHeal 會與 Velero restore 競爭 + +```bash +kubectl patch application awoooi-prod -n argocd \ + --type merge \ + -p '{"spec":{"syncPolicy":{"automated":null}}}' + +echo "✅ ArgoCD auto-sync 已暫停" +``` + +--- + +## 步驟 3:執行 Velero Restore(5 分鐘) + +```bash +# 使用最新備份恢復 +BACKUP_NAME=$(kubectl get backups -n velero --sort-by='.metadata.creationTimestamp' \ + -o jsonpath='{.items[-1:].metadata.name}') +echo "使用備份: $BACKUP_NAME" + +velero restore create awoooi-restore-$(date +%H%M) \ + --from-backup $BACKUP_NAME \ + --include-namespaces awoooi-prod \ + --wait + +# 監控進度 +velero restore describe awoooi-restore-$(date +%H%M) --details +``` + +--- + +## 步驟 4:驗證服務恢復(3 分鐘) + +```bash +# 確認 Pod 全部 Running +kubectl get pods -n awoooi-prod + +# Health Check +curl -f http://192.168.0.121:32334/api/v1/health +# 期望: {"status": "healthy"} + +# Web 服務 +curl -s -o /dev/null -w '%{http_code}' https://awoooi.wooo.work +# 期望: 200 +``` + +--- + +## 步驟 5:恢復 ArgoCD 自動 Sync(1 分鐘) + +```bash +kubectl patch application awoooi-prod -n argocd \ + --type merge \ + -p '{"spec":{"syncPolicy":{"automated":{"prune":true,"selfHeal":true}}}}' + +# 觸發一次同步確認一致 +kubectl annotate application awoooi-prod -n argocd \ + argocd.argoproj.io/refresh=hard + +echo "✅ ArgoCD auto-sync 已恢復" +``` + +--- + +## 回滾方案 + +若 restore 失敗,直接從 Git 重新部署: + +```bash +# 強制 ArgoCD sync(從 Git 最新狀態重建) +kubectl patch application awoooi-prod -n argocd \ + --type merge \ + -p '{"operation":{"sync":{"force":true}}}' +``` diff --git a/docs/runbooks/disaster-recovery/DR-Nginx.md b/docs/runbooks/disaster-recovery/DR-Nginx.md new file mode 100644 index 00000000..49f6dc8f --- /dev/null +++ b/docs/runbooks/disaster-recovery/DR-Nginx.md @@ -0,0 +1,69 @@ +# DR-Nginx — Nginx 設定錯誤回滾 SOP + +**目標時間**: < 5 分鐘 +**觸發場景**: nginx conf 錯誤導致 reload 失敗,或手動修改造成設定漂移 +**工具**: Ansible, git +**最後更新**: 2026-04-11 (Claude Sonnet 4.6 Asia/Taipei) + +--- + +## 場景 A:nginx conf 語法錯誤(最常見) + +```bash +# 在 MacBook 執行 +# 1. 確認問題 +ssh wooo@192.168.0.188 "sudo nginx -t 2>&1" + +# 2. 用 Ansible 從 Git 重新部署正確版本 +cd /Users/ogt/awoooi +ansible-playbook -i infra/ansible/inventory/hosts.yml \ + infra/ansible/playbooks/nginx-sync.yml \ + --tags 188 + +# 3. 驗收 +curl -s -o /dev/null -w '%{http_code}' https://awoooi.wooo.work +# 期望: 200 +``` + +--- + +## 場景 B:需要回滾到前一版本 + +```bash +# 查看最近 nginx conf 變更 +git log --oneline -- infra/ansible/roles/nginx/templates/188-all-sites.conf.j2 | head -5 + +# 回滾到前一個 commit +git revert HEAD --no-edit # 或 git checkout -- +git push gitea main + +# 立即用 Ansible 套用回滾版本 +ansible-playbook -i infra/ansible/inventory/hosts.yml \ + infra/ansible/playbooks/nginx-sync.yml \ + --tags 188 +``` + +--- + +## 場景 C:188 nginx 完全無法啟動,需要 110 接管 + +```bash +# 在 192.168.0.110 或 188 強制 keepalived failover +# (僅在 188 完全失效時使用) +ssh wooo@192.168.0.110 "sudo systemctl status keepalived" + +# 若 110 已自動接管 VIP:200,確認: +ssh wooo@192.168.0.110 "ip addr show | grep 192.168.0.200" +# 期望:顯示 VIP 已在 110 + +# 確認對外服務正常(透過 VIP) +curl -s -o /dev/null -w '%{http_code}' https://awoooi.wooo.work +``` + +--- + +## 預防 + +- 所有 nginx conf 變更必須透過 `infra/ansible/playbooks/nginx-sync.yml` 部署 +- 禁止直接 SSH 到 188 修改 `/etc/nginx/sites-enabled/` +- Ansible 會自動 `backup: true`(在部署前備份 `.bak.TIMESTAMP`) diff --git a/docs/runbooks/disaster-recovery/DR-Stock.md b/docs/runbooks/disaster-recovery/DR-Stock.md new file mode 100644 index 00000000..c32e08b8 --- /dev/null +++ b/docs/runbooks/disaster-recovery/DR-Stock.md @@ -0,0 +1,47 @@ +# DR-Stock — stock 服務崩潰復原 SOP + +**目標時間**: < 5 分鐘 +**觸發場景**: stock K8s Service/Pod 無法訪問,或 NodePort 31235 無回應 +**工具**: kubectl, ArgoCD +**最後更新**: 2026-04-11 (Claude Sonnet 4.6 Asia/Taipei) + +--- + +## 快速復原 + +```bash +# 確認 stock Pod 狀態 +kubectl get pods -n awoooi-prod -l app=stock 2>/dev/null || \ + echo "stock 可能不在 K8s,確認是否為 Docker 服務" + +# 若是 K8s Pod +kubectl rollout restart deployment/stock -n awoooi-prod +kubectl rollout status deployment/stock -n awoooi-prod --timeout=60s + +# 驗收 +curl -s -o /dev/null -w '%{http_code}' https://stock.wooo.work +# 期望: 200 +``` + +--- + +## NodePort 31235 確認 + +```bash +# 確認 NodePort Service 存在 +kubectl get svc -n awoooi-prod | grep 31235 + +# 直連 NodePort 測試 +curl -s -o /dev/null -w '%{http_code}' http://192.168.0.121:31235 +``` + +--- + +## 資料完整性確認 + +stock 資料存放在 PostgreSQL。確認資料庫連接正常: + +```bash +# 透過 API 健康檢查確認 DB 正常 +curl http://192.168.0.121:32334/api/v1/health | python3 -m json.tool +``` diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 566d3408..cac21f29 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -156,6 +156,24 @@ groups: summary: "Velero 超過 24 小時未成功備份" description: "最後一次成功備份超過 24 小時" + # Sprint C-2 Host rsync 備份告警 (2026-04-11 Claude Sonnet 4.6) + # backup-from-110.sh 成功時寫入 /var/run/backup-110.last_success + # node-exporter textfile collector 讀取此檔案暴露指標 + - alert: HostBackupFailed + expr: time() - node_textfile_scrape_error{collector="backup_110"} == 0 or absent(node_textfile_scrape_error{collector="backup_110"}) or (time() - backup_110_last_success_timestamp > 90000) + for: 10m + labels: + severity: warning + layer: docker-188 + team: ops + component: backup + host: "188" + auto_repair: "false" + alert_category: host_resource + annotations: + summary: "188 Host 備份超過 25 小時未成功" + description: "backup-from-110.sh 可能失敗,/backup/110 資料可能過舊" + # ========================================================================= # 資料庫告警 (database_alerts) # ========================================================================= diff --git a/scripts/ops/backup-from-110.sh b/scripts/ops/backup-from-110.sh new file mode 100644 index 00000000..baad500a --- /dev/null +++ b/scripts/ops/backup-from-110.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# ============================================================================= +# backup-from-110.sh — 188 Host 層備份腳本(從 110 rsync 到 188) +# ============================================================================= +# 部署位置: /home/ollama/bin/backup-from-110.sh (188 上) +# 執行者: ollama (188 的主要帳號) +# Cron: 0 1 * * * /home/ollama/bin/backup-from-110.sh +# +# 備份項目: +# 1. Harbor registry data(最高優先) +# 2. Gitea repos +# 3. bitan-pharmacy git bare repo(若存在) +# +# 前提: +# - 188 的 ollama 帳號已加入 110 wooo 帳號的 authorized_keys +# - /backup/110/{harbor,gitea} 目錄已建立 (mkdir -p /backup/110/{harbor,gitea}) +# - 188 磁碟空間足夠(建議 > 50GB 可用) +# +# 成功/失敗狀態: +# - 寫入 BACKUP_LAST_SUCCESS_TS 到 /var/run/backup-110.last_success +# - 失敗時傳送 Telegram 告警(若設定了 TG_BOT_TOKEN) +# +# Sprint C ADR-069 (2026-04-11 Claude Sonnet 4.6 Asia/Taipei) +# ============================================================================= +set -euo pipefail + +LOG="/var/log/backup-from-110.log" +LAST_SUCCESS_FILE="/var/run/backup-110.last_success" +DATE=$(date +%Y%m%d-%H%M%S) +ERRORS=0 + +log() { + echo "[$DATE] $*" | tee -a "$LOG" +} + +log "=== Starting backup from 110 ===" + +# ── Harbor registry data ────────────────────────────────────────────────────── +log "Backing up Harbor registry..." +if rsync -avz --delete \ + -e "ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10" \ + wooo@192.168.0.110:/var/lib/docker/volumes/harbor_harbor-data/_data/ \ + /backup/110/harbor/ >> "$LOG" 2>&1; then + log "✅ Harbor backup OK" +else + log "❌ ERROR: Harbor backup failed" + ERRORS=$((ERRORS + 1)) +fi + +# ── Gitea repos ─────────────────────────────────────────────────────────────── +log "Backing up Gitea repos..." +if rsync -avz --delete \ + -e "ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10" \ + wooo@192.168.0.110:/var/lib/docker/volumes/gitea_gitea-data/_data/ \ + /backup/110/gitea/ >> "$LOG" 2>&1; then + log "✅ Gitea backup OK" +else + log "❌ ERROR: Gitea backup failed" + ERRORS=$((ERRORS + 1)) +fi + +# ── bitan-pharmacy git bare repo (可選) ────────────────────────────────────── +if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 \ + wooo@192.168.0.110 "test -d /home/wooo/bitan-pharmacy.git" 2>/dev/null; then + log "Backing up bitan-pharmacy.git..." + if rsync -avz \ + -e "ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10" \ + wooo@192.168.0.110:/home/wooo/bitan-pharmacy.git/ \ + /backup/110/bitan-pharmacy.git/ >> "$LOG" 2>&1; then + log "✅ bitan-pharmacy.git backup OK" + else + log "⚠️ bitan-pharmacy.git backup failed (non-fatal)" + fi +else + log "⚠️ bitan-pharmacy.git not found on 110, skipping" +fi + +# ── 結果處理 ───────────────────────────────────────────────────────────────── +if [ "$ERRORS" -eq 0 ]; then + # 寫入成功時間戳(供 Prometheus backup_last_success_timestamp 使用) + date +%s > "$LAST_SUCCESS_FILE" + log "=== Backup completed successfully ===" + exit 0 +else + log "=== Backup FAILED ($ERRORS errors) ===" + + # Telegram 告警(若有 token) + TG_TOKEN="${TG_BOT_TOKEN:-}" + TG_CHAT="${TG_CHAT_ID:-}" + if [ -n "$TG_TOKEN" ] && [ -n "$TG_CHAT" ]; then + curl -s -X POST "https://api.telegram.org/bot${TG_TOKEN}/sendMessage" \ + -d "chat_id=${TG_CHAT}" \ + -d "text=🚨 backup-from-110.sh FAILED on 188 — ${ERRORS} error(s) at ${DATE}" \ + > /dev/null || true + fi + exit 1 +fi