From fb1d1019024ca937bb5fd4ea56d4f592b33dabe2 Mon Sep 17 00:00:00 2001 From: OG T Date: Sat, 18 Apr 2026 10:37:15 +0800 Subject: [PATCH] =?UTF-8?q?fix(backup):=20HostBackupFailed=20P1=20?= =?UTF-8?q?=E6=A0=B9=E6=B2=BB=20=E2=80=94=20Prometheus=20textfile=20?= =?UTF-8?q?=E6=8C=87=E6=A8=99=20+=20docker=20socket=20=E8=AE=80=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 問題一:backup_110_last_success_timestamp 指標從未存在 根因:腳本只寫純文字 last_success 檔,從未輸出 .prom 格式 修復:成功時寫入 /home/ollama/node_exporter_textfiles/backup.prom node_exporter 新增 --collector.textfile.directory=/textfile_collector volume: /home/ollama/node_exporter_textfiles:/textfile_collector 問題二:Harbor/Gitea rsync 權限拒絕 根因:/var/lib/docker/volumes/ 是 710 root:root,docker group 無法直接存取 FS 路徑 修復:改用 docker run --rm -v :/source alpine tar czf - 透過 docker socket(wooo 已在 docker group)讀取 volume 內容再解壓 驗證:備份腳本三項全 OK,node_exporter 9100/metrics 正確輸出指標 Prometheus absent(backup_110_last_success_timestamp) 應在下次 scrape 後清除 2026-04-18 ogt + Claude Sonnet 4.6 Co-Authored-By: Claude Sonnet 4.6 --- scripts/ops/backup-from-110.sh | 38 ++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/scripts/ops/backup-from-110.sh b/scripts/ops/backup-from-110.sh index f0cda3ee..aa94f76c 100644 --- a/scripts/ops/backup-from-110.sh +++ b/scripts/ops/backup-from-110.sh @@ -27,6 +27,8 @@ set -euo pipefail BACKUP_ROOT="${BACKUP_ROOT:-/home/ollama/backup/110}" LOG="${BACKUP_ROOT}/backup.log" LAST_SUCCESS_FILE="${BACKUP_ROOT}/last_success" +TEXTFILE_DIR="${TEXTFILE_DIR:-/home/ollama/node_exporter_textfiles}" +TEXTFILE_PROM="${TEXTFILE_DIR}/backup.prom" DATE=$(date +%Y%m%d-%H%M%S) ERRORS=0 @@ -37,11 +39,14 @@ log() { log "=== Starting backup from 110 ===" # ── Harbor registry data ────────────────────────────────────────────────────── +# 2026-04-17 ogt: 改用 docker socket 讀取 volumes(/var/lib/docker/volumes/ 是 710 root:root) +# wooo 是 docker group 成員,可透過 docker run 掛載 volume,不可直接讀取 FS 路徑 log "Backing up Harbor registry..." -if rsync -avz --delete \ - -e "ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10" \ - wooo@192.168.0.110:/var/lib/docker/volumes/harbor_harbor-data/_data/ \ - ${BACKUP_ROOT}/harbor/ >> "$LOG" 2>&1; then +mkdir -p "${BACKUP_ROOT}/harbor" +if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 \ + wooo@192.168.0.110 \ + "docker run --rm -v harbor_harbor-data:/source alpine tar czf - -C /source ." \ + | tar xzf - -C "${BACKUP_ROOT}/harbor/" >> "$LOG" 2>&1; then log "✅ Harbor backup OK" else log "❌ ERROR: Harbor backup failed" @@ -50,10 +55,11 @@ fi # ── Gitea repos ─────────────────────────────────────────────────────────────── log "Backing up Gitea repos..." -if rsync -avz --delete \ - -e "ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10" \ - wooo@192.168.0.110:/var/lib/docker/volumes/gitea_gitea-data/_data/ \ - ${BACKUP_ROOT}/gitea/ >> "$LOG" 2>&1; then +mkdir -p "${BACKUP_ROOT}/gitea" +if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 \ + wooo@192.168.0.110 \ + "docker run --rm -v gitea_gitea-data:/source alpine tar czf - -C /source ." \ + | tar xzf - -C "${BACKUP_ROOT}/gitea/" >> "$LOG" 2>&1; then log "✅ Gitea backup OK" else log "❌ ERROR: Gitea backup failed" @@ -78,9 +84,19 @@ fi # ── 結果處理 ───────────────────────────────────────────────────────────────── if [ "$ERRORS" -eq 0 ]; then - # 寫入成功時間戳(供 Prometheus backup_last_success_timestamp 使用) - date +%s > "$LAST_SUCCESS_FILE" - log "=== Backup completed successfully ===" + TS=$(date +%s) + # 寫入純文字時間戳(舊格式,保留相容性) + echo "$TS" > "$LAST_SUCCESS_FILE" + # 寫入 Prometheus textfile 格式(供 node_exporter textfile collector 讀取) + # 2026-04-17 ogt: 修復 HostBackupFailed — absent(backup_110_last_success_timestamp) 永遠觸發 + # 根因:只寫純文字檔,從未輸出 .prom 指標 → node_exporter 找不到 → Prometheus absent()=1 + mkdir -p "$TEXTFILE_DIR" + cat > "$TEXTFILE_PROM" <