fix(backup): HostBackupFailed P1 根治 — Prometheus textfile 指標 + docker socket 讀取

問題一:backup_110_last_success_timestamp 指標從未存在
根因:腳本只寫純文字 last_success 檔,從未輸出 .prom 格式
修復:成功時寫入 /home/ollama/node_exporter_textfiles/backup.prom
      node_exporter 新增 --collector.textfile.directory=/textfile_collector
      volume: /home/ollama/node_exporter_textfiles:/textfile_collector

問題二:Harbor/Gitea rsync 權限拒絕
根因:/var/lib/docker/volumes/ 是 710 root:root,docker group 無法直接存取 FS 路徑
修復:改用 docker run --rm -v <volume>:/source alpine tar czf -
      透過 docker socket(wooo 已在 docker group)讀取 volume 內容再解壓

驗證:備份腳本三項全 OK,node_exporter 9100/metrics 正確輸出指標
      Prometheus absent(backup_110_last_success_timestamp) 應在下次 scrape 後清除

2026-04-18 ogt + Claude Sonnet 4.6

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-18 10:37:15 +08:00
parent d23343ac69
commit fb1d101902

View File

@@ -27,6 +27,8 @@ set -euo pipefail
BACKUP_ROOT="${BACKUP_ROOT:-/home/ollama/backup/110}"
LOG="${BACKUP_ROOT}/backup.log"
LAST_SUCCESS_FILE="${BACKUP_ROOT}/last_success"
TEXTFILE_DIR="${TEXTFILE_DIR:-/home/ollama/node_exporter_textfiles}"
TEXTFILE_PROM="${TEXTFILE_DIR}/backup.prom"
DATE=$(date +%Y%m%d-%H%M%S)
ERRORS=0
@@ -37,11 +39,14 @@ log() {
log "=== Starting backup from 110 ==="
# ── Harbor registry data ──────────────────────────────────────────────────────
# 2026-04-17 ogt: 改用 docker socket 讀取 volumes/var/lib/docker/volumes/ 是 710 root:root
# wooo 是 docker group 成員,可透過 docker run 掛載 volume不可直接讀取 FS 路徑
log "Backing up Harbor registry..."
if rsync -avz --delete \
-e "ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10" \
wooo@192.168.0.110:/var/lib/docker/volumes/harbor_harbor-data/_data/ \
${BACKUP_ROOT}/harbor/ >> "$LOG" 2>&1; then
mkdir -p "${BACKUP_ROOT}/harbor"
if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 \
wooo@192.168.0.110 \
"docker run --rm -v harbor_harbor-data:/source alpine tar czf - -C /source ." \
| tar xzf - -C "${BACKUP_ROOT}/harbor/" >> "$LOG" 2>&1; then
log "✅ Harbor backup OK"
else
log "❌ ERROR: Harbor backup failed"
@@ -50,10 +55,11 @@ fi
# ── Gitea repos ───────────────────────────────────────────────────────────────
log "Backing up Gitea repos..."
if rsync -avz --delete \
-e "ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10" \
wooo@192.168.0.110:/var/lib/docker/volumes/gitea_gitea-data/_data/ \
${BACKUP_ROOT}/gitea/ >> "$LOG" 2>&1; then
mkdir -p "${BACKUP_ROOT}/gitea"
if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 \
wooo@192.168.0.110 \
"docker run --rm -v gitea_gitea-data:/source alpine tar czf - -C /source ." \
| tar xzf - -C "${BACKUP_ROOT}/gitea/" >> "$LOG" 2>&1; then
log "✅ Gitea backup OK"
else
log "❌ ERROR: Gitea backup failed"
@@ -78,9 +84,19 @@ fi
# ── 結果處理 ─────────────────────────────────────────────────────────────────
if [ "$ERRORS" -eq 0 ]; then
# 寫入成功時間戳(供 Prometheus backup_last_success_timestamp 使用)
date +%s > "$LAST_SUCCESS_FILE"
log "=== Backup completed successfully ==="
TS=$(date +%s)
# 寫入純文字時間戳(舊格式,保留相容性)
echo "$TS" > "$LAST_SUCCESS_FILE"
# 寫入 Prometheus textfile 格式(供 node_exporter textfile collector 讀取)
# 2026-04-17 ogt: 修復 HostBackupFailed — absent(backup_110_last_success_timestamp) 永遠觸發
# 根因:只寫純文字檔,從未輸出 .prom 指標 → node_exporter 找不到 → Prometheus absent()=1
mkdir -p "$TEXTFILE_DIR"
cat > "$TEXTFILE_PROM" <<EOF
# HELP backup_110_last_success_timestamp Unix timestamp of last successful backup from 110
# TYPE backup_110_last_success_timestamp gauge
backup_110_last_success_timestamp $TS
EOF
log "=== Backup completed successfully (ts=$TS) ==="
exit 0
else
log "=== Backup FAILED ($ERRORS errors) ==="