Files
awoooi/scripts/backup/backup-monitoring.sh
OG T f51bf5a6a8 feat(backup): 全服務備份覆蓋 + 告警機制 — 9/9 服務完整
新增備份(已部署到 110,首次執行全部通過):
- backup-langfuse.sh: Langfuse AI 追蹤/評測 DB (7238 traces)
- backup-monitoring.sh: Prometheus + Grafana + Alertmanager volumes + configs
- backup-signoz.sh: SignOz ClickHouse + SQLite (分散式追蹤/日誌)
- backup-open-webui.sh: Open-WebUI LLM 對話紀錄 (SSH 188 volume)
- backup-clawbot.sh: ClawBot Redis 狀態/快取 (SSH 188 volume)
- backup-all.sh v3.0: 整合至 9/9 服務

告警機制:
- common.sh: notify_clawbot 改用 /webhook/custom 正確格式
- failed → severity:critical → Telegram 🔴 立即告警
- 告警測試通過:{"status":"ok","alert_id":"878c4c59..."}

GFS 保留:30日/12週/24月 (AWOOOI 額外 28h 高頻)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 11:12:42 +08:00

110 lines
4.1 KiB
Bash
Executable File

#!/bin/bash
# =============================================================================
# WOOO AIOps - Monitoring 備份腳本 (Prometheus + Grafana + Alertmanager)
# 版本: 1.1.0
# 建立日期: 2026-04-05
# 2026-04-05 Claude Code: 新增監控數據備份 — 首席架構師備份審計
# 2026-04-05 Claude Code: v1.1 修正 Prometheus 1.1GB volume tar pipeline exit code 處理
# =============================================================================
set -euo pipefail
source "$(dirname "$0")/common.sh"
SERVICE="monitoring"
LOCAL_REPO="${BACKUP_BASE}/monitoring"
DUMP_DIR="/tmp/monitoring-backup-$$"
MONITORING_CONFIG_DIR="/home/wooo/monitoring"
cleanup() {
rm -rf "${DUMP_DIR}"
}
backup_volume() {
local volume_name="$1"
local output_file="$2"
log_info "備份 volume: ${volume_name}"
# 注意: tar 備份大型 volume 時可能 exit 1 (因 mmap/lock files 被修改)
# 使用 || true 避免因 warning 導致失敗,但仍驗證檔案大小
docker run --rm -v "${volume_name}:/data" alpine tar czf - /data 2>/dev/null > "${output_file}" || true
if [ -s "${output_file}" ]; then
local size=$(du -h "${output_file}" | cut -f1)
log_success " Volume ${volume_name} 備份完成 (${size})"
return 0
else
log_error " Volume ${volume_name} 備份失敗 (空檔案)"
return 1
fi
}
main() {
local start_time=$(date +%s)
log_info "========== 開始 Monitoring 備份 =========="
mkdir -p "${DUMP_DIR}"
local timestamp=$(date "+%Y%m%d_%H%M%S")
local any_failed=0
# Step 1: 備份 Prometheus volume (TSDB 數據,約 1GB+)
backup_volume "monitoring_prometheus_data" "${DUMP_DIR}/prometheus_${timestamp}.tar.gz" || {
notify_clawbot "failed" "${SERVICE}" "Prometheus volume 備份失敗"
cleanup
exit 1
}
# Step 2: 備份 Grafana volume (dashboards/alerts 設定)
backup_volume "monitoring_grafana_data" "${DUMP_DIR}/grafana_${timestamp}.tar.gz" || {
log_warn "Grafana volume 備份失敗,繼續..."
any_failed=1
}
# Step 3: 備份 Alertmanager volume (靜默/路由設定)
backup_volume "monitoring_alertmanager_data" "${DUMP_DIR}/alertmanager_${timestamp}.tar.gz" || {
log_warn "Alertmanager volume 備份失敗,繼續..."
any_failed=1
}
# Step 4: 備份 monitoring 設定檔目錄
log_info "備份 monitoring 設定檔 (${MONITORING_CONFIG_DIR})"
if [ -d "${MONITORING_CONFIG_DIR}" ]; then
tar czf "${DUMP_DIR}/monitoring-configs_${timestamp}.tar.gz" -C "$(dirname ${MONITORING_CONFIG_DIR})" "$(basename ${MONITORING_CONFIG_DIR})" 2>/dev/null || true
if [ -s "${DUMP_DIR}/monitoring-configs_${timestamp}.tar.gz" ]; then
log_success "設定檔備份完成"
else
log_warn "設定檔備份失敗或為空"
fi
else
log_warn "monitoring 設定目錄不存在: ${MONITORING_CONFIG_DIR}"
fi
# Step 5: 初始化 Restic 倉庫
if [ ! -d "${LOCAL_REPO}/data" ]; then
log_info "初始化 Restic 倉庫: ${LOCAL_REPO}"
restic -r "${LOCAL_REPO}" init --password-file "${RESTIC_PASSWORD_FILE}" 2>&1 || {
log_error "Restic 倉庫初始化失敗"
cleanup
exit 1
}
fi
# Step 6: Restic 備份
log_info "建立 Restic 備份..."
local tags=$(build_tags "${SERVICE}")
restic -r "${LOCAL_REPO}" backup "${DUMP_DIR}" --password-file "${RESTIC_PASSWORD_FILE}" ${tags} 2>&1
local snapshot_id=$(restic -r "${LOCAL_REPO}" snapshots --latest 1 --json --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | grep -oP '"short_id":"\K[^"]+' | head -1)
log_success "Restic 備份完成: ${snapshot_id}"
# Step 7: GFS 清理
cleanup_old_backups "${LOCAL_REPO}"
cleanup
local end_time=$(date +%s)
local duration=$((end_time - start_time))
log_success "========== Monitoring 備份完成 (${duration}s) =========="
notify_clawbot "success" "${SERVICE}" "Monitoring 備份完成 (Prometheus+Grafana+Alertmanager)" "${duration}"
}
main "$@"