新增備份(已部署到 110,首次執行全部通過): - backup-langfuse.sh: Langfuse AI 追蹤/評測 DB (7238 traces) - backup-monitoring.sh: Prometheus + Grafana + Alertmanager volumes + configs - backup-signoz.sh: SignOz ClickHouse + SQLite (分散式追蹤/日誌) - backup-open-webui.sh: Open-WebUI LLM 對話紀錄 (SSH 188 volume) - backup-clawbot.sh: ClawBot Redis 狀態/快取 (SSH 188 volume) - backup-all.sh v3.0: 整合至 9/9 服務 告警機制: - common.sh: notify_clawbot 改用 /webhook/custom 正確格式 - failed → severity:critical → Telegram 🔴 立即告警 - 告警測試通過:{"status":"ok","alert_id":"878c4c59..."} GFS 保留:30日/12週/24月 (AWOOOI 額外 28h 高頻) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
104 lines
3.8 KiB
Bash
Executable File
104 lines
3.8 KiB
Bash
Executable File
#!/bin/bash
|
|
# =============================================================================
|
|
# WOOO AIOps - SignOz 備份腳本 (ClickHouse + SQLite)
|
|
# 版本: 1.1.0
|
|
# 建立日期: 2026-04-05
|
|
# 2026-04-05 Claude Code: 新增 SignOz 分散式追蹤備份 — 首席架構師備份審計
|
|
# 2026-04-05 Claude Code: v1.1 修正 tar pipeline exit code 處理 + || true
|
|
# =============================================================================
|
|
|
|
set -euo pipefail
|
|
|
|
source "$(dirname "$0")/common.sh"
|
|
|
|
SERVICE="signoz"
|
|
LOCAL_REPO="${BACKUP_BASE}/signoz"
|
|
DUMP_DIR="/tmp/signoz-backup-$$"
|
|
|
|
cleanup() {
|
|
# 確保 collector 已重啟
|
|
docker start signoz-otel-collector 2>/dev/null || true
|
|
rm -rf "${DUMP_DIR}"
|
|
}
|
|
|
|
backup_volume() {
|
|
local volume_name="$1"
|
|
local output_file="$2"
|
|
local extra_exclude="${3:-}"
|
|
log_info "備份 volume: ${volume_name}"
|
|
# 使用 || true 處理 tar 備份運行中 volume 的 exit 1 警告
|
|
if [ -n "${extra_exclude}" ]; then
|
|
docker run --rm -v "${volume_name}:/data" alpine tar czf - "${extra_exclude}" /data 2>/dev/null > "${output_file}" || true
|
|
else
|
|
docker run --rm -v "${volume_name}:/data" alpine tar czf - /data 2>/dev/null > "${output_file}" || true
|
|
fi
|
|
if [ -s "${output_file}" ]; then
|
|
local size=$(du -h "${output_file}" | cut -f1)
|
|
log_success " Volume ${volume_name} 備份完成 (${size})"
|
|
return 0
|
|
else
|
|
log_error " Volume ${volume_name} 備份失敗 (空檔案)"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
main() {
|
|
local start_time=$(date +%s)
|
|
log_info "========== 開始 SignOz 備份 =========="
|
|
mkdir -p "${DUMP_DIR}"
|
|
|
|
local timestamp=$(date "+%Y%m%d_%H%M%S")
|
|
|
|
# Step 1: 停止 OTEL Collector 確保數據一致性
|
|
log_info "暫停 signoz-otel-collector 以確保數據一致性..."
|
|
docker stop signoz-otel-collector 2>/dev/null || log_warn "signoz-otel-collector 未在運行,繼續"
|
|
docker stop signoz-telemetrystore-migrator 2>/dev/null || true
|
|
|
|
# Step 2: 備份 ClickHouse volume (排除 tmp 目錄降低體積)
|
|
backup_volume "signoz-clickhouse" "${DUMP_DIR}/clickhouse_${timestamp}.tar.gz" "--exclude=/data/tmp" || {
|
|
log_error "ClickHouse volume 備份失敗"
|
|
cleanup
|
|
notify_clawbot "failed" "${SERVICE}" "SignOz ClickHouse 備份失敗"
|
|
exit 1
|
|
}
|
|
|
|
# Step 3: 備份 SQLite volume (SignOz metadata)
|
|
backup_volume "signoz-sqlite" "${DUMP_DIR}/sqlite_${timestamp}.tar.gz" || {
|
|
log_warn "SQLite volume 備份失敗,繼續..."
|
|
}
|
|
|
|
# Step 4: 重啟 Collector
|
|
log_info "重啟 signoz-otel-collector..."
|
|
docker start signoz-otel-collector 2>/dev/null || log_warn "signoz-otel-collector 重啟失敗"
|
|
|
|
# Step 5: 初始化 Restic 倉庫
|
|
if [ ! -d "${LOCAL_REPO}/data" ]; then
|
|
log_info "初始化 Restic 倉庫: ${LOCAL_REPO}"
|
|
restic -r "${LOCAL_REPO}" init --password-file "${RESTIC_PASSWORD_FILE}" 2>&1 || {
|
|
log_error "Restic 倉庫初始化失敗"
|
|
rm -rf "${DUMP_DIR}"
|
|
exit 1
|
|
}
|
|
fi
|
|
|
|
# Step 6: Restic 備份
|
|
log_info "建立 Restic 備份..."
|
|
local tags=$(build_tags "${SERVICE}")
|
|
restic -r "${LOCAL_REPO}" backup "${DUMP_DIR}" --password-file "${RESTIC_PASSWORD_FILE}" ${tags} 2>&1
|
|
|
|
local snapshot_id=$(restic -r "${LOCAL_REPO}" snapshots --latest 1 --json --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | grep -oP '"short_id":"\K[^"]+' | head -1)
|
|
log_success "Restic 備份完成: ${snapshot_id}"
|
|
|
|
# Step 7: GFS 清理
|
|
cleanup_old_backups "${LOCAL_REPO}"
|
|
|
|
rm -rf "${DUMP_DIR}"
|
|
|
|
local end_time=$(date +%s)
|
|
local duration=$((end_time - start_time))
|
|
log_success "========== SignOz 備份完成 (${duration}s) =========="
|
|
notify_clawbot "success" "${SERVICE}" "SignOz 備份完成 (ClickHouse+SQLite)" "${duration}"
|
|
}
|
|
|
|
main "$@"
|