#!/bin/bash # ============================================================================= # WOOO AIOps - SignOz 備份腳本 (ClickHouse + SQLite) # 版本: 1.1.0 # 建立日期: 2026-04-05 # 2026-04-05 Claude Code: 新增 SignOz 分散式追蹤備份 — 首席架構師備份審計 # 2026-04-05 Claude Code: v1.1 修正 tar pipeline exit code 處理 + || true # ============================================================================= set -euo pipefail source "$(dirname "$0")/common.sh" SERVICE="signoz" LOCAL_REPO="${BACKUP_BASE}/signoz" DUMP_DIR="/tmp/signoz-backup-$$" cleanup() { # 確保 collector 已重啟 docker start signoz-otel-collector 2>/dev/null || true rm -rf "${DUMP_DIR}" } backup_volume() { local volume_name="$1" local output_file="$2" local extra_exclude="${3:-}" log_info "備份 volume: ${volume_name}" # 使用 || true 處理 tar 備份運行中 volume 的 exit 1 警告 if [ -n "${extra_exclude}" ]; then docker run --rm -v "${volume_name}:/data" alpine tar czf - "${extra_exclude}" /data 2>/dev/null > "${output_file}" || true else docker run --rm -v "${volume_name}:/data" alpine tar czf - /data 2>/dev/null > "${output_file}" || true fi if [ -s "${output_file}" ]; then local size=$(du -h "${output_file}" | cut -f1) log_success " Volume ${volume_name} 備份完成 (${size})" return 0 else log_error " Volume ${volume_name} 備份失敗 (空檔案)" return 1 fi } main() { local start_time=$(date +%s) log_info "========== 開始 SignOz 備份 ==========" mkdir -p "${DUMP_DIR}" local timestamp=$(date "+%Y%m%d_%H%M%S") # Step 1: 停止 OTEL Collector 確保數據一致性 log_info "暫停 signoz-otel-collector 以確保數據一致性..." docker stop signoz-otel-collector 2>/dev/null || log_warn "signoz-otel-collector 未在運行,繼續" docker stop signoz-telemetrystore-migrator 2>/dev/null || true # Step 2: 備份 ClickHouse volume (排除 tmp 目錄降低體積) backup_volume "signoz-clickhouse" "${DUMP_DIR}/clickhouse_${timestamp}.tar.gz" "--exclude=/data/tmp" || { log_error "ClickHouse volume 備份失敗" cleanup notify_clawbot "failed" "${SERVICE}" "SignOz ClickHouse 備份失敗" exit 1 } # Step 3: 備份 SQLite volume (SignOz metadata) backup_volume "signoz-sqlite" "${DUMP_DIR}/sqlite_${timestamp}.tar.gz" || { log_warn "SQLite volume 備份失敗,繼續..." } # Step 4: 重啟 Collector log_info "重啟 signoz-otel-collector..." docker start signoz-otel-collector 2>/dev/null || log_warn "signoz-otel-collector 重啟失敗" # Step 5: 初始化 Restic 倉庫 if [ ! -d "${LOCAL_REPO}/data" ]; then log_info "初始化 Restic 倉庫: ${LOCAL_REPO}" restic -r "${LOCAL_REPO}" init --password-file "${RESTIC_PASSWORD_FILE}" 2>&1 || { log_error "Restic 倉庫初始化失敗" rm -rf "${DUMP_DIR}" exit 1 } fi # Step 6: Restic 備份 log_info "建立 Restic 備份..." local tags=$(build_tags "${SERVICE}") restic -r "${LOCAL_REPO}" backup "${DUMP_DIR}" --password-file "${RESTIC_PASSWORD_FILE}" ${tags} 2>&1 local snapshot_id=$(restic -r "${LOCAL_REPO}" snapshots --latest 1 --json --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | grep -oP '"short_id":"\K[^"]+' | head -1) log_success "Restic 備份完成: ${snapshot_id}" # Step 7: GFS 清理 cleanup_old_backups "${LOCAL_REPO}" rm -rf "${DUMP_DIR}" local end_time=$(date +%s) local duration=$((end_time - start_time)) log_success "========== SignOz 備份完成 (${duration}s) ==========" notify_clawbot "success" "${SERVICE}" "SignOz 備份完成 (ClickHouse+SQLite)" "${duration}" } main "$@"