#!/bin/bash # ============================================================================= # WOOO AIOps - 主機與服務設定檔備份 # 2026-05-06 ogt + Codex: 重開機事故後補齊 configuration-state backup。 # # 目的: # DB/volume backup 只能還原資料;真正決定服務能否啟動的是 nginx、 # systemd drop-in、Docker Compose、cron、K8s Secret/ConfigMap、Prometheus # 與 Alertmanager 設定。此腳本只收集設定狀態,不收集大型資料目錄。 # # 安全: # - Secret/ConfigMap 只進入 restic 加密快照,不印到 log。 # - 不把 restic password file 備份進同一個 restic repo。 # - 暫存目錄權限 0700,結束後清除。 # ============================================================================= set -euo pipefail source "$(dirname "$0")/common.sh" SERVICE="configs" LOCAL_REPO="${BACKUP_BASE}/configs" DUMP_DIR="/tmp/configs-backup-$$" STATUS_DIR="${BACKUP_BASE}/status" CONFIG_STATUS_FILE="${STATUS_DIR}/backup-configs-last-status.json" STATUS_ITEMS_FILE="${DUMP_DIR}/config-capture-status.jsonl" SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=8 -o StrictHostKeyChecking=accept-new) K8S_BACKUP_HOSTS="${K8S_BACKUP_HOSTS:-192.168.0.120 192.168.0.121 192.168.0.125}" # 2026-05-19 ogt + Codex: 保留策略統一交給 common.sh。 # 預設 latest-only keep-last=1,避免設定檔備份長期堆積。 tar_excludes=( --exclude="*/node_modules" --exclude="*/.next" --exclude="*/.venv" --exclude="*/venv" --exclude="*/__pycache__" --exclude="*/logs" --exclude="*/log" --exclude="*/backup" --exclude="*/backups" --exclude="*/data" --exclude="*/tmp" --exclude=".restic-password" ) write_cmd_output() { local label="$1" shift if "$@" > "${DUMP_DIR}/${label}.txt" 2>&1; then log_success "設定盤點完成: ${label}" else log_warn "設定盤點失敗或無權限: ${label}" return 1 fi } tar_local() { local label="$1" shift local paths=("$@") local tar_cmd tar_cmd=$(local_tar_command) if ${tar_cmd} czf "${DUMP_DIR}/${label}.tar.gz" \ --ignore-failed-read \ --warning=no-file-changed \ "${tar_excludes[@]}" \ "${paths[@]}" 2>"${DUMP_DIR}/${label}.tar.stderr"; then log_success "本機設定封存完成: ${label}" else log_warn "本機設定封存部分失敗: ${label}" fi [ -s "${DUMP_DIR}/${label}.tar.gz" ] } local_tar_command() { if sudo -n true >/dev/null 2>&1; then printf 'sudo -n tar' else printf 'tar' fi } tar_remote() { local host="$1" local label="$2" shift 2 local paths=("$@") local remote_script remote_script='if sudo -n true >/dev/null 2>&1; then tar_cmd="sudo -n tar"; else tar_cmd="tar"; fi; $tar_cmd czf - --ignore-failed-read --warning=no-file-changed' for exclude in "${tar_excludes[@]}"; do remote_script+=" $(printf '%q' "$exclude")" done for path in "${paths[@]}"; do remote_script+=" $(printf '%q' "$path")" done if ssh "${SSH_OPTS[@]}" "$host" "$remote_script" > "${DUMP_DIR}/${label}.tar.gz" 2>"${DUMP_DIR}/${label}.tar.stderr"; then log_success "遠端設定封存完成: ${label}" else log_warn "遠端設定封存部分失敗: ${label}" fi [ -s "${DUMP_DIR}/${label}.tar.gz" ] } capture_remote_cmd() { local host="$1" local label="$2" local cmd="$3" if ssh "${SSH_OPTS[@]}" "$host" "$cmd" > "${DUMP_DIR}/${label}.txt" 2>&1; then log_success "遠端設定盤點完成: ${label}" else log_warn "遠端設定盤點失敗或無權限: ${label}" return 1 fi } capture_k8s_yaml() { local label="$1" local resource="$2" local cmd k8s_host cmd="sudo -n kubectl get ${resource} -A -o yaml 2>/dev/null || kubectl get ${resource} -A -o yaml" for k8s_host in ${K8S_BACKUP_HOSTS}; do if ssh "${SSH_OPTS[@]}" "wooo@${k8s_host}" "$cmd" > "${DUMP_DIR}/${label}.yaml" 2>"${DUMP_DIR}/${label}.stderr"; then printf 'source_host=%s\n' "${k8s_host}" > "${DUMP_DIR}/${label}.source" log_success "K8s 設定備份完成: ${label} (source=${k8s_host})" return 0 fi done log_warn "K8s 設定備份失敗: ${label}" return 1 } record_config_status() { local target="$1" local critical="$2" local ok="$3" local source="${4:-}" printf '{"target":"%s","critical":%s,"ok":%s,"source":"%s"}\n' \ "${target}" "${critical}" "${ok}" "${source}" >> "${STATUS_ITEMS_FILE}" } write_config_status_file() { local failed_count="$1" local duration="$2" local snapshot_id="$3" install -d -m 700 "${STATUS_DIR}" python3 - "${STATUS_ITEMS_FILE}" "${CONFIG_STATUS_FILE}" "${failed_count}" "${duration}" "${snapshot_id}" <<'PY' import json import os import sys import time from pathlib import Path items_path = Path(sys.argv[1]) status_path = Path(sys.argv[2]) failed_count = int(sys.argv[3]) duration = int(sys.argv[4]) snapshot_id = sys.argv[5] items = [] if items_path.exists(): for line in items_path.read_text(encoding="utf-8", errors="replace").splitlines(): if not line.strip(): continue items.append(json.loads(line)) critical_failed_count = sum(1 for item in items if item.get("critical") and not item.get("ok")) document = { "timestamp": int(time.time()), "failed_count": failed_count, "critical_failed_count": critical_failed_count, "duration_seconds": duration, "snapshot_id": snapshot_id, "items": items, } tmp_path = status_path.with_suffix(status_path.suffix + ".tmp") tmp_path.write_text(json.dumps(document, ensure_ascii=False, sort_keys=True) + "\n", encoding="utf-8") os.replace(tmp_path, status_path) os.chmod(status_path, 0o640) PY } main() { local start_time local failed=0 local timestamp start_time=$(date +%s) timestamp=$(date "+%Y%m%d_%H%M%S") log_info "========== 開始主機與服務設定檔備份 (${timestamp}) ==========" install -d -m 700 "${DUMP_DIR}" : > "${STATUS_ITEMS_FILE}" write_cmd_output "110-crontab-current-user" crontab -l || failed=$((failed + 1)) write_cmd_output "110-systemd-unit-files" systemctl list-unit-files || failed=$((failed + 1)) write_cmd_output "110-docker-containers" docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}' || true if tar_local "110-host-configs" \ /etc/nginx \ /etc/systemd/system \ /etc/cron.d \ /etc/crontab \ /etc/letsencrypt \ /etc/ssh \ /etc/fstab \ /etc/hosts \ /etc/netplan \ /etc/docker \ /etc/containerd \ /etc/keepalived \ /opt/harbor/harbor.yml \ /opt/harbor/docker-compose.yml \ /opt/sentry/.env \ /opt/sentry/docker-compose.yml \ /opt/sentry/docker-compose.override.yml \ /opt/sentry/sentry \ /home/wooo/monitoring \ /home/wooo/scripts \ /home/wooo/awoooi \ /home/wooo/awoooi-ops \ /backup/scripts; then record_config_status "110-host-configs" true true "110" else record_config_status "110-host-configs" true false "110" failed=$((failed + 1)) fi capture_remote_cmd "ollama@192.168.0.188" "188-crontab-ollama" "crontab -l" || failed=$((failed + 1)) capture_remote_cmd "ollama@192.168.0.188" "188-systemd-unit-files" "systemctl list-unit-files" || failed=$((failed + 1)) capture_remote_cmd "ollama@192.168.0.188" "188-docker-containers" "docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}'" || true if tar_remote "ollama@192.168.0.188" "188-host-configs" \ /etc/nginx \ /etc/systemd/system \ /etc/cron.d \ /etc/crontab \ /etc/letsencrypt \ /etc/ssh \ /etc/fstab \ /etc/hosts \ /etc/netplan \ /etc/docker \ /etc/containerd \ /etc/keepalived \ /opt/n8n \ /opt/open-webui \ /opt/litellm \ /opt/signoz \ /opt/minio \ /opt/registry \ /home/ollama/bin \ /home/ollama/scripts \ /home/ollama/momo-pro \ /home/ollama/awoooi-ops \ /home/ollama/node_exporter_textfiles; then record_config_status "188-host-configs" true true "188" else record_config_status "188-host-configs" true false "188" failed=$((failed + 1)) fi capture_remote_cmd "wooo@192.168.0.120" "120-crontab-wooo" "crontab -l" || true if tar_remote "wooo@192.168.0.120" "120-k3s-host-configs" \ /etc/rancher/k3s \ /var/lib/rancher/k3s/server/manifests \ /etc/systemd/system \ /etc/cron.d \ /etc/crontab \ /etc/ssh \ /etc/fstab \ /etc/hosts \ /etc/netplan \ /etc/containerd \ /etc/keepalived; then record_config_status "120-k3s-host-configs" true true "120" else record_config_status "120-k3s-host-configs" true false "120" failed=$((failed + 1)) fi capture_remote_cmd "wooo@192.168.0.121" "121-crontab-wooo" "crontab -l" || true if tar_remote "wooo@192.168.0.121" "121-k3s-host-configs" \ /etc/rancher/k3s \ /var/lib/rancher/k3s/agent/etc \ /etc/systemd/system \ /etc/cron.d \ /etc/crontab \ /etc/ssh \ /etc/fstab \ /etc/hosts \ /etc/netplan \ /etc/containerd \ /etc/keepalived; then record_config_status "121-k3s-host-configs" true true "121" else record_config_status "121-k3s-host-configs" true false "121" failed=$((failed + 1)) fi if capture_k8s_yaml "cluster-k8s-workloads" "deployments,statefulsets,daemonsets,services,ingress,configmaps,cronjobs,jobs,persistentvolumeclaims,persistentvolumes,storageclasses,networkpolicies,serviceaccounts,roles,rolebindings,clusterroles,clusterrolebindings,customresourcedefinitions"; then record_config_status "cluster-k8s-workloads" true true "$(sed -n 's/^source_host=//p' "${DUMP_DIR}/cluster-k8s-workloads.source" | head -n 1)" else record_config_status "cluster-k8s-workloads" true false "" failed=$((failed + 1)) fi if capture_k8s_yaml "cluster-k8s-secrets" "secrets"; then record_config_status "cluster-k8s-secrets" true true "$(sed -n 's/^source_host=//p' "${DUMP_DIR}/cluster-k8s-secrets.source" | head -n 1)" else record_config_status "cluster-k8s-secrets" true false "" failed=$((failed + 1)) fi if capture_k8s_yaml "cluster-velero-backups" "backups.velero.io,schedules.velero.io"; then record_config_status "cluster-velero-backups" false true "$(sed -n 's/^source_host=//p' "${DUMP_DIR}/cluster-velero-backups.source" | head -n 1)" else record_config_status "cluster-velero-backups" false false "" fi if [ ! -d "${LOCAL_REPO}/data" ]; then log_info "初始化 Restic 倉庫 ${LOCAL_REPO}..." restic -r "${LOCAL_REPO}" init --password-file "${RESTIC_PASSWORD_FILE}" 2>&1 fi local tags tags=$(build_tags "${SERVICE}") restic -r "${LOCAL_REPO}" backup "${DUMP_DIR}" \ --password-file "${RESTIC_PASSWORD_FILE}" \ ${tags} \ --tag "scope:host-configs" \ --tag "contains:k8s-secrets" 2>&1 local snapshot_id snapshot_id=$(restic -r "${LOCAL_REPO}" snapshots --latest 1 --json \ --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \ python3 -c 'import json,sys; rows=json.load(sys.stdin); print(rows[-1].get("short_id","unknown") if rows else "unknown")' 2>/dev/null || echo "unknown") log_success "設定檔 Restic 備份完成: ${snapshot_id}" cleanup_old_backups "${LOCAL_REPO}" local duration duration=$(($(date +%s) - start_time)) write_config_status_file "${failed}" "${duration}" "${snapshot_id}" rm -rf "${DUMP_DIR}" if [ "${failed}" -eq 0 ]; then log_success "========== 設定檔備份完成 (${duration}s) ==========" notify_clawbot "success" "${SERVICE}" "主機與服務設定檔備份完成" "${duration}" else log_error "========== 設定檔備份完成但有 ${failed} 個項目失敗 (${duration}s) ==========" notify_clawbot "warning" "${SERVICE}" "設定檔備份有 ${failed} 個項目失敗" "${duration}" fi return "${failed}" } main "$@"