Files
awoooi/scripts/backup/backup-configs.sh
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

360 lines
12 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# =============================================================================
# WOOO AIOps - 主機與服務設定檔備份
# 2026-05-06 ogt + Codex: 重開機事故後補齊 configuration-state backup。
#
# 目的:
# DB/volume backup 只能還原資料;真正決定服務能否啟動的是 nginx、
# systemd drop-in、Docker Compose、cron、K8s Secret/ConfigMap、Prometheus
# 與 Alertmanager 設定。此腳本只收集設定狀態,不收集大型資料目錄。
#
# 安全:
# - Secret/ConfigMap 只進入 restic 加密快照,不印到 log。
# - 不把 restic password file 備份進同一個 restic repo。
# - 暫存目錄權限 0700結束後清除。
# =============================================================================
set -euo pipefail
source "$(dirname "$0")/common.sh"
SERVICE="configs"
LOCAL_REPO="${BACKUP_BASE}/configs"
DUMP_DIR="/tmp/configs-backup-$$"
STATUS_DIR="${BACKUP_BASE}/status"
CONFIG_STATUS_FILE="${STATUS_DIR}/backup-configs-last-status.json"
STATUS_ITEMS_FILE="${DUMP_DIR}/config-capture-status.jsonl"
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=8 -o StrictHostKeyChecking=accept-new)
K8S_BACKUP_HOSTS="${K8S_BACKUP_HOSTS:-192.168.0.120 192.168.0.121 192.168.0.125}"
# 2026-05-19 ogt + Codex: 保留策略統一交給 common.sh。
# 預設 latest-only keep-last=1避免設定檔備份長期堆積。
tar_excludes=(
--exclude="*/node_modules"
--exclude="*/.next"
--exclude="*/.venv"
--exclude="*/venv"
--exclude="*/__pycache__"
--exclude="*/logs"
--exclude="*/log"
--exclude="*/backup"
--exclude="*/backups"
--exclude="*/data"
--exclude="*/tmp"
--exclude=".restic-password"
)
write_cmd_output() {
local label="$1"
shift
if "$@" > "${DUMP_DIR}/${label}.txt" 2>&1; then
log_success "設定盤點完成: ${label}"
else
log_warn "設定盤點失敗或無權限: ${label}"
return 1
fi
}
tar_local() {
local label="$1"
shift
local paths=("$@")
local tar_cmd
tar_cmd=$(local_tar_command)
if ${tar_cmd} czf "${DUMP_DIR}/${label}.tar.gz" \
--ignore-failed-read \
--warning=no-file-changed \
"${tar_excludes[@]}" \
"${paths[@]}" 2>"${DUMP_DIR}/${label}.tar.stderr"; then
log_success "本機設定封存完成: ${label}"
else
log_warn "本機設定封存部分失敗: ${label}"
fi
[ -s "${DUMP_DIR}/${label}.tar.gz" ]
}
local_tar_command() {
if sudo -n true >/dev/null 2>&1; then
printf 'sudo -n tar'
else
printf 'tar'
fi
}
tar_remote() {
local host="$1"
local label="$2"
shift 2
local paths=("$@")
local remote_script
remote_script='if sudo -n true >/dev/null 2>&1; then tar_cmd="sudo -n tar"; else tar_cmd="tar"; fi; $tar_cmd czf - --ignore-failed-read --warning=no-file-changed'
for exclude in "${tar_excludes[@]}"; do
remote_script+=" $(printf '%q' "$exclude")"
done
for path in "${paths[@]}"; do
remote_script+=" $(printf '%q' "$path")"
done
if ssh "${SSH_OPTS[@]}" "$host" "$remote_script" > "${DUMP_DIR}/${label}.tar.gz" 2>"${DUMP_DIR}/${label}.tar.stderr"; then
log_success "遠端設定封存完成: ${label}"
else
log_warn "遠端設定封存部分失敗: ${label}"
fi
[ -s "${DUMP_DIR}/${label}.tar.gz" ]
}
capture_remote_cmd() {
local host="$1"
local label="$2"
local cmd="$3"
if ssh "${SSH_OPTS[@]}" "$host" "$cmd" > "${DUMP_DIR}/${label}.txt" 2>&1; then
log_success "遠端設定盤點完成: ${label}"
else
log_warn "遠端設定盤點失敗或無權限: ${label}"
return 1
fi
}
capture_k8s_yaml() {
local label="$1"
local resource="$2"
local cmd k8s_host
cmd="sudo -n kubectl get ${resource} -A -o yaml 2>/dev/null || kubectl get ${resource} -A -o yaml"
for k8s_host in ${K8S_BACKUP_HOSTS}; do
if ssh "${SSH_OPTS[@]}" "wooo@${k8s_host}" "$cmd" > "${DUMP_DIR}/${label}.yaml" 2>"${DUMP_DIR}/${label}.stderr"; then
printf 'source_host=%s\n' "${k8s_host}" > "${DUMP_DIR}/${label}.source"
log_success "K8s 設定備份完成: ${label} (source=${k8s_host})"
return 0
fi
done
log_warn "K8s 設定備份失敗: ${label}"
return 1
}
record_config_status() {
local target="$1"
local critical="$2"
local ok="$3"
local source="${4:-}"
printf '{"target":"%s","critical":%s,"ok":%s,"source":"%s"}\n' \
"${target}" "${critical}" "${ok}" "${source}" >> "${STATUS_ITEMS_FILE}"
}
write_config_status_file() {
local failed_count="$1"
local duration="$2"
local snapshot_id="$3"
install -d -m 700 "${STATUS_DIR}"
python3 - "${STATUS_ITEMS_FILE}" "${CONFIG_STATUS_FILE}" "${failed_count}" "${duration}" "${snapshot_id}" <<'PY'
import json
import os
import sys
import time
from pathlib import Path
items_path = Path(sys.argv[1])
status_path = Path(sys.argv[2])
failed_count = int(sys.argv[3])
duration = int(sys.argv[4])
snapshot_id = sys.argv[5]
items = []
if items_path.exists():
for line in items_path.read_text(encoding="utf-8", errors="replace").splitlines():
if not line.strip():
continue
items.append(json.loads(line))
critical_failed_count = sum(1 for item in items if item.get("critical") and not item.get("ok"))
document = {
"timestamp": int(time.time()),
"failed_count": failed_count,
"critical_failed_count": critical_failed_count,
"duration_seconds": duration,
"snapshot_id": snapshot_id,
"items": items,
}
tmp_path = status_path.with_suffix(status_path.suffix + ".tmp")
tmp_path.write_text(json.dumps(document, ensure_ascii=False, sort_keys=True) + "\n", encoding="utf-8")
os.replace(tmp_path, status_path)
os.chmod(status_path, 0o640)
PY
}
main() {
local start_time
local failed=0
local timestamp
start_time=$(date +%s)
timestamp=$(date "+%Y%m%d_%H%M%S")
log_info "========== 開始主機與服務設定檔備份 (${timestamp}) =========="
install -d -m 700 "${DUMP_DIR}"
: > "${STATUS_ITEMS_FILE}"
write_cmd_output "110-crontab-current-user" crontab -l || failed=$((failed + 1))
write_cmd_output "110-systemd-unit-files" systemctl list-unit-files || failed=$((failed + 1))
write_cmd_output "110-docker-containers" docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}' || true
if tar_local "110-host-configs" \
/etc/nginx \
/etc/systemd/system \
/etc/cron.d \
/etc/crontab \
/etc/letsencrypt \
/etc/ssh \
/etc/fstab \
/etc/hosts \
/etc/netplan \
/etc/docker \
/etc/containerd \
/etc/keepalived \
/opt/harbor/harbor.yml \
/opt/harbor/docker-compose.yml \
/opt/sentry/.env \
/opt/sentry/docker-compose.yml \
/opt/sentry/docker-compose.override.yml \
/opt/sentry/sentry \
/home/wooo/monitoring \
/home/wooo/scripts \
/home/wooo/awoooi \
/home/wooo/awoooi-ops \
/backup/scripts; then
record_config_status "110-host-configs" true true "110"
else
record_config_status "110-host-configs" true false "110"
failed=$((failed + 1))
fi
capture_remote_cmd "ollama@192.168.0.188" "188-crontab-ollama" "crontab -l" || failed=$((failed + 1))
capture_remote_cmd "ollama@192.168.0.188" "188-systemd-unit-files" "systemctl list-unit-files" || failed=$((failed + 1))
capture_remote_cmd "ollama@192.168.0.188" "188-docker-containers" "docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}'" || true
if tar_remote "ollama@192.168.0.188" "188-host-configs" \
/etc/nginx \
/etc/systemd/system \
/etc/cron.d \
/etc/crontab \
/etc/letsencrypt \
/etc/ssh \
/etc/fstab \
/etc/hosts \
/etc/netplan \
/etc/docker \
/etc/containerd \
/etc/keepalived \
/opt/n8n \
/opt/open-webui \
/opt/litellm \
/opt/signoz \
/opt/minio \
/opt/registry \
/home/ollama/bin \
/home/ollama/scripts \
/home/ollama/momo-pro \
/home/ollama/awoooi-ops \
/home/ollama/node_exporter_textfiles; then
record_config_status "188-host-configs" true true "188"
else
record_config_status "188-host-configs" true false "188"
failed=$((failed + 1))
fi
capture_remote_cmd "wooo@192.168.0.120" "120-crontab-wooo" "crontab -l" || true
if tar_remote "wooo@192.168.0.120" "120-k3s-host-configs" \
/etc/rancher/k3s \
/var/lib/rancher/k3s/server/manifests \
/etc/systemd/system \
/etc/cron.d \
/etc/crontab \
/etc/ssh \
/etc/fstab \
/etc/hosts \
/etc/netplan \
/etc/containerd \
/etc/keepalived; then
record_config_status "120-k3s-host-configs" true true "120"
else
record_config_status "120-k3s-host-configs" true false "120"
failed=$((failed + 1))
fi
capture_remote_cmd "wooo@192.168.0.121" "121-crontab-wooo" "crontab -l" || true
if tar_remote "wooo@192.168.0.121" "121-k3s-host-configs" \
/etc/rancher/k3s \
/var/lib/rancher/k3s/agent/etc \
/etc/systemd/system \
/etc/cron.d \
/etc/crontab \
/etc/ssh \
/etc/fstab \
/etc/hosts \
/etc/netplan \
/etc/containerd \
/etc/keepalived; then
record_config_status "121-k3s-host-configs" true true "121"
else
record_config_status "121-k3s-host-configs" true false "121"
failed=$((failed + 1))
fi
if capture_k8s_yaml "cluster-k8s-workloads" "deployments,statefulsets,daemonsets,services,ingress,configmaps,cronjobs,jobs,persistentvolumeclaims,persistentvolumes,storageclasses,networkpolicies,serviceaccounts,roles,rolebindings,clusterroles,clusterrolebindings,customresourcedefinitions"; then
record_config_status "cluster-k8s-workloads" true true "$(sed -n 's/^source_host=//p' "${DUMP_DIR}/cluster-k8s-workloads.source" | head -n 1)"
else
record_config_status "cluster-k8s-workloads" true false ""
failed=$((failed + 1))
fi
if capture_k8s_yaml "cluster-k8s-secrets" "secrets"; then
record_config_status "cluster-k8s-secrets" true true "$(sed -n 's/^source_host=//p' "${DUMP_DIR}/cluster-k8s-secrets.source" | head -n 1)"
else
record_config_status "cluster-k8s-secrets" true false ""
failed=$((failed + 1))
fi
if capture_k8s_yaml "cluster-velero-backups" "backups.velero.io,schedules.velero.io"; then
record_config_status "cluster-velero-backups" false true "$(sed -n 's/^source_host=//p' "${DUMP_DIR}/cluster-velero-backups.source" | head -n 1)"
else
record_config_status "cluster-velero-backups" false false ""
fi
if [ ! -d "${LOCAL_REPO}/data" ]; then
log_info "初始化 Restic 倉庫 ${LOCAL_REPO}..."
restic -r "${LOCAL_REPO}" init --password-file "${RESTIC_PASSWORD_FILE}" 2>&1
fi
local tags
tags=$(build_tags "${SERVICE}")
restic -r "${LOCAL_REPO}" backup "${DUMP_DIR}" \
--password-file "${RESTIC_PASSWORD_FILE}" \
${tags} \
--tag "scope:host-configs" \
--tag "contains:k8s-secrets" 2>&1
local snapshot_id
snapshot_id=$(restic -r "${LOCAL_REPO}" snapshots --latest 1 --json \
--password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \
python3 -c 'import json,sys; rows=json.load(sys.stdin); print(rows[-1].get("short_id","unknown") if rows else "unknown")' 2>/dev/null || echo "unknown")
log_success "設定檔 Restic 備份完成: ${snapshot_id}"
cleanup_old_backups "${LOCAL_REPO}"
local duration
duration=$(($(date +%s) - start_time))
write_config_status_file "${failed}" "${duration}" "${snapshot_id}"
rm -rf "${DUMP_DIR}"
if [ "${failed}" -eq 0 ]; then
log_success "========== 設定檔備份完成 (${duration}s) =========="
notify_clawbot "success" "${SERVICE}" "主機與服務設定檔備份完成" "${duration}"
else
log_error "========== 設定檔備份完成但有 ${failed} 個項目失敗 (${duration}s) =========="
notify_clawbot "warning" "${SERVICE}" "設定檔備份有 ${failed} 個項目失敗" "${duration}"
fi
return "${failed}"
}
main "$@"