360 lines
12 KiB
Bash
Executable File
360 lines
12 KiB
Bash
Executable File
#!/bin/bash
|
||
# =============================================================================
|
||
# WOOO AIOps - 主機與服務設定檔備份
|
||
# 2026-05-06 ogt + Codex: 重開機事故後補齊 configuration-state backup。
|
||
#
|
||
# 目的:
|
||
# DB/volume backup 只能還原資料;真正決定服務能否啟動的是 nginx、
|
||
# systemd drop-in、Docker Compose、cron、K8s Secret/ConfigMap、Prometheus
|
||
# 與 Alertmanager 設定。此腳本只收集設定狀態,不收集大型資料目錄。
|
||
#
|
||
# 安全:
|
||
# - Secret/ConfigMap 只進入 restic 加密快照,不印到 log。
|
||
# - 不把 restic password file 備份進同一個 restic repo。
|
||
# - 暫存目錄權限 0700,結束後清除。
|
||
# =============================================================================
|
||
|
||
set -euo pipefail
|
||
|
||
source "$(dirname "$0")/common.sh"
|
||
|
||
SERVICE="configs"
|
||
LOCAL_REPO="${BACKUP_BASE}/configs"
|
||
DUMP_DIR="/tmp/configs-backup-$$"
|
||
STATUS_DIR="${BACKUP_BASE}/status"
|
||
CONFIG_STATUS_FILE="${STATUS_DIR}/backup-configs-last-status.json"
|
||
STATUS_ITEMS_FILE="${DUMP_DIR}/config-capture-status.jsonl"
|
||
|
||
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=8 -o StrictHostKeyChecking=accept-new)
|
||
K8S_BACKUP_HOSTS="${K8S_BACKUP_HOSTS:-192.168.0.120 192.168.0.121 192.168.0.125}"
|
||
|
||
# 2026-05-19 ogt + Codex: 保留策略統一交給 common.sh。
|
||
# 預設 latest-only keep-last=1,避免設定檔備份長期堆積。
|
||
|
||
tar_excludes=(
|
||
--exclude="*/node_modules"
|
||
--exclude="*/.next"
|
||
--exclude="*/.venv"
|
||
--exclude="*/venv"
|
||
--exclude="*/__pycache__"
|
||
--exclude="*/logs"
|
||
--exclude="*/log"
|
||
--exclude="*/backup"
|
||
--exclude="*/backups"
|
||
--exclude="*/data"
|
||
--exclude="*/tmp"
|
||
--exclude=".restic-password"
|
||
)
|
||
|
||
write_cmd_output() {
|
||
local label="$1"
|
||
shift
|
||
if "$@" > "${DUMP_DIR}/${label}.txt" 2>&1; then
|
||
log_success "設定盤點完成: ${label}"
|
||
else
|
||
log_warn "設定盤點失敗或無權限: ${label}"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
tar_local() {
|
||
local label="$1"
|
||
shift
|
||
local paths=("$@")
|
||
local tar_cmd
|
||
tar_cmd=$(local_tar_command)
|
||
if ${tar_cmd} czf "${DUMP_DIR}/${label}.tar.gz" \
|
||
--ignore-failed-read \
|
||
--warning=no-file-changed \
|
||
"${tar_excludes[@]}" \
|
||
"${paths[@]}" 2>"${DUMP_DIR}/${label}.tar.stderr"; then
|
||
log_success "本機設定封存完成: ${label}"
|
||
else
|
||
log_warn "本機設定封存部分失敗: ${label}"
|
||
fi
|
||
[ -s "${DUMP_DIR}/${label}.tar.gz" ]
|
||
}
|
||
|
||
local_tar_command() {
|
||
if sudo -n true >/dev/null 2>&1; then
|
||
printf 'sudo -n tar'
|
||
else
|
||
printf 'tar'
|
||
fi
|
||
}
|
||
|
||
tar_remote() {
|
||
local host="$1"
|
||
local label="$2"
|
||
shift 2
|
||
local paths=("$@")
|
||
local remote_script
|
||
remote_script='if sudo -n true >/dev/null 2>&1; then tar_cmd="sudo -n tar"; else tar_cmd="tar"; fi; $tar_cmd czf - --ignore-failed-read --warning=no-file-changed'
|
||
for exclude in "${tar_excludes[@]}"; do
|
||
remote_script+=" $(printf '%q' "$exclude")"
|
||
done
|
||
for path in "${paths[@]}"; do
|
||
remote_script+=" $(printf '%q' "$path")"
|
||
done
|
||
|
||
if ssh "${SSH_OPTS[@]}" "$host" "$remote_script" > "${DUMP_DIR}/${label}.tar.gz" 2>"${DUMP_DIR}/${label}.tar.stderr"; then
|
||
log_success "遠端設定封存完成: ${label}"
|
||
else
|
||
log_warn "遠端設定封存部分失敗: ${label}"
|
||
fi
|
||
[ -s "${DUMP_DIR}/${label}.tar.gz" ]
|
||
}
|
||
|
||
capture_remote_cmd() {
|
||
local host="$1"
|
||
local label="$2"
|
||
local cmd="$3"
|
||
if ssh "${SSH_OPTS[@]}" "$host" "$cmd" > "${DUMP_DIR}/${label}.txt" 2>&1; then
|
||
log_success "遠端設定盤點完成: ${label}"
|
||
else
|
||
log_warn "遠端設定盤點失敗或無權限: ${label}"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
capture_k8s_yaml() {
|
||
local label="$1"
|
||
local resource="$2"
|
||
local cmd k8s_host
|
||
cmd="sudo -n kubectl get ${resource} -A -o yaml 2>/dev/null || kubectl get ${resource} -A -o yaml"
|
||
for k8s_host in ${K8S_BACKUP_HOSTS}; do
|
||
if ssh "${SSH_OPTS[@]}" "wooo@${k8s_host}" "$cmd" > "${DUMP_DIR}/${label}.yaml" 2>"${DUMP_DIR}/${label}.stderr"; then
|
||
printf 'source_host=%s\n' "${k8s_host}" > "${DUMP_DIR}/${label}.source"
|
||
log_success "K8s 設定備份完成: ${label} (source=${k8s_host})"
|
||
return 0
|
||
fi
|
||
done
|
||
log_warn "K8s 設定備份失敗: ${label}"
|
||
return 1
|
||
}
|
||
|
||
record_config_status() {
|
||
local target="$1"
|
||
local critical="$2"
|
||
local ok="$3"
|
||
local source="${4:-}"
|
||
|
||
printf '{"target":"%s","critical":%s,"ok":%s,"source":"%s"}\n' \
|
||
"${target}" "${critical}" "${ok}" "${source}" >> "${STATUS_ITEMS_FILE}"
|
||
}
|
||
|
||
write_config_status_file() {
|
||
local failed_count="$1"
|
||
local duration="$2"
|
||
local snapshot_id="$3"
|
||
|
||
install -d -m 700 "${STATUS_DIR}"
|
||
python3 - "${STATUS_ITEMS_FILE}" "${CONFIG_STATUS_FILE}" "${failed_count}" "${duration}" "${snapshot_id}" <<'PY'
|
||
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
|
||
items_path = Path(sys.argv[1])
|
||
status_path = Path(sys.argv[2])
|
||
failed_count = int(sys.argv[3])
|
||
duration = int(sys.argv[4])
|
||
snapshot_id = sys.argv[5]
|
||
|
||
items = []
|
||
if items_path.exists():
|
||
for line in items_path.read_text(encoding="utf-8", errors="replace").splitlines():
|
||
if not line.strip():
|
||
continue
|
||
items.append(json.loads(line))
|
||
|
||
critical_failed_count = sum(1 for item in items if item.get("critical") and not item.get("ok"))
|
||
document = {
|
||
"timestamp": int(time.time()),
|
||
"failed_count": failed_count,
|
||
"critical_failed_count": critical_failed_count,
|
||
"duration_seconds": duration,
|
||
"snapshot_id": snapshot_id,
|
||
"items": items,
|
||
}
|
||
|
||
tmp_path = status_path.with_suffix(status_path.suffix + ".tmp")
|
||
tmp_path.write_text(json.dumps(document, ensure_ascii=False, sort_keys=True) + "\n", encoding="utf-8")
|
||
os.replace(tmp_path, status_path)
|
||
os.chmod(status_path, 0o640)
|
||
PY
|
||
}
|
||
|
||
main() {
|
||
local start_time
|
||
local failed=0
|
||
local timestamp
|
||
start_time=$(date +%s)
|
||
timestamp=$(date "+%Y%m%d_%H%M%S")
|
||
|
||
log_info "========== 開始主機與服務設定檔備份 (${timestamp}) =========="
|
||
install -d -m 700 "${DUMP_DIR}"
|
||
: > "${STATUS_ITEMS_FILE}"
|
||
|
||
write_cmd_output "110-crontab-current-user" crontab -l || failed=$((failed + 1))
|
||
write_cmd_output "110-systemd-unit-files" systemctl list-unit-files || failed=$((failed + 1))
|
||
write_cmd_output "110-docker-containers" docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}' || true
|
||
|
||
if tar_local "110-host-configs" \
|
||
/etc/nginx \
|
||
/etc/systemd/system \
|
||
/etc/cron.d \
|
||
/etc/crontab \
|
||
/etc/letsencrypt \
|
||
/etc/ssh \
|
||
/etc/fstab \
|
||
/etc/hosts \
|
||
/etc/netplan \
|
||
/etc/docker \
|
||
/etc/containerd \
|
||
/etc/keepalived \
|
||
/opt/harbor/harbor.yml \
|
||
/opt/harbor/docker-compose.yml \
|
||
/opt/sentry/.env \
|
||
/opt/sentry/docker-compose.yml \
|
||
/opt/sentry/docker-compose.override.yml \
|
||
/opt/sentry/sentry \
|
||
/home/wooo/monitoring \
|
||
/home/wooo/scripts \
|
||
/home/wooo/awoooi \
|
||
/home/wooo/awoooi-ops \
|
||
/backup/scripts; then
|
||
record_config_status "110-host-configs" true true "110"
|
||
else
|
||
record_config_status "110-host-configs" true false "110"
|
||
failed=$((failed + 1))
|
||
fi
|
||
|
||
capture_remote_cmd "ollama@192.168.0.188" "188-crontab-ollama" "crontab -l" || failed=$((failed + 1))
|
||
capture_remote_cmd "ollama@192.168.0.188" "188-systemd-unit-files" "systemctl list-unit-files" || failed=$((failed + 1))
|
||
capture_remote_cmd "ollama@192.168.0.188" "188-docker-containers" "docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}'" || true
|
||
if tar_remote "ollama@192.168.0.188" "188-host-configs" \
|
||
/etc/nginx \
|
||
/etc/systemd/system \
|
||
/etc/cron.d \
|
||
/etc/crontab \
|
||
/etc/letsencrypt \
|
||
/etc/ssh \
|
||
/etc/fstab \
|
||
/etc/hosts \
|
||
/etc/netplan \
|
||
/etc/docker \
|
||
/etc/containerd \
|
||
/etc/keepalived \
|
||
/opt/n8n \
|
||
/opt/open-webui \
|
||
/opt/litellm \
|
||
/opt/signoz \
|
||
/opt/minio \
|
||
/opt/registry \
|
||
/home/ollama/bin \
|
||
/home/ollama/scripts \
|
||
/home/ollama/momo-pro \
|
||
/home/ollama/awoooi-ops \
|
||
/home/ollama/node_exporter_textfiles; then
|
||
record_config_status "188-host-configs" true true "188"
|
||
else
|
||
record_config_status "188-host-configs" true false "188"
|
||
failed=$((failed + 1))
|
||
fi
|
||
|
||
capture_remote_cmd "wooo@192.168.0.120" "120-crontab-wooo" "crontab -l" || true
|
||
if tar_remote "wooo@192.168.0.120" "120-k3s-host-configs" \
|
||
/etc/rancher/k3s \
|
||
/var/lib/rancher/k3s/server/manifests \
|
||
/etc/systemd/system \
|
||
/etc/cron.d \
|
||
/etc/crontab \
|
||
/etc/ssh \
|
||
/etc/fstab \
|
||
/etc/hosts \
|
||
/etc/netplan \
|
||
/etc/containerd \
|
||
/etc/keepalived; then
|
||
record_config_status "120-k3s-host-configs" true true "120"
|
||
else
|
||
record_config_status "120-k3s-host-configs" true false "120"
|
||
failed=$((failed + 1))
|
||
fi
|
||
|
||
capture_remote_cmd "wooo@192.168.0.121" "121-crontab-wooo" "crontab -l" || true
|
||
if tar_remote "wooo@192.168.0.121" "121-k3s-host-configs" \
|
||
/etc/rancher/k3s \
|
||
/var/lib/rancher/k3s/agent/etc \
|
||
/etc/systemd/system \
|
||
/etc/cron.d \
|
||
/etc/crontab \
|
||
/etc/ssh \
|
||
/etc/fstab \
|
||
/etc/hosts \
|
||
/etc/netplan \
|
||
/etc/containerd \
|
||
/etc/keepalived; then
|
||
record_config_status "121-k3s-host-configs" true true "121"
|
||
else
|
||
record_config_status "121-k3s-host-configs" true false "121"
|
||
failed=$((failed + 1))
|
||
fi
|
||
|
||
if capture_k8s_yaml "cluster-k8s-workloads" "deployments,statefulsets,daemonsets,services,ingress,configmaps,cronjobs,jobs,persistentvolumeclaims,persistentvolumes,storageclasses,networkpolicies,serviceaccounts,roles,rolebindings,clusterroles,clusterrolebindings,customresourcedefinitions"; then
|
||
record_config_status "cluster-k8s-workloads" true true "$(sed -n 's/^source_host=//p' "${DUMP_DIR}/cluster-k8s-workloads.source" | head -n 1)"
|
||
else
|
||
record_config_status "cluster-k8s-workloads" true false ""
|
||
failed=$((failed + 1))
|
||
fi
|
||
if capture_k8s_yaml "cluster-k8s-secrets" "secrets"; then
|
||
record_config_status "cluster-k8s-secrets" true true "$(sed -n 's/^source_host=//p' "${DUMP_DIR}/cluster-k8s-secrets.source" | head -n 1)"
|
||
else
|
||
record_config_status "cluster-k8s-secrets" true false ""
|
||
failed=$((failed + 1))
|
||
fi
|
||
if capture_k8s_yaml "cluster-velero-backups" "backups.velero.io,schedules.velero.io"; then
|
||
record_config_status "cluster-velero-backups" false true "$(sed -n 's/^source_host=//p' "${DUMP_DIR}/cluster-velero-backups.source" | head -n 1)"
|
||
else
|
||
record_config_status "cluster-velero-backups" false false ""
|
||
fi
|
||
|
||
if [ ! -d "${LOCAL_REPO}/data" ]; then
|
||
log_info "初始化 Restic 倉庫 ${LOCAL_REPO}..."
|
||
restic -r "${LOCAL_REPO}" init --password-file "${RESTIC_PASSWORD_FILE}" 2>&1
|
||
fi
|
||
|
||
local tags
|
||
tags=$(build_tags "${SERVICE}")
|
||
restic -r "${LOCAL_REPO}" backup "${DUMP_DIR}" \
|
||
--password-file "${RESTIC_PASSWORD_FILE}" \
|
||
${tags} \
|
||
--tag "scope:host-configs" \
|
||
--tag "contains:k8s-secrets" 2>&1
|
||
|
||
local snapshot_id
|
||
snapshot_id=$(restic -r "${LOCAL_REPO}" snapshots --latest 1 --json \
|
||
--password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \
|
||
python3 -c 'import json,sys; rows=json.load(sys.stdin); print(rows[-1].get("short_id","unknown") if rows else "unknown")' 2>/dev/null || echo "unknown")
|
||
log_success "設定檔 Restic 備份完成: ${snapshot_id}"
|
||
|
||
cleanup_old_backups "${LOCAL_REPO}"
|
||
|
||
local duration
|
||
duration=$(($(date +%s) - start_time))
|
||
write_config_status_file "${failed}" "${duration}" "${snapshot_id}"
|
||
rm -rf "${DUMP_DIR}"
|
||
if [ "${failed}" -eq 0 ]; then
|
||
log_success "========== 設定檔備份完成 (${duration}s) =========="
|
||
notify_clawbot "success" "${SERVICE}" "主機與服務設定檔備份完成" "${duration}"
|
||
else
|
||
log_error "========== 設定檔備份完成但有 ${failed} 個項目失敗 (${duration}s) =========="
|
||
notify_clawbot "warning" "${SERVICE}" "設定檔備份有 ${failed} 個項目失敗" "${duration}"
|
||
fi
|
||
|
||
return "${failed}"
|
||
}
|
||
|
||
main "$@"
|