130 lines
5.4 KiB
Bash
130 lines
5.4 KiB
Bash
#!/bin/bash
|
||
# =============================================================================
|
||
# WOOO AIOps - AI 工具與模型 manifest 備份
|
||
# 2026-05-06 ogt + Codex: 補齊 188 Ollama / AI tooling metadata backup。
|
||
#
|
||
# 安全原則:
|
||
# - 每日只備份模型清單、manifest、Modelfile 與工具狀態證據。
|
||
# - 不預設備份 /home/ollama/.ollama/models/blobs,避免每日拉 10GB+
|
||
# 可重新下載模型;自製或不可重下的 blobs 需先人工標記後另做 offsite。
|
||
# - 所有輸出只進 encrypted restic repo;不把 Secret 值印到 log。
|
||
# =============================================================================
|
||
|
||
set -euo pipefail
|
||
|
||
source "$(dirname "$0")/common.sh"
|
||
|
||
SERVICE="ai-artifacts"
|
||
LOCAL_REPO="${BACKUP_BASE}/ai-artifacts"
|
||
DUMP_DIR="/tmp/ai-artifacts-backup-$$"
|
||
REMOTE_HOST="${AI_ARTIFACTS_REMOTE_HOST:-ollama@192.168.0.188}"
|
||
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=8)
|
||
|
||
cleanup() {
|
||
rm -rf "${DUMP_DIR}"
|
||
}
|
||
|
||
low_priority() {
|
||
if command -v ionice >/dev/null 2>&1; then
|
||
ionice -c2 -n7 nice -n 10 "$@"
|
||
else
|
||
nice -n 10 "$@"
|
||
fi
|
||
}
|
||
|
||
capture_remote_cmd() {
|
||
local label="$1"
|
||
local cmd="$2"
|
||
if ssh "${SSH_OPTS[@]}" "${REMOTE_HOST}" "${cmd}" > "${DUMP_DIR}/${label}.txt" 2>&1; then
|
||
log_success "AI artifacts 盤點完成: ${label}"
|
||
else
|
||
log_warn "AI artifacts 盤點失敗: ${label}"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
main() {
|
||
local start_time
|
||
local timestamp
|
||
local failed=0
|
||
start_time=$(date +%s)
|
||
timestamp=$(date "+%Y%m%d_%H%M%S")
|
||
|
||
trap cleanup EXIT
|
||
install -d -m 700 "${DUMP_DIR}"
|
||
|
||
log_info "========== 開始 AI artifacts 備份 (${timestamp}) =========="
|
||
|
||
capture_remote_cmd "188-ollama-version" "ollama --version" || true
|
||
capture_remote_cmd "188-ollama-list" "ollama list" || failed=$((failed + 1))
|
||
capture_remote_cmd "188-ollama-ps" "ollama ps" || true
|
||
capture_remote_cmd "188-ollama-manifest-inventory" "find /home/ollama/.ollama/models/manifests -type f -printf '%P\t%s\t%TY-%Tm-%Td %TH:%TM:%TS\n' | sort" || failed=$((failed + 1))
|
||
capture_remote_cmd "188-ollama-manifest-sha256" "cd /home/ollama/.ollama/models/manifests && find . -type f -print0 | sort -z | xargs -0 sha256sum" || failed=$((failed + 1))
|
||
capture_remote_cmd "188-ollama-blob-summary" "find /home/ollama/.ollama/models/blobs -type f -printf '%s\n' 2>/dev/null | awk 'BEGIN{count=0;bytes=0}{count++;bytes+=\$1}END{printf \"blob_count=%d\\nblob_bytes=%d\\n\", count, bytes}'" || true
|
||
capture_remote_cmd "188-ai-containers" "docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}' | grep -Ei 'ollama|open-webui|litellm|openclaw|clawbot|langfuse|n8n' || true" || true
|
||
|
||
log_info "匯出 Ollama manifest tree(不含 blobs)"
|
||
if ssh "${SSH_OPTS[@]}" "${REMOTE_HOST}" "tar czf - -C /home/ollama/.ollama/models manifests 2>/dev/null" > "${DUMP_DIR}/ollama-manifests_${timestamp}.tar.gz"; then
|
||
log_success "Ollama manifests 備份完成 ($(du -h "${DUMP_DIR}/ollama-manifests_${timestamp}.tar.gz" | cut -f1))"
|
||
else
|
||
log_error "Ollama manifests 備份失敗"
|
||
failed=$((failed + 1))
|
||
fi
|
||
|
||
log_info "匯出 Ollama Modelfile 摘要"
|
||
ssh "${SSH_OPTS[@]}" "${REMOTE_HOST}" 'set -euo pipefail
|
||
tmp="$(mktemp -d)"
|
||
trap "rm -rf \"$tmp\"" EXIT
|
||
ollama list 2>/dev/null | awk "NR>1 {print \$1}" | while read -r model; do
|
||
safe="$(printf "%s" "$model" | tr "/:" "__")"
|
||
ollama show "$model" --modelfile > "$tmp/${safe}.Modelfile" 2>&1 || true
|
||
done
|
||
tar czf - -C "$tmp" .
|
||
' > "${DUMP_DIR}/ollama-modelfiles_${timestamp}.tar.gz" 2>"${DUMP_DIR}/ollama-modelfiles_${timestamp}.stderr" || log_warn "Ollama Modelfile 匯出部分失敗"
|
||
|
||
cat > "${DUMP_DIR}/backup-manifest.txt" <<EOF
|
||
service=ai-artifacts
|
||
timestamp=${timestamp}
|
||
remote_host=${REMOTE_HOST}
|
||
contains=ollama_list,ollama_ps,ollama_manifests,ollama_modelfiles,ai_container_inventory
|
||
blob_policy=manifest_only_no_model_blobs_by_default
|
||
failed_components=${failed}
|
||
EOF
|
||
|
||
if [ ! -d "${LOCAL_REPO}/data" ]; then
|
||
log_info "初始化 Restic 倉庫: ${LOCAL_REPO}"
|
||
low_priority restic -r "${LOCAL_REPO}" init --password-file "${RESTIC_PASSWORD_FILE}" 2>&1
|
||
fi
|
||
|
||
log_info "建立 AI artifacts Restic 備份..."
|
||
local tags
|
||
tags=$(build_tags "${SERVICE}")
|
||
low_priority restic -r "${LOCAL_REPO}" backup "${DUMP_DIR}" \
|
||
--password-file "${RESTIC_PASSWORD_FILE}" \
|
||
${tags} \
|
||
--tag "scope:ai-artifacts" \
|
||
--tag "contains:ollama-manifests-no-blobs" 2>&1
|
||
|
||
local snapshot_id
|
||
snapshot_id=$(restic -r "${LOCAL_REPO}" snapshots --latest 1 --json \
|
||
--password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \
|
||
python3 -c 'import json,sys; rows=json.load(sys.stdin); print(rows[-1].get("short_id","unknown") if rows else "unknown")' 2>/dev/null || echo "unknown")
|
||
log_success "AI artifacts Restic 備份完成: ${snapshot_id}"
|
||
|
||
cleanup_old_backups "${LOCAL_REPO}"
|
||
|
||
local duration
|
||
duration=$(($(date +%s) - start_time))
|
||
if [ "${failed}" -eq 0 ]; then
|
||
log_success "========== AI artifacts 備份完成 (${duration}s) =========="
|
||
notify_clawbot "success" "${SERVICE}" "AI artifacts 備份完成" "${duration}"
|
||
else
|
||
log_error "========== AI artifacts 備份有 ${failed} 個必要項目失敗 (${duration}s) =========="
|
||
notify_clawbot "failed" "${SERVICE}" "AI artifacts 備份有 ${failed} 個必要項目失敗" "${duration}"
|
||
fi
|
||
|
||
return "${failed}"
|
||
}
|
||
|
||
main "$@"
|