#!/bin/bash # ============================================================================= # WOOO AIOps - AI 工具與模型 manifest 備份 # 2026-05-06 ogt + Codex: 補齊 188 Ollama / AI tooling metadata backup。 # # 安全原則: # - 每日只備份模型清單、manifest、Modelfile 與工具狀態證據。 # - 不預設備份 /home/ollama/.ollama/models/blobs,避免每日拉 10GB+ # 可重新下載模型;自製或不可重下的 blobs 需先人工標記後另做 offsite。 # - 所有輸出只進 encrypted restic repo;不把 Secret 值印到 log。 # ============================================================================= set -euo pipefail source "$(dirname "$0")/common.sh" SERVICE="ai-artifacts" LOCAL_REPO="${BACKUP_BASE}/ai-artifacts" DUMP_DIR="/tmp/ai-artifacts-backup-$$" REMOTE_HOST="${AI_ARTIFACTS_REMOTE_HOST:-ollama@192.168.0.188}" SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=8) cleanup() { rm -rf "${DUMP_DIR}" } low_priority() { if command -v ionice >/dev/null 2>&1; then ionice -c2 -n7 nice -n 10 "$@" else nice -n 10 "$@" fi } capture_remote_cmd() { local label="$1" local cmd="$2" if ssh "${SSH_OPTS[@]}" "${REMOTE_HOST}" "${cmd}" > "${DUMP_DIR}/${label}.txt" 2>&1; then log_success "AI artifacts 盤點完成: ${label}" else log_warn "AI artifacts 盤點失敗: ${label}" return 1 fi } main() { local start_time local timestamp local failed=0 start_time=$(date +%s) timestamp=$(date "+%Y%m%d_%H%M%S") trap cleanup EXIT install -d -m 700 "${DUMP_DIR}" log_info "========== 開始 AI artifacts 備份 (${timestamp}) ==========" capture_remote_cmd "188-ollama-version" "ollama --version" || true capture_remote_cmd "188-ollama-list" "ollama list" || failed=$((failed + 1)) capture_remote_cmd "188-ollama-ps" "ollama ps" || true capture_remote_cmd "188-ollama-manifest-inventory" "find /home/ollama/.ollama/models/manifests -type f -printf '%P\t%s\t%TY-%Tm-%Td %TH:%TM:%TS\n' | sort" || failed=$((failed + 1)) capture_remote_cmd "188-ollama-manifest-sha256" "cd /home/ollama/.ollama/models/manifests && find . -type f -print0 | sort -z | xargs -0 sha256sum" || failed=$((failed + 1)) capture_remote_cmd "188-ollama-blob-summary" "find /home/ollama/.ollama/models/blobs -type f -printf '%s\n' 2>/dev/null | awk 'BEGIN{count=0;bytes=0}{count++;bytes+=\$1}END{printf \"blob_count=%d\\nblob_bytes=%d\\n\", count, bytes}'" || true capture_remote_cmd "188-ai-containers" "docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}' | grep -Ei 'ollama|open-webui|litellm|openclaw|clawbot|langfuse|n8n' || true" || true log_info "匯出 Ollama manifest tree(不含 blobs)" if ssh "${SSH_OPTS[@]}" "${REMOTE_HOST}" "tar czf - -C /home/ollama/.ollama/models manifests 2>/dev/null" > "${DUMP_DIR}/ollama-manifests_${timestamp}.tar.gz"; then log_success "Ollama manifests 備份完成 ($(du -h "${DUMP_DIR}/ollama-manifests_${timestamp}.tar.gz" | cut -f1))" else log_error "Ollama manifests 備份失敗" failed=$((failed + 1)) fi log_info "匯出 Ollama Modelfile 摘要" ssh "${SSH_OPTS[@]}" "${REMOTE_HOST}" 'set -euo pipefail tmp="$(mktemp -d)" trap "rm -rf \"$tmp\"" EXIT ollama list 2>/dev/null | awk "NR>1 {print \$1}" | while read -r model; do safe="$(printf "%s" "$model" | tr "/:" "__")" ollama show "$model" --modelfile > "$tmp/${safe}.Modelfile" 2>&1 || true done tar czf - -C "$tmp" . ' > "${DUMP_DIR}/ollama-modelfiles_${timestamp}.tar.gz" 2>"${DUMP_DIR}/ollama-modelfiles_${timestamp}.stderr" || log_warn "Ollama Modelfile 匯出部分失敗" cat > "${DUMP_DIR}/backup-manifest.txt" <&1 fi log_info "建立 AI artifacts Restic 備份..." local tags tags=$(build_tags "${SERVICE}") low_priority restic -r "${LOCAL_REPO}" backup "${DUMP_DIR}" \ --password-file "${RESTIC_PASSWORD_FILE}" \ ${tags} \ --tag "scope:ai-artifacts" \ --tag "contains:ollama-manifests-no-blobs" 2>&1 local snapshot_id snapshot_id=$(restic -r "${LOCAL_REPO}" snapshots --latest 1 --json \ --password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \ python3 -c 'import json,sys; rows=json.load(sys.stdin); print(rows[-1].get("short_id","unknown") if rows else "unknown")' 2>/dev/null || echo "unknown") log_success "AI artifacts Restic 備份完成: ${snapshot_id}" cleanup_old_backups "${LOCAL_REPO}" local duration duration=$(($(date +%s) - start_time)) if [ "${failed}" -eq 0 ]; then log_success "========== AI artifacts 備份完成 (${duration}s) ==========" notify_clawbot "success" "${SERVICE}" "AI artifacts 備份完成" "${duration}" else log_error "========== AI artifacts 備份有 ${failed} 個必要項目失敗 (${duration}s) ==========" notify_clawbot "failed" "${SERVICE}" "AI artifacts 備份有 ${failed} 個必要項目失敗" "${duration}" fi return "${failed}" } main "$@"