#!/bin/bash # ============================================================================= # WOOO AIOps - daily backup heartbeat with low-noise Telegram reporting # ============================================================================= set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" if [ -f "${SCRIPT_DIR}/common.sh" ]; then # shellcheck disable=SC1091 source "${SCRIPT_DIR}/common.sh" else notify_clawbot() { return 0; } fi BACKUP_BASE="${BACKUP_BASE:-/backup}" LOG_DIR="${BACKUP_LOG_DIR:-${BACKUP_BASE}/logs}" TEXTFILE_110="${BACKUP_HEALTH_110_PROM:-/home/wooo/node_exporter_textfiles/backup_health.prom}" TEXTFILE_188_TMP="${BACKUP_HEALTH_188_TMP:-/tmp/awoooi-backup-health-188.prom}" SSH_188="${BACKUP_STATUS_SSH_188:-ollama@192.168.0.188}" SSH_OPTS="${BACKUP_STATUS_SSH_OPTS:--o BatchMode=yes -o ConnectTimeout=8 -o StrictHostKeyChecking=accept-new}" NOTIFY=1 REFRESH=1 FORCE_NOTIFY=0 usage() { cat <<'USAGE' Usage: backup-status.sh [--no-notify] [--no-refresh] [--force-notify] 每日備份心跳報告: - 讀取 110 / 188 backup_health.prom - 彙整 cron、script、freshness、last aggregate failure、integrity、restore drill - 每日寫本機 log;Telegram 只在狀態變化、失敗、或低頻提醒時發送 USAGE } while [ "$#" -gt 0 ]; do case "$1" in --no-notify) NOTIFY=0 ;; --no-refresh) REFRESH=0 ;; --force-notify) FORCE_NOTIFY=1 ;; -h|--help) usage exit 0 ;; *) echo "Unknown argument: $1" >&2 usage >&2 exit 2 ;; esac shift done mkdir -p "${LOG_DIR}" log_line() { printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" } refresh_110() { [ "${REFRESH}" -eq 1 ] || return 0 if [ -x /home/wooo/scripts/backup-health-textfile-exporter.py ]; then AIOPS_HOST_LABEL=110 \ NODE_EXPORTER_TEXTFILE_DIR=/home/wooo/node_exporter_textfiles \ /home/wooo/scripts/backup-health-textfile-exporter.py >/dev/null 2>&1 || true fi } refresh_188() { : > "${TEXTFILE_188_TMP}" if [ "${REFRESH}" -eq 1 ]; then # Keep this read-only from 110's perspective: refresh the textfile exporter, then read the metric file. ssh ${SSH_OPTS} "${SSH_188}" \ 'AIOPS_HOST_LABEL=188 NODE_EXPORTER_TEXTFILE_DIR=/home/ollama/node_exporter_textfiles /home/ollama/scripts/backup-health-textfile-exporter.py >/dev/null 2>&1 || true; cat /home/ollama/node_exporter_textfiles/backup_health.prom 2>/dev/null' \ > "${TEXTFILE_188_TMP}" 2>/dev/null || true else ssh ${SSH_OPTS} "${SSH_188}" \ 'cat /home/ollama/node_exporter_textfiles/backup_health.prom 2>/dev/null' \ > "${TEXTFILE_188_TMP}" 2>/dev/null || true fi } metric_count() { local file="$1" local metric="$2" local expected="${3:-}" if [ ! -s "${file}" ]; then echo 0 return 0 fi awk -v metric="${metric}" -v expected="${expected}" ' $1 ~ ("^" metric "\\{") { if (expected == "" || $2 == expected) count += 1 } END { print count + 0 } ' "${file}" } metric_sum() { local file="$1" local metric="$2" if [ ! -s "${file}" ]; then echo 0 return 0 fi awk -v metric="${metric}" ' $1 ~ ("^" metric "\\{") { sum += $2 } END { print sum + 0 } ' "${file}" } metric_first() { local file="$1" local metric="$2" if [ ! -s "${file}" ]; then echo 0 return 0 fi awk -v metric="${metric}" ' $1 ~ ("^" metric "\\{") { print $2; found = 1; exit } END { if (!found) print 0 } ' "${file}" } label_list_for_zero() { local file="$1" local metric="$2" local label="$3" if [ ! -s "${file}" ]; then echo "textfile_missing" return 0 fi awk -v metric="${metric}" -v label="${label}" ' $1 ~ ("^" metric "\\{") && $2 == 0 { pattern = label "=\"[^\"]+\"" if (match($1, pattern)) { value = substr($1, RSTART + length(label) + 2, RLENGTH - length(label) - 3) if (out == "") out = value else out = out "," value } } END { print out } ' "${file}" } human_metric_time() { local file="$1" local metric="$2" local ts ts="$(metric_first "${file}" "${metric}")" case "${ts}" in ''|0|0.0) echo "unknown" ;; *) date -d "@${ts%.*}" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "${ts}" ;; esac } metric_value_for_label() { local file="$1" local metric="$2" local label="$3" local value="$4" if [ ! -s "${file}" ]; then echo 0 return 0 fi awk -v metric="${metric}" -v label="${label}" -v value="${value}" ' $1 ~ ("^" metric "\\{") && $1 ~ (label "=\"" value "\"") { print $2 found = 1 exit } END { if (!found) print 0 } ' "${file}" } human_timestamp() { local ts="$1" case "${ts}" in ''|0|0.0) echo "unknown" ;; *) date -d "@${ts%.*}" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "${ts}" ;; esac } refresh_110 refresh_188 host_110_missing=0 host_188_missing=0 [ -s "${TEXTFILE_110}" ] || host_110_missing=1 [ -s "${TEXTFILE_188_TMP}" ] || host_188_missing=1 configured_missing_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_job_configured" 0)" configured_missing_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_job_configured" 0)" script_missing_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_script_present" 0)" script_missing_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_script_present" 0)" fresh_total_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_job_fresh")" fresh_total_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_job_fresh")" stale_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_job_fresh" 0)" stale_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_job_fresh" 0)" failed_total_110="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_last_run_failed_count")" failed_total_188="$(metric_sum "${TEXTFILE_188_TMP}" "awoooi_backup_last_run_failed_count")" integrity_stale_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_integrity_fresh" 0)" offsite_configured="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_offsite_configured")" offsite_fresh="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_offsite_fresh")" offsite_rclone_configured="$(awk '/^awoooi_backup_offsite_configured\{.*provider="rclone"/ { print $2; found=1; exit } END { if (!found) print 0 }' "${TEXTFILE_110}" 2>/dev/null || echo 0)" offsite_rclone_fresh="$(awk '/^awoooi_backup_offsite_fresh\{.*provider="rclone"/ { print $2; found=1; exit } END { if (!found) print 0 }' "${TEXTFILE_110}" 2>/dev/null || echo 0)" escrow_missing="$(metric_first "${TEXTFILE_110}" "awoooi_backup_dr_credential_escrow_missing_count")" core_blockers=$((host_110_missing + host_188_missing + configured_missing_110 + configured_missing_188 + script_missing_110 + script_missing_188 + stale_110 + stale_188 + failed_total_110 + failed_total_188 + integrity_stale_110)) dr_warnings=0 if [ "${offsite_configured%.*}" -lt 1 ] 2>/dev/null; then dr_warnings=$((dr_warnings + 1)) fi if [ "${offsite_fresh%.*}" -lt 1 ] 2>/dev/null; then dr_warnings=$((dr_warnings + 1)) fi if [ "${escrow_missing%.*}" -gt 0 ] 2>/dev/null; then dr_warnings=$((dr_warnings + escrow_missing)) fi status="success" headline="每日備份心跳正常" if [ "${core_blockers}" -gt 0 ]; then status="failed" headline="每日備份心跳失敗" elif [ "${dr_warnings}" -gt 0 ]; then status="warning" headline="每日備份心跳核心正常但 DR 未完成" fi stale_jobs_110="$(label_list_for_zero "${TEXTFILE_110}" "awoooi_backup_job_fresh" "job")" stale_jobs_188="$(label_list_for_zero "${TEXTFILE_188_TMP}" "awoooi_backup_job_fresh" "job")" missing_scripts_110="$(label_list_for_zero "${TEXTFILE_110}" "awoooi_backup_script_present" "script")" missing_scripts_188="$(label_list_for_zero "${TEXTFILE_188_TMP}" "awoooi_backup_script_present" "script")" backup_all_ts="$(metric_value_for_label "${TEXTFILE_110}" "awoooi_backup_job_last_success_timestamp" "job" "backup_all")" last_backup_all="$(human_timestamp "${backup_all_ts}")" message="${headline}; 110備份=${fresh_total_110}/13 fresh failed=${failed_total_110}; 188備份=${fresh_total_188}/2 fresh failed=${failed_total_188}; integrity_stale=${integrity_stale_110}; offsite_configured=${offsite_configured}; offsite_fresh=${offsite_fresh}; rclone_gdrive_configured=${offsite_rclone_configured}; rclone_gdrive_fresh=${offsite_rclone_fresh}; escrow_missing=${escrow_missing}; last_backup_all=${last_backup_all}" if [ "${core_blockers}" -gt 0 ]; then message="${message}; stale110=${stale_jobs_110:-none}; stale188=${stale_jobs_188:-none}; missing_script110=${missing_scripts_110:-none}; missing_script188=${missing_scripts_188:-none}" fi { log_line "${message}" log_line "DETAIL core_blockers=${core_blockers} dr_warnings=${dr_warnings} configured_missing_110=${configured_missing_110} configured_missing_188=${configured_missing_188} script_missing_110=${script_missing_110} script_missing_188=${script_missing_188}" } | tee -a "${LOG_DIR}/backup-status.log" if [ "${NOTIFY}" -eq 1 ]; then state_dir="${BACKUP_STATUS_STATE_DIR:-${BACKUP_BASE}/state}" notify_marker="${state_dir}/backup-status-last-notified" notify_success="${BACKUP_STATUS_NOTIFY_SUCCESS:-0}" success_interval_hours="${BACKUP_STATUS_SUCCESS_INTERVAL_HOURS:-168}" warning_interval_hours="${BACKUP_STATUS_WARNING_INTERVAL_HOURS:-168}" failed_interval_hours="${BACKUP_STATUS_FAILED_INTERVAL_HOURS:-6}" now_ts="$(date +%s)" notify_fingerprint="$( printf '%s' "status=${status};core=${core_blockers};dr=${dr_warnings};cm110=${configured_missing_110};cm188=${configured_missing_188};sm110=${script_missing_110};sm188=${script_missing_188};stale110=${stale_jobs_110:-none};stale188=${stale_jobs_188:-none};offsite=${offsite_configured}:${offsite_fresh};escrow=${escrow_missing}" \ | cksum \ | awk '{print $1}' )" last_status="" last_fingerprint="" last_timestamp=0 if [ -f "${notify_marker}" ]; then last_status="$(awk -F= '$1=="status" {print $2; exit}' "${notify_marker}" 2>/dev/null || true)" last_fingerprint="$(awk -F= '$1=="fingerprint" {print $2; exit}' "${notify_marker}" 2>/dev/null || true)" last_timestamp="$(awk -F= '$1=="timestamp" {value=int($2)} END {print value + 0}' "${notify_marker}" 2>/dev/null || echo 0)" if [ "${last_timestamp}" -eq 0 ] && grep -Eq '^[0-9]{4}-[0-9]{2}-[0-9]{2}$' "${notify_marker}" 2>/dev/null; then last_timestamp="$(stat -c '%Y' "${notify_marker}" 2>/dev/null || stat -f '%m' "${notify_marker}" 2>/dev/null || echo 0)" last_status="${status}" last_fingerprint="${notify_fingerprint}" fi fi interval_hours="${warning_interval_hours}" [ "${status}" = "success" ] && interval_hours="${success_interval_hours}" [ "${status}" = "failed" ] && interval_hours="${failed_interval_hours}" interval_seconds=$((interval_hours * 3600)) elapsed=$((now_ts - last_timestamp)) should_notify=0 notify_reason="throttled" mkdir -p "${state_dir}" if [ "${FORCE_NOTIFY}" -eq 1 ]; then should_notify=1 notify_reason="force" elif [ "${status}" = "success" ] && [ "${notify_success}" != "1" ] && [ "${last_status}" != "warning" ] && [ "${last_status}" != "failed" ]; then notify_reason="success_quiet" elif [ "${last_status}" != "" ] && [ "${last_status}" != "${status}" ]; then should_notify=1 notify_reason="status_changed_${last_status}_to_${status}" elif [ "${status}" != "success" ] && [ "${last_fingerprint}" != "" ] && [ "${last_fingerprint}" != "${notify_fingerprint}" ]; then should_notify=1 notify_reason="fingerprint_changed" elif [ "${last_timestamp}" -eq 0 ] || [ "${elapsed}" -ge "${interval_seconds}" ]; then if [ "${status}" != "success" ] || [ "${notify_success}" = "1" ]; then should_notify=1 notify_reason="interval_${interval_hours}h" else notify_reason="success_quiet" fi fi if [ "${should_notify}" -eq 1 ]; then if [ "${status}" = "success" ]; then BACKUP_NOTIFY_SUCCESS=1 notify_clawbot "${status}" "backup-daily-heartbeat" "${message}" 0 else notify_clawbot "${status}" "backup-daily-heartbeat" "${message}" 0 fi { printf 'timestamp=%s\n' "${now_ts}" printf 'status=%s\n' "${status}" printf 'fingerprint=%s\n' "${notify_fingerprint}" printf 'reason=%s\n' "${notify_reason}" } > "${notify_marker}" else log_line "SKIP_NOTIFY reason=${notify_reason} status=${status} elapsed_seconds=${elapsed} interval_hours=${interval_hours}" | tee -a "${LOG_DIR}/backup-status.log" fi fi case "${status}" in success) exit 0 ;; warning) exit "${BACKUP_STATUS_WARNING_EXIT_CODE:-0}" ;; failed) exit 2 ;; *) exit 3 ;; esac