Files
awoooi/scripts/backup/backup-status.sh
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

343 lines
13 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# =============================================================================
# WOOO AIOps - daily backup heartbeat with low-noise Telegram reporting
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
if [ -f "${SCRIPT_DIR}/common.sh" ]; then
# shellcheck disable=SC1091
source "${SCRIPT_DIR}/common.sh"
else
notify_clawbot() { return 0; }
fi
BACKUP_BASE="${BACKUP_BASE:-/backup}"
LOG_DIR="${BACKUP_LOG_DIR:-${BACKUP_BASE}/logs}"
TEXTFILE_110="${BACKUP_HEALTH_110_PROM:-/home/wooo/node_exporter_textfiles/backup_health.prom}"
TEXTFILE_188_TMP="${BACKUP_HEALTH_188_TMP:-/tmp/awoooi-backup-health-188.prom}"
SSH_188="${BACKUP_STATUS_SSH_188:-ollama@192.168.0.188}"
SSH_OPTS="${BACKUP_STATUS_SSH_OPTS:--o BatchMode=yes -o ConnectTimeout=8 -o StrictHostKeyChecking=accept-new}"
NOTIFY=1
REFRESH=1
FORCE_NOTIFY=0
usage() {
cat <<'USAGE'
Usage: backup-status.sh [--no-notify] [--no-refresh] [--force-notify]
每日備份心跳報告:
- 讀取 110 / 188 backup_health.prom
- 彙整 cron、script、freshness、last aggregate failure、integrity、restore drill
- 每日寫本機 logTelegram 只在狀態變化、失敗、或低頻提醒時發送
USAGE
}
while [ "$#" -gt 0 ]; do
case "$1" in
--no-notify)
NOTIFY=0
;;
--no-refresh)
REFRESH=0
;;
--force-notify)
FORCE_NOTIFY=1
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
shift
done
mkdir -p "${LOG_DIR}"
log_line() {
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*"
}
refresh_110() {
[ "${REFRESH}" -eq 1 ] || return 0
if [ -x /home/wooo/scripts/backup-health-textfile-exporter.py ]; then
AIOPS_HOST_LABEL=110 \
NODE_EXPORTER_TEXTFILE_DIR=/home/wooo/node_exporter_textfiles \
/home/wooo/scripts/backup-health-textfile-exporter.py >/dev/null 2>&1 || true
fi
}
refresh_188() {
: > "${TEXTFILE_188_TMP}"
if [ "${REFRESH}" -eq 1 ]; then
# Keep this read-only from 110's perspective: refresh the textfile exporter, then read the metric file.
ssh ${SSH_OPTS} "${SSH_188}" \
'AIOPS_HOST_LABEL=188 NODE_EXPORTER_TEXTFILE_DIR=/home/ollama/node_exporter_textfiles /home/ollama/scripts/backup-health-textfile-exporter.py >/dev/null 2>&1 || true; cat /home/ollama/node_exporter_textfiles/backup_health.prom 2>/dev/null' \
> "${TEXTFILE_188_TMP}" 2>/dev/null || true
else
ssh ${SSH_OPTS} "${SSH_188}" \
'cat /home/ollama/node_exporter_textfiles/backup_health.prom 2>/dev/null' \
> "${TEXTFILE_188_TMP}" 2>/dev/null || true
fi
}
metric_count() {
local file="$1"
local metric="$2"
local expected="${3:-}"
if [ ! -s "${file}" ]; then
echo 0
return 0
fi
awk -v metric="${metric}" -v expected="${expected}" '
$1 ~ ("^" metric "\\{") {
if (expected == "" || $2 == expected) count += 1
}
END { print count + 0 }
' "${file}"
}
metric_sum() {
local file="$1"
local metric="$2"
if [ ! -s "${file}" ]; then
echo 0
return 0
fi
awk -v metric="${metric}" '
$1 ~ ("^" metric "\\{") { sum += $2 }
END { print sum + 0 }
' "${file}"
}
metric_first() {
local file="$1"
local metric="$2"
if [ ! -s "${file}" ]; then
echo 0
return 0
fi
awk -v metric="${metric}" '
$1 ~ ("^" metric "\\{") { print $2; found = 1; exit }
END { if (!found) print 0 }
' "${file}"
}
label_list_for_zero() {
local file="$1"
local metric="$2"
local label="$3"
if [ ! -s "${file}" ]; then
echo "textfile_missing"
return 0
fi
awk -v metric="${metric}" -v label="${label}" '
$1 ~ ("^" metric "\\{") && $2 == 0 {
pattern = label "=\"[^\"]+\""
if (match($1, pattern)) {
value = substr($1, RSTART + length(label) + 2, RLENGTH - length(label) - 3)
if (out == "") out = value
else out = out "," value
}
}
END { print out }
' "${file}"
}
human_metric_time() {
local file="$1"
local metric="$2"
local ts
ts="$(metric_first "${file}" "${metric}")"
case "${ts}" in
''|0|0.0)
echo "unknown"
;;
*)
date -d "@${ts%.*}" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "${ts}"
;;
esac
}
metric_value_for_label() {
local file="$1"
local metric="$2"
local label="$3"
local value="$4"
if [ ! -s "${file}" ]; then
echo 0
return 0
fi
awk -v metric="${metric}" -v label="${label}" -v value="${value}" '
$1 ~ ("^" metric "\\{") && $1 ~ (label "=\"" value "\"") {
print $2
found = 1
exit
}
END { if (!found) print 0 }
' "${file}"
}
human_timestamp() {
local ts="$1"
case "${ts}" in
''|0|0.0)
echo "unknown"
;;
*)
date -d "@${ts%.*}" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "${ts}"
;;
esac
}
refresh_110
refresh_188
host_110_missing=0
host_188_missing=0
[ -s "${TEXTFILE_110}" ] || host_110_missing=1
[ -s "${TEXTFILE_188_TMP}" ] || host_188_missing=1
configured_missing_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_job_configured" 0)"
configured_missing_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_job_configured" 0)"
script_missing_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_script_present" 0)"
script_missing_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_script_present" 0)"
fresh_total_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_job_fresh")"
fresh_total_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_job_fresh")"
stale_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_job_fresh" 0)"
stale_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_job_fresh" 0)"
failed_total_110="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_last_run_failed_count")"
failed_total_188="$(metric_sum "${TEXTFILE_188_TMP}" "awoooi_backup_last_run_failed_count")"
integrity_stale_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_integrity_fresh" 0)"
offsite_configured="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_offsite_configured")"
offsite_fresh="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_offsite_fresh")"
offsite_rclone_configured="$(awk '/^awoooi_backup_offsite_configured\{.*provider="rclone"/ { print $2; found=1; exit } END { if (!found) print 0 }' "${TEXTFILE_110}" 2>/dev/null || echo 0)"
offsite_rclone_fresh="$(awk '/^awoooi_backup_offsite_fresh\{.*provider="rclone"/ { print $2; found=1; exit } END { if (!found) print 0 }' "${TEXTFILE_110}" 2>/dev/null || echo 0)"
escrow_missing="$(metric_first "${TEXTFILE_110}" "awoooi_backup_dr_credential_escrow_missing_count")"
core_blockers=$((host_110_missing + host_188_missing + configured_missing_110 + configured_missing_188 + script_missing_110 + script_missing_188 + stale_110 + stale_188 + failed_total_110 + failed_total_188 + integrity_stale_110))
dr_warnings=0
if [ "${offsite_configured%.*}" -lt 1 ] 2>/dev/null; then
dr_warnings=$((dr_warnings + 1))
fi
if [ "${offsite_fresh%.*}" -lt 1 ] 2>/dev/null; then
dr_warnings=$((dr_warnings + 1))
fi
if [ "${escrow_missing%.*}" -gt 0 ] 2>/dev/null; then
dr_warnings=$((dr_warnings + escrow_missing))
fi
status="success"
headline="每日備份心跳正常"
if [ "${core_blockers}" -gt 0 ]; then
status="failed"
headline="每日備份心跳失敗"
elif [ "${dr_warnings}" -gt 0 ]; then
status="warning"
headline="每日備份心跳核心正常但 DR 未完成"
fi
stale_jobs_110="$(label_list_for_zero "${TEXTFILE_110}" "awoooi_backup_job_fresh" "job")"
stale_jobs_188="$(label_list_for_zero "${TEXTFILE_188_TMP}" "awoooi_backup_job_fresh" "job")"
missing_scripts_110="$(label_list_for_zero "${TEXTFILE_110}" "awoooi_backup_script_present" "script")"
missing_scripts_188="$(label_list_for_zero "${TEXTFILE_188_TMP}" "awoooi_backup_script_present" "script")"
backup_all_ts="$(metric_value_for_label "${TEXTFILE_110}" "awoooi_backup_job_last_success_timestamp" "job" "backup_all")"
last_backup_all="$(human_timestamp "${backup_all_ts}")"
message="${headline}; 110備份=${fresh_total_110}/13 fresh failed=${failed_total_110}; 188備份=${fresh_total_188}/2 fresh failed=${failed_total_188}; integrity_stale=${integrity_stale_110}; offsite_configured=${offsite_configured}; offsite_fresh=${offsite_fresh}; rclone_gdrive_configured=${offsite_rclone_configured}; rclone_gdrive_fresh=${offsite_rclone_fresh}; escrow_missing=${escrow_missing}; last_backup_all=${last_backup_all}"
if [ "${core_blockers}" -gt 0 ]; then
message="${message}; stale110=${stale_jobs_110:-none}; stale188=${stale_jobs_188:-none}; missing_script110=${missing_scripts_110:-none}; missing_script188=${missing_scripts_188:-none}"
fi
{
log_line "${message}"
log_line "DETAIL core_blockers=${core_blockers} dr_warnings=${dr_warnings} configured_missing_110=${configured_missing_110} configured_missing_188=${configured_missing_188} script_missing_110=${script_missing_110} script_missing_188=${script_missing_188}"
} | tee -a "${LOG_DIR}/backup-status.log"
if [ "${NOTIFY}" -eq 1 ]; then
state_dir="${BACKUP_STATUS_STATE_DIR:-${BACKUP_BASE}/state}"
notify_marker="${state_dir}/backup-status-last-notified"
notify_success="${BACKUP_STATUS_NOTIFY_SUCCESS:-0}"
success_interval_hours="${BACKUP_STATUS_SUCCESS_INTERVAL_HOURS:-168}"
warning_interval_hours="${BACKUP_STATUS_WARNING_INTERVAL_HOURS:-168}"
failed_interval_hours="${BACKUP_STATUS_FAILED_INTERVAL_HOURS:-6}"
now_ts="$(date +%s)"
notify_fingerprint="$(
printf '%s' "status=${status};core=${core_blockers};dr=${dr_warnings};cm110=${configured_missing_110};cm188=${configured_missing_188};sm110=${script_missing_110};sm188=${script_missing_188};stale110=${stale_jobs_110:-none};stale188=${stale_jobs_188:-none};offsite=${offsite_configured}:${offsite_fresh};escrow=${escrow_missing}" \
| cksum \
| awk '{print $1}'
)"
last_status=""
last_fingerprint=""
last_timestamp=0
if [ -f "${notify_marker}" ]; then
last_status="$(awk -F= '$1=="status" {print $2; exit}' "${notify_marker}" 2>/dev/null || true)"
last_fingerprint="$(awk -F= '$1=="fingerprint" {print $2; exit}' "${notify_marker}" 2>/dev/null || true)"
last_timestamp="$(awk -F= '$1=="timestamp" {value=int($2)} END {print value + 0}' "${notify_marker}" 2>/dev/null || echo 0)"
if [ "${last_timestamp}" -eq 0 ] && grep -Eq '^[0-9]{4}-[0-9]{2}-[0-9]{2}$' "${notify_marker}" 2>/dev/null; then
last_timestamp="$(stat -c '%Y' "${notify_marker}" 2>/dev/null || stat -f '%m' "${notify_marker}" 2>/dev/null || echo 0)"
last_status="${status}"
last_fingerprint="${notify_fingerprint}"
fi
fi
interval_hours="${warning_interval_hours}"
[ "${status}" = "success" ] && interval_hours="${success_interval_hours}"
[ "${status}" = "failed" ] && interval_hours="${failed_interval_hours}"
interval_seconds=$((interval_hours * 3600))
elapsed=$((now_ts - last_timestamp))
should_notify=0
notify_reason="throttled"
mkdir -p "${state_dir}"
if [ "${FORCE_NOTIFY}" -eq 1 ]; then
should_notify=1
notify_reason="force"
elif [ "${status}" = "success" ] && [ "${notify_success}" != "1" ] && [ "${last_status}" != "warning" ] && [ "${last_status}" != "failed" ]; then
notify_reason="success_quiet"
elif [ "${last_status}" != "" ] && [ "${last_status}" != "${status}" ]; then
should_notify=1
notify_reason="status_changed_${last_status}_to_${status}"
elif [ "${status}" != "success" ] && [ "${last_fingerprint}" != "" ] && [ "${last_fingerprint}" != "${notify_fingerprint}" ]; then
should_notify=1
notify_reason="fingerprint_changed"
elif [ "${last_timestamp}" -eq 0 ] || [ "${elapsed}" -ge "${interval_seconds}" ]; then
if [ "${status}" != "success" ] || [ "${notify_success}" = "1" ]; then
should_notify=1
notify_reason="interval_${interval_hours}h"
else
notify_reason="success_quiet"
fi
fi
if [ "${should_notify}" -eq 1 ]; then
if [ "${status}" = "success" ]; then
BACKUP_NOTIFY_SUCCESS=1 notify_clawbot "${status}" "backup-daily-heartbeat" "${message}" 0
else
notify_clawbot "${status}" "backup-daily-heartbeat" "${message}" 0
fi
{
printf 'timestamp=%s\n' "${now_ts}"
printf 'status=%s\n' "${status}"
printf 'fingerprint=%s\n' "${notify_fingerprint}"
printf 'reason=%s\n' "${notify_reason}"
} > "${notify_marker}"
else
log_line "SKIP_NOTIFY reason=${notify_reason} status=${status} elapsed_seconds=${elapsed} interval_hours=${interval_hours}" | tee -a "${LOG_DIR}/backup-status.log"
fi
fi
case "${status}" in
success) exit 0 ;;
warning) exit "${BACKUP_STATUS_WARNING_EXIT_CODE:-0}" ;;
failed) exit 2 ;;
*) exit 3 ;;
esac