343 lines
13 KiB
Bash
343 lines
13 KiB
Bash
#!/bin/bash
|
||
# =============================================================================
|
||
# WOOO AIOps - daily backup heartbeat with low-noise Telegram reporting
|
||
# =============================================================================
|
||
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
if [ -f "${SCRIPT_DIR}/common.sh" ]; then
|
||
# shellcheck disable=SC1091
|
||
source "${SCRIPT_DIR}/common.sh"
|
||
else
|
||
notify_clawbot() { return 0; }
|
||
fi
|
||
|
||
BACKUP_BASE="${BACKUP_BASE:-/backup}"
|
||
LOG_DIR="${BACKUP_LOG_DIR:-${BACKUP_BASE}/logs}"
|
||
TEXTFILE_110="${BACKUP_HEALTH_110_PROM:-/home/wooo/node_exporter_textfiles/backup_health.prom}"
|
||
TEXTFILE_188_TMP="${BACKUP_HEALTH_188_TMP:-/tmp/awoooi-backup-health-188.prom}"
|
||
SSH_188="${BACKUP_STATUS_SSH_188:-ollama@192.168.0.188}"
|
||
SSH_OPTS="${BACKUP_STATUS_SSH_OPTS:--o BatchMode=yes -o ConnectTimeout=8 -o StrictHostKeyChecking=accept-new}"
|
||
NOTIFY=1
|
||
REFRESH=1
|
||
FORCE_NOTIFY=0
|
||
|
||
usage() {
|
||
cat <<'USAGE'
|
||
Usage: backup-status.sh [--no-notify] [--no-refresh] [--force-notify]
|
||
|
||
每日備份心跳報告:
|
||
- 讀取 110 / 188 backup_health.prom
|
||
- 彙整 cron、script、freshness、last aggregate failure、integrity、restore drill
|
||
- 每日寫本機 log;Telegram 只在狀態變化、失敗、或低頻提醒時發送
|
||
USAGE
|
||
}
|
||
|
||
while [ "$#" -gt 0 ]; do
|
||
case "$1" in
|
||
--no-notify)
|
||
NOTIFY=0
|
||
;;
|
||
--no-refresh)
|
||
REFRESH=0
|
||
;;
|
||
--force-notify)
|
||
FORCE_NOTIFY=1
|
||
;;
|
||
-h|--help)
|
||
usage
|
||
exit 0
|
||
;;
|
||
*)
|
||
echo "Unknown argument: $1" >&2
|
||
usage >&2
|
||
exit 2
|
||
;;
|
||
esac
|
||
shift
|
||
done
|
||
|
||
mkdir -p "${LOG_DIR}"
|
||
|
||
log_line() {
|
||
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*"
|
||
}
|
||
|
||
refresh_110() {
|
||
[ "${REFRESH}" -eq 1 ] || return 0
|
||
if [ -x /home/wooo/scripts/backup-health-textfile-exporter.py ]; then
|
||
AIOPS_HOST_LABEL=110 \
|
||
NODE_EXPORTER_TEXTFILE_DIR=/home/wooo/node_exporter_textfiles \
|
||
/home/wooo/scripts/backup-health-textfile-exporter.py >/dev/null 2>&1 || true
|
||
fi
|
||
}
|
||
|
||
refresh_188() {
|
||
: > "${TEXTFILE_188_TMP}"
|
||
if [ "${REFRESH}" -eq 1 ]; then
|
||
# Keep this read-only from 110's perspective: refresh the textfile exporter, then read the metric file.
|
||
ssh ${SSH_OPTS} "${SSH_188}" \
|
||
'AIOPS_HOST_LABEL=188 NODE_EXPORTER_TEXTFILE_DIR=/home/ollama/node_exporter_textfiles /home/ollama/scripts/backup-health-textfile-exporter.py >/dev/null 2>&1 || true; cat /home/ollama/node_exporter_textfiles/backup_health.prom 2>/dev/null' \
|
||
> "${TEXTFILE_188_TMP}" 2>/dev/null || true
|
||
else
|
||
ssh ${SSH_OPTS} "${SSH_188}" \
|
||
'cat /home/ollama/node_exporter_textfiles/backup_health.prom 2>/dev/null' \
|
||
> "${TEXTFILE_188_TMP}" 2>/dev/null || true
|
||
fi
|
||
}
|
||
|
||
metric_count() {
|
||
local file="$1"
|
||
local metric="$2"
|
||
local expected="${3:-}"
|
||
if [ ! -s "${file}" ]; then
|
||
echo 0
|
||
return 0
|
||
fi
|
||
awk -v metric="${metric}" -v expected="${expected}" '
|
||
$1 ~ ("^" metric "\\{") {
|
||
if (expected == "" || $2 == expected) count += 1
|
||
}
|
||
END { print count + 0 }
|
||
' "${file}"
|
||
}
|
||
|
||
metric_sum() {
|
||
local file="$1"
|
||
local metric="$2"
|
||
if [ ! -s "${file}" ]; then
|
||
echo 0
|
||
return 0
|
||
fi
|
||
awk -v metric="${metric}" '
|
||
$1 ~ ("^" metric "\\{") { sum += $2 }
|
||
END { print sum + 0 }
|
||
' "${file}"
|
||
}
|
||
|
||
metric_first() {
|
||
local file="$1"
|
||
local metric="$2"
|
||
if [ ! -s "${file}" ]; then
|
||
echo 0
|
||
return 0
|
||
fi
|
||
awk -v metric="${metric}" '
|
||
$1 ~ ("^" metric "\\{") { print $2; found = 1; exit }
|
||
END { if (!found) print 0 }
|
||
' "${file}"
|
||
}
|
||
|
||
label_list_for_zero() {
|
||
local file="$1"
|
||
local metric="$2"
|
||
local label="$3"
|
||
if [ ! -s "${file}" ]; then
|
||
echo "textfile_missing"
|
||
return 0
|
||
fi
|
||
awk -v metric="${metric}" -v label="${label}" '
|
||
$1 ~ ("^" metric "\\{") && $2 == 0 {
|
||
pattern = label "=\"[^\"]+\""
|
||
if (match($1, pattern)) {
|
||
value = substr($1, RSTART + length(label) + 2, RLENGTH - length(label) - 3)
|
||
if (out == "") out = value
|
||
else out = out "," value
|
||
}
|
||
}
|
||
END { print out }
|
||
' "${file}"
|
||
}
|
||
|
||
human_metric_time() {
|
||
local file="$1"
|
||
local metric="$2"
|
||
local ts
|
||
ts="$(metric_first "${file}" "${metric}")"
|
||
case "${ts}" in
|
||
''|0|0.0)
|
||
echo "unknown"
|
||
;;
|
||
*)
|
||
date -d "@${ts%.*}" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "${ts}"
|
||
;;
|
||
esac
|
||
}
|
||
|
||
metric_value_for_label() {
|
||
local file="$1"
|
||
local metric="$2"
|
||
local label="$3"
|
||
local value="$4"
|
||
if [ ! -s "${file}" ]; then
|
||
echo 0
|
||
return 0
|
||
fi
|
||
awk -v metric="${metric}" -v label="${label}" -v value="${value}" '
|
||
$1 ~ ("^" metric "\\{") && $1 ~ (label "=\"" value "\"") {
|
||
print $2
|
||
found = 1
|
||
exit
|
||
}
|
||
END { if (!found) print 0 }
|
||
' "${file}"
|
||
}
|
||
|
||
human_timestamp() {
|
||
local ts="$1"
|
||
case "${ts}" in
|
||
''|0|0.0)
|
||
echo "unknown"
|
||
;;
|
||
*)
|
||
date -d "@${ts%.*}" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "${ts}"
|
||
;;
|
||
esac
|
||
}
|
||
|
||
refresh_110
|
||
refresh_188
|
||
|
||
host_110_missing=0
|
||
host_188_missing=0
|
||
[ -s "${TEXTFILE_110}" ] || host_110_missing=1
|
||
[ -s "${TEXTFILE_188_TMP}" ] || host_188_missing=1
|
||
|
||
configured_missing_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_job_configured" 0)"
|
||
configured_missing_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_job_configured" 0)"
|
||
script_missing_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_script_present" 0)"
|
||
script_missing_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_script_present" 0)"
|
||
fresh_total_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_job_fresh")"
|
||
fresh_total_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_job_fresh")"
|
||
stale_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_job_fresh" 0)"
|
||
stale_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_job_fresh" 0)"
|
||
failed_total_110="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_last_run_failed_count")"
|
||
failed_total_188="$(metric_sum "${TEXTFILE_188_TMP}" "awoooi_backup_last_run_failed_count")"
|
||
integrity_stale_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_integrity_fresh" 0)"
|
||
offsite_configured="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_offsite_configured")"
|
||
offsite_fresh="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_offsite_fresh")"
|
||
offsite_rclone_configured="$(awk '/^awoooi_backup_offsite_configured\{.*provider="rclone"/ { print $2; found=1; exit } END { if (!found) print 0 }' "${TEXTFILE_110}" 2>/dev/null || echo 0)"
|
||
offsite_rclone_fresh="$(awk '/^awoooi_backup_offsite_fresh\{.*provider="rclone"/ { print $2; found=1; exit } END { if (!found) print 0 }' "${TEXTFILE_110}" 2>/dev/null || echo 0)"
|
||
escrow_missing="$(metric_first "${TEXTFILE_110}" "awoooi_backup_dr_credential_escrow_missing_count")"
|
||
|
||
core_blockers=$((host_110_missing + host_188_missing + configured_missing_110 + configured_missing_188 + script_missing_110 + script_missing_188 + stale_110 + stale_188 + failed_total_110 + failed_total_188 + integrity_stale_110))
|
||
dr_warnings=0
|
||
if [ "${offsite_configured%.*}" -lt 1 ] 2>/dev/null; then
|
||
dr_warnings=$((dr_warnings + 1))
|
||
fi
|
||
if [ "${offsite_fresh%.*}" -lt 1 ] 2>/dev/null; then
|
||
dr_warnings=$((dr_warnings + 1))
|
||
fi
|
||
if [ "${escrow_missing%.*}" -gt 0 ] 2>/dev/null; then
|
||
dr_warnings=$((dr_warnings + escrow_missing))
|
||
fi
|
||
|
||
status="success"
|
||
headline="每日備份心跳正常"
|
||
if [ "${core_blockers}" -gt 0 ]; then
|
||
status="failed"
|
||
headline="每日備份心跳失敗"
|
||
elif [ "${dr_warnings}" -gt 0 ]; then
|
||
status="warning"
|
||
headline="每日備份心跳核心正常但 DR 未完成"
|
||
fi
|
||
|
||
stale_jobs_110="$(label_list_for_zero "${TEXTFILE_110}" "awoooi_backup_job_fresh" "job")"
|
||
stale_jobs_188="$(label_list_for_zero "${TEXTFILE_188_TMP}" "awoooi_backup_job_fresh" "job")"
|
||
missing_scripts_110="$(label_list_for_zero "${TEXTFILE_110}" "awoooi_backup_script_present" "script")"
|
||
missing_scripts_188="$(label_list_for_zero "${TEXTFILE_188_TMP}" "awoooi_backup_script_present" "script")"
|
||
backup_all_ts="$(metric_value_for_label "${TEXTFILE_110}" "awoooi_backup_job_last_success_timestamp" "job" "backup_all")"
|
||
last_backup_all="$(human_timestamp "${backup_all_ts}")"
|
||
|
||
message="${headline}; 110備份=${fresh_total_110}/13 fresh failed=${failed_total_110}; 188備份=${fresh_total_188}/2 fresh failed=${failed_total_188}; integrity_stale=${integrity_stale_110}; offsite_configured=${offsite_configured}; offsite_fresh=${offsite_fresh}; rclone_gdrive_configured=${offsite_rclone_configured}; rclone_gdrive_fresh=${offsite_rclone_fresh}; escrow_missing=${escrow_missing}; last_backup_all=${last_backup_all}"
|
||
|
||
if [ "${core_blockers}" -gt 0 ]; then
|
||
message="${message}; stale110=${stale_jobs_110:-none}; stale188=${stale_jobs_188:-none}; missing_script110=${missing_scripts_110:-none}; missing_script188=${missing_scripts_188:-none}"
|
||
fi
|
||
|
||
{
|
||
log_line "${message}"
|
||
log_line "DETAIL core_blockers=${core_blockers} dr_warnings=${dr_warnings} configured_missing_110=${configured_missing_110} configured_missing_188=${configured_missing_188} script_missing_110=${script_missing_110} script_missing_188=${script_missing_188}"
|
||
} | tee -a "${LOG_DIR}/backup-status.log"
|
||
|
||
if [ "${NOTIFY}" -eq 1 ]; then
|
||
state_dir="${BACKUP_STATUS_STATE_DIR:-${BACKUP_BASE}/state}"
|
||
notify_marker="${state_dir}/backup-status-last-notified"
|
||
notify_success="${BACKUP_STATUS_NOTIFY_SUCCESS:-0}"
|
||
success_interval_hours="${BACKUP_STATUS_SUCCESS_INTERVAL_HOURS:-168}"
|
||
warning_interval_hours="${BACKUP_STATUS_WARNING_INTERVAL_HOURS:-168}"
|
||
failed_interval_hours="${BACKUP_STATUS_FAILED_INTERVAL_HOURS:-6}"
|
||
now_ts="$(date +%s)"
|
||
notify_fingerprint="$(
|
||
printf '%s' "status=${status};core=${core_blockers};dr=${dr_warnings};cm110=${configured_missing_110};cm188=${configured_missing_188};sm110=${script_missing_110};sm188=${script_missing_188};stale110=${stale_jobs_110:-none};stale188=${stale_jobs_188:-none};offsite=${offsite_configured}:${offsite_fresh};escrow=${escrow_missing}" \
|
||
| cksum \
|
||
| awk '{print $1}'
|
||
)"
|
||
last_status=""
|
||
last_fingerprint=""
|
||
last_timestamp=0
|
||
if [ -f "${notify_marker}" ]; then
|
||
last_status="$(awk -F= '$1=="status" {print $2; exit}' "${notify_marker}" 2>/dev/null || true)"
|
||
last_fingerprint="$(awk -F= '$1=="fingerprint" {print $2; exit}' "${notify_marker}" 2>/dev/null || true)"
|
||
last_timestamp="$(awk -F= '$1=="timestamp" {value=int($2)} END {print value + 0}' "${notify_marker}" 2>/dev/null || echo 0)"
|
||
if [ "${last_timestamp}" -eq 0 ] && grep -Eq '^[0-9]{4}-[0-9]{2}-[0-9]{2}$' "${notify_marker}" 2>/dev/null; then
|
||
last_timestamp="$(stat -c '%Y' "${notify_marker}" 2>/dev/null || stat -f '%m' "${notify_marker}" 2>/dev/null || echo 0)"
|
||
last_status="${status}"
|
||
last_fingerprint="${notify_fingerprint}"
|
||
fi
|
||
fi
|
||
|
||
interval_hours="${warning_interval_hours}"
|
||
[ "${status}" = "success" ] && interval_hours="${success_interval_hours}"
|
||
[ "${status}" = "failed" ] && interval_hours="${failed_interval_hours}"
|
||
interval_seconds=$((interval_hours * 3600))
|
||
elapsed=$((now_ts - last_timestamp))
|
||
should_notify=0
|
||
notify_reason="throttled"
|
||
mkdir -p "${state_dir}"
|
||
|
||
if [ "${FORCE_NOTIFY}" -eq 1 ]; then
|
||
should_notify=1
|
||
notify_reason="force"
|
||
elif [ "${status}" = "success" ] && [ "${notify_success}" != "1" ] && [ "${last_status}" != "warning" ] && [ "${last_status}" != "failed" ]; then
|
||
notify_reason="success_quiet"
|
||
elif [ "${last_status}" != "" ] && [ "${last_status}" != "${status}" ]; then
|
||
should_notify=1
|
||
notify_reason="status_changed_${last_status}_to_${status}"
|
||
elif [ "${status}" != "success" ] && [ "${last_fingerprint}" != "" ] && [ "${last_fingerprint}" != "${notify_fingerprint}" ]; then
|
||
should_notify=1
|
||
notify_reason="fingerprint_changed"
|
||
elif [ "${last_timestamp}" -eq 0 ] || [ "${elapsed}" -ge "${interval_seconds}" ]; then
|
||
if [ "${status}" != "success" ] || [ "${notify_success}" = "1" ]; then
|
||
should_notify=1
|
||
notify_reason="interval_${interval_hours}h"
|
||
else
|
||
notify_reason="success_quiet"
|
||
fi
|
||
fi
|
||
|
||
if [ "${should_notify}" -eq 1 ]; then
|
||
if [ "${status}" = "success" ]; then
|
||
BACKUP_NOTIFY_SUCCESS=1 notify_clawbot "${status}" "backup-daily-heartbeat" "${message}" 0
|
||
else
|
||
notify_clawbot "${status}" "backup-daily-heartbeat" "${message}" 0
|
||
fi
|
||
{
|
||
printf 'timestamp=%s\n' "${now_ts}"
|
||
printf 'status=%s\n' "${status}"
|
||
printf 'fingerprint=%s\n' "${notify_fingerprint}"
|
||
printf 'reason=%s\n' "${notify_reason}"
|
||
} > "${notify_marker}"
|
||
else
|
||
log_line "SKIP_NOTIFY reason=${notify_reason} status=${status} elapsed_seconds=${elapsed} interval_hours=${interval_hours}" | tee -a "${LOG_DIR}/backup-status.log"
|
||
fi
|
||
fi
|
||
|
||
case "${status}" in
|
||
success) exit 0 ;;
|
||
warning) exit "${BACKUP_STATUS_WARNING_EXIT_CODE:-0}" ;;
|
||
failed) exit 2 ;;
|
||
*) exit 3 ;;
|
||
esac
|