Files
awoooi/scripts/reboot-recovery/cold-start-textfile-exporter.sh
2026-05-29 12:41:34 +08:00

179 lines
7.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# Export AWOOOI full-stack cold-start gate status as node-exporter textfile metrics.
#
# This wrapper is read-only: it never sends the Alertmanager smoke event and
# never mutates remote host/service state.
set -uo pipefail
CHECK_SCRIPT="${CHECK_SCRIPT:-/home/wooo/scripts/full-stack-cold-start-check.sh}"
TEXTFILE_DIR="${TEXTFILE_DIR:-${NODE_EXPORTER_TEXTFILE_DIR:-/home/wooo/node_exporter_textfiles}}"
OUTPUT_NAME="${OUTPUT_NAME:-cold_start_recovery.prom}"
LOG_DIR="${LOG_DIR:-/home/wooo/reboot-recovery}"
CHECK_TIMEOUT_SECONDS="${CHECK_TIMEOUT_SECONDS:-240}"
CHECK_WATCH_INTERVAL_SECONDS="${CHECK_WATCH_INTERVAL_SECONDS:-10}"
CHECK_WATCH_MAX_ATTEMPTS="${CHECK_WATCH_MAX_ATTEMPTS:-3}"
HOST_LABEL="${AIOPS_HOST_LABEL:-110}"
SCOPE_LABEL="${AIOPS_SCOPE_LABEL:-110_120_121_188}"
LOCK_FILE="${LOCK_FILE:-/tmp/awoooi-cold-start-textfile-exporter.lock}"
escape_label() {
printf '%s' "$1" | sed 's/\\/\\\\/g; s/"/\\"/g'
}
write_metric_file() {
local tmp="$1"
local now="$2"
local duration="$3"
local exit_code="$4"
local monitor_up="$5"
local pass="$6"
local warn="$7"
local blocked="$8"
local green="$9"
local degraded="${10}"
local blocked_state="${11}"
local check_failed="${12}"
local last_green="${13}"
local k3s_node_fs_blocker="${14}"
local public_route_tls_blocker="${15}"
local host_120_unreachable_blocker="${16}"
local backup_health_blocker="${17}"
local host scope
host=$(escape_label "$HOST_LABEL")
scope=$(escape_label "$SCOPE_LABEL")
cat >"$tmp" <<METRICS
# HELP awoooi_cold_start_monitor_up Whether the cold-start monitor produced a parseable summary.
# TYPE awoooi_cold_start_monitor_up gauge
awoooi_cold_start_monitor_up{host="$host",scope="$scope",mode="read_only"} $monitor_up
# HELP awoooi_cold_start_pass_gates Last cold-start check pass gate count.
# TYPE awoooi_cold_start_pass_gates gauge
awoooi_cold_start_pass_gates{host="$host",scope="$scope"} $pass
# HELP awoooi_cold_start_warn_gates Last cold-start check warning gate count.
# TYPE awoooi_cold_start_warn_gates gauge
awoooi_cold_start_warn_gates{host="$host",scope="$scope"} $warn
# HELP awoooi_cold_start_blocked_gates Last cold-start check blocked gate count.
# TYPE awoooi_cold_start_blocked_gates gauge
awoooi_cold_start_blocked_gates{host="$host",scope="$scope"} $blocked
# HELP awoooi_cold_start_last_run_timestamp Unix timestamp of the last cold-start monitor run.
# TYPE awoooi_cold_start_last_run_timestamp gauge
awoooi_cold_start_last_run_timestamp{host="$host",scope="$scope"} $now
# HELP awoooi_cold_start_last_green_timestamp Unix timestamp of the last GREEN cold-start monitor run.
# TYPE awoooi_cold_start_last_green_timestamp gauge
awoooi_cold_start_last_green_timestamp{host="$host",scope="$scope"} $last_green
# HELP awoooi_cold_start_last_run_duration_seconds Last cold-start monitor run duration in seconds.
# TYPE awoooi_cold_start_last_run_duration_seconds gauge
awoooi_cold_start_last_run_duration_seconds{host="$host",scope="$scope"} $duration
# HELP awoooi_cold_start_last_exit_code Last cold-start monitor process exit code.
# TYPE awoooi_cold_start_last_exit_code gauge
awoooi_cold_start_last_exit_code{host="$host",scope="$scope"} $exit_code
# HELP awoooi_cold_start_last_result Last cold-start result as one-hot labels.
# TYPE awoooi_cold_start_last_result gauge
awoooi_cold_start_last_result{host="$host",scope="$scope",result="green"} $green
awoooi_cold_start_last_result{host="$host",scope="$scope",result="degraded"} $degraded
awoooi_cold_start_last_result{host="$host",scope="$scope",result="blocked"} $blocked_state
awoooi_cold_start_last_result{host="$host",scope="$scope",result="check_failed"} $check_failed
# HELP awoooi_cold_start_blocker_reason Whether a known cold-start blocker reason was detected in the last log.
# TYPE awoooi_cold_start_blocker_reason gauge
awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="k3s_node_filesystem_error",target="120"} $k3s_node_fs_blocker
awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="public_route_tls_failure",target="public_https"} $public_route_tls_blocker
awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="host_unreachable",target="120"} $host_120_unreachable_blocker
awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="backup_health_blocked",target="110"} $backup_health_blocker
METRICS
}
if command -v flock >/dev/null 2>&1; then
exec 9>"$LOCK_FILE"
if ! flock -n 9; then
exit 0
fi
fi
mkdir -p "$TEXTFILE_DIR" "$LOG_DIR"
start_ts=$(date +%s)
log_tmp="$LOG_DIR/cold-start-last.log.tmp"
log_file="$LOG_DIR/cold-start-last.log"
state_file="$LOG_DIR/cold-start-last-green.timestamp"
if [ ! -x "$CHECK_SCRIPT" ]; then
end_ts=$(date +%s)
tmp_metric=$(mktemp "$TEXTFILE_DIR/.cold_start_recovery.XXXXXX")
last_green=$(cat "$state_file" 2>/dev/null || echo 0)
printf 'CHECK_SCRIPT not executable: %s\n' "$CHECK_SCRIPT" >"$log_file"
write_metric_file "$tmp_metric" "$end_ts" "$((end_ts - start_ts))" 127 0 0 0 1 0 0 0 1 "$last_green" 0 0 0 0
chmod 0644 "$tmp_metric"
mv "$tmp_metric" "$TEXTFILE_DIR/$OUTPUT_NAME"
exit 0
fi
timeout "$CHECK_TIMEOUT_SECONDS" bash "$CHECK_SCRIPT" \
--monitor-read-only \
--no-color \
--watch \
--interval "$CHECK_WATCH_INTERVAL_SECONDS" \
--max-attempts "$CHECK_WATCH_MAX_ATTEMPTS" \
>"$log_tmp" 2>&1
exit_code=$?
mv "$log_tmp" "$log_file"
summary_line=$(grep -E '^PASS=[0-9]+ WARN=[0-9]+ BLOCKED=[0-9]+' "$log_file" | tail -1 || true)
monitor_up=0
pass=0
warn=0
blocked=0
green=0
degraded=0
blocked_state=0
check_failed=0
k3s_node_fs_blocker=0
public_route_tls_blocker=0
host_120_unreachable_blocker=0
backup_health_blocker=0
if [ -n "$summary_line" ]; then
monitor_up=1
pass=$(printf '%s\n' "$summary_line" | sed -n 's/.*PASS=\([0-9][0-9]*\).*/\1/p')
warn=$(printf '%s\n' "$summary_line" | sed -n 's/.*WARN=\([0-9][0-9]*\).*/\1/p')
blocked=$(printf '%s\n' "$summary_line" | sed -n 's/.*BLOCKED=\([0-9][0-9]*\).*/\1/p')
if [ "$blocked" -gt 0 ]; then
blocked_state=1
elif [ "$warn" -gt 0 ]; then
degraded=1
elif [ "$exit_code" -eq 0 ]; then
green=1
else
check_failed=1
fi
else
check_failed=1
fi
if grep -Eq 'NODE_FS_ERROR_EVENTS[[:space:]]+[1-9][0-9]*|K3s node filesystem error events present' "$log_file"; then
k3s_node_fs_blocker=1
fi
if grep -Eq 'PUBLIC_ROUTE_TLS .*(000|5[0-9][0-9])|public route .* TLS certificate verification failed' "$log_file"; then
public_route_tls_blocker=1
fi
if grep -Eq 'BLOCKED (ping 192\.168\.0\.120|ssh port 192\.168\.0\.120:22|ssh 120 k3s read-only check)' "$log_file"; then
host_120_unreachable_blocker=1
fi
if grep -Eq 'BLOCKED 110 backup health has stale expected jobs' "$log_file"; then
backup_health_blocker=1
fi
end_ts=$(date +%s)
if [ "$green" -eq 1 ]; then
printf '%s\n' "$end_ts" >"$state_file"
fi
last_green=$(cat "$state_file" 2>/dev/null || echo 0)
tmp_metric=$(mktemp "$TEXTFILE_DIR/.cold_start_recovery.XXXXXX")
write_metric_file "$tmp_metric" "$end_ts" "$((end_ts - start_ts))" "$exit_code" "$monitor_up" "$pass" "$warn" "$blocked" "$green" "$degraded" "$blocked_state" "$check_failed" "$last_green" "$k3s_node_fs_blocker" "$public_route_tls_blocker" "$host_120_unreachable_blocker" "$backup_health_blocker"
chmod 0644 "$tmp_metric"
mv "$tmp_metric" "$TEXTFILE_DIR/$OUTPUT_NAME"