179 lines
7.2 KiB
Bash
Executable File
179 lines
7.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Export AWOOOI full-stack cold-start gate status as node-exporter textfile metrics.
|
|
#
|
|
# This wrapper is read-only: it never sends the Alertmanager smoke event and
|
|
# never mutates remote host/service state.
|
|
|
|
set -uo pipefail
|
|
|
|
CHECK_SCRIPT="${CHECK_SCRIPT:-/home/wooo/scripts/full-stack-cold-start-check.sh}"
|
|
TEXTFILE_DIR="${TEXTFILE_DIR:-${NODE_EXPORTER_TEXTFILE_DIR:-/home/wooo/node_exporter_textfiles}}"
|
|
OUTPUT_NAME="${OUTPUT_NAME:-cold_start_recovery.prom}"
|
|
LOG_DIR="${LOG_DIR:-/home/wooo/reboot-recovery}"
|
|
CHECK_TIMEOUT_SECONDS="${CHECK_TIMEOUT_SECONDS:-240}"
|
|
CHECK_WATCH_INTERVAL_SECONDS="${CHECK_WATCH_INTERVAL_SECONDS:-10}"
|
|
CHECK_WATCH_MAX_ATTEMPTS="${CHECK_WATCH_MAX_ATTEMPTS:-3}"
|
|
HOST_LABEL="${AIOPS_HOST_LABEL:-110}"
|
|
SCOPE_LABEL="${AIOPS_SCOPE_LABEL:-110_120_121_188}"
|
|
LOCK_FILE="${LOCK_FILE:-/tmp/awoooi-cold-start-textfile-exporter.lock}"
|
|
|
|
escape_label() {
|
|
printf '%s' "$1" | sed 's/\\/\\\\/g; s/"/\\"/g'
|
|
}
|
|
|
|
write_metric_file() {
|
|
local tmp="$1"
|
|
local now="$2"
|
|
local duration="$3"
|
|
local exit_code="$4"
|
|
local monitor_up="$5"
|
|
local pass="$6"
|
|
local warn="$7"
|
|
local blocked="$8"
|
|
local green="$9"
|
|
local degraded="${10}"
|
|
local blocked_state="${11}"
|
|
local check_failed="${12}"
|
|
local last_green="${13}"
|
|
local k3s_node_fs_blocker="${14}"
|
|
local public_route_tls_blocker="${15}"
|
|
local host_120_unreachable_blocker="${16}"
|
|
local backup_health_blocker="${17}"
|
|
local host scope
|
|
host=$(escape_label "$HOST_LABEL")
|
|
scope=$(escape_label "$SCOPE_LABEL")
|
|
|
|
cat >"$tmp" <<METRICS
|
|
# HELP awoooi_cold_start_monitor_up Whether the cold-start monitor produced a parseable summary.
|
|
# TYPE awoooi_cold_start_monitor_up gauge
|
|
awoooi_cold_start_monitor_up{host="$host",scope="$scope",mode="read_only"} $monitor_up
|
|
# HELP awoooi_cold_start_pass_gates Last cold-start check pass gate count.
|
|
# TYPE awoooi_cold_start_pass_gates gauge
|
|
awoooi_cold_start_pass_gates{host="$host",scope="$scope"} $pass
|
|
# HELP awoooi_cold_start_warn_gates Last cold-start check warning gate count.
|
|
# TYPE awoooi_cold_start_warn_gates gauge
|
|
awoooi_cold_start_warn_gates{host="$host",scope="$scope"} $warn
|
|
# HELP awoooi_cold_start_blocked_gates Last cold-start check blocked gate count.
|
|
# TYPE awoooi_cold_start_blocked_gates gauge
|
|
awoooi_cold_start_blocked_gates{host="$host",scope="$scope"} $blocked
|
|
# HELP awoooi_cold_start_last_run_timestamp Unix timestamp of the last cold-start monitor run.
|
|
# TYPE awoooi_cold_start_last_run_timestamp gauge
|
|
awoooi_cold_start_last_run_timestamp{host="$host",scope="$scope"} $now
|
|
# HELP awoooi_cold_start_last_green_timestamp Unix timestamp of the last GREEN cold-start monitor run.
|
|
# TYPE awoooi_cold_start_last_green_timestamp gauge
|
|
awoooi_cold_start_last_green_timestamp{host="$host",scope="$scope"} $last_green
|
|
# HELP awoooi_cold_start_last_run_duration_seconds Last cold-start monitor run duration in seconds.
|
|
# TYPE awoooi_cold_start_last_run_duration_seconds gauge
|
|
awoooi_cold_start_last_run_duration_seconds{host="$host",scope="$scope"} $duration
|
|
# HELP awoooi_cold_start_last_exit_code Last cold-start monitor process exit code.
|
|
# TYPE awoooi_cold_start_last_exit_code gauge
|
|
awoooi_cold_start_last_exit_code{host="$host",scope="$scope"} $exit_code
|
|
# HELP awoooi_cold_start_last_result Last cold-start result as one-hot labels.
|
|
# TYPE awoooi_cold_start_last_result gauge
|
|
awoooi_cold_start_last_result{host="$host",scope="$scope",result="green"} $green
|
|
awoooi_cold_start_last_result{host="$host",scope="$scope",result="degraded"} $degraded
|
|
awoooi_cold_start_last_result{host="$host",scope="$scope",result="blocked"} $blocked_state
|
|
awoooi_cold_start_last_result{host="$host",scope="$scope",result="check_failed"} $check_failed
|
|
# HELP awoooi_cold_start_blocker_reason Whether a known cold-start blocker reason was detected in the last log.
|
|
# TYPE awoooi_cold_start_blocker_reason gauge
|
|
awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="k3s_node_filesystem_error",target="120"} $k3s_node_fs_blocker
|
|
awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="public_route_tls_failure",target="public_https"} $public_route_tls_blocker
|
|
awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="host_unreachable",target="120"} $host_120_unreachable_blocker
|
|
awoooi_cold_start_blocker_reason{host="$host",scope="$scope",reason="backup_health_blocked",target="110"} $backup_health_blocker
|
|
METRICS
|
|
}
|
|
|
|
if command -v flock >/dev/null 2>&1; then
|
|
exec 9>"$LOCK_FILE"
|
|
if ! flock -n 9; then
|
|
exit 0
|
|
fi
|
|
fi
|
|
|
|
mkdir -p "$TEXTFILE_DIR" "$LOG_DIR"
|
|
|
|
start_ts=$(date +%s)
|
|
log_tmp="$LOG_DIR/cold-start-last.log.tmp"
|
|
log_file="$LOG_DIR/cold-start-last.log"
|
|
state_file="$LOG_DIR/cold-start-last-green.timestamp"
|
|
|
|
if [ ! -x "$CHECK_SCRIPT" ]; then
|
|
end_ts=$(date +%s)
|
|
tmp_metric=$(mktemp "$TEXTFILE_DIR/.cold_start_recovery.XXXXXX")
|
|
last_green=$(cat "$state_file" 2>/dev/null || echo 0)
|
|
printf 'CHECK_SCRIPT not executable: %s\n' "$CHECK_SCRIPT" >"$log_file"
|
|
write_metric_file "$tmp_metric" "$end_ts" "$((end_ts - start_ts))" 127 0 0 0 1 0 0 0 1 "$last_green" 0 0 0 0
|
|
chmod 0644 "$tmp_metric"
|
|
mv "$tmp_metric" "$TEXTFILE_DIR/$OUTPUT_NAME"
|
|
exit 0
|
|
fi
|
|
|
|
timeout "$CHECK_TIMEOUT_SECONDS" bash "$CHECK_SCRIPT" \
|
|
--monitor-read-only \
|
|
--no-color \
|
|
--watch \
|
|
--interval "$CHECK_WATCH_INTERVAL_SECONDS" \
|
|
--max-attempts "$CHECK_WATCH_MAX_ATTEMPTS" \
|
|
>"$log_tmp" 2>&1
|
|
exit_code=$?
|
|
mv "$log_tmp" "$log_file"
|
|
|
|
summary_line=$(grep -E '^PASS=[0-9]+ WARN=[0-9]+ BLOCKED=[0-9]+' "$log_file" | tail -1 || true)
|
|
monitor_up=0
|
|
pass=0
|
|
warn=0
|
|
blocked=0
|
|
green=0
|
|
degraded=0
|
|
blocked_state=0
|
|
check_failed=0
|
|
k3s_node_fs_blocker=0
|
|
public_route_tls_blocker=0
|
|
host_120_unreachable_blocker=0
|
|
backup_health_blocker=0
|
|
|
|
if [ -n "$summary_line" ]; then
|
|
monitor_up=1
|
|
pass=$(printf '%s\n' "$summary_line" | sed -n 's/.*PASS=\([0-9][0-9]*\).*/\1/p')
|
|
warn=$(printf '%s\n' "$summary_line" | sed -n 's/.*WARN=\([0-9][0-9]*\).*/\1/p')
|
|
blocked=$(printf '%s\n' "$summary_line" | sed -n 's/.*BLOCKED=\([0-9][0-9]*\).*/\1/p')
|
|
if [ "$blocked" -gt 0 ]; then
|
|
blocked_state=1
|
|
elif [ "$warn" -gt 0 ]; then
|
|
degraded=1
|
|
elif [ "$exit_code" -eq 0 ]; then
|
|
green=1
|
|
else
|
|
check_failed=1
|
|
fi
|
|
else
|
|
check_failed=1
|
|
fi
|
|
|
|
if grep -Eq 'NODE_FS_ERROR_EVENTS[[:space:]]+[1-9][0-9]*|K3s node filesystem error events present' "$log_file"; then
|
|
k3s_node_fs_blocker=1
|
|
fi
|
|
|
|
if grep -Eq 'PUBLIC_ROUTE_TLS .*(000|5[0-9][0-9])|public route .* TLS certificate verification failed' "$log_file"; then
|
|
public_route_tls_blocker=1
|
|
fi
|
|
|
|
if grep -Eq 'BLOCKED (ping 192\.168\.0\.120|ssh port 192\.168\.0\.120:22|ssh 120 k3s read-only check)' "$log_file"; then
|
|
host_120_unreachable_blocker=1
|
|
fi
|
|
|
|
if grep -Eq 'BLOCKED 110 backup health has stale expected jobs' "$log_file"; then
|
|
backup_health_blocker=1
|
|
fi
|
|
|
|
end_ts=$(date +%s)
|
|
if [ "$green" -eq 1 ]; then
|
|
printf '%s\n' "$end_ts" >"$state_file"
|
|
fi
|
|
last_green=$(cat "$state_file" 2>/dev/null || echo 0)
|
|
|
|
tmp_metric=$(mktemp "$TEXTFILE_DIR/.cold_start_recovery.XXXXXX")
|
|
write_metric_file "$tmp_metric" "$end_ts" "$((end_ts - start_ts))" "$exit_code" "$monitor_up" "$pass" "$warn" "$blocked" "$green" "$degraded" "$blocked_state" "$check_failed" "$last_green" "$k3s_node_fs_blocker" "$public_route_tls_blocker" "$host_120_unreachable_blocker" "$backup_health_blocker"
|
|
chmod 0644 "$tmp_metric"
|
|
mv "$tmp_metric" "$TEXTFILE_DIR/$OUTPUT_NAME"
|