#!/usr/bin/env bash # AWOOOI full-stack cold-start readiness check. # Read-only by design. It never restarts, deletes, repairs, or writes remote state. set -uo pipefail SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6) SEND_ALERT_TEST=0 MONITOR_READ_ONLY=0 NO_COLOR_FLAG=0 WATCH_MODE=0 WATCH_INTERVAL=60 WATCH_MAX_ATTEMPTS=30 usage() { cat <<'USAGE' Usage: bash scripts/reboot-recovery/full-stack-cold-start-check.sh [options] Options: --send-alert-test POST one Alertmanager webhook test after AWOOOI API is ready. --monitor-read-only Skip the webhook POST without warning; intended for cron/textfile monitors. --watch Repeat checks until all gates are GREEN or max attempts is reached. --interval SECONDS Retry interval for --watch. Default: 60. --max-attempts COUNT Max attempts for --watch. Default: 30. Use 0 for unlimited. --no-color Disable ANSI colors in output. -h, --help Show this help. Default mode is read-only and does not POST an Alertmanager test event. Use --send-alert-test for the final release gate after AWOOOI API is expected to be ready. USAGE } while [ "$#" -gt 0 ]; do arg="$1" case "$arg" in --send-alert-test) SEND_ALERT_TEST=1 ;; --monitor-read-only) MONITOR_READ_ONLY=1 SEND_ALERT_TEST=0 ;; --no-color) NO_COLOR_FLAG=1 ;; --watch) WATCH_MODE=1 ;; --interval) shift if ! [[ "${1:-}" =~ ^[0-9]+$ ]] || [ "${1:-0}" -lt 1 ]; then echo "--interval requires a positive integer number of seconds" >&2 exit 64 fi WATCH_INTERVAL="$1" ;; --max-attempts) shift if ! [[ "${1:-}" =~ ^[0-9]+$ ]]; then echo "--max-attempts requires a non-negative integer" >&2 exit 64 fi WATCH_MAX_ATTEMPTS="$1" ;; -h|--help) usage exit 0 ;; *) echo "Unknown argument: $arg" >&2 usage >&2 exit 64 ;; esac shift done if [ -n "${NO_COLOR:-}" ] || [ "$NO_COLOR_FLAG" -eq 1 ]; then RED="" GREEN="" YELLOW="" BLUE="" NC="" else RED=$'\033[0;31m' GREEN=$'\033[0;32m' YELLOW=$'\033[1;33m' BLUE=$'\033[0;34m' NC=$'\033[0m' fi PASS=0 WARN=0 FAIL=0 log_section() { printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC" } ok() { printf "%sOK%s %s\n" "$GREEN" "$NC" "$1" PASS=$((PASS + 1)) } warn() { printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$1" WARN=$((WARN + 1)) } fail() { printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$1" FAIL=$((FAIL + 1)) } run_local() { local label="$1" shift if "$@" >/tmp/awoooi-cold-start-check.out 2>&1; then ok "$label" cat /tmp/awoooi-cold-start-check.out return 0 fi fail "$label" cat /tmp/awoooi-cold-start-check.out return 1 } ssh_cmd() { local user_host="$1" local cmd="$2" local prefix="" if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD" fi ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}" } host_has_ip() { local expected_ip="$1" if command -v ip >/dev/null 2>&1; then ip -o -4 addr show 2>/dev/null | awk '{print $4}' | grep -q "^${expected_ip}/" && return 0 fi hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$expected_ip" } host_cmd() { local user_host="$1" local cmd="$2" case "$user_host" in *@192.168.0.110) if host_has_ip "192.168.0.110"; then bash -lc "$cmd" return fi ;; *@192.168.0.120) if host_has_ip "192.168.0.120"; then bash -lc "$cmd" return fi ;; *@192.168.0.121) if host_has_ip "192.168.0.121"; then bash -lc "$cmd" return fi ;; *@192.168.0.188) if host_has_ip "192.168.0.188"; then bash -lc "$cmd" return fi ;; esac ssh_cmd "$user_host" "$cmd" } probe_http_code() { local url="$1" local attempt code for attempt in 1 2; do code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 12 "$url" 2>/dev/null || true) if [[ "$code" =~ ^[0-9]{3}$ ]] && [ "$code" != "000" ]; then echo "$code" return fi sleep 1 done echo "${code:-000}" } probe_tcp() { local host="$1" local port="$2" nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1 } print_neighbor_rows() { if command -v arp >/dev/null 2>&1; then arp -an | grep -E '192\.168\.0\.(110|120|121|188)' return $? fi if command -v ip >/dev/null 2>&1; then ip neigh show | grep -E '192\.168\.0\.(110|120|121|188)' return $? fi return 1 } print_header() { echo "AWOOOI full-stack cold-start check" date '+%Y-%m-%d %H:%M:%S %Z' echo "Scope: 110 / 120 / 121 / 188. 112 Kali is intentionally skipped." echo "Baseline: ops/reboot-recovery/full-stack-cold-start-baseline.yml" } check_network() { log_section "P0-NETWORK" local host for host in 110 120 121 188; do if ping -c 1 -W 2 "192.168.0.$host" >/dev/null 2>&1; then ok "ping 192.168.0.$host" else fail "ping 192.168.0.$host" fi if probe_tcp "192.168.0.$host" 22; then ok "ssh port 192.168.0.$host:22" else fail "ssh port 192.168.0.$host:22" fi done if print_neighbor_rows; then ok "neighbor evidence printed" elif [ "$MONITOR_READ_ONLY" -eq 1 ]; then ok "neighbor evidence unavailable in monitor mode; ping and TCP gates provide primary signal" else warn "no neighbor rows printed for one or more hosts" fi } check_188() { log_section "P0-188-DATA" local out if ! out=$(host_cmd "ollama@192.168.0.188" ' echo "HOST $(hostname) $(uptime)" echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")" echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx 2>/dev/null | tr "\n" " ")" echo "PG $(pg_isready -h localhost -p 5432 2>&1)" echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)" echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)" echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)" echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)" docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80 ' 2>&1); then fail "ssh 188 read-only check" echo "$out" return fi echo "$out" grep -q "PORT5432 OPEN" <<<"$out" && ok "188 PostgreSQL port open" || fail "188 PostgreSQL port closed" grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections" grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed" grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop" grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed" grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed" } check_110() { log_section "P0-110-REGISTRY-OBSERVABILITY" local out if ! out=$(host_cmd "wooo@192.168.0.110" ' echo "HOST $(hostname) $(uptime)" echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")" echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)" echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)" echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)" echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)" echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)" echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)" for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /" done docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120 ' 2>&1); then fail "ssh 110 read-only check" echo "$out" return fi echo "$out" grep -Eq "HARBOR_CODE (200|401)" <<<"$out" && ok "110 Harbor /v2 healthy code" || fail "110 Harbor not healthy" grep -Eq "GITEA_CODE (200|302)" <<<"$out" && ok "110 Gitea reachable" || warn "110 Gitea not confirmed" grep -q "PROM_CODE 200" <<<"$out" && ok "110 Prometheus ready" || warn "110 Prometheus not ready" grep -q "AM_CODE 200" <<<"$out" && ok "110 Alertmanager healthy" || warn "110 Alertmanager not healthy" grep -Eq "SENTRY_CODE (200|302|400)" <<<"$out" && ok "110 Sentry HTTP reachable" || warn "110 Sentry HTTP not confirmed" grep -q "WatchdogUSec=0" <<<"$out" && ok "runner watchdog disabled on at least one unit" || warn "runner watchdog state not confirmed" grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting" } check_k3s() { log_section "P1-K3S" local out local_kubectl_out if ! out=$(host_cmd "wooo@192.168.0.120" ' echo "HOST $(hostname) $(uptime)" echo "PG188_PORT $(nc -z -w 2 192.168.0.188 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)" echo "SYSTEMD $(systemctl is-active k3s k3s-agent keepalived 2>/dev/null | tr "\n" " ")" kcmd() { if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@" else sudo -n kubectl "$@" 2>/dev/null || kubectl "$@" fi } kcmd get nodes -o wide 2>/dev/null || true kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true ip addr show | grep 192.168.0.125 || true ' 2>&1); then fail "ssh 120 k3s read-only check" echo "$out" return fi echo "$out" if ! grep -q " Ready " <<<"$out"; then local_kubectl_out=$(kubectl get nodes -o wide 2>/dev/null || true) if [ -n "$local_kubectl_out" ]; then echo "LOCAL_KUBECTL_FALLBACK" echo "$local_kubectl_out" fi else local_kubectl_out="" fi grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL" grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable" grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120" } check_workload_and_alertchain() { log_section "P2-WORKLOAD-ALERTCHAIN" local api_code web_code alert_code local out if out=$(host_cmd "wooo@192.168.0.120" ' api_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32334/api/v1/health 2>/dev/null || true) web_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32335/ 2>/dev/null || true) echo "API_CODE ${api_code:-000}" echo "WEB_CODE ${web_code:-000}" ' 2>/dev/null); then api_code=$(awk '/^API_CODE / {print $2}' <<<"$out") web_code=$(awk '/^WEB_CODE / {print $2}' <<<"$out") else api_code=$(probe_http_code "http://192.168.0.125:32334/api/v1/health") web_code=$(probe_http_code "http://192.168.0.125:32335/") out="API_CODE $api_code WEB_CODE $web_code" fi echo "$out" [[ "$api_code" =~ ^[23] ]] && ok "AWOOOI API reachable" || fail "AWOOOI API not reachable" [[ "$web_code" =~ ^[23] ]] && ok "AWOOOI Web reachable" || warn "AWOOOI Web not confirmed" if [ "$SEND_ALERT_TEST" -eq 1 ]; then alert_code=$(host_cmd "wooo@192.168.0.120" 'curl -s -o /tmp/awoooi-alertchain.out -w "%{http_code}" --max-time 8 \ -X POST "http://192.168.0.125:32334/api/v1/webhooks/alertmanager" \ -H '"'"'Content-Type: application/json'"'"' \ -d '"'"'{"receiver":"cold-start-check","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartCheck","severity":"info"},"annotations":{"summary":"Cold start check"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-check"}'"'"' 2>/dev/null || echo "000"') echo "ALERTCHAIN_CODE $alert_code" [[ "$alert_code" =~ ^2 ]] && ok "Alertmanager webhook endpoint accepts POST" || warn "Alertmanager webhook E2E not confirmed" elif [ "$MONITOR_READ_ONLY" -eq 1 ]; then ok "Alertmanager webhook POST intentionally skipped in read-only monitor mode" else warn "Alertmanager webhook POST skipped; rerun with --send-alert-test after API is ready" fi } check_public_routes() { log_section "P2-PUBLIC-ROUTES" local item name url code tls_code local routes=( "awoooi_api|https://awoooi.wooo.work/api/v1/health" "awoooi_web|https://awoooi.wooo.work/" "momo_web|https://mo.wooo.work/" "momo_health|https://mo.wooo.work/health" "gitea|https://gitea.wooo.work/" "harbor|https://harbor.wooo.work/" "registry|https://registry.wooo.work/" "sentry|https://sentry.wooo.work/" "signoz|https://signoz.wooo.work/" "stock|https://stock.wooo.work/" "langfuse|https://langfuse.wooo.work/" "bitan|https://bitan.wooo.work/" "aiops|https://aiops.wooo.work/" ) for item in "${routes[@]}"; do name="${item%%|*}" url="${item#*|}" code=$(probe_http_code "$url") echo "PUBLIC_ROUTE $name $code $url" [[ "$code" =~ ^[23] ]] && ok "public route $name reachable" || warn "public route $name not confirmed" tls_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 8 "$url" 2>/dev/null || true) tls_code="${tls_code:-000}" echo "PUBLIC_ROUTE_TLS $name $tls_code $url" [[ "$tls_code" =~ ^[23] ]] && ok "public route $name TLS certificate verified" || fail "public route $name TLS certificate verification failed" done } check_schedules() { log_section "P2-SCHEDULES" local out if out=$(host_cmd "ollama@192.168.0.188" ' now=$(date +%s) echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)" for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/backup_health.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom /home/ollama/node_exporter_textfiles/storage_health.prom; do if [ -f "$f" ]; then mt=$(stat -c %Y "$f") echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))" else echo "TEXTFILE_188 $(basename "$f") missing" fi done if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom fi if [ -f /home/ollama/node_exporter_textfiles/backup_health.prom ]; then awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} END {printf \"BACKUP_HEALTH_188 total=%d stale=%d missing_cron=%d missing_script=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0}" /home/ollama/node_exporter_textfiles/backup_health.prom fi if [ -f /home/ollama/node_exporter_textfiles/storage_health.prom ]; then awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_188 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/ollama/node_exporter_textfiles/storage_health.prom fi echo "SCHEDULER_CONTAINER_RUNNING $(docker inspect -f "{{.State.Running}}" momo-scheduler 2>/dev/null || true)" echo "SCHEDULER_CONTAINER_HEALTH $(docker inspect -f "{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}" momo-scheduler 2>/dev/null || true)" echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)" echo "SCHEDULER_RECENT_ACTIVITY $(docker logs --since 2h momo-scheduler 2>&1 | grep -Ec "AutoImport|Meta-Analysis|Scheduler" || true)" momo_sync=$(docker exec momo-db sh -c "psql -U \"\$POSTGRES_USER\" -d \"\$POSTGRES_DB\" -Atc \"WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\\\"日期\\\"::date) mmin, max(\\\"日期\\\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \\\"日期\\\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;\"" 2>/dev/null || true) echo "MOMO_MONTHLY_SYNC ${momo_sync:-unavailable}" ' 2>&1); then echo "$out" grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed" awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing" awk '/TEXTFILE_188 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "188 backup health exporter fresh" || warn "188 backup health exporter stale" awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale" awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale" awk '/TEXTFILE_188 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 storage health exporter fresh" || warn "188 storage health exporter stale" grep -q "STORAGE_HEALTH_188 root_readonly=0 current=0" <<<"$out" && ok "188 current boot storage health clean" || warn "188 storage health not clean" awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed" grep -q "BACKUP_HEALTH_188 total=" <<<"$out" && awk '/BACKUP_HEALTH_188/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); exit !((a[2]+b[2]+c[2]) == 0)}' <<<"$out" && ok "188 backup health has no stale expected jobs" || warn "188 backup health has stale expected jobs" if grep -q "SCHEDULER_CONTAINER_HEALTH healthy" <<<"$out" && awk '/SCHEDULER_RECENT_ACTIVITY / {exit !($2 > 0)}' <<<"$out"; then ok "188 momo scheduler healthy with recent task activity" elif awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out"; then ok "188 momo scheduler registered jobs" else warn "188 momo scheduler registration/activity not confirmed" fi awk '/MOMO_MONTHLY_SYNC / {split($2,a,"|"); exit !(a[1] > 0 && a[1] == a[2] && a[3] == a[5] && a[4] == a[6])}' <<<"$out" && ok "188 momo current-month snapshot and realtime tables match" || warn "188 momo current-month snapshot/realtime sync not confirmed" else warn "188 schedule check unavailable" echo "$out" fi if out=$(host_cmd "wooo@192.168.0.110" ' now=$(date +%s) echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)" echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)" echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)" echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)" for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom /home/wooo/node_exporter_textfiles/storage_health.prom /home/wooo/node_exporter_textfiles/backup_health.prom; do if [ -f "$f" ]; then mt=$(stat -c %Y "$f") echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))" else echo "TEXTFILE_110 $(basename "$f") missing" fi done if [ -f /home/wooo/node_exporter_textfiles/storage_health.prom ]; then awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_110 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/wooo/node_exporter_textfiles/storage_health.prom fi if [ -f /home/wooo/node_exporter_textfiles/backup_health.prom ]; then awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} /^awoooi_backup_last_run_failed_count/ {if (\$0 ~ /(exported_job|job)=\"backup_all\"/) failed=int(\$2)} /^awoooi_backup_config_capture_critical_failed_count/ {config_failed=int(\$2)} /^awoooi_backup_integrity_fresh/ {integrity_total++; if (int(\$2) == 0) integrity_stale++} END {printf \"BACKUP_HEALTH_110 total=%d stale=%d missing_cron=%d missing_script=%d failed_count=%d config_failed=%d integrity_total=%d integrity_stale=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0, failed+0, config_failed+0, integrity_total+0, integrity_stale+0}" /home/wooo/node_exporter_textfiles/backup_health.prom fi ' 2>&1); then echo "$out" grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed" grep -q "FAILED_UNITS_110 0" <<<"$out" && ok "110 systemd has no failed units" || warn "110 systemd failed units remain" grep -q "MOMO_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale momo startup unit disabled" || warn "110 stale momo startup unit not disabled" grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled" awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale" awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale" awk '/TEXTFILE_110 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 storage health exporter fresh" || warn "110 storage health exporter stale" awk '/TEXTFILE_110 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "110 backup health exporter fresh" || warn "110 backup health exporter stale" grep -q "STORAGE_HEALTH_110 root_readonly=0 current=0" <<<"$out" && ok "110 current boot storage health clean" || warn "110 storage health not clean" grep -q "BACKUP_HEALTH_110 total=" <<<"$out" && awk '/BACKUP_HEALTH_110/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); split($6,d,"="); split($7,e,"="); exit !((a[2]+b[2]+c[2]) == 0 && d[2] == 0 && e[2] == 0)}' <<<"$out" && ok "110 backup health has no stale expected jobs" || warn "110 latest aggregate/config backup had failed components; rerun backup-all after 120 recovers" awk '/BACKUP_HEALTH_110/ {split($9,a,"="); exit !(a[2] == 0)}' <<<"$out" && ok "110 backup integrity and restore drill fresh" || warn "110 backup integrity or restore drill stale" else warn "110 schedule check unavailable" echo "$out" fi if out=$(host_cmd "wooo@192.168.0.120" ' kcmd() { if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@" else sudo -n kubectl "$@" 2>/dev/null || kubectl "$@" fi } echo "CRON_120 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)" kcmd get cronjobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); items=d.get(\"items\", []); print(\"CRONJOB_COUNT\", len(items)); print(\"CRONJOB_SUSPENDED\", sum(1 for i in items if i.get(\"spec\",{}).get(\"suspend\")))" kcmd get jobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); failed=0 for j in d.get(\"items\", []): if any(c.get(\"type\")==\"Failed\" and c.get(\"status\")==\"True\" for c in j.get(\"status\",{}).get(\"conditions\",[]) or []): failed += 1 print(\"FAILED_JOBS\", failed)" kcmd get pods -n awoooi-prod --no-headers 2>/dev/null | awk "\$3 !~ /^(Running|Completed)$/ {bad++} END {print \"BAD_PODS\", bad+0}" ' 2>&1); then echo "$out" grep -q "CRON_120 active" <<<"$out" && ok "120 cron active" || warn "120 cron not confirmed" awk '/CRONJOB_COUNT / {exit !($2 >= 4)}' <<<"$out" && ok "K8s AWOOOI CronJobs present" || warn "K8s AWOOOI CronJobs missing" grep -q "CRONJOB_SUSPENDED 0" <<<"$out" && ok "K8s AWOOOI CronJobs unsuspended" || warn "K8s AWOOOI CronJob suspended" grep -q "FAILED_JOBS 0" <<<"$out" && ok "K8s AWOOOI has no failed Jobs" || warn "K8s AWOOOI failed Jobs remain" grep -q "BAD_PODS 0" <<<"$out" && ok "K8s AWOOOI pods Running/Completed only" || warn "K8s AWOOOI bad pod status remains" else warn "120 K8s schedule check unavailable" echo "$out" fi if out=$(host_cmd "wooo@192.168.0.121" ' echo "CRON_121 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)" crontab -l 2>/dev/null | grep -q "dr-drill.sh" && echo "DR_DRILL_CRON present" || echo "DR_DRILL_CRON missing" ' 2>&1); then echo "$out" grep -q "CRON_121 active" <<<"$out" && ok "121 cron active" || warn "121 cron not confirmed" grep -q "DR_DRILL_CRON present" <<<"$out" && ok "121 DR drill cron present" || warn "121 DR drill cron missing" else warn "121 schedule check unavailable" echo "$out" fi } summary() { log_section "SUMMARY" echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL" if [ "$FAIL" -gt 0 ]; then echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation." exit 2 fi if [ "$WARN" -gt 0 ]; then echo "Result: DEGRADED. Core gates passed but warnings remain." exit 1 fi echo "Result: GREEN. Full stack is ready for controlled runner/CD release." } if [ "$WATCH_MODE" -eq 1 ]; then attempt=1 rc=2 while true; do echo "WATCH_ATTEMPT=$attempt" args=() [ "$MONITOR_READ_ONLY" -eq 1 ] && args+=(--monitor-read-only) [ "$NO_COLOR_FLAG" -eq 1 ] && args+=(--no-color) [ "$SEND_ALERT_TEST" -eq 1 ] && args+=(--send-alert-test) bash "$0" "${args[@]}" rc=$? [ "$rc" -eq 0 ] && exit 0 if [ "$WATCH_MAX_ATTEMPTS" -gt 0 ] && [ "$attempt" -ge "$WATCH_MAX_ATTEMPTS" ]; then exit "$rc" fi attempt=$((attempt + 1)) sleep "$WATCH_INTERVAL" done fi print_header check_network check_188 check_110 check_k3s check_workload_and_alertchain check_public_routes check_schedules summary