Files
awoooi/scripts/reboot-recovery/full-stack-cold-start-check.sh
2026-05-29 12:41:34 +08:00

588 lines
27 KiB
Bash
Executable File

#!/usr/bin/env bash
# AWOOOI full-stack cold-start readiness check.
# Read-only by design. It never restarts, deletes, repairs, or writes remote state.
set -uo pipefail
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
SEND_ALERT_TEST=0
MONITOR_READ_ONLY=0
NO_COLOR_FLAG=0
WATCH_MODE=0
WATCH_INTERVAL=60
WATCH_MAX_ATTEMPTS=30
usage() {
cat <<'USAGE'
Usage: bash scripts/reboot-recovery/full-stack-cold-start-check.sh [options]
Options:
--send-alert-test POST one Alertmanager webhook test after AWOOOI API is ready.
--monitor-read-only Skip the webhook POST without warning; intended for cron/textfile monitors.
--watch Repeat checks until all gates are GREEN or max attempts is reached.
--interval SECONDS Retry interval for --watch. Default: 60.
--max-attempts COUNT Max attempts for --watch. Default: 30. Use 0 for unlimited.
--no-color Disable ANSI colors in output.
-h, --help Show this help.
Default mode is read-only and does not POST an Alertmanager test event.
Use --send-alert-test for the final release gate after AWOOOI API is expected to be ready.
USAGE
}
while [ "$#" -gt 0 ]; do
arg="$1"
case "$arg" in
--send-alert-test)
SEND_ALERT_TEST=1
;;
--monitor-read-only)
MONITOR_READ_ONLY=1
SEND_ALERT_TEST=0
;;
--no-color)
NO_COLOR_FLAG=1
;;
--watch)
WATCH_MODE=1
;;
--interval)
shift
if ! [[ "${1:-}" =~ ^[0-9]+$ ]] || [ "${1:-0}" -lt 1 ]; then
echo "--interval requires a positive integer number of seconds" >&2
exit 64
fi
WATCH_INTERVAL="$1"
;;
--max-attempts)
shift
if ! [[ "${1:-}" =~ ^[0-9]+$ ]]; then
echo "--max-attempts requires a non-negative integer" >&2
exit 64
fi
WATCH_MAX_ATTEMPTS="$1"
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $arg" >&2
usage >&2
exit 64
;;
esac
shift
done
if [ -n "${NO_COLOR:-}" ] || [ "$NO_COLOR_FLAG" -eq 1 ]; then
RED=""
GREEN=""
YELLOW=""
BLUE=""
NC=""
else
RED=$'\033[0;31m'
GREEN=$'\033[0;32m'
YELLOW=$'\033[1;33m'
BLUE=$'\033[0;34m'
NC=$'\033[0m'
fi
PASS=0
WARN=0
FAIL=0
log_section() {
printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
}
ok() {
printf "%sOK%s %s\n" "$GREEN" "$NC" "$1"
PASS=$((PASS + 1))
}
warn() {
printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$1"
WARN=$((WARN + 1))
}
fail() {
printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$1"
FAIL=$((FAIL + 1))
}
run_local() {
local label="$1"
shift
if "$@" >/tmp/awoooi-cold-start-check.out 2>&1; then
ok "$label"
cat /tmp/awoooi-cold-start-check.out
return 0
fi
fail "$label"
cat /tmp/awoooi-cold-start-check.out
return 1
}
ssh_cmd() {
local user_host="$1"
local cmd="$2"
local prefix=""
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD"
fi
ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}"
}
host_has_ip() {
local expected_ip="$1"
if command -v ip >/dev/null 2>&1; then
ip -o -4 addr show 2>/dev/null | awk '{print $4}' | grep -q "^${expected_ip}/" && return 0
fi
hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$expected_ip"
}
host_cmd() {
local user_host="$1"
local cmd="$2"
case "$user_host" in
*@192.168.0.110)
if host_has_ip "192.168.0.110"; then
bash -lc "$cmd"
return
fi
;;
*@192.168.0.120)
if host_has_ip "192.168.0.120"; then
bash -lc "$cmd"
return
fi
;;
*@192.168.0.121)
if host_has_ip "192.168.0.121"; then
bash -lc "$cmd"
return
fi
;;
*@192.168.0.188)
if host_has_ip "192.168.0.188"; then
bash -lc "$cmd"
return
fi
;;
esac
ssh_cmd "$user_host" "$cmd"
}
probe_http_code() {
local url="$1"
local attempt code
for attempt in 1 2; do
code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 12 "$url" 2>/dev/null || true)
if [[ "$code" =~ ^[0-9]{3}$ ]] && [ "$code" != "000" ]; then
echo "$code"
return
fi
sleep 1
done
echo "${code:-000}"
}
probe_tcp() {
local host="$1"
local port="$2"
nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1
}
print_neighbor_rows() {
if command -v arp >/dev/null 2>&1; then
arp -an | grep -E '192\.168\.0\.(110|120|121|188)'
return $?
fi
if command -v ip >/dev/null 2>&1; then
ip neigh show | grep -E '192\.168\.0\.(110|120|121|188)'
return $?
fi
return 1
}
print_header() {
echo "AWOOOI full-stack cold-start check"
date '+%Y-%m-%d %H:%M:%S %Z'
echo "Scope: 110 / 120 / 121 / 188. 112 Kali is intentionally skipped."
echo "Baseline: ops/reboot-recovery/full-stack-cold-start-baseline.yml"
}
check_network() {
log_section "P0-NETWORK"
local host
for host in 110 120 121 188; do
if ping -c 1 -W 2 "192.168.0.$host" >/dev/null 2>&1; then
ok "ping 192.168.0.$host"
else
fail "ping 192.168.0.$host"
fi
if probe_tcp "192.168.0.$host" 22; then
ok "ssh port 192.168.0.$host:22"
else
fail "ssh port 192.168.0.$host:22"
fi
done
if print_neighbor_rows; then
ok "neighbor evidence printed"
elif [ "$MONITOR_READ_ONLY" -eq 1 ]; then
ok "neighbor evidence unavailable in monitor mode; ping and TCP gates provide primary signal"
else
warn "no neighbor rows printed for one or more hosts"
fi
}
check_188() {
log_section "P0-188-DATA"
local out
if ! out=$(host_cmd "ollama@192.168.0.188" '
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx 2>/dev/null | tr "\n" " ")"
echo "PG $(pg_isready -h localhost -p 5432 2>&1)"
echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)"
echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)"
echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)"
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80
' 2>&1); then
fail "ssh 188 read-only check"
echo "$out"
return
fi
echo "$out"
grep -q "PORT5432 OPEN" <<<"$out" && ok "188 PostgreSQL port open" || fail "188 PostgreSQL port closed"
grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections"
grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed"
grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop"
grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed"
grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed"
}
check_110() {
log_section "P0-110-REGISTRY-OBSERVABILITY"
local out
if ! out=$(host_cmd "wooo@192.168.0.110" '
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)"
echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)"
echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)"
echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)"
echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)"
echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)"
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
done
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
' 2>&1); then
fail "ssh 110 read-only check"
echo "$out"
return
fi
echo "$out"
grep -Eq "HARBOR_CODE (200|401)" <<<"$out" && ok "110 Harbor /v2 healthy code" || fail "110 Harbor not healthy"
grep -Eq "GITEA_CODE (200|302)" <<<"$out" && ok "110 Gitea reachable" || warn "110 Gitea not confirmed"
grep -q "PROM_CODE 200" <<<"$out" && ok "110 Prometheus ready" || warn "110 Prometheus not ready"
grep -q "AM_CODE 200" <<<"$out" && ok "110 Alertmanager healthy" || warn "110 Alertmanager not healthy"
grep -Eq "SENTRY_CODE (200|302|400)" <<<"$out" && ok "110 Sentry HTTP reachable" || warn "110 Sentry HTTP not confirmed"
grep -q "WatchdogUSec=0" <<<"$out" && ok "runner watchdog disabled on at least one unit" || warn "runner watchdog state not confirmed"
grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
}
check_k3s() {
log_section "P1-K3S"
local out local_kubectl_out
if ! out=$(host_cmd "wooo@192.168.0.120" '
echo "HOST $(hostname) $(uptime)"
echo "PG188_PORT $(nc -z -w 2 192.168.0.188 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
echo "SYSTEMD $(systemctl is-active k3s k3s-agent keepalived 2>/dev/null | tr "\n" " ")"
kcmd() {
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
else
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
fi
}
kcmd get nodes -o wide 2>/dev/null || true
kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true
ip addr show | grep 192.168.0.125 || true
' 2>&1); then
fail "ssh 120 k3s read-only check"
echo "$out"
return
fi
echo "$out"
if ! grep -q " Ready " <<<"$out"; then
local_kubectl_out=$(kubectl get nodes -o wide 2>/dev/null || true)
if [ -n "$local_kubectl_out" ]; then
echo "LOCAL_KUBECTL_FALLBACK"
echo "$local_kubectl_out"
fi
else
local_kubectl_out=""
fi
grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL"
grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable"
grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120"
}
check_workload_and_alertchain() {
log_section "P2-WORKLOAD-ALERTCHAIN"
local api_code web_code alert_code
local out
if out=$(host_cmd "wooo@192.168.0.120" '
api_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32334/api/v1/health 2>/dev/null || true)
web_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32335/ 2>/dev/null || true)
echo "API_CODE ${api_code:-000}"
echo "WEB_CODE ${web_code:-000}"
' 2>/dev/null); then
api_code=$(awk '/^API_CODE / {print $2}' <<<"$out")
web_code=$(awk '/^WEB_CODE / {print $2}' <<<"$out")
else
api_code=$(probe_http_code "http://192.168.0.125:32334/api/v1/health")
web_code=$(probe_http_code "http://192.168.0.125:32335/")
out="API_CODE $api_code
WEB_CODE $web_code"
fi
echo "$out"
[[ "$api_code" =~ ^[23] ]] && ok "AWOOOI API reachable" || fail "AWOOOI API not reachable"
[[ "$web_code" =~ ^[23] ]] && ok "AWOOOI Web reachable" || warn "AWOOOI Web not confirmed"
if [ "$SEND_ALERT_TEST" -eq 1 ]; then
alert_code=$(host_cmd "wooo@192.168.0.120" 'curl -s -o /tmp/awoooi-alertchain.out -w "%{http_code}" --max-time 8 \
-X POST "http://192.168.0.125:32334/api/v1/webhooks/alertmanager" \
-H '"'"'Content-Type: application/json'"'"' \
-d '"'"'{"receiver":"cold-start-check","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartCheck","severity":"info"},"annotations":{"summary":"Cold start check"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-check"}'"'"' 2>/dev/null || echo "000"')
echo "ALERTCHAIN_CODE $alert_code"
[[ "$alert_code" =~ ^2 ]] && ok "Alertmanager webhook endpoint accepts POST" || warn "Alertmanager webhook E2E not confirmed"
elif [ "$MONITOR_READ_ONLY" -eq 1 ]; then
ok "Alertmanager webhook POST intentionally skipped in read-only monitor mode"
else
warn "Alertmanager webhook POST skipped; rerun with --send-alert-test after API is ready"
fi
}
check_public_routes() {
log_section "P2-PUBLIC-ROUTES"
local item name url code tls_code
local routes=(
"awoooi_api|https://awoooi.wooo.work/api/v1/health"
"awoooi_web|https://awoooi.wooo.work/"
"momo_web|https://mo.wooo.work/"
"momo_health|https://mo.wooo.work/health"
"gitea|https://gitea.wooo.work/"
"harbor|https://harbor.wooo.work/"
"registry|https://registry.wooo.work/"
"sentry|https://sentry.wooo.work/"
"signoz|https://signoz.wooo.work/"
"stock|https://stock.wooo.work/"
"langfuse|https://langfuse.wooo.work/"
"bitan|https://bitan.wooo.work/"
"aiops|https://aiops.wooo.work/"
)
for item in "${routes[@]}"; do
name="${item%%|*}"
url="${item#*|}"
code=$(probe_http_code "$url")
echo "PUBLIC_ROUTE $name $code $url"
[[ "$code" =~ ^[23] ]] && ok "public route $name reachable" || warn "public route $name not confirmed"
tls_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 8 "$url" 2>/dev/null || true)
tls_code="${tls_code:-000}"
echo "PUBLIC_ROUTE_TLS $name $tls_code $url"
[[ "$tls_code" =~ ^[23] ]] && ok "public route $name TLS certificate verified" || fail "public route $name TLS certificate verification failed"
done
}
check_schedules() {
log_section "P2-SCHEDULES"
local out
if out=$(host_cmd "ollama@192.168.0.188" '
now=$(date +%s)
echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/backup_health.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom /home/ollama/node_exporter_textfiles/storage_health.prom; do
if [ -f "$f" ]; then
mt=$(stat -c %Y "$f")
echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))"
else
echo "TEXTFILE_188 $(basename "$f") missing"
fi
done
if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then
awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom
fi
if [ -f /home/ollama/node_exporter_textfiles/backup_health.prom ]; then
awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} END {printf \"BACKUP_HEALTH_188 total=%d stale=%d missing_cron=%d missing_script=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0}" /home/ollama/node_exporter_textfiles/backup_health.prom
fi
if [ -f /home/ollama/node_exporter_textfiles/storage_health.prom ]; then
awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_188 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/ollama/node_exporter_textfiles/storage_health.prom
fi
echo "SCHEDULER_CONTAINER_RUNNING $(docker inspect -f "{{.State.Running}}" momo-scheduler 2>/dev/null || true)"
echo "SCHEDULER_CONTAINER_HEALTH $(docker inspect -f "{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}" momo-scheduler 2>/dev/null || true)"
echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)"
echo "SCHEDULER_RECENT_ACTIVITY $(docker logs --since 2h momo-scheduler 2>&1 | grep -Ec "AutoImport|Meta-Analysis|Scheduler" || true)"
momo_sync=$(docker exec momo-db sh -c "psql -U \"\$POSTGRES_USER\" -d \"\$POSTGRES_DB\" -Atc \"WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\\\"日期\\\"::date) mmin, max(\\\"日期\\\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \\\"日期\\\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;\"" 2>/dev/null || true)
echo "MOMO_MONTHLY_SYNC ${momo_sync:-unavailable}"
' 2>&1); then
echo "$out"
grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing"
awk '/TEXTFILE_188 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "188 backup health exporter fresh" || warn "188 backup health exporter stale"
awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale"
awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale"
awk '/TEXTFILE_188 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 storage health exporter fresh" || warn "188 storage health exporter stale"
grep -q "STORAGE_HEALTH_188 root_readonly=0 current=0" <<<"$out" && ok "188 current boot storage health clean" || warn "188 storage health not clean"
awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed"
grep -q "BACKUP_HEALTH_188 total=" <<<"$out" && awk '/BACKUP_HEALTH_188/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); exit !((a[2]+b[2]+c[2]) == 0)}' <<<"$out" && ok "188 backup health has no stale expected jobs" || warn "188 backup health has stale expected jobs"
if grep -q "SCHEDULER_CONTAINER_HEALTH healthy" <<<"$out" && awk '/SCHEDULER_RECENT_ACTIVITY / {exit !($2 > 0)}' <<<"$out"; then
ok "188 momo scheduler healthy with recent task activity"
elif awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out"; then
ok "188 momo scheduler registered jobs"
else
warn "188 momo scheduler registration/activity not confirmed"
fi
awk '/MOMO_MONTHLY_SYNC / {split($2,a,"|"); exit !(a[1] > 0 && a[1] == a[2] && a[3] == a[5] && a[4] == a[6])}' <<<"$out" && ok "188 momo current-month snapshot and realtime tables match" || warn "188 momo current-month snapshot/realtime sync not confirmed"
else
warn "188 schedule check unavailable"
echo "$out"
fi
if out=$(host_cmd "wooo@192.168.0.110" '
now=$(date +%s)
echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)"
echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)"
echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)"
for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom /home/wooo/node_exporter_textfiles/storage_health.prom /home/wooo/node_exporter_textfiles/backup_health.prom; do
if [ -f "$f" ]; then
mt=$(stat -c %Y "$f")
echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))"
else
echo "TEXTFILE_110 $(basename "$f") missing"
fi
done
if [ -f /home/wooo/node_exporter_textfiles/storage_health.prom ]; then
awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_110 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/wooo/node_exporter_textfiles/storage_health.prom
fi
if [ -f /home/wooo/node_exporter_textfiles/backup_health.prom ]; then
awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} /^awoooi_backup_last_run_failed_count/ {if (\$0 ~ /(exported_job|job)=\"backup_all\"/) failed=int(\$2)} /^awoooi_backup_config_capture_critical_failed_count/ {config_failed=int(\$2)} /^awoooi_backup_integrity_fresh/ {integrity_total++; if (int(\$2) == 0) integrity_stale++} END {printf \"BACKUP_HEALTH_110 total=%d stale=%d missing_cron=%d missing_script=%d failed_count=%d config_failed=%d integrity_total=%d integrity_stale=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0, failed+0, config_failed+0, integrity_total+0, integrity_stale+0}" /home/wooo/node_exporter_textfiles/backup_health.prom
fi
' 2>&1); then
echo "$out"
grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed"
grep -q "FAILED_UNITS_110 0" <<<"$out" && ok "110 systemd has no failed units" || warn "110 systemd failed units remain"
grep -q "MOMO_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale momo startup unit disabled" || warn "110 stale momo startup unit not disabled"
grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled"
awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale"
awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale"
awk '/TEXTFILE_110 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 storage health exporter fresh" || warn "110 storage health exporter stale"
awk '/TEXTFILE_110 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "110 backup health exporter fresh" || warn "110 backup health exporter stale"
grep -q "STORAGE_HEALTH_110 root_readonly=0 current=0" <<<"$out" && ok "110 current boot storage health clean" || warn "110 storage health not clean"
grep -q "BACKUP_HEALTH_110 total=" <<<"$out" && awk '/BACKUP_HEALTH_110/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); split($6,d,"="); split($7,e,"="); exit !((a[2]+b[2]+c[2]) == 0 && d[2] == 0 && e[2] == 0)}' <<<"$out" && ok "110 backup health has no stale expected jobs" || warn "110 latest aggregate/config backup had failed components; rerun backup-all after 120 recovers"
awk '/BACKUP_HEALTH_110/ {split($9,a,"="); exit !(a[2] == 0)}' <<<"$out" && ok "110 backup integrity and restore drill fresh" || warn "110 backup integrity or restore drill stale"
else
warn "110 schedule check unavailable"
echo "$out"
fi
if out=$(host_cmd "wooo@192.168.0.120" '
kcmd() {
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
else
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
fi
}
echo "CRON_120 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
kcmd get cronjobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); items=d.get(\"items\", []); print(\"CRONJOB_COUNT\", len(items)); print(\"CRONJOB_SUSPENDED\", sum(1 for i in items if i.get(\"spec\",{}).get(\"suspend\")))"
kcmd get jobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); failed=0
for j in d.get(\"items\", []):
if any(c.get(\"type\")==\"Failed\" and c.get(\"status\")==\"True\" for c in j.get(\"status\",{}).get(\"conditions\",[]) or []):
failed += 1
print(\"FAILED_JOBS\", failed)"
kcmd get pods -n awoooi-prod --no-headers 2>/dev/null | awk "\$3 !~ /^(Running|Completed)$/ {bad++} END {print \"BAD_PODS\", bad+0}"
' 2>&1); then
echo "$out"
grep -q "CRON_120 active" <<<"$out" && ok "120 cron active" || warn "120 cron not confirmed"
awk '/CRONJOB_COUNT / {exit !($2 >= 4)}' <<<"$out" && ok "K8s AWOOOI CronJobs present" || warn "K8s AWOOOI CronJobs missing"
grep -q "CRONJOB_SUSPENDED 0" <<<"$out" && ok "K8s AWOOOI CronJobs unsuspended" || warn "K8s AWOOOI CronJob suspended"
grep -q "FAILED_JOBS 0" <<<"$out" && ok "K8s AWOOOI has no failed Jobs" || warn "K8s AWOOOI failed Jobs remain"
grep -q "BAD_PODS 0" <<<"$out" && ok "K8s AWOOOI pods Running/Completed only" || warn "K8s AWOOOI bad pod status remains"
else
warn "120 K8s schedule check unavailable"
echo "$out"
fi
if out=$(host_cmd "wooo@192.168.0.121" '
echo "CRON_121 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
crontab -l 2>/dev/null | grep -q "dr-drill.sh" && echo "DR_DRILL_CRON present" || echo "DR_DRILL_CRON missing"
' 2>&1); then
echo "$out"
grep -q "CRON_121 active" <<<"$out" && ok "121 cron active" || warn "121 cron not confirmed"
grep -q "DR_DRILL_CRON present" <<<"$out" && ok "121 DR drill cron present" || warn "121 DR drill cron missing"
else
warn "121 schedule check unavailable"
echo "$out"
fi
}
summary() {
log_section "SUMMARY"
echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
if [ "$FAIL" -gt 0 ]; then
echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation."
exit 2
fi
if [ "$WARN" -gt 0 ]; then
echo "Result: DEGRADED. Core gates passed but warnings remain."
exit 1
fi
echo "Result: GREEN. Full stack is ready for controlled runner/CD release."
}
if [ "$WATCH_MODE" -eq 1 ]; then
attempt=1
rc=2
while true; do
echo "WATCH_ATTEMPT=$attempt"
args=()
[ "$MONITOR_READ_ONLY" -eq 1 ] && args+=(--monitor-read-only)
[ "$NO_COLOR_FLAG" -eq 1 ] && args+=(--no-color)
[ "$SEND_ALERT_TEST" -eq 1 ] && args+=(--send-alert-test)
bash "$0" "${args[@]}"
rc=$?
[ "$rc" -eq 0 ] && exit 0
if [ "$WATCH_MAX_ATTEMPTS" -gt 0 ] && [ "$attempt" -ge "$WATCH_MAX_ATTEMPTS" ]; then
exit "$rc"
fi
attempt=$((attempt + 1))
sleep "$WATCH_INTERVAL"
done
fi
print_header
check_network
check_188
check_110
check_k3s
check_workload_and_alertchain
check_public_routes
check_schedules
summary