Files
awoooi/scripts/reboot-recovery/full-stack-cold-start-check.sh

678 lines
33 KiB
Bash
Executable File

#!/usr/bin/env bash
# AWOOOI full-stack cold-start readiness check.
# Read-only by design. It never restarts, deletes, repairs, or writes remote state.
set -uo pipefail
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
SEND_ALERT_TEST=0
MONITOR_READ_ONLY=0
NO_COLOR_FLAG=0
WATCH_MODE=0
WATCH_INTERVAL=60
WATCH_MAX_ATTEMPTS=30
usage() {
cat <<'USAGE'
Usage: bash scripts/reboot-recovery/full-stack-cold-start-check.sh [options]
Options:
--send-alert-test POST one Alertmanager webhook test after AWOOOI API is ready.
--monitor-read-only Skip the webhook POST without warning; intended for cron/textfile monitors.
--watch Repeat checks until all gates are GREEN or max attempts is reached.
--interval SECONDS Retry interval for --watch. Default: 60.
--max-attempts COUNT Max attempts for --watch. Default: 30. Use 0 for unlimited.
--no-color Disable ANSI colors in output.
-h, --help Show this help.
Default mode is read-only and does not POST an Alertmanager test event.
Use --send-alert-test for the final release gate after AWOOOI API is expected to be ready.
USAGE
}
while [ "$#" -gt 0 ]; do
arg="$1"
case "$arg" in
--send-alert-test)
SEND_ALERT_TEST=1
;;
--monitor-read-only)
MONITOR_READ_ONLY=1
SEND_ALERT_TEST=0
;;
--no-color)
NO_COLOR_FLAG=1
;;
--watch)
WATCH_MODE=1
;;
--interval)
shift
if ! [[ "${1:-}" =~ ^[0-9]+$ ]] || [ "${1:-0}" -lt 1 ]; then
echo "--interval requires a positive integer number of seconds" >&2
exit 64
fi
WATCH_INTERVAL="$1"
;;
--max-attempts)
shift
if ! [[ "${1:-}" =~ ^[0-9]+$ ]]; then
echo "--max-attempts requires a non-negative integer" >&2
exit 64
fi
WATCH_MAX_ATTEMPTS="$1"
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $arg" >&2
usage >&2
exit 64
;;
esac
shift
done
if [ -n "${NO_COLOR:-}" ] || [ "$NO_COLOR_FLAG" -eq 1 ]; then
RED=""
GREEN=""
YELLOW=""
BLUE=""
NC=""
else
RED=$'\033[0;31m'
GREEN=$'\033[0;32m'
YELLOW=$'\033[1;33m'
BLUE=$'\033[0;34m'
NC=$'\033[0m'
fi
PASS=0
WARN=0
FAIL=0
log_section() {
printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
}
ok() {
printf "%sOK%s %s\n" "$GREEN" "$NC" "$1"
PASS=$((PASS + 1))
}
warn() {
printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$1"
WARN=$((WARN + 1))
}
fail() {
printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$1"
FAIL=$((FAIL + 1))
}
run_local() {
local label="$1"
shift
if "$@" >/tmp/awoooi-cold-start-check.out 2>&1; then
ok "$label"
cat /tmp/awoooi-cold-start-check.out
return 0
fi
fail "$label"
cat /tmp/awoooi-cold-start-check.out
return 1
}
ssh_cmd() {
local user_host="$1"
local cmd="$2"
local prefix=""
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD"
fi
ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}"
}
host_has_ip() {
local expected_ip="$1"
if command -v ip >/dev/null 2>&1; then
ip -o -4 addr show 2>/dev/null | awk '{print $4}' | grep -q "^${expected_ip}/" && return 0
fi
hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$expected_ip"
}
host_cmd() {
local user_host="$1"
local cmd="$2"
case "$user_host" in
*@192.168.0.110)
if host_has_ip "192.168.0.110"; then
bash -lc "$cmd"
return
fi
;;
*@192.168.0.120)
if host_has_ip "192.168.0.120"; then
bash -lc "$cmd"
return
fi
;;
*@192.168.0.121)
if host_has_ip "192.168.0.121"; then
bash -lc "$cmd"
return
fi
;;
*@192.168.0.188)
if host_has_ip "192.168.0.188"; then
bash -lc "$cmd"
return
fi
;;
esac
ssh_cmd "$user_host" "$cmd"
}
probe_http_code() {
local url="$1"
local attempt code
for attempt in 1 2; do
code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 12 "$url" 2>/dev/null || true)
if [[ "$code" =~ ^[0-9]{3}$ ]] && [ "$code" != "000" ]; then
echo "$code"
return
fi
sleep 1
done
echo "${code:-000}"
}
probe_tcp() {
local host="$1"
local port="$2"
nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1
}
print_neighbor_rows() {
if command -v arp >/dev/null 2>&1; then
arp -an | grep -E '192\.168\.0\.(110|120|121|188)'
return $?
fi
if command -v ip >/dev/null 2>&1; then
ip neigh show | grep -E '192\.168\.0\.(110|120|121|188)'
return $?
fi
return 1
}
print_header() {
echo "AWOOOI full-stack cold-start check"
date '+%Y-%m-%d %H:%M:%S %Z'
echo "Scope: 110 / 120 / 121 / 188. 112 Kali is intentionally skipped."
echo "Baseline: ops/reboot-recovery/full-stack-cold-start-baseline.yml"
}
check_network() {
log_section "P0-NETWORK"
local host
for host in 110 120 121 188; do
if ping -c 1 -W 2 "192.168.0.$host" >/dev/null 2>&1; then
ok "ping 192.168.0.$host"
else
fail "ping 192.168.0.$host"
fi
if probe_tcp "192.168.0.$host" 22; then
ok "ssh port 192.168.0.$host:22"
else
fail "ssh port 192.168.0.$host:22"
fi
done
if print_neighbor_rows; then
ok "neighbor evidence printed"
elif [ "$MONITOR_READ_ONLY" -eq 1 ]; then
ok "neighbor evidence unavailable in monitor mode; ping and TCP gates provide primary signal"
else
warn "no neighbor rows printed for one or more hosts"
fi
}
check_188() {
log_section "P0-188-DATA"
local out
if ! out=$(host_cmd "ollama@192.168.0.188" '
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx 2>/dev/null | tr "\n" " ")"
echo "PG $(pg_isready -h localhost -p 5432 2>&1)"
echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)"
echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)"
echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)"
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80
' 2>&1); then
fail "ssh 188 read-only check"
echo "$out"
return
fi
echo "$out"
grep -q "PORT5432 OPEN" <<<"$out" && ok "188 PostgreSQL port open" || fail "188 PostgreSQL port closed"
grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections"
grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed"
grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop"
grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed"
grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed"
}
check_110() {
log_section "P0-110-REGISTRY-OBSERVABILITY"
local out
if ! out=$(host_cmd "wooo@192.168.0.110" '
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)"
echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)"
echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)"
echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)"
echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)"
echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)"
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
done
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
' 2>&1); then
fail "ssh 110 read-only check"
echo "$out"
return
fi
echo "$out"
grep -Eq "HARBOR_CODE (200|401)" <<<"$out" && ok "110 Harbor /v2 healthy code" || fail "110 Harbor not healthy"
grep -Eq "GITEA_CODE (200|302)" <<<"$out" && ok "110 Gitea reachable" || warn "110 Gitea not confirmed"
grep -q "PROM_CODE 200" <<<"$out" && ok "110 Prometheus ready" || warn "110 Prometheus not ready"
grep -q "AM_CODE 200" <<<"$out" && ok "110 Alertmanager healthy" || warn "110 Alertmanager not healthy"
grep -Eq "SENTRY_CODE (200|302|400)" <<<"$out" && ok "110 Sentry HTTP reachable" || warn "110 Sentry HTTP not confirmed"
grep -q "WatchdogUSec=0" <<<"$out" && ok "runner watchdog disabled on at least one unit" || warn "runner watchdog state not confirmed"
grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
}
check_k3s() {
log_section "P1-K3S"
local out local_kubectl_out
if ! out=$(host_cmd "wooo@192.168.0.120" '
echo "HOST $(hostname) $(uptime)"
echo "PG188_PORT $(nc -z -w 2 192.168.0.188 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
echo "SYSTEMD $(systemctl is-active k3s k3s-agent keepalived 2>/dev/null | tr "\n" " ")"
kcmd() {
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
else
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
fi
}
kcmd get nodes -o wide 2>/dev/null || true
kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true
node_condition_summary=$(kcmd get nodes -o json 2>/dev/null | python3 -c "import json,sys
try:
d=json.load(sys.stdin)
except Exception:
d={\"items\": []}
not_ready=readonly=disk_pressure=0
for node in d.get(\"items\", []):
conds={c.get(\"type\"): c.get(\"status\") for c in node.get(\"status\",{}).get(\"conditions\",[]) or []}
if conds.get(\"Ready\") != \"True\":
not_ready += 1
if conds.get(\"ReadonlyFilesystem\") == \"True\":
readonly += 1
if conds.get(\"DiskPressure\") == \"True\":
disk_pressure += 1
print(f\"NODE_NOT_READY {not_ready}\")
print(f\"NODE_READONLY_FILESYSTEM_TRUE {readonly}\")
print(f\"NODE_DISK_PRESSURE_TRUE {disk_pressure}\")" || true)
printf "%s\n" "$node_condition_summary"
node_fs_events=$(kcmd get events -A --field-selector involvedObject.kind=Node --sort-by=.lastTimestamp 2>/dev/null \
| grep -Eiv "InvalidDiskCapacity|image filesystem" \
| grep -Eic "fsck|I/O error|read-only file system|Structure needs cleaning|orphan linked list|EXT4-fs.*error|XFS.*(corruption|metadata)|Remounting filesystem read-only" || true)
echo "NODE_FS_ERROR_EVENTS ${node_fs_events:-0}"
ip addr show | grep 192.168.0.125 || true
' 2>&1); then
fail "ssh 120 k3s read-only check"
echo "$out"
return
fi
echo "$out"
if ! grep -q " Ready " <<<"$out"; then
local_kubectl_out=$(kubectl get nodes -o wide 2>/dev/null || true)
if [ -n "$local_kubectl_out" ]; then
echo "LOCAL_KUBECTL_FALLBACK"
echo "$local_kubectl_out"
fi
else
local_kubectl_out=""
fi
grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL"
grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable"
grep -q "NODE_NOT_READY 0" <<<"$out" && ok "K3s node Ready condition clean" || fail "K3s node Ready condition not clean"
if grep -q "NODE_FS_ERROR_EVENTS 0" <<<"$out" \
&& grep -q "NODE_READONLY_FILESYSTEM_TRUE 0" <<<"$out" \
&& grep -q "NODE_DISK_PRESSURE_TRUE 0" <<<"$out"; then
ok "K3s node storage conditions clean"
else
fail "K3s node storage condition or severe filesystem event present"
fi
grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120"
}
check_workload_and_alertchain() {
log_section "P2-WORKLOAD-ALERTCHAIN"
local api_code web_code alert_code
local out
if out=$(host_cmd "wooo@192.168.0.120" '
api_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32334/api/v1/health 2>/dev/null || true)
web_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32335/ 2>/dev/null || true)
echo "API_CODE ${api_code:-000}"
echo "WEB_CODE ${web_code:-000}"
' 2>/dev/null); then
api_code=$(awk '/^API_CODE / {print $2}' <<<"$out")
web_code=$(awk '/^WEB_CODE / {print $2}' <<<"$out")
else
api_code=$(probe_http_code "http://192.168.0.125:32334/api/v1/health")
web_code=$(probe_http_code "http://192.168.0.125:32335/")
out="API_CODE $api_code
WEB_CODE $web_code"
fi
echo "$out"
[[ "$api_code" =~ ^[23] ]] && ok "AWOOOI API reachable" || fail "AWOOOI API not reachable"
[[ "$web_code" =~ ^[23] ]] && ok "AWOOOI Web reachable" || warn "AWOOOI Web not confirmed"
if [ "$SEND_ALERT_TEST" -eq 1 ]; then
alert_code=$(host_cmd "wooo@192.168.0.120" 'curl -s -o /tmp/awoooi-alertchain.out -w "%{http_code}" --max-time 8 \
-X POST "http://192.168.0.125:32334/api/v1/webhooks/alertmanager" \
-H '"'"'Content-Type: application/json'"'"' \
-d '"'"'{"receiver":"cold-start-check","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartCheck","severity":"info"},"annotations":{"summary":"Cold start check"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-check"}'"'"' 2>/dev/null || echo "000"')
echo "ALERTCHAIN_CODE $alert_code"
[[ "$alert_code" =~ ^2 ]] && ok "Alertmanager webhook endpoint accepts POST" || warn "Alertmanager webhook E2E not confirmed"
elif [ "$MONITOR_READ_ONLY" -eq 1 ]; then
ok "Alertmanager webhook POST intentionally skipped in read-only monitor mode"
else
warn "Alertmanager webhook POST skipped; rerun with --send-alert-test after API is ready"
fi
}
check_public_routes() {
log_section "P2-PUBLIC-ROUTES"
local item name url code tls_code
local routes=(
"awoooi_api|https://awoooi.wooo.work/api/v1/health"
"awoooi_web|https://awoooi.wooo.work/"
"momo_web|https://mo.wooo.work/"
"momo_health|https://mo.wooo.work/health"
"gitea|https://gitea.wooo.work/"
"harbor|https://harbor.wooo.work/"
"registry|https://registry.wooo.work/"
"sentry|https://sentry.wooo.work/"
"signoz|https://signoz.wooo.work/"
"stock|https://stock.wooo.work/"
"langfuse|https://langfuse.wooo.work/"
"bitan|https://bitan.wooo.work/"
"aiops|https://aiops.wooo.work/"
)
for item in "${routes[@]}"; do
name="${item%%|*}"
url="${item#*|}"
code=$(probe_http_code "$url")
echo "PUBLIC_ROUTE $name $code $url"
[[ "$code" =~ ^[23] ]] && ok "public route $name reachable" || warn "public route $name not confirmed"
tls_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 8 "$url" 2>/dev/null || true)
tls_code="${tls_code:-000}"
echo "PUBLIC_ROUTE_TLS $name $tls_code $url"
[[ "$tls_code" =~ ^[23] ]] && ok "public route $name TLS certificate verified" || fail "public route $name TLS certificate verification failed"
done
}
check_schedules() {
log_section "P2-SCHEDULES"
local out
if out=$(host_cmd "ollama@192.168.0.188" '
now=$(date +%s)
echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/backup_health.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom /home/ollama/node_exporter_textfiles/storage_health.prom; do
if [ -f "$f" ]; then
mt=$(stat -c %Y "$f")
echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))"
else
echo "TEXTFILE_188 $(basename "$f") missing"
fi
done
if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then
awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom
fi
if [ -f /home/ollama/node_exporter_textfiles/backup_health.prom ]; then
awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} END {printf \"BACKUP_HEALTH_188 total=%d stale=%d missing_cron=%d missing_script=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0}" /home/ollama/node_exporter_textfiles/backup_health.prom
fi
if [ -f /home/ollama/node_exporter_textfiles/storage_health.prom ]; then
awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_188 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/ollama/node_exporter_textfiles/storage_health.prom
fi
echo "SCHEDULER_CONTAINER_RUNNING $(docker inspect -f "{{.State.Running}}" momo-scheduler 2>/dev/null || true)"
echo "SCHEDULER_CONTAINER_HEALTH $(docker inspect -f "{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}" momo-scheduler 2>/dev/null || true)"
echo "SCHEDULER_REGISTERED $(docker logs --tail 400 momo-scheduler 2>&1 | grep -Ec "全部排程任務已註冊|排程任務已註冊|Scheduler started|APScheduler" || true)"
echo "SCHEDULER_RECENT_ACTIVITY $(docker logs --since 2h momo-scheduler 2>&1 | grep -Ec "AutoImport|Meta-Analysis|Scheduler|排程|任務|批次 [0-9]+: 取得|\\[Feeder\\]|HITL|候選屬" || true)"
echo "MOMO_SOURCE_EMPTY_EVIDENCE_LINES $(docker logs --since 6h momo-scheduler 2>&1 | grep -Ec "找到 0 個 Excel|沒有找到待匯入" || true)"
token_stat=$(stat -c "%u:%g:%a" /home/ollama/momo-pro/config/google_token.json 2>/dev/null || true)
scheduler_uid=$(docker top momo-scheduler -eo pid,user,uid 2>/dev/null | awk "NR==2 {print \$3}" || true)
echo "MOMO_GDRIVE_TOKEN_STAT ${token_stat:-missing} scheduler_uid=${scheduler_uid:-unknown}"
db_user=$(docker exec momo-pro-system printenv POSTGRES_USER 2>/dev/null || true)
db_name=$(docker exec momo-pro-system printenv POSTGRES_DB 2>/dev/null || true)
db_pass=$(docker exec momo-pro-system printenv POSTGRES_PASSWORD 2>/dev/null || true)
if [ -n "$db_user" ] && [ -n "$db_name" ] && [ -n "$db_pass" ]; then
momo_sync=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\"日期\"::date) mmin, max(\"日期\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \"日期\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;" 2>/dev/null || true)
momo_freshness=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT coalesce((current_date - max(snapshot_date::date))::text, chr(45)) || chr(124) || coalesce(max(snapshot_date::date)::text, chr(45)) FROM daily_sales_snapshot;" 2>/dev/null || true)
momo_import_config=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT config_key || chr(61) || config_value FROM import_config;" 2>/dev/null | awk -F= "\$1 == \"gdrive_folder_path\" {folder=\$2} \$1 == \"gdrive_file_pattern\" {pattern=\$2} END {if (folder || pattern) print folder \"|\" pattern}" || true)
momo_latest_import_job=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT coalesce(id::text, chr(45)) || chr(124) || coalesce(job_type, chr(45)) || chr(124) || coalesce(status, chr(45)) || chr(124) || coalesce(drive_file_name, chr(45)) || chr(124) || coalesce(replace(created_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(replace(completed_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(total_rows::text, chr(45)) || chr(124) || coalesce(success_rows::text, chr(45)) || chr(124) || coalesce(error_rows::text, chr(45)) FROM import_jobs ORDER BY created_at DESC LIMIT 20;" 2>/dev/null | awk "BEGIN {FS=sprintf(\"%c\",124)} \$2 == \"daily_sales\" {print \$1 \"|\" \$3 \"|\" \$4 \"|\" \$5 \"|\" \$6 \"|\" \$7 \"|\" \$8 \"|\" \$9; exit}" || true)
else
momo_sync=""
momo_freshness=""
momo_import_config=""
momo_latest_import_job=""
fi
echo "MOMO_MONTHLY_SYNC ${momo_sync:-unavailable}"
echo "MOMO_DAILY_FRESHNESS ${momo_freshness:-unavailable}"
echo "MOMO_IMPORT_CONFIG ${momo_import_config:-unavailable}"
echo "MOMO_LATEST_IMPORT_JOB ${momo_latest_import_job:-unavailable}"
' 2>&1); then
echo "$out"
grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing"
awk '/TEXTFILE_188 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "188 backup health exporter fresh" || warn "188 backup health exporter stale"
awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale"
awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale"
awk '/TEXTFILE_188 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 storage health exporter fresh" || warn "188 storage health exporter stale"
grep -q "STORAGE_HEALTH_188 root_readonly=0 current=0" <<<"$out" && ok "188 current boot storage health clean" || warn "188 storage health not clean"
awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed"
grep -q "BACKUP_HEALTH_188 total=" <<<"$out" && awk '/BACKUP_HEALTH_188/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); exit !((a[2]+b[2]+c[2]) == 0)}' <<<"$out" && ok "188 backup health has no stale expected jobs" || warn "188 backup health has stale expected jobs"
if grep -q "SCHEDULER_CONTAINER_HEALTH healthy" <<<"$out" && awk '/SCHEDULER_RECENT_ACTIVITY / {exit !($2 > 0)}' <<<"$out"; then
ok "188 momo scheduler healthy with recent task activity"
elif awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out"; then
ok "188 momo scheduler registered jobs"
else
warn "188 momo scheduler registration/activity not confirmed"
fi
awk '/MOMO_GDRIVE_TOKEN_STAT / {split($2,a,":"); split($3,b,"="); exit !(a[1] == b[2] && a[3] <= 600)}' <<<"$out" && ok "188 momo Google Drive token ownership matches scheduler userns" || warn "188 momo Google Drive token ownership/writeback not confirmed"
grep -Fq "MOMO_IMPORT_CONFIG 當日業績匯入|即時業績_當日" <<<"$out" && ok "188 momo Drive import config points to expected daily-sales intake" || fail "188 momo Drive import config drifted from expected daily-sales intake"
awk '/MOMO_LATEST_IMPORT_JOB / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[2] == "completed" && a[6] == a[7] && a[8] == 0)}' <<<"$out" && ok "188 momo latest daily import job completed cleanly" || warn "188 momo latest daily import job not confirmed clean"
awk '/MOMO_MONTHLY_SYNC / {split($2,a,"|"); exit !(a[1] > 0 && a[1] == a[2] && a[3] == a[5] && a[4] == a[6])}' <<<"$out" && ok "188 momo current-month snapshot and realtime tables match" || warn "188 momo current-month snapshot/realtime sync not confirmed"
if awk '/MOMO_DAILY_FRESHNESS / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[1] >= 0 && a[1] <= 2)}' <<<"$out"; then
ok "188 momo daily sales data fresh enough"
elif awk '/MOMO_DAILY_FRESHNESS / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[1] >= 3)}' <<<"$out"; then
if awk '/MOMO_SOURCE_EMPTY_EVIDENCE_LINES / {exit !($2 > 0)}' <<<"$out"; then
fail "188 momo source file absent while daily sales data stale"
else
fail "188 momo daily sales data stale beyond 3 days"
fi
else
warn "188 momo daily sales freshness not confirmed"
fi
else
warn "188 schedule check unavailable"
echo "$out"
fi
if out=$(host_cmd "wooo@192.168.0.110" '
now=$(date +%s)
echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)"
echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)"
echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)"
for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom /home/wooo/node_exporter_textfiles/storage_health.prom /home/wooo/node_exporter_textfiles/backup_health.prom; do
if [ -f "$f" ]; then
mt=$(stat -c %Y "$f")
echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))"
else
echo "TEXTFILE_110 $(basename "$f") missing"
fi
done
if [ -f /home/wooo/node_exporter_textfiles/storage_health.prom ]; then
awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_110 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/wooo/node_exporter_textfiles/storage_health.prom
fi
if [ -f /home/wooo/node_exporter_textfiles/backup_health.prom ]; then
awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} /^awoooi_backup_last_run_failed_count/ {if (\$0 ~ /(exported_job|job)=\"backup_all\"/) failed=int(\$2)} /^awoooi_backup_config_capture_critical_failed_count/ {config_failed=int(\$2)} /^awoooi_backup_integrity_fresh/ {integrity_total++; if (int(\$2) == 0) integrity_stale++} END {printf \"BACKUP_HEALTH_110 total=%d stale=%d missing_cron=%d missing_script=%d failed_count=%d config_failed=%d integrity_total=%d integrity_stale=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0, failed+0, config_failed+0, integrity_total+0, integrity_stale+0}" /home/wooo/node_exporter_textfiles/backup_health.prom
fi
' 2>&1); then
echo "$out"
grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed"
grep -q "FAILED_UNITS_110 0" <<<"$out" && ok "110 systemd has no failed units" || warn "110 systemd failed units remain"
grep -q "MOMO_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale momo startup unit disabled" || warn "110 stale momo startup unit not disabled"
grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled"
awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale"
awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale"
awk '/TEXTFILE_110 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 storage health exporter fresh" || warn "110 storage health exporter stale"
awk '/TEXTFILE_110 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "110 backup health exporter fresh" || warn "110 backup health exporter stale"
grep -q "STORAGE_HEALTH_110 root_readonly=0 current=0" <<<"$out" && ok "110 current boot storage health clean" || warn "110 storage health not clean"
grep -q "BACKUP_HEALTH_110 total=" <<<"$out" && awk '/BACKUP_HEALTH_110/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); split($6,d,"="); split($7,e,"="); exit !((a[2]+b[2]+c[2]) == 0 && d[2] == 0 && e[2] == 0)}' <<<"$out" && ok "110 backup health has no stale expected jobs" || warn "110 latest aggregate/config backup had failed components; rerun backup-all after 120 recovers"
awk '/BACKUP_HEALTH_110/ {split($9,a,"="); exit !(a[2] == 0)}' <<<"$out" && ok "110 backup integrity and restore drill fresh" || warn "110 backup integrity or restore drill stale"
else
warn "110 schedule check unavailable"
echo "$out"
fi
if out=$(host_cmd "wooo@192.168.0.120" '
kcmd() {
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
else
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
fi
}
echo "CRON_120 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
kcmd get cronjobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); items=d.get(\"items\", []); print(\"CRONJOB_COUNT\", len(items)); print(\"CRONJOB_SUSPENDED\", sum(1 for i in items if i.get(\"spec\",{}).get(\"suspend\")))"
kcmd get jobs -n awoooi-prod -o json | python3 -c "import json,sys,re; d=json.load(sys.stdin)
def owner(job):
for ref in job.get(\"metadata\",{}).get(\"ownerReferences\",[]) or []:
if ref.get(\"kind\") == \"CronJob\" and ref.get(\"name\"):
return ref.get(\"name\")
name = job.get(\"metadata\",{}).get(\"name\", \"\")
return re.sub(r\"-[0-9]+$\", \"\", name)
def has_condition(job, kind):
return any(c.get(\"type\") == kind and c.get(\"status\") == \"True\" for c in job.get(\"status\",{}).get(\"conditions\",[]) or [])
def job_time(job):
status = job.get(\"status\",{})
return status.get(\"completionTime\") or status.get(\"startTime\") or \"\"
latest_success = {}
failed_jobs = []
for job in d.get(\"items\", []):
own = owner(job)
ts = job_time(job)
if has_condition(job, \"Complete\"):
latest_success[own] = max(latest_success.get(own, \"\"), ts)
if has_condition(job, \"Failed\"):
failed_jobs.append((own, job.get(\"metadata\",{}).get(\"name\", \"\"), ts))
active_failed = 0
stale_failed = 0
for own, name, ts in failed_jobs:
if ts and latest_success.get(own, \"\") > ts:
stale_failed += 1
else:
active_failed += 1
print(\"FAILED_JOBS\", len(failed_jobs))
print(\"STALE_FAILED_JOBS\", stale_failed)
print(\"ACTIVE_FAILED_JOBS\", active_failed)"
kcmd get pods -n awoooi-prod --no-headers 2>/dev/null | awk "\$3 !~ /^(Running|Completed)$/ {bad++} END {print \"BAD_PODS\", bad+0}"
' 2>&1); then
echo "$out"
grep -q "CRON_120 active" <<<"$out" && ok "120 cron active" || warn "120 cron not confirmed"
awk '/CRONJOB_COUNT / {exit !($2 >= 4)}' <<<"$out" && ok "K8s AWOOOI CronJobs present" || warn "K8s AWOOOI CronJobs missing"
grep -q "CRONJOB_SUSPENDED 0" <<<"$out" && ok "K8s AWOOOI CronJobs unsuspended" || warn "K8s AWOOOI CronJob suspended"
grep -q "ACTIVE_FAILED_JOBS 0" <<<"$out" && ok "K8s AWOOOI has no active failed Jobs" || warn "K8s AWOOOI active failed Jobs remain"
grep -q "BAD_PODS 0" <<<"$out" && ok "K8s AWOOOI pods Running/Completed only" || warn "K8s AWOOOI bad pod status remains"
else
warn "120 K8s schedule check unavailable"
echo "$out"
fi
if out=$(host_cmd "wooo@192.168.0.121" '
echo "CRON_121 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
crontab -l 2>/dev/null | grep -q "dr-drill.sh" && echo "DR_DRILL_CRON present" || echo "DR_DRILL_CRON missing"
' 2>&1); then
echo "$out"
grep -q "CRON_121 active" <<<"$out" && ok "121 cron active" || warn "121 cron not confirmed"
grep -q "DR_DRILL_CRON present" <<<"$out" && ok "121 DR drill cron present" || warn "121 DR drill cron missing"
else
warn "121 schedule check unavailable"
echo "$out"
fi
}
summary() {
log_section "SUMMARY"
echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
if [ "$FAIL" -gt 0 ]; then
echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation."
exit 2
fi
if [ "$WARN" -gt 0 ]; then
echo "Result: DEGRADED. Core gates passed but warnings remain."
exit 1
fi
echo "Result: GREEN. Full stack is ready for controlled runner/CD release."
}
if [ "$WATCH_MODE" -eq 1 ]; then
attempt=1
rc=2
while true; do
echo "WATCH_ATTEMPT=$attempt"
args=()
[ "$MONITOR_READ_ONLY" -eq 1 ] && args+=(--monitor-read-only)
[ "$NO_COLOR_FLAG" -eq 1 ] && args+=(--no-color)
[ "$SEND_ALERT_TEST" -eq 1 ] && args+=(--send-alert-test)
bash "$0" "${args[@]}"
rc=$?
[ "$rc" -eq 0 ] && exit 0
if [ "$WATCH_MAX_ATTEMPTS" -gt 0 ] && [ "$attempt" -ge "$WATCH_MAX_ATTEMPTS" ]; then
exit "$rc"
fi
attempt=$((attempt + 1))
sleep "$WATCH_INTERVAL"
done
fi
print_header
check_network
check_188
check_110
check_k3s
check_workload_and_alertchain
check_public_routes
check_schedules
summary