588 lines
27 KiB
Bash
Executable File
588 lines
27 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# AWOOOI full-stack cold-start readiness check.
|
|
# Read-only by design. It never restarts, deletes, repairs, or writes remote state.
|
|
|
|
set -uo pipefail
|
|
|
|
SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
|
|
SEND_ALERT_TEST=0
|
|
MONITOR_READ_ONLY=0
|
|
NO_COLOR_FLAG=0
|
|
WATCH_MODE=0
|
|
WATCH_INTERVAL=60
|
|
WATCH_MAX_ATTEMPTS=30
|
|
|
|
usage() {
|
|
cat <<'USAGE'
|
|
Usage: bash scripts/reboot-recovery/full-stack-cold-start-check.sh [options]
|
|
|
|
Options:
|
|
--send-alert-test POST one Alertmanager webhook test after AWOOOI API is ready.
|
|
--monitor-read-only Skip the webhook POST without warning; intended for cron/textfile monitors.
|
|
--watch Repeat checks until all gates are GREEN or max attempts is reached.
|
|
--interval SECONDS Retry interval for --watch. Default: 60.
|
|
--max-attempts COUNT Max attempts for --watch. Default: 30. Use 0 for unlimited.
|
|
--no-color Disable ANSI colors in output.
|
|
-h, --help Show this help.
|
|
|
|
Default mode is read-only and does not POST an Alertmanager test event.
|
|
Use --send-alert-test for the final release gate after AWOOOI API is expected to be ready.
|
|
USAGE
|
|
}
|
|
|
|
while [ "$#" -gt 0 ]; do
|
|
arg="$1"
|
|
case "$arg" in
|
|
--send-alert-test)
|
|
SEND_ALERT_TEST=1
|
|
;;
|
|
--monitor-read-only)
|
|
MONITOR_READ_ONLY=1
|
|
SEND_ALERT_TEST=0
|
|
;;
|
|
--no-color)
|
|
NO_COLOR_FLAG=1
|
|
;;
|
|
--watch)
|
|
WATCH_MODE=1
|
|
;;
|
|
--interval)
|
|
shift
|
|
if ! [[ "${1:-}" =~ ^[0-9]+$ ]] || [ "${1:-0}" -lt 1 ]; then
|
|
echo "--interval requires a positive integer number of seconds" >&2
|
|
exit 64
|
|
fi
|
|
WATCH_INTERVAL="$1"
|
|
;;
|
|
--max-attempts)
|
|
shift
|
|
if ! [[ "${1:-}" =~ ^[0-9]+$ ]]; then
|
|
echo "--max-attempts requires a non-negative integer" >&2
|
|
exit 64
|
|
fi
|
|
WATCH_MAX_ATTEMPTS="$1"
|
|
;;
|
|
-h|--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown argument: $arg" >&2
|
|
usage >&2
|
|
exit 64
|
|
;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
if [ -n "${NO_COLOR:-}" ] || [ "$NO_COLOR_FLAG" -eq 1 ]; then
|
|
RED=""
|
|
GREEN=""
|
|
YELLOW=""
|
|
BLUE=""
|
|
NC=""
|
|
else
|
|
RED=$'\033[0;31m'
|
|
GREEN=$'\033[0;32m'
|
|
YELLOW=$'\033[1;33m'
|
|
BLUE=$'\033[0;34m'
|
|
NC=$'\033[0m'
|
|
fi
|
|
|
|
PASS=0
|
|
WARN=0
|
|
FAIL=0
|
|
|
|
log_section() {
|
|
printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
|
|
}
|
|
|
|
ok() {
|
|
printf "%sOK%s %s\n" "$GREEN" "$NC" "$1"
|
|
PASS=$((PASS + 1))
|
|
}
|
|
|
|
warn() {
|
|
printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$1"
|
|
WARN=$((WARN + 1))
|
|
}
|
|
|
|
fail() {
|
|
printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$1"
|
|
FAIL=$((FAIL + 1))
|
|
}
|
|
|
|
run_local() {
|
|
local label="$1"
|
|
shift
|
|
if "$@" >/tmp/awoooi-cold-start-check.out 2>&1; then
|
|
ok "$label"
|
|
cat /tmp/awoooi-cold-start-check.out
|
|
return 0
|
|
fi
|
|
fail "$label"
|
|
cat /tmp/awoooi-cold-start-check.out
|
|
return 1
|
|
}
|
|
|
|
ssh_cmd() {
|
|
local user_host="$1"
|
|
local cmd="$2"
|
|
local prefix=""
|
|
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
|
|
printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD"
|
|
fi
|
|
ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}"
|
|
}
|
|
|
|
host_has_ip() {
|
|
local expected_ip="$1"
|
|
if command -v ip >/dev/null 2>&1; then
|
|
ip -o -4 addr show 2>/dev/null | awk '{print $4}' | grep -q "^${expected_ip}/" && return 0
|
|
fi
|
|
hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$expected_ip"
|
|
}
|
|
|
|
host_cmd() {
|
|
local user_host="$1"
|
|
local cmd="$2"
|
|
case "$user_host" in
|
|
*@192.168.0.110)
|
|
if host_has_ip "192.168.0.110"; then
|
|
bash -lc "$cmd"
|
|
return
|
|
fi
|
|
;;
|
|
*@192.168.0.120)
|
|
if host_has_ip "192.168.0.120"; then
|
|
bash -lc "$cmd"
|
|
return
|
|
fi
|
|
;;
|
|
*@192.168.0.121)
|
|
if host_has_ip "192.168.0.121"; then
|
|
bash -lc "$cmd"
|
|
return
|
|
fi
|
|
;;
|
|
*@192.168.0.188)
|
|
if host_has_ip "192.168.0.188"; then
|
|
bash -lc "$cmd"
|
|
return
|
|
fi
|
|
;;
|
|
esac
|
|
ssh_cmd "$user_host" "$cmd"
|
|
}
|
|
|
|
probe_http_code() {
|
|
local url="$1"
|
|
local attempt code
|
|
for attempt in 1 2; do
|
|
code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 12 "$url" 2>/dev/null || true)
|
|
if [[ "$code" =~ ^[0-9]{3}$ ]] && [ "$code" != "000" ]; then
|
|
echo "$code"
|
|
return
|
|
fi
|
|
sleep 1
|
|
done
|
|
echo "${code:-000}"
|
|
}
|
|
|
|
probe_tcp() {
|
|
local host="$1"
|
|
local port="$2"
|
|
nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1
|
|
}
|
|
|
|
print_neighbor_rows() {
|
|
if command -v arp >/dev/null 2>&1; then
|
|
arp -an | grep -E '192\.168\.0\.(110|120|121|188)'
|
|
return $?
|
|
fi
|
|
if command -v ip >/dev/null 2>&1; then
|
|
ip neigh show | grep -E '192\.168\.0\.(110|120|121|188)'
|
|
return $?
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
print_header() {
|
|
echo "AWOOOI full-stack cold-start check"
|
|
date '+%Y-%m-%d %H:%M:%S %Z'
|
|
echo "Scope: 110 / 120 / 121 / 188. 112 Kali is intentionally skipped."
|
|
echo "Baseline: ops/reboot-recovery/full-stack-cold-start-baseline.yml"
|
|
}
|
|
|
|
check_network() {
|
|
log_section "P0-NETWORK"
|
|
local host
|
|
for host in 110 120 121 188; do
|
|
if ping -c 1 -W 2 "192.168.0.$host" >/dev/null 2>&1; then
|
|
ok "ping 192.168.0.$host"
|
|
else
|
|
fail "ping 192.168.0.$host"
|
|
fi
|
|
|
|
if probe_tcp "192.168.0.$host" 22; then
|
|
ok "ssh port 192.168.0.$host:22"
|
|
else
|
|
fail "ssh port 192.168.0.$host:22"
|
|
fi
|
|
done
|
|
|
|
if print_neighbor_rows; then
|
|
ok "neighbor evidence printed"
|
|
elif [ "$MONITOR_READ_ONLY" -eq 1 ]; then
|
|
ok "neighbor evidence unavailable in monitor mode; ping and TCP gates provide primary signal"
|
|
else
|
|
warn "no neighbor rows printed for one or more hosts"
|
|
fi
|
|
}
|
|
|
|
check_188() {
|
|
log_section "P0-188-DATA"
|
|
local out
|
|
if ! out=$(host_cmd "ollama@192.168.0.188" '
|
|
echo "HOST $(hostname) $(uptime)"
|
|
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
|
|
echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx 2>/dev/null | tr "\n" " ")"
|
|
echo "PG $(pg_isready -h localhost -p 5432 2>&1)"
|
|
echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)"
|
|
echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
|
|
echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)"
|
|
echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)"
|
|
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80
|
|
' 2>&1); then
|
|
fail "ssh 188 read-only check"
|
|
echo "$out"
|
|
return
|
|
fi
|
|
echo "$out"
|
|
|
|
grep -q "PORT5432 OPEN" <<<"$out" && ok "188 PostgreSQL port open" || fail "188 PostgreSQL port closed"
|
|
grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections"
|
|
grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed"
|
|
grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop"
|
|
grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed"
|
|
grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed"
|
|
}
|
|
|
|
check_110() {
|
|
log_section "P0-110-REGISTRY-OBSERVABILITY"
|
|
local out
|
|
if ! out=$(host_cmd "wooo@192.168.0.110" '
|
|
echo "HOST $(hostname) $(uptime)"
|
|
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
|
|
echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)"
|
|
echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)"
|
|
echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)"
|
|
echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)"
|
|
echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)"
|
|
echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)"
|
|
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
|
|
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
|
|
done
|
|
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
|
|
' 2>&1); then
|
|
fail "ssh 110 read-only check"
|
|
echo "$out"
|
|
return
|
|
fi
|
|
echo "$out"
|
|
|
|
grep -Eq "HARBOR_CODE (200|401)" <<<"$out" && ok "110 Harbor /v2 healthy code" || fail "110 Harbor not healthy"
|
|
grep -Eq "GITEA_CODE (200|302)" <<<"$out" && ok "110 Gitea reachable" || warn "110 Gitea not confirmed"
|
|
grep -q "PROM_CODE 200" <<<"$out" && ok "110 Prometheus ready" || warn "110 Prometheus not ready"
|
|
grep -q "AM_CODE 200" <<<"$out" && ok "110 Alertmanager healthy" || warn "110 Alertmanager not healthy"
|
|
grep -Eq "SENTRY_CODE (200|302|400)" <<<"$out" && ok "110 Sentry HTTP reachable" || warn "110 Sentry HTTP not confirmed"
|
|
grep -q "WatchdogUSec=0" <<<"$out" && ok "runner watchdog disabled on at least one unit" || warn "runner watchdog state not confirmed"
|
|
grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
|
|
}
|
|
|
|
check_k3s() {
|
|
log_section "P1-K3S"
|
|
local out local_kubectl_out
|
|
if ! out=$(host_cmd "wooo@192.168.0.120" '
|
|
echo "HOST $(hostname) $(uptime)"
|
|
echo "PG188_PORT $(nc -z -w 2 192.168.0.188 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
|
|
echo "SYSTEMD $(systemctl is-active k3s k3s-agent keepalived 2>/dev/null | tr "\n" " ")"
|
|
kcmd() {
|
|
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
|
|
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
|
|
else
|
|
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
|
|
fi
|
|
}
|
|
kcmd get nodes -o wide 2>/dev/null || true
|
|
kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true
|
|
ip addr show | grep 192.168.0.125 || true
|
|
' 2>&1); then
|
|
fail "ssh 120 k3s read-only check"
|
|
echo "$out"
|
|
return
|
|
fi
|
|
echo "$out"
|
|
|
|
if ! grep -q " Ready " <<<"$out"; then
|
|
local_kubectl_out=$(kubectl get nodes -o wide 2>/dev/null || true)
|
|
if [ -n "$local_kubectl_out" ]; then
|
|
echo "LOCAL_KUBECTL_FALLBACK"
|
|
echo "$local_kubectl_out"
|
|
fi
|
|
else
|
|
local_kubectl_out=""
|
|
fi
|
|
|
|
grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL"
|
|
grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable"
|
|
grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120"
|
|
}
|
|
|
|
check_workload_and_alertchain() {
|
|
log_section "P2-WORKLOAD-ALERTCHAIN"
|
|
local api_code web_code alert_code
|
|
local out
|
|
if out=$(host_cmd "wooo@192.168.0.120" '
|
|
api_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32334/api/v1/health 2>/dev/null || true)
|
|
web_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32335/ 2>/dev/null || true)
|
|
echo "API_CODE ${api_code:-000}"
|
|
echo "WEB_CODE ${web_code:-000}"
|
|
' 2>/dev/null); then
|
|
api_code=$(awk '/^API_CODE / {print $2}' <<<"$out")
|
|
web_code=$(awk '/^WEB_CODE / {print $2}' <<<"$out")
|
|
else
|
|
api_code=$(probe_http_code "http://192.168.0.125:32334/api/v1/health")
|
|
web_code=$(probe_http_code "http://192.168.0.125:32335/")
|
|
out="API_CODE $api_code
|
|
WEB_CODE $web_code"
|
|
fi
|
|
|
|
echo "$out"
|
|
|
|
[[ "$api_code" =~ ^[23] ]] && ok "AWOOOI API reachable" || fail "AWOOOI API not reachable"
|
|
[[ "$web_code" =~ ^[23] ]] && ok "AWOOOI Web reachable" || warn "AWOOOI Web not confirmed"
|
|
|
|
if [ "$SEND_ALERT_TEST" -eq 1 ]; then
|
|
alert_code=$(host_cmd "wooo@192.168.0.120" 'curl -s -o /tmp/awoooi-alertchain.out -w "%{http_code}" --max-time 8 \
|
|
-X POST "http://192.168.0.125:32334/api/v1/webhooks/alertmanager" \
|
|
-H '"'"'Content-Type: application/json'"'"' \
|
|
-d '"'"'{"receiver":"cold-start-check","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartCheck","severity":"info"},"annotations":{"summary":"Cold start check"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-check"}'"'"' 2>/dev/null || echo "000"')
|
|
echo "ALERTCHAIN_CODE $alert_code"
|
|
[[ "$alert_code" =~ ^2 ]] && ok "Alertmanager webhook endpoint accepts POST" || warn "Alertmanager webhook E2E not confirmed"
|
|
elif [ "$MONITOR_READ_ONLY" -eq 1 ]; then
|
|
ok "Alertmanager webhook POST intentionally skipped in read-only monitor mode"
|
|
else
|
|
warn "Alertmanager webhook POST skipped; rerun with --send-alert-test after API is ready"
|
|
fi
|
|
}
|
|
|
|
check_public_routes() {
|
|
log_section "P2-PUBLIC-ROUTES"
|
|
local item name url code tls_code
|
|
local routes=(
|
|
"awoooi_api|https://awoooi.wooo.work/api/v1/health"
|
|
"awoooi_web|https://awoooi.wooo.work/"
|
|
"momo_web|https://mo.wooo.work/"
|
|
"momo_health|https://mo.wooo.work/health"
|
|
"gitea|https://gitea.wooo.work/"
|
|
"harbor|https://harbor.wooo.work/"
|
|
"registry|https://registry.wooo.work/"
|
|
"sentry|https://sentry.wooo.work/"
|
|
"signoz|https://signoz.wooo.work/"
|
|
"stock|https://stock.wooo.work/"
|
|
"langfuse|https://langfuse.wooo.work/"
|
|
"bitan|https://bitan.wooo.work/"
|
|
"aiops|https://aiops.wooo.work/"
|
|
)
|
|
|
|
for item in "${routes[@]}"; do
|
|
name="${item%%|*}"
|
|
url="${item#*|}"
|
|
code=$(probe_http_code "$url")
|
|
echo "PUBLIC_ROUTE $name $code $url"
|
|
[[ "$code" =~ ^[23] ]] && ok "public route $name reachable" || warn "public route $name not confirmed"
|
|
tls_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 8 "$url" 2>/dev/null || true)
|
|
tls_code="${tls_code:-000}"
|
|
echo "PUBLIC_ROUTE_TLS $name $tls_code $url"
|
|
[[ "$tls_code" =~ ^[23] ]] && ok "public route $name TLS certificate verified" || fail "public route $name TLS certificate verification failed"
|
|
done
|
|
}
|
|
|
|
check_schedules() {
|
|
log_section "P2-SCHEDULES"
|
|
local out
|
|
|
|
if out=$(host_cmd "ollama@192.168.0.188" '
|
|
now=$(date +%s)
|
|
echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
|
|
for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/backup_health.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom /home/ollama/node_exporter_textfiles/storage_health.prom; do
|
|
if [ -f "$f" ]; then
|
|
mt=$(stat -c %Y "$f")
|
|
echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))"
|
|
else
|
|
echo "TEXTFILE_188 $(basename "$f") missing"
|
|
fi
|
|
done
|
|
if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then
|
|
awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom
|
|
fi
|
|
if [ -f /home/ollama/node_exporter_textfiles/backup_health.prom ]; then
|
|
awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} END {printf \"BACKUP_HEALTH_188 total=%d stale=%d missing_cron=%d missing_script=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0}" /home/ollama/node_exporter_textfiles/backup_health.prom
|
|
fi
|
|
if [ -f /home/ollama/node_exporter_textfiles/storage_health.prom ]; then
|
|
awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_188 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/ollama/node_exporter_textfiles/storage_health.prom
|
|
fi
|
|
echo "SCHEDULER_CONTAINER_RUNNING $(docker inspect -f "{{.State.Running}}" momo-scheduler 2>/dev/null || true)"
|
|
echo "SCHEDULER_CONTAINER_HEALTH $(docker inspect -f "{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}" momo-scheduler 2>/dev/null || true)"
|
|
echo "SCHEDULER_REGISTERED $(docker logs --tail 200 momo-scheduler 2>&1 | grep -c "全部排程任務已註冊" || true)"
|
|
echo "SCHEDULER_RECENT_ACTIVITY $(docker logs --since 2h momo-scheduler 2>&1 | grep -Ec "AutoImport|Meta-Analysis|Scheduler" || true)"
|
|
momo_sync=$(docker exec momo-db sh -c "psql -U \"\$POSTGRES_USER\" -d \"\$POSTGRES_DB\" -Atc \"WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\\\"日期\\\"::date) mmin, max(\\\"日期\\\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \\\"日期\\\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;\"" 2>/dev/null || true)
|
|
echo "MOMO_MONTHLY_SYNC ${momo_sync:-unavailable}"
|
|
' 2>&1); then
|
|
echo "$out"
|
|
grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
|
|
awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing"
|
|
awk '/TEXTFILE_188 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "188 backup health exporter fresh" || warn "188 backup health exporter stale"
|
|
awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale"
|
|
awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale"
|
|
awk '/TEXTFILE_188 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 storage health exporter fresh" || warn "188 storage health exporter stale"
|
|
grep -q "STORAGE_HEALTH_188 root_readonly=0 current=0" <<<"$out" && ok "188 current boot storage health clean" || warn "188 storage health not clean"
|
|
awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed"
|
|
grep -q "BACKUP_HEALTH_188 total=" <<<"$out" && awk '/BACKUP_HEALTH_188/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); exit !((a[2]+b[2]+c[2]) == 0)}' <<<"$out" && ok "188 backup health has no stale expected jobs" || warn "188 backup health has stale expected jobs"
|
|
if grep -q "SCHEDULER_CONTAINER_HEALTH healthy" <<<"$out" && awk '/SCHEDULER_RECENT_ACTIVITY / {exit !($2 > 0)}' <<<"$out"; then
|
|
ok "188 momo scheduler healthy with recent task activity"
|
|
elif awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out"; then
|
|
ok "188 momo scheduler registered jobs"
|
|
else
|
|
warn "188 momo scheduler registration/activity not confirmed"
|
|
fi
|
|
awk '/MOMO_MONTHLY_SYNC / {split($2,a,"|"); exit !(a[1] > 0 && a[1] == a[2] && a[3] == a[5] && a[4] == a[6])}' <<<"$out" && ok "188 momo current-month snapshot and realtime tables match" || warn "188 momo current-month snapshot/realtime sync not confirmed"
|
|
else
|
|
warn "188 schedule check unavailable"
|
|
echo "$out"
|
|
fi
|
|
|
|
if out=$(host_cmd "wooo@192.168.0.110" '
|
|
now=$(date +%s)
|
|
echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
|
|
echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)"
|
|
echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)"
|
|
echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)"
|
|
for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom /home/wooo/node_exporter_textfiles/storage_health.prom /home/wooo/node_exporter_textfiles/backup_health.prom; do
|
|
if [ -f "$f" ]; then
|
|
mt=$(stat -c %Y "$f")
|
|
echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))"
|
|
else
|
|
echo "TEXTFILE_110 $(basename "$f") missing"
|
|
fi
|
|
done
|
|
if [ -f /home/wooo/node_exporter_textfiles/storage_health.prom ]; then
|
|
awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_110 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/wooo/node_exporter_textfiles/storage_health.prom
|
|
fi
|
|
if [ -f /home/wooo/node_exporter_textfiles/backup_health.prom ]; then
|
|
awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} /^awoooi_backup_last_run_failed_count/ {if (\$0 ~ /(exported_job|job)=\"backup_all\"/) failed=int(\$2)} /^awoooi_backup_config_capture_critical_failed_count/ {config_failed=int(\$2)} /^awoooi_backup_integrity_fresh/ {integrity_total++; if (int(\$2) == 0) integrity_stale++} END {printf \"BACKUP_HEALTH_110 total=%d stale=%d missing_cron=%d missing_script=%d failed_count=%d config_failed=%d integrity_total=%d integrity_stale=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0, failed+0, config_failed+0, integrity_total+0, integrity_stale+0}" /home/wooo/node_exporter_textfiles/backup_health.prom
|
|
fi
|
|
' 2>&1); then
|
|
echo "$out"
|
|
grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed"
|
|
grep -q "FAILED_UNITS_110 0" <<<"$out" && ok "110 systemd has no failed units" || warn "110 systemd failed units remain"
|
|
grep -q "MOMO_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale momo startup unit disabled" || warn "110 stale momo startup unit not disabled"
|
|
grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled"
|
|
awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale"
|
|
awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale"
|
|
awk '/TEXTFILE_110 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 storage health exporter fresh" || warn "110 storage health exporter stale"
|
|
awk '/TEXTFILE_110 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "110 backup health exporter fresh" || warn "110 backup health exporter stale"
|
|
grep -q "STORAGE_HEALTH_110 root_readonly=0 current=0" <<<"$out" && ok "110 current boot storage health clean" || warn "110 storage health not clean"
|
|
grep -q "BACKUP_HEALTH_110 total=" <<<"$out" && awk '/BACKUP_HEALTH_110/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); split($6,d,"="); split($7,e,"="); exit !((a[2]+b[2]+c[2]) == 0 && d[2] == 0 && e[2] == 0)}' <<<"$out" && ok "110 backup health has no stale expected jobs" || warn "110 latest aggregate/config backup had failed components; rerun backup-all after 120 recovers"
|
|
awk '/BACKUP_HEALTH_110/ {split($9,a,"="); exit !(a[2] == 0)}' <<<"$out" && ok "110 backup integrity and restore drill fresh" || warn "110 backup integrity or restore drill stale"
|
|
else
|
|
warn "110 schedule check unavailable"
|
|
echo "$out"
|
|
fi
|
|
|
|
if out=$(host_cmd "wooo@192.168.0.120" '
|
|
kcmd() {
|
|
if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
|
|
printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
|
|
else
|
|
sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
|
|
fi
|
|
}
|
|
echo "CRON_120 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
|
|
kcmd get cronjobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); items=d.get(\"items\", []); print(\"CRONJOB_COUNT\", len(items)); print(\"CRONJOB_SUSPENDED\", sum(1 for i in items if i.get(\"spec\",{}).get(\"suspend\")))"
|
|
kcmd get jobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); failed=0
|
|
for j in d.get(\"items\", []):
|
|
if any(c.get(\"type\")==\"Failed\" and c.get(\"status\")==\"True\" for c in j.get(\"status\",{}).get(\"conditions\",[]) or []):
|
|
failed += 1
|
|
print(\"FAILED_JOBS\", failed)"
|
|
kcmd get pods -n awoooi-prod --no-headers 2>/dev/null | awk "\$3 !~ /^(Running|Completed)$/ {bad++} END {print \"BAD_PODS\", bad+0}"
|
|
' 2>&1); then
|
|
echo "$out"
|
|
grep -q "CRON_120 active" <<<"$out" && ok "120 cron active" || warn "120 cron not confirmed"
|
|
awk '/CRONJOB_COUNT / {exit !($2 >= 4)}' <<<"$out" && ok "K8s AWOOOI CronJobs present" || warn "K8s AWOOOI CronJobs missing"
|
|
grep -q "CRONJOB_SUSPENDED 0" <<<"$out" && ok "K8s AWOOOI CronJobs unsuspended" || warn "K8s AWOOOI CronJob suspended"
|
|
grep -q "FAILED_JOBS 0" <<<"$out" && ok "K8s AWOOOI has no failed Jobs" || warn "K8s AWOOOI failed Jobs remain"
|
|
grep -q "BAD_PODS 0" <<<"$out" && ok "K8s AWOOOI pods Running/Completed only" || warn "K8s AWOOOI bad pod status remains"
|
|
else
|
|
warn "120 K8s schedule check unavailable"
|
|
echo "$out"
|
|
fi
|
|
|
|
if out=$(host_cmd "wooo@192.168.0.121" '
|
|
echo "CRON_121 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
|
|
crontab -l 2>/dev/null | grep -q "dr-drill.sh" && echo "DR_DRILL_CRON present" || echo "DR_DRILL_CRON missing"
|
|
' 2>&1); then
|
|
echo "$out"
|
|
grep -q "CRON_121 active" <<<"$out" && ok "121 cron active" || warn "121 cron not confirmed"
|
|
grep -q "DR_DRILL_CRON present" <<<"$out" && ok "121 DR drill cron present" || warn "121 DR drill cron missing"
|
|
else
|
|
warn "121 schedule check unavailable"
|
|
echo "$out"
|
|
fi
|
|
}
|
|
|
|
summary() {
|
|
log_section "SUMMARY"
|
|
echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
|
|
if [ "$FAIL" -gt 0 ]; then
|
|
echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation."
|
|
exit 2
|
|
fi
|
|
if [ "$WARN" -gt 0 ]; then
|
|
echo "Result: DEGRADED. Core gates passed but warnings remain."
|
|
exit 1
|
|
fi
|
|
echo "Result: GREEN. Full stack is ready for controlled runner/CD release."
|
|
}
|
|
|
|
if [ "$WATCH_MODE" -eq 1 ]; then
|
|
attempt=1
|
|
rc=2
|
|
while true; do
|
|
echo "WATCH_ATTEMPT=$attempt"
|
|
args=()
|
|
[ "$MONITOR_READ_ONLY" -eq 1 ] && args+=(--monitor-read-only)
|
|
[ "$NO_COLOR_FLAG" -eq 1 ] && args+=(--no-color)
|
|
[ "$SEND_ALERT_TEST" -eq 1 ] && args+=(--send-alert-test)
|
|
bash "$0" "${args[@]}"
|
|
rc=$?
|
|
[ "$rc" -eq 0 ] && exit 0
|
|
if [ "$WATCH_MAX_ATTEMPTS" -gt 0 ] && [ "$attempt" -ge "$WATCH_MAX_ATTEMPTS" ]; then
|
|
exit "$rc"
|
|
fi
|
|
attempt=$((attempt + 1))
|
|
sleep "$WATCH_INTERVAL"
|
|
done
|
|
fi
|
|
|
|
print_header
|
|
check_network
|
|
check_188
|
|
check_110
|
|
check_k3s
|
|
check_workload_and_alertchain
|
|
check_public_routes
|
|
check_schedules
|
|
summary
|