awoooi/scripts/reboot-recovery/full-stack-cold-start-check.sh

#!/usr/bin/env bash
# AWOOOI full-stack cold-start readiness check.
# Read-only by design. It never restarts, deletes, repairs, or writes remote state.

set -uo pipefail

SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6)
SEND_ALERT_TEST=0
MONITOR_READ_ONLY=0
NO_COLOR_FLAG=0
WATCH_MODE=0
WATCH_INTERVAL=60
WATCH_MAX_ATTEMPTS=30

usage() {
  cat <<'USAGE'
Usage: bash scripts/reboot-recovery/full-stack-cold-start-check.sh [options]

Options:
  --send-alert-test       POST one Alertmanager webhook test after AWOOOI API is ready.
  --monitor-read-only     Skip the webhook POST without warning; intended for cron/textfile monitors.
  --watch                 Repeat checks until all gates are GREEN or max attempts is reached.
  --interval SECONDS      Retry interval for --watch. Default: 60.
  --max-attempts COUNT    Max attempts for --watch. Default: 30. Use 0 for unlimited.
  --no-color              Disable ANSI colors in output.
  -h, --help              Show this help.

Default mode is read-only and does not POST an Alertmanager test event.
Use --send-alert-test for the final release gate after AWOOOI API is expected to be ready.
USAGE
}

while [ "$#" -gt 0 ]; do
  arg="$1"
  case "$arg" in
    --send-alert-test)
      SEND_ALERT_TEST=1
      ;;
    --monitor-read-only)
      MONITOR_READ_ONLY=1
      SEND_ALERT_TEST=0
      ;;
    --no-color)
      NO_COLOR_FLAG=1
      ;;
    --watch)
      WATCH_MODE=1
      ;;
    --interval)
      shift
      if ! [[ "${1:-}" =~ ^[0-9]+$ ]] || [ "${1:-0}" -lt 1 ]; then
        echo "--interval requires a positive integer number of seconds" >&2
        exit 64
      fi
      WATCH_INTERVAL="$1"
      ;;
    --max-attempts)
      shift
      if ! [[ "${1:-}" =~ ^[0-9]+$ ]]; then
        echo "--max-attempts requires a non-negative integer" >&2
        exit 64
      fi
      WATCH_MAX_ATTEMPTS="$1"
      ;;
    -h|--help)
      usage
      exit 0
      ;;
    *)
      echo "Unknown argument: $arg" >&2
      usage >&2
      exit 64
      ;;
  esac
  shift
done

if [ -n "${NO_COLOR:-}" ] || [ "$NO_COLOR_FLAG" -eq 1 ]; then
  RED=""
  GREEN=""
  YELLOW=""
  BLUE=""
  NC=""
else
  RED=$'\033[0;31m'
  GREEN=$'\033[0;32m'
  YELLOW=$'\033[1;33m'
  BLUE=$'\033[0;34m'
  NC=$'\033[0m'
fi

PASS=0
WARN=0
FAIL=0

log_section() {
  printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
}

ok() {
  printf "%sOK%s %s\n" "$GREEN" "$NC" "$1"
  PASS=$((PASS + 1))
}

warn() {
  printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$1"
  WARN=$((WARN + 1))
}

fail() {
  printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$1"
  FAIL=$((FAIL + 1))
}

run_local() {
  local label="$1"
  shift
  if "$@" >/tmp/awoooi-cold-start-check.out 2>&1; then
    ok "$label"
    cat /tmp/awoooi-cold-start-check.out
    return 0
  fi
  fail "$label"
  cat /tmp/awoooi-cold-start-check.out
  return 1
}

ssh_cmd() {
  local user_host="$1"
  local cmd="$2"
  local prefix=""
  if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
    printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD"
  fi
  ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}"
}

host_has_ip() {
  local expected_ip="$1"
  if command -v ip >/dev/null 2>&1; then
    ip -o -4 addr show 2>/dev/null | awk '{print $4}' | grep -q "^${expected_ip}/" && return 0
  fi
  hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$expected_ip"
}

host_cmd() {
  local user_host="$1"
  local cmd="$2"
  case "$user_host" in
    *@192.168.0.110)
      if host_has_ip "192.168.0.110"; then
        bash -lc "$cmd"
        return
      fi
      ;;
    *@192.168.0.120)
      if host_has_ip "192.168.0.120"; then
        bash -lc "$cmd"
        return
      fi
      ;;
    *@192.168.0.121)
      if host_has_ip "192.168.0.121"; then
        bash -lc "$cmd"
        return
      fi
      ;;
    *@192.168.0.188)
      if host_has_ip "192.168.0.188"; then
        bash -lc "$cmd"
        return
      fi
      ;;
  esac
  ssh_cmd "$user_host" "$cmd"
}

probe_http_code() {
  local url="$1"
  local attempt code
  for attempt in 1 2; do
    code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 12 "$url" 2>/dev/null || true)
    if [[ "$code" =~ ^[0-9]{3}$ ]] && [ "$code" != "000" ]; then
      echo "$code"
      return
    fi
    sleep 1
  done
  echo "${code:-000}"
}

probe_tcp() {
  local host="$1"
  local port="$2"
  nc -G 3 -z "$host" "$port" >/dev/null 2>&1 || nc -w 3 -z "$host" "$port" >/dev/null 2>&1
}

print_neighbor_rows() {
  if command -v arp >/dev/null 2>&1; then
    arp -an | grep -E '192\.168\.0\.(110|120|121|188)'
    return $?
  fi
  if command -v ip >/dev/null 2>&1; then
    ip neigh show | grep -E '192\.168\.0\.(110|120|121|188)'
    return $?
  fi
  return 1
}

print_header() {
  echo "AWOOOI full-stack cold-start check"
  date '+%Y-%m-%d %H:%M:%S %Z'
  echo "Scope: 110 / 120 / 121 / 188. 112 Kali is intentionally skipped."
  echo "Baseline: ops/reboot-recovery/full-stack-cold-start-baseline.yml"
}

check_network() {
  log_section "P0-NETWORK"
  local host
  for host in 110 120 121 188; do
    if ping -c 1 -W 2 "192.168.0.$host" >/dev/null 2>&1; then
      ok "ping 192.168.0.$host"
    else
      fail "ping 192.168.0.$host"
    fi

    if probe_tcp "192.168.0.$host" 22; then
      ok "ssh port 192.168.0.$host:22"
    else
      fail "ssh port 192.168.0.$host:22"
    fi
  done

  if print_neighbor_rows; then
    ok "neighbor evidence printed"
  elif [ "$MONITOR_READ_ONLY" -eq 1 ]; then
    ok "neighbor evidence unavailable in monitor mode; ping and TCP gates provide primary signal"
  else
    warn "no neighbor rows printed for one or more hosts"
  fi
}

check_188() {
  log_section "P0-188-DATA"
  local out
  if ! out=$(host_cmd "ollama@192.168.0.188" '
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "SYSTEMD $(systemctl is-active containerd docker postgresql@14-main redis-server ollama nginx 2>/dev/null | tr "\n" " ")"
echo "PG $(pg_isready -h localhost -p 5432 2>&1)"
echo "REDIS $(redis-cli -p 6380 ping 2>/dev/null || redis-cli ping 2>/dev/null || true)"
echo "PORT5432 $(nc -z -w 2 127.0.0.1 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
echo "SIGNOZ_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3301/ || true)"
echo "MOMO_HEALTH_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5003/health || true)"
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80
' 2>&1); then
    fail "ssh 188 read-only check"
    echo "$out"
    return
  fi
  echo "$out"

  grep -q "PORT5432 OPEN" <<<"$out" && ok "188 PostgreSQL port open" || fail "188 PostgreSQL port closed"
  grep -q "accepting connections" <<<"$out" && ok "188 PostgreSQL accepting connections" || fail "188 PostgreSQL not accepting connections"
  grep -q "REDIS PONG" <<<"$out" && ok "188 Redis PONG" || warn "188 Redis not confirmed"
  grep -q "momo-db.*Restarting" <<<"$out" && warn "188 momo-db restarting" || ok "188 momo-db not in visible restart loop"
  grep -Eq "SIGNOZ_CODE (200|302|307)" <<<"$out" && ok "188 SignOz HTTP reachable" || warn "188 SignOz HTTP not confirmed"
  grep -q "MOMO_HEALTH_CODE 200" <<<"$out" && ok "188 momo health reachable" || warn "188 momo health not confirmed"
}

check_110() {
  log_section "P0-110-REGISTRY-OBSERVABILITY"
  local out
  if ! out=$(host_cmd "wooo@192.168.0.110" '
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)"
echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)"
echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)"
echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)"
echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)"
echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)"
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
  systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
done
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
' 2>&1); then
    fail "ssh 110 read-only check"
    echo "$out"
    return
  fi
  echo "$out"

  grep -Eq "HARBOR_CODE (200|401)" <<<"$out" && ok "110 Harbor /v2 healthy code" || fail "110 Harbor not healthy"
  grep -Eq "GITEA_CODE (200|302)" <<<"$out" && ok "110 Gitea reachable" || warn "110 Gitea not confirmed"
  grep -q "PROM_CODE 200" <<<"$out" && ok "110 Prometheus ready" || warn "110 Prometheus not ready"
  grep -q "AM_CODE 200" <<<"$out" && ok "110 Alertmanager healthy" || warn "110 Alertmanager not healthy"
  grep -Eq "SENTRY_CODE (200|302|400)" <<<"$out" && ok "110 Sentry HTTP reachable" || warn "110 Sentry HTTP not confirmed"
  grep -q "WatchdogUSec=0" <<<"$out" && ok "runner watchdog disabled on at least one unit" || warn "runner watchdog state not confirmed"
  grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
}

check_k3s() {
  log_section "P1-K3S"
  local out local_kubectl_out
  if ! out=$(host_cmd "wooo@192.168.0.120" '
echo "HOST $(hostname) $(uptime)"
echo "PG188_PORT $(nc -z -w 2 192.168.0.188 5432 >/dev/null 2>&1 && echo OPEN || echo CLOSED)"
echo "SYSTEMD $(systemctl is-active k3s k3s-agent keepalived 2>/dev/null | tr "\n" " ")"
kcmd() {
  if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
    printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
  else
    sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
  fi
}
kcmd get nodes -o wide 2>/dev/null || true
kcmd get pods -n awoooi-prod -o wide 2>/dev/null || true
node_condition_summary=$(kcmd get nodes -o json 2>/dev/null | python3 -c "import json,sys
try:
  d=json.load(sys.stdin)
except Exception:
  d={\"items\": []}
not_ready=readonly=disk_pressure=0
for node in d.get(\"items\", []):
  conds={c.get(\"type\"): c.get(\"status\") for c in node.get(\"status\",{}).get(\"conditions\",[]) or []}
  if conds.get(\"Ready\") != \"True\":
    not_ready += 1
  if conds.get(\"ReadonlyFilesystem\") == \"True\":
    readonly += 1
  if conds.get(\"DiskPressure\") == \"True\":
    disk_pressure += 1
print(f\"NODE_NOT_READY {not_ready}\")
print(f\"NODE_READONLY_FILESYSTEM_TRUE {readonly}\")
print(f\"NODE_DISK_PRESSURE_TRUE {disk_pressure}\")" || true)
printf "%s\n" "$node_condition_summary"
node_fs_events=$(kcmd get events -A --field-selector involvedObject.kind=Node --sort-by=.lastTimestamp 2>/dev/null \
  | grep -Eiv "InvalidDiskCapacity|image filesystem" \
  | grep -Eic "fsck|I/O error|read-only file system|Structure needs cleaning|orphan linked list|EXT4-fs.*error|XFS.*(corruption|metadata)|Remounting filesystem read-only" || true)
echo "NODE_FS_ERROR_EVENTS ${node_fs_events:-0}"
ip addr show | grep 192.168.0.125 || true
' 2>&1); then
    fail "ssh 120 k3s read-only check"
    echo "$out"
    return
  fi
  echo "$out"

  if ! grep -q " Ready " <<<"$out"; then
    local_kubectl_out=$(kubectl get nodes -o wide 2>/dev/null || true)
    if [ -n "$local_kubectl_out" ]; then
      echo "LOCAL_KUBECTL_FALLBACK"
      echo "$local_kubectl_out"
    fi
  else
    local_kubectl_out=""
  fi

  grep -q "PG188_PORT OPEN" <<<"$out" && ok "120 can reach 188 PostgreSQL port" || fail "120 cannot reach 188 PostgreSQL"
  grep -q " Ready " <<<"$out$local_kubectl_out" && ok "K3s has Ready node output" || fail "K3s nodes not Ready or kubectl unavailable"
  grep -q "NODE_NOT_READY 0" <<<"$out" && ok "K3s node Ready condition clean" || fail "K3s node Ready condition not clean"
  if grep -q "NODE_FS_ERROR_EVENTS 0" <<<"$out" \
    && grep -q "NODE_READONLY_FILESYSTEM_TRUE 0" <<<"$out" \
    && grep -q "NODE_DISK_PRESSURE_TRUE 0" <<<"$out"; then
    ok "K3s node storage conditions clean"
  else
    fail "K3s node storage condition or severe filesystem event present"
  fi
  grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120"
}

check_workload_and_alertchain() {
  log_section "P2-WORKLOAD-ALERTCHAIN"
  local api_code web_code alert_code
  local out
  if out=$(host_cmd "wooo@192.168.0.120" '
api_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32334/api/v1/health 2>/dev/null || true)
web_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://192.168.0.125:32335/ 2>/dev/null || true)
echo "API_CODE ${api_code:-000}"
echo "WEB_CODE ${web_code:-000}"
' 2>/dev/null); then
    api_code=$(awk '/^API_CODE / {print $2}' <<<"$out")
    web_code=$(awk '/^WEB_CODE / {print $2}' <<<"$out")
  else
    api_code=$(probe_http_code "http://192.168.0.125:32334/api/v1/health")
    web_code=$(probe_http_code "http://192.168.0.125:32335/")
    out="API_CODE $api_code
WEB_CODE $web_code"
  fi

  echo "$out"

  [[ "$api_code" =~ ^[23] ]] && ok "AWOOOI API reachable" || fail "AWOOOI API not reachable"
  [[ "$web_code" =~ ^[23] ]] && ok "AWOOOI Web reachable" || warn "AWOOOI Web not confirmed"

  if [ "$SEND_ALERT_TEST" -eq 1 ]; then
    alert_code=$(host_cmd "wooo@192.168.0.120" 'curl -s -o /tmp/awoooi-alertchain.out -w "%{http_code}" --max-time 8 \
      -X POST "http://192.168.0.125:32334/api/v1/webhooks/alertmanager" \
      -H '"'"'Content-Type: application/json'"'"' \
      -d '"'"'{"receiver":"cold-start-check","status":"firing","alerts":[{"status":"firing","labels":{"alertname":"ColdStartCheck","severity":"info"},"annotations":{"summary":"Cold start check"},"startsAt":"2026-05-05T11:00:00Z","endsAt":"0001-01-01T00:00:00Z","generatorURL":""}],"groupLabels":{},"commonLabels":{},"commonAnnotations":{},"externalURL":"","version":"4","groupKey":"cold-start-check"}'"'"' 2>/dev/null || echo "000"')
    echo "ALERTCHAIN_CODE $alert_code"
    [[ "$alert_code" =~ ^2 ]] && ok "Alertmanager webhook endpoint accepts POST" || warn "Alertmanager webhook E2E not confirmed"
  elif [ "$MONITOR_READ_ONLY" -eq 1 ]; then
    ok "Alertmanager webhook POST intentionally skipped in read-only monitor mode"
  else
    warn "Alertmanager webhook POST skipped; rerun with --send-alert-test after API is ready"
  fi
}

check_public_routes() {
  log_section "P2-PUBLIC-ROUTES"
  local item name url code tls_code
  local routes=(
    "awoooi_api|https://awoooi.wooo.work/api/v1/health"
    "awoooi_web|https://awoooi.wooo.work/"
    "momo_web|https://mo.wooo.work/"
    "momo_health|https://mo.wooo.work/health"
    "gitea|https://gitea.wooo.work/"
    "harbor|https://harbor.wooo.work/"
    "registry|https://registry.wooo.work/"
    "sentry|https://sentry.wooo.work/"
    "signoz|https://signoz.wooo.work/"
    "stock|https://stock.wooo.work/"
    "langfuse|https://langfuse.wooo.work/"
    "bitan|https://bitan.wooo.work/"
    "aiops|https://aiops.wooo.work/"
  )

  for item in "${routes[@]}"; do
    name="${item%%|*}"
    url="${item#*|}"
    code=$(probe_http_code "$url")
    echo "PUBLIC_ROUTE $name $code $url"
    [[ "$code" =~ ^[23] ]] && ok "public route $name reachable" || warn "public route $name not confirmed"
    tls_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 8 "$url" 2>/dev/null || true)
    tls_code="${tls_code:-000}"
    echo "PUBLIC_ROUTE_TLS $name $tls_code $url"
    [[ "$tls_code" =~ ^[23] ]] && ok "public route $name TLS certificate verified" || fail "public route $name TLS certificate verification failed"
  done
}

check_schedules() {
  log_section "P2-SCHEDULES"
  local out

  if out=$(host_cmd "ollama@192.168.0.188" '
now=$(date +%s)
echo "CRON_188 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
for f in /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/backup_health.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/docker_stats.prom /home/ollama/node_exporter_textfiles/storage_health.prom; do
  if [ -f "$f" ]; then
    mt=$(stat -c %Y "$f")
    echo "TEXTFILE_188 $(basename "$f") age=$((now - mt))"
  else
    echo "TEXTFILE_188 $(basename "$f") missing"
  fi
done
if [ -f /home/ollama/node_exporter_textfiles/backup.prom ]; then
  awk -v now="$now" "/^backup_110_last_success_timestamp / {printf \"BACKUP_110_AGE %d\\n\", now - int(\$2)}" /home/ollama/node_exporter_textfiles/backup.prom
fi
if [ -f /home/ollama/node_exporter_textfiles/backup_health.prom ]; then
  awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} END {printf \"BACKUP_HEALTH_188 total=%d stale=%d missing_cron=%d missing_script=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0}" /home/ollama/node_exporter_textfiles/backup_health.prom
fi
if [ -f /home/ollama/node_exporter_textfiles/storage_health.prom ]; then
  awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_188 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/ollama/node_exporter_textfiles/storage_health.prom
fi
echo "SCHEDULER_CONTAINER_RUNNING $(docker inspect -f "{{.State.Running}}" momo-scheduler 2>/dev/null || true)"
echo "SCHEDULER_CONTAINER_HEALTH $(docker inspect -f "{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}" momo-scheduler 2>/dev/null || true)"
echo "SCHEDULER_REGISTERED $(docker logs --tail 400 momo-scheduler 2>&1 | grep -Ec "全部排程任務已註冊|排程任務已註冊|Scheduler started|APScheduler" || true)"
echo "SCHEDULER_RECENT_ACTIVITY $(docker logs --since 2h momo-scheduler 2>&1 | grep -Ec "AutoImport|Meta-Analysis|Scheduler|排程|任務|批次 [0-9]+: 取得|\\[Feeder\\]|HITL|候選屬" || true)"
echo "MOMO_SOURCE_EMPTY_EVIDENCE_LINES $(docker logs --since 6h momo-scheduler 2>&1 | grep -Ec "找到 0 個 Excel|沒有找到待匯入" || true)"
token_stat=$(stat -c "%u:%g:%a" /home/ollama/momo-pro/config/google_token.json 2>/dev/null || true)
scheduler_uid=$(docker top momo-scheduler -eo pid,user,uid 2>/dev/null | awk "NR==2 {print \$3}" || true)
echo "MOMO_GDRIVE_TOKEN_STAT ${token_stat:-missing} scheduler_uid=${scheduler_uid:-unknown}"
db_user=$(docker exec momo-pro-system printenv POSTGRES_USER 2>/dev/null || true)
db_name=$(docker exec momo-pro-system printenv POSTGRES_DB 2>/dev/null || true)
db_pass=$(docker exec momo-pro-system printenv POSTGRES_PASSWORD 2>/dev/null || true)
if [ -n "$db_user" ] && [ -n "$db_name" ] && [ -n "$db_pass" ]; then
  momo_sync=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\"日期\"::date) mmin, max(\"日期\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \"日期\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;" 2>/dev/null || true)
  momo_freshness=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT coalesce((current_date - max(snapshot_date::date))::text, chr(45)) || chr(124) || coalesce(max(snapshot_date::date)::text, chr(45)) FROM daily_sales_snapshot;" 2>/dev/null || true)
  momo_import_config=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT config_key || chr(61) || config_value FROM import_config;" 2>/dev/null | awk -F= "\$1 == \"gdrive_folder_path\" {folder=\$2} \$1 == \"gdrive_file_pattern\" {pattern=\$2} END {if (folder || pattern) print folder \"|\" pattern}" || true)
  momo_latest_import_job=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT coalesce(id::text, chr(45)) || chr(124) || coalesce(job_type, chr(45)) || chr(124) || coalesce(status, chr(45)) || chr(124) || coalesce(drive_file_name, chr(45)) || chr(124) || coalesce(replace(created_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(replace(completed_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(total_rows::text, chr(45)) || chr(124) || coalesce(success_rows::text, chr(45)) || chr(124) || coalesce(error_rows::text, chr(45)) FROM import_jobs ORDER BY created_at DESC LIMIT 20;" 2>/dev/null | awk "BEGIN {FS=sprintf(\"%c\",124)} \$2 == \"daily_sales\" {print \$1 \"|\" \$3 \"|\" \$4 \"|\" \$5 \"|\" \$6 \"|\" \$7 \"|\" \$8 \"|\" \$9; exit}" || true)
else
  momo_sync=""
  momo_freshness=""
  momo_import_config=""
  momo_latest_import_job=""
fi
echo "MOMO_MONTHLY_SYNC ${momo_sync:-unavailable}"
echo "MOMO_DAILY_FRESHNESS ${momo_freshness:-unavailable}"
echo "MOMO_IMPORT_CONFIG ${momo_import_config:-unavailable}"
echo "MOMO_LATEST_IMPORT_JOB ${momo_latest_import_job:-unavailable}"
' 2>&1); then
    echo "$out"
    grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
    awk '/TEXTFILE_188 backup.prom age=/ {split($3,a,"="); exit !(a[2] < 90000)}' <<<"$out" && ok "188 backup textfile fresh enough" || warn "188 backup textfile stale or missing"
    awk '/TEXTFILE_188 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "188 backup health exporter fresh" || warn "188 backup health exporter stale"
    awk '/TEXTFILE_188 docker_restart_count.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker restart exporter fresh" || warn "188 docker restart exporter stale"
    awk '/TEXTFILE_188 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 docker stats exporter fresh" || warn "188 docker stats exporter stale"
    awk '/TEXTFILE_188 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "188 storage health exporter fresh" || warn "188 storage health exporter stale"
    grep -q "STORAGE_HEALTH_188 root_readonly=0 current=0" <<<"$out" && ok "188 current boot storage health clean" || warn "188 storage health not clean"
    awk '/BACKUP_110_AGE / {exit !($2 < 90000)}' <<<"$out" && ok "188 backup-from-110 success within 25h" || warn "188 backup-from-110 success not confirmed"
    grep -q "BACKUP_HEALTH_188 total=" <<<"$out" && awk '/BACKUP_HEALTH_188/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); exit !((a[2]+b[2]+c[2]) == 0)}' <<<"$out" && ok "188 backup health has no stale expected jobs" || warn "188 backup health has stale expected jobs"
    if grep -q "SCHEDULER_CONTAINER_HEALTH healthy" <<<"$out" && awk '/SCHEDULER_RECENT_ACTIVITY / {exit !($2 > 0)}' <<<"$out"; then
      ok "188 momo scheduler healthy with recent task activity"
    elif awk '/SCHEDULER_REGISTERED / {exit !($2 > 0)}' <<<"$out"; then
      ok "188 momo scheduler registered jobs"
    else
      warn "188 momo scheduler registration/activity not confirmed"
    fi
    awk '/MOMO_GDRIVE_TOKEN_STAT / {split($2,a,":"); split($3,b,"="); exit !(a[1] == b[2] && a[3] <= 600)}' <<<"$out" && ok "188 momo Google Drive token ownership matches scheduler userns" || warn "188 momo Google Drive token ownership/writeback not confirmed"
    grep -Fq "MOMO_IMPORT_CONFIG 當日業績匯入|即時業績_當日" <<<"$out" && ok "188 momo Drive import config points to expected daily-sales intake" || fail "188 momo Drive import config drifted from expected daily-sales intake"
    awk '/MOMO_LATEST_IMPORT_JOB / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[2] == "completed" && a[6] == a[7] && a[8] == 0)}' <<<"$out" && ok "188 momo latest daily import job completed cleanly" || warn "188 momo latest daily import job not confirmed clean"
    awk '/MOMO_MONTHLY_SYNC / {split($2,a,"|"); exit !(a[1] > 0 && a[1] == a[2] && a[3] == a[5] && a[4] == a[6])}' <<<"$out" && ok "188 momo current-month snapshot and realtime tables match" || warn "188 momo current-month snapshot/realtime sync not confirmed"
    if awk '/MOMO_DAILY_FRESHNESS / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[1] >= 0 && a[1] <= 2)}' <<<"$out"; then
      ok "188 momo daily sales data fresh enough"
    elif awk '/MOMO_DAILY_FRESHNESS / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[1] >= 3)}' <<<"$out"; then
      if awk '/MOMO_SOURCE_EMPTY_EVIDENCE_LINES / {exit !($2 > 0)}' <<<"$out"; then
        fail "188 momo source file absent while daily sales data stale"
      else
        fail "188 momo daily sales data stale beyond 3 days"
      fi
    else
      warn "188 momo daily sales freshness not confirmed"
    fi
  else
    warn "188 schedule check unavailable"
    echo "$out"
  fi

  if out=$(host_cmd "wooo@192.168.0.110" '
now=$(date +%s)
echo "CRON_110 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
echo "FAILED_UNITS_110 $(systemctl --failed --no-legend --plain 2>/dev/null | wc -l)"
echo "MOMO_STARTUP_ENABLED $(systemctl is-enabled momo-startup-complete.service 2>/dev/null || true)"
echo "STAGGERED_STARTUP_ENABLED $(systemctl is-enabled wooo-staggered-startup.service 2>/dev/null || true)"
for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom /home/wooo/node_exporter_textfiles/storage_health.prom /home/wooo/node_exporter_textfiles/backup_health.prom; do
  if [ -f "$f" ]; then
    mt=$(stat -c %Y "$f")
    echo "TEXTFILE_110 $(basename "$f") age=$((now - mt))"
  else
    echo "TEXTFILE_110 $(basename "$f") missing"
  fi
done
if [ -f /home/wooo/node_exporter_textfiles/storage_health.prom ]; then
  awk "/^awoooi_host_storage_root_readonly/ {readonly=int(\$2)} /^awoooi_host_storage_current_boot_error_count/ {current=int(\$2)} END {printf \"STORAGE_HEALTH_110 root_readonly=%d current=%d\\n\", readonly+0, current+0}" /home/wooo/node_exporter_textfiles/storage_health.prom
fi
if [ -f /home/wooo/node_exporter_textfiles/backup_health.prom ]; then
  awk "/^awoooi_backup_job_fresh/ {total++; if (int(\$2) == 0) stale++} /^awoooi_backup_job_configured/ {if (int(\$2) == 0) missing_cron++} /^awoooi_backup_script_present/ {if (int(\$2) == 0) missing_script++} /^awoooi_backup_last_run_failed_count/ {if (\$0 ~ /(exported_job|job)=\"backup_all\"/) failed=int(\$2)} /^awoooi_backup_config_capture_critical_failed_count/ {config_failed=int(\$2)} /^awoooi_backup_integrity_fresh/ {integrity_total++; if (int(\$2) == 0) integrity_stale++} END {printf \"BACKUP_HEALTH_110 total=%d stale=%d missing_cron=%d missing_script=%d failed_count=%d config_failed=%d integrity_total=%d integrity_stale=%d\\n\", total+0, stale+0, missing_cron+0, missing_script+0, failed+0, config_failed+0, integrity_total+0, integrity_stale+0}" /home/wooo/node_exporter_textfiles/backup_health.prom
fi
' 2>&1); then
    echo "$out"
    grep -q "CRON_110 active" <<<"$out" && ok "110 cron active" || warn "110 cron not confirmed"
    grep -q "FAILED_UNITS_110 0" <<<"$out" && ok "110 systemd has no failed units" || warn "110 systemd failed units remain"
    grep -q "MOMO_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale momo startup unit disabled" || warn "110 stale momo startup unit not disabled"
    grep -q "STAGGERED_STARTUP_ENABLED disabled" <<<"$out" && ok "110 stale staggered startup unit disabled" || warn "110 stale staggered startup unit not disabled"
    awk '/TEXTFILE_110 docker_stats.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 docker stats exporter fresh" || warn "110 docker stats exporter stale"
    awk '/TEXTFILE_110 systemd_units.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 systemd units exporter fresh" || warn "110 systemd units exporter stale"
    awk '/TEXTFILE_110 storage_health.prom age=/ {split($3,a,"="); exit !(a[2] < 300)}' <<<"$out" && ok "110 storage health exporter fresh" || warn "110 storage health exporter stale"
    awk '/TEXTFILE_110 backup_health.prom age=/ {split($3,a,"="); exit !(a[2] < 900)}' <<<"$out" && ok "110 backup health exporter fresh" || warn "110 backup health exporter stale"
    grep -q "STORAGE_HEALTH_110 root_readonly=0 current=0" <<<"$out" && ok "110 current boot storage health clean" || warn "110 storage health not clean"
    grep -q "BACKUP_HEALTH_110 total=" <<<"$out" && awk '/BACKUP_HEALTH_110/ {split($3,a,"="); split($4,b,"="); split($5,c,"="); split($6,d,"="); split($7,e,"="); exit !((a[2]+b[2]+c[2]) == 0 && d[2] == 0 && e[2] == 0)}' <<<"$out" && ok "110 backup health has no stale expected jobs" || warn "110 latest aggregate/config backup had failed components; rerun backup-all after 120 recovers"
    awk '/BACKUP_HEALTH_110/ {split($9,a,"="); exit !(a[2] == 0)}' <<<"$out" && ok "110 backup integrity and restore drill fresh" || warn "110 backup integrity or restore drill stale"
  else
    warn "110 schedule check unavailable"
    echo "$out"
  fi

  if out=$(host_cmd "wooo@192.168.0.120" '
kcmd() {
  if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then
    printf "%s\n" "$REMOTE_SUDO_PASSWORD" | sudo -S -p "" kubectl "$@"
  else
    sudo -n kubectl "$@" 2>/dev/null || kubectl "$@"
  fi
}
echo "CRON_120 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
kcmd get cronjobs -n awoooi-prod -o json | python3 -c "import json,sys; d=json.load(sys.stdin); items=d.get(\"items\", []); print(\"CRONJOB_COUNT\", len(items)); print(\"CRONJOB_SUSPENDED\", sum(1 for i in items if i.get(\"spec\",{}).get(\"suspend\")))"
kcmd get jobs -n awoooi-prod -o json | python3 -c "import json,sys,re; d=json.load(sys.stdin)
def owner(job):
  for ref in job.get(\"metadata\",{}).get(\"ownerReferences\",[]) or []:
    if ref.get(\"kind\") == \"CronJob\" and ref.get(\"name\"):
      return ref.get(\"name\")
  name = job.get(\"metadata\",{}).get(\"name\", \"\")
  return re.sub(r\"-[0-9]+$\", \"\", name)
def has_condition(job, kind):
  return any(c.get(\"type\") == kind and c.get(\"status\") == \"True\" for c in job.get(\"status\",{}).get(\"conditions\",[]) or [])
def job_time(job):
  status = job.get(\"status\",{})
  return status.get(\"completionTime\") or status.get(\"startTime\") or \"\"
latest_success = {}
failed_jobs = []
for job in d.get(\"items\", []):
  own = owner(job)
  ts = job_time(job)
  if has_condition(job, \"Complete\"):
    latest_success[own] = max(latest_success.get(own, \"\"), ts)
  if has_condition(job, \"Failed\"):
    failed_jobs.append((own, job.get(\"metadata\",{}).get(\"name\", \"\"), ts))
active_failed = 0
stale_failed = 0
for own, name, ts in failed_jobs:
  if ts and latest_success.get(own, \"\") > ts:
    stale_failed += 1
  else:
    active_failed += 1
print(\"FAILED_JOBS\", len(failed_jobs))
print(\"STALE_FAILED_JOBS\", stale_failed)
print(\"ACTIVE_FAILED_JOBS\", active_failed)"
kcmd get pods -n awoooi-prod --no-headers 2>/dev/null | awk "\$3 !~ /^(Running|Completed)$/ {bad++} END {print \"BAD_PODS\", bad+0}"
' 2>&1); then
    echo "$out"
    grep -q "CRON_120 active" <<<"$out" && ok "120 cron active" || warn "120 cron not confirmed"
    awk '/CRONJOB_COUNT / {exit !($2 >= 4)}' <<<"$out" && ok "K8s AWOOOI CronJobs present" || warn "K8s AWOOOI CronJobs missing"
    grep -q "CRONJOB_SUSPENDED 0" <<<"$out" && ok "K8s AWOOOI CronJobs unsuspended" || warn "K8s AWOOOI CronJob suspended"
    grep -q "ACTIVE_FAILED_JOBS 0" <<<"$out" && ok "K8s AWOOOI has no active failed Jobs" || warn "K8s AWOOOI active failed Jobs remain"
    grep -q "BAD_PODS 0" <<<"$out" && ok "K8s AWOOOI pods Running/Completed only" || warn "K8s AWOOOI bad pod status remains"
  else
    warn "120 K8s schedule check unavailable"
    echo "$out"
  fi

  if out=$(host_cmd "wooo@192.168.0.121" '
echo "CRON_121 $(systemctl is-active cron 2>/dev/null || systemctl is-active crond 2>/dev/null || true)"
crontab -l 2>/dev/null | grep -q "dr-drill.sh" && echo "DR_DRILL_CRON present" || echo "DR_DRILL_CRON missing"
' 2>&1); then
    echo "$out"
    grep -q "CRON_121 active" <<<"$out" && ok "121 cron active" || warn "121 cron not confirmed"
    grep -q "DR_DRILL_CRON present" <<<"$out" && ok "121 DR drill cron present" || warn "121 DR drill cron missing"
  else
    warn "121 schedule check unavailable"
    echo "$out"
  fi
}

summary() {
  log_section "SUMMARY"
  echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
  if [ "$FAIL" -gt 0 ]; then
    echo "Result: BLOCKED. Fix the first blocked gate before releasing runner/CD/AI auto-remediation."
    exit 2
  fi
  if [ "$WARN" -gt 0 ]; then
    echo "Result: DEGRADED. Core gates passed but warnings remain."
    exit 1
  fi
  echo "Result: GREEN. Full stack is ready for controlled runner/CD release."
}

if [ "$WATCH_MODE" -eq 1 ]; then
  attempt=1
  rc=2
  while true; do
    echo "WATCH_ATTEMPT=$attempt"
    args=()
    [ "$MONITOR_READ_ONLY" -eq 1 ] && args+=(--monitor-read-only)
    [ "$NO_COLOR_FLAG" -eq 1 ] && args+=(--no-color)
    [ "$SEND_ALERT_TEST" -eq 1 ] && args+=(--send-alert-test)
    bash "$0" "${args[@]}"
    rc=$?
    [ "$rc" -eq 0 ] && exit 0
    if [ "$WATCH_MAX_ATTEMPTS" -gt 0 ] && [ "$attempt" -ge "$WATCH_MAX_ATTEMPTS" ]; then
      exit "$rc"
    fi
    attempt=$((attempt + 1))
    sleep "$WATCH_INTERVAL"
  done
fi

print_header
check_network
check_188
check_110
check_k3s
check_workload_and_alertchain
check_public_routes
check_schedules
summary