Files
awoooi/scripts/reboot-recovery/post-start-quick-check.sh

407 lines
11 KiB
Bash
Executable File

#!/usr/bin/env bash
set -uo pipefail
# One-entry read-only post-reboot check. This wrapper intentionally delegates
# deep checks to the existing recovery scripts and does not restart, patch,
# delete, import, reload, or write runtime state.
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-6}"
RUN_COLD_START=1
RUN_MOMO=1
RUN_STOCK=1
RUN_BACKUP=1
RUN_ROUTES=1
RUN_CPU=1
NO_COLOR_FLAG=0
PASS_COUNT=0
WARN_COUNT=0
BLOCKED_COUNT=0
SERVICE_WARN_COUNT=0
BOUNDARY_WARN_COUNT=0
EVIDENCE_WARN_COUNT=0
HOSTS=(
"192.168.0.110"
"192.168.0.120"
"192.168.0.121"
"192.168.0.188"
)
ROUTES=(
"https://awoooi.wooo.work/"
"https://awoooi.wooo.work/api/v1/health"
"https://awoooi.wooo.work/zh-TW/iwooos"
"https://vibework.wooo.work/"
"https://awooogo.wooo.work/"
"https://2026fifa.wooo.work/"
"https://agent.wooo.work/"
"https://mo.wooo.work/"
"https://mo.wooo.work/health"
"https://stock.wooo.work/"
"https://stock.wooo.work/healthz"
"https://stock.wooo.work/api/healthz"
"https://bitan.wooo.work/"
"https://tsenyang.com/"
"https://www.tsenyang.com/"
"https://vtuber.wooo.work/"
"https://gitea.wooo.work/"
"https://harbor.wooo.work/"
"https://registry.wooo.work/"
"https://sentry.wooo.work/"
"https://signoz.wooo.work/"
"https://langfuse.wooo.work/"
"https://aiops.wooo.work/"
)
usage() {
cat <<'USAGE'
Usage: post-start-quick-check.sh [options]
Read-only post-reboot quick check for 110 / 120 / 121 / 188.
Options:
--skip-cold-start Do not run full-stack-cold-start-check.sh.
--skip-momo Do not run momo-drive-token-source-recovery-preflight.sh.
--skip-stock Do not query StockPlatform data freshness.
--skip-backup Do not run /backup/scripts/backup-status.sh on 110.
--skip-routes Do not curl public route smoke targets.
--skip-cpu Do not read 110 CPU / process summary.
--no-color Disable ANSI color.
-h, --help Show this help.
Exit codes:
0 = no service blockers. Boundary / evidence warnings may still be present.
1 = service warnings only.
2 = service blockers observed.
This script never reads token content and never writes runtime state.
USAGE
}
while [[ $# -gt 0 ]]; do
case "$1" in
--skip-cold-start)
RUN_COLD_START=0
;;
--skip-momo)
RUN_MOMO=0
;;
--skip-stock)
RUN_STOCK=0
;;
--skip-backup)
RUN_BACKUP=0
;;
--skip-routes)
RUN_ROUTES=0
;;
--skip-cpu)
RUN_CPU=0
;;
--no-color)
NO_COLOR_FLAG=1
;;
-h|--help)
usage
exit 0
;;
*)
printf 'Unknown argument: %s\n' "$1" >&2
usage >&2
exit 2
;;
esac
shift
done
if [[ -n "${NO_COLOR:-}" || "$NO_COLOR_FLAG" -eq 1 ]]; then
RED=""
GREEN=""
YELLOW=""
BLUE=""
NC=""
else
RED=$'\033[0;31m'
GREEN=$'\033[0;32m'
YELLOW=$'\033[1;33m'
BLUE=$'\033[0;34m'
NC=$'\033[0m'
fi
section() {
printf '\n%s=== %s ===%s\n' "$BLUE" "$1" "$NC"
}
ok() {
PASS_COUNT=$((PASS_COUNT + 1))
printf '%sOK%s %s\n' "$GREEN" "$NC" "$*"
}
warn() {
WARN_COUNT=$((WARN_COUNT + 1))
printf '%sWARN%s %s\n' "$YELLOW" "$NC" "$*"
}
service_warn() {
SERVICE_WARN_COUNT=$((SERVICE_WARN_COUNT + 1))
warn "$@"
}
boundary_warn() {
BOUNDARY_WARN_COUNT=$((BOUNDARY_WARN_COUNT + 1))
warn "$@"
}
evidence_warn() {
EVIDENCE_WARN_COUNT=$((EVIDENCE_WARN_COUNT + 1))
warn "$@"
}
blocked() {
BLOCKED_COUNT=$((BLOCKED_COUNT + 1))
printf '%sBLOCKED%s %s\n' "$RED" "$NC" "$*"
}
ssh_read() {
local user_host="$1"
local command="$2"
ssh -o BatchMode=yes -o ConnectTimeout="$SSH_CONNECT_TIMEOUT" "$user_host" "$command"
}
run_and_capture() {
local label="$1"
shift
local tmp
tmp="$(mktemp -t post-start-quick-check.XXXXXX)"
if "$@" >"$tmp" 2>&1; then
ok "$label"
cat "$tmp"
rm -f "$tmp"
return 0
fi
local rc=$?
cat "$tmp"
rm -f "$tmp"
return "$rc"
}
section "主機 / SSH"
for host in "${HOSTS[@]}"; do
if ping -c 1 -W 1 "$host" >/dev/null 2>&1; then
ok "PING_OK $host"
else
blocked "PING_FAIL $host"
fi
if nc -z -w 2 "$host" 22 >/dev/null 2>&1; then
ok "SSH_PORT_OK $host"
else
blocked "SSH_PORT_FAIL $host"
fi
done
if [[ "$RUN_COLD_START" -eq 1 ]]; then
section "Cold-start scorecard"
cold_tmp="$(mktemp -t post-start-cold-start.XXXXXX)"
cold_rc=0
if bash "$ROOT_DIR/scripts/reboot-recovery/full-stack-cold-start-check.sh" --monitor-read-only --no-color --watch --interval 1 --max-attempts 1 >"$cold_tmp" 2>&1; then
cold_rc=0
else
cold_rc=$?
fi
cat "$cold_tmp"
cold_summary="$(grep -E 'PASS=[0-9]+ WARN=[0-9]+ BLOCKED=[0-9]+' "$cold_tmp" | tail -n 1 || true)"
if [[ -n "$cold_summary" ]]; then
ok "cold-start summary: $cold_summary"
cold_warn=0
cold_blocked=0
if [[ "$cold_summary" =~ WARN=([0-9]+) ]]; then
cold_warn="${BASH_REMATCH[1]}"
fi
if [[ "$cold_summary" =~ BLOCKED=([0-9]+) ]]; then
cold_blocked="${BASH_REMATCH[1]}"
fi
if [[ "$cold_blocked" -gt 0 ]]; then
blocked "cold-start has blockers: $cold_summary"
elif [[ "$cold_warn" -gt 0 ]]; then
service_warn "cold-start is warning-only, not blocked: $cold_summary"
elif [[ "$cold_rc" -eq 0 ]]; then
ok "cold-start command exited 0"
else
evidence_warn "cold-start exited $cold_rc but summary has no blockers: $cold_summary"
fi
else
if [[ "$cold_rc" -eq 0 ]]; then
service_warn "cold-start summary not found"
else
blocked "cold-start command returned $cold_rc without summary"
fi
fi
rm -f "$cold_tmp"
fi
if [[ "$RUN_MOMO" -eq 1 ]]; then
section "MOMO freshness"
momo_tmp="$(mktemp -t post-start-momo.XXXXXX)"
bash "$ROOT_DIR/scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh" >"$momo_tmp" 2>&1
momo_rc=$?
cat "$momo_tmp"
momo_summary="$(grep -E 'MOMO_DRIVE_TOKEN_SOURCE_PREFLIGHT PASS=[0-9]+ WARN=[0-9]+ BLOCKED=[0-9]+' "$momo_tmp" | tail -n 1 || true)"
case "$momo_rc" in
0)
ok "MOMO preflight clean"
;;
1)
if [[ "$momo_summary" =~ BLOCKED=0 ]]; then
evidence_warn "MOMO preflight has non-service warnings"
else
service_warn "MOMO preflight has warnings and no clean summary"
fi
;;
*)
blocked "MOMO preflight has blockers"
;;
esac
grep -E 'MOMO_DRIVE_TOKEN_SOURCE_PREFLIGHT|MOMO_HEALTH_VERSION|DB_MONTHLY_SYNC|DB_DAILY_FRESHNESS|DB_LATEST_DAILY_IMPORT_JOB' "$momo_tmp" || true
rm -f "$momo_tmp"
fi
if [[ "$RUN_STOCK" -eq 1 ]]; then
section "StockPlatform freshness"
stock_tmp="$(mktemp -t post-start-stock.XXXXXX)"
stock_code="$(curl -k -sS -o "$stock_tmp" -w '%{http_code}' --max-time 12 "https://stock.wooo.work/api/v1/system/freshness" 2>/dev/null || true)"
if [[ "$stock_code" != 2* ]]; then
blocked "StockPlatform freshness endpoint returned ${stock_code:-curl_failed}"
cat "$stock_tmp" || true
else
python3 - "$stock_tmp" <<'PY'
import json
import sys
path = sys.argv[1]
with open(path, "r", encoding="utf-8") as fh:
payload = json.load(fh)
print(f"STOCK_FRESHNESS_STATUS {payload.get('status')}")
print(f"STOCK_LATEST_TRADING_DATE {payload.get('latest_trading_date')}")
print("STOCK_BLOCKERS " + ",".join(payload.get("blockers") or []))
for source in payload.get("sources") or []:
print(
"STOCK_SOURCE "
f"{source.get('source')}|{source.get('status')}|"
f"{source.get('latest_date')}|{source.get('row_count')}"
)
PY
stock_status="$(python3 - "$stock_tmp" <<'PY'
import json
import sys
with open(sys.argv[1], "r", encoding="utf-8") as fh:
print(json.load(fh).get("status") or "")
PY
)"
if [[ "$stock_status" == "ok" ]]; then
ok "StockPlatform freshness is ok"
else
stock_blockers="$(python3 - "$stock_tmp" <<'PY'
import json
import sys
with open(sys.argv[1], "r", encoding="utf-8") as fh:
print(",".join(json.load(fh).get("blockers") or []))
PY
)"
blocked "StockPlatform freshness is ${stock_status:-unknown}: ${stock_blockers:-no_blocker_list}"
fi
fi
rm -f "$stock_tmp"
fi
if [[ "$RUN_BACKUP" -eq 1 ]]; then
section "Backup / offsite / escrow"
backup_tmp="$(mktemp -t post-start-backup.XXXXXX)"
if ssh_read "wooo@192.168.0.110" '/backup/scripts/backup-status.sh --no-notify --no-refresh' >"$backup_tmp" 2>&1; then
ok "backup-status readback succeeded"
else
blocked "backup-status readback failed"
fi
cat "$backup_tmp"
if grep -Eq 'core_blockers=0|CORE_BLOCKERS[ =]0' "$backup_tmp"; then
ok "backup core blockers are 0"
elif grep -Eq 'core_blockers=[1-9]|CORE_BLOCKERS[ =][1-9]' "$backup_tmp"; then
blocked "backup core blockers are non-zero"
else
service_warn "backup core blocker summary not confirmed"
fi
if grep -Eq 'escrow_missing=0|ESCROW_MISSING_COUNT[ =]0' "$backup_tmp"; then
ok "credential escrow missing is 0"
elif grep -Eq 'escrow_missing=[1-9]|ESCROW_MISSING_COUNT[ =][1-9]' "$backup_tmp"; then
boundary_warn "credential escrow still missing; DR_COMPLETE is forbidden"
else
evidence_warn "credential escrow count not found"
fi
rm -f "$backup_tmp"
fi
if [[ "$RUN_ROUTES" -eq 1 ]]; then
section "Public routes"
for url in "${ROUTES[@]}"; do
code="$(curl -k -sS -o /dev/null -w '%{http_code}' --max-time 12 "$url" 2>/dev/null || true)"
case "$code" in
2*|3*)
ok "$code $url"
;;
*)
blocked "${code:-curl_failed} $url"
;;
esac
done
fi
if [[ "$RUN_CPU" -eq 1 ]]; then
section "110 CPU / process attribution"
cpu_tmp="$(mktemp -t post-start-cpu.XXXXXX)"
if ssh_read "wooo@192.168.0.110" 'uptime; vmstat 1 5; ps -eo pid,ppid,pgid,stat,pcpu,pmem,comm,args --sort=-pcpu | head -25' >"$cpu_tmp" 2>&1; then
ok "110 CPU/process readback succeeded"
else
evidence_warn "110 CPU/process readback failed"
fi
cat "$cpu_tmp"
if grep -Eiq 'chrome|chromium|playwright' "$cpu_tmp"; then
evidence_warn "browser/smoke process is visible; classify orphan vs active parent before action"
fi
if grep -Eiq 'gitea|actions|runner|npm|pnpm|pytest|pip-audit' "$cpu_tmp"; then
ok "active CI/build/test load is visible"
fi
rm -f "$cpu_tmp"
fi
section "總結"
printf 'POST_START_QUICK_CHECK PASS=%s WARN=%s BLOCKED=%s\n' "$PASS_COUNT" "$WARN_COUNT" "$BLOCKED_COUNT"
printf 'POST_START_QUICK_CHECK_WARNINGS SERVICE=%s BOUNDARY=%s EVIDENCE=%s\n' "$SERVICE_WARN_COUNT" "$BOUNDARY_WARN_COUNT" "$EVIDENCE_WARN_COUNT"
if [[ "$BLOCKED_COUNT" -gt 0 ]]; then
printf 'RESULT=BLOCKED\n'
exit 2
fi
if [[ "$SERVICE_WARN_COUNT" -gt 0 ]]; then
printf 'RESULT=DEGRADED\n'
exit 1
fi
if [[ "$BOUNDARY_WARN_COUNT" -gt 0 ]]; then
printf 'RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED\n'
exit 0
fi
if [[ "$EVIDENCE_WARN_COUNT" -gt 0 ]]; then
printf 'RESULT=GREEN_WITH_EVIDENCE_WARNINGS\n'
exit 0
fi
printf 'RESULT=GREEN\n'
exit 0