docs(ops): add momo preflight and cpu triage evidence [skip ci]
This commit is contained in:
231
scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh
Executable file
231
scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh
Executable file
@@ -0,0 +1,231 @@
|
||||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
# Read-only MOMO recovery preflight. This script must not import files, move
|
||||
# Drive artifacts, restart containers, change token ownership, or print secrets.
|
||||
|
||||
MOMO_HOST="${MOMO_HOST:-ollama@192.168.0.188}"
|
||||
FRESHNESS_MAX_DAYS="${FRESHNESS_MAX_DAYS:-2}"
|
||||
SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-8}"
|
||||
|
||||
PASS_COUNT=0
|
||||
WARN_COUNT=0
|
||||
BLOCKED_COUNT=0
|
||||
|
||||
ok() {
|
||||
PASS_COUNT=$((PASS_COUNT + 1))
|
||||
printf 'OK: %s\n' "$*"
|
||||
}
|
||||
|
||||
warn() {
|
||||
WARN_COUNT=$((WARN_COUNT + 1))
|
||||
printf 'WARN: %s\n' "$*"
|
||||
}
|
||||
|
||||
blocked() {
|
||||
BLOCKED_COUNT=$((BLOCKED_COUNT + 1))
|
||||
printf 'BLOCKED: %s\n' "$*"
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat <<'USAGE'
|
||||
Usage: momo-drive-token-source-recovery-preflight.sh [--host user@host] [--freshness-max-days N]
|
||||
|
||||
Read-only checks:
|
||||
- MOMO public health and local health endpoint on 188
|
||||
- momo-scheduler running / health / UID
|
||||
- Google token metadata only, never token content
|
||||
- scheduler fail-closed log evidence and notification evidence
|
||||
- daily_sales_snapshot / realtime_sales_monthly bounds
|
||||
- latest daily_sales import job
|
||||
|
||||
Exit codes:
|
||||
0 = no warnings or blockers
|
||||
1 = warnings only
|
||||
2 = one or more blockers
|
||||
USAGE
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--host)
|
||||
MOMO_HOST="${2:-}"
|
||||
shift 2
|
||||
;;
|
||||
--freshness-max-days)
|
||||
FRESHNESS_MAX_DAYS="${2:-}"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
printf 'Unknown argument: %s\n' "$1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if ! [[ "$FRESHNESS_MAX_DAYS" =~ ^[0-9]+$ ]]; then
|
||||
printf 'FRESHNESS_MAX_DAYS must be numeric: %s\n' "$FRESHNESS_MAX_DAYS" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
tmp_output="$(mktemp -t momo-drive-preflight.XXXXXX)"
|
||||
trap 'rm -f "$tmp_output"' EXIT
|
||||
|
||||
if ! ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -o ConnectTimeout="$SSH_CONNECT_TIMEOUT" "$MOMO_HOST" 'bash -s' >"$tmp_output" <<'REMOTE'
|
||||
set -uo pipefail
|
||||
|
||||
emit() {
|
||||
printf '%s %s\n' "$1" "${2:-}"
|
||||
}
|
||||
|
||||
emit HOST "$(hostname 2>/dev/null || true)"
|
||||
emit MOMO_HEALTH_CODE "$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 http://127.0.0.1:5003/health 2>/dev/null || true)"
|
||||
emit MOMO_PUBLIC_HEALTH_CODE "$(curl -s -o /dev/null -w '%{http_code}' --max-time 8 https://mo.wooo.work/health 2>/dev/null || true)"
|
||||
emit SCHEDULER_RUNNING "$(docker inspect -f '{{.State.Running}}' momo-scheduler 2>/dev/null || true)"
|
||||
emit SCHEDULER_HEALTH "$(docker inspect -f '{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}' momo-scheduler 2>/dev/null || true)"
|
||||
emit SCHEDULER_STARTED_AT "$(docker inspect -f '{{.State.StartedAt}}' momo-scheduler 2>/dev/null || true)"
|
||||
emit SCHEDULER_UID "$(docker top momo-scheduler -eo pid,user,uid 2>/dev/null | awk 'NR==2 {print $3}' || true)"
|
||||
|
||||
token_stat="$(stat -c '%u:%g:%a' /home/ollama/momo-pro/config/google_token.json 2>/dev/null || true)"
|
||||
emit TOKEN_STAT "${token_stat:-missing}"
|
||||
|
||||
container_token_stat="$(docker exec momo-scheduler sh -lc 'stat -c "%u:%g:%a" config/google_token.json 2>/dev/null || true' 2>/dev/null || true)"
|
||||
emit CONTAINER_TOKEN_STAT "${container_token_stat:-missing}"
|
||||
|
||||
logs="$(docker logs --since 8h momo-scheduler 2>&1 || true)"
|
||||
emit LOG_AUTH_FAILURE_COUNT "$(printf '%s\n' "$logs" | grep -Ec 'Google Drive 認證失敗|could not locate runnable browser|Permission denied.*google_token|連線或認證失敗' || true)"
|
||||
emit LOG_FAIL_CLOSED_COUNT "$(printf '%s\n' "$logs" | grep -Ec '自動匯入失敗|未能確認來源資料夾是否有新檔案' || true)"
|
||||
emit LOG_FAILURE_NOTIFY_SUCCESS_COUNT "$(printf '%s\n' "$logs" | grep -Ec '匯入失敗通知已發送|Telegram 通知發送成功' || true)"
|
||||
emit LOG_EMPTY_SOURCE_COUNT "$(printf '%s\n' "$logs" | grep -Ec '找到 0 個 Excel|沒有找到待匯入' || true)"
|
||||
emit LOG_SUCCESS_IMPORT_COUNT "$(printf '%s\n' "$logs" | grep -Ec '自動匯入完成|匯入成功|成功匯入' || true)"
|
||||
|
||||
psql_query() {
|
||||
docker exec momo-db psql -h 127.0.0.1 -U momo -d momo_analytics -Atc "$1" 2>/dev/null || true
|
||||
}
|
||||
|
||||
emit DB_DAILY "$(psql_query "SELECT count(*) || chr(124) || coalesce(min(snapshot_date::date)::text, chr(45)) || chr(124) || coalesce(max(snapshot_date::date)::text, chr(45)) FROM daily_sales_snapshot;")"
|
||||
emit DB_MONTHLY_CURRENT "$(psql_query "SELECT count(*) || chr(124) || coalesce(min(\"日期\"::date)::text, chr(45)) || chr(124) || coalesce(max(\"日期\"::date)::text, chr(45)) FROM realtime_sales_monthly WHERE \"日期\"::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1);")"
|
||||
emit DB_MONTHLY_SYNC "$(psql_query "WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\"日期\"::date) mmin, max(\"日期\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \"日期\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;")"
|
||||
emit DB_DAILY_FRESHNESS "$(psql_query "SELECT coalesce((current_date - max(snapshot_date::date))::text, chr(45)) || chr(124) || coalesce(max(snapshot_date::date)::text, chr(45)) FROM daily_sales_snapshot;")"
|
||||
emit DB_LATEST_DAILY_IMPORT_JOB "$(psql_query "SELECT coalesce(id::text, chr(45)) || chr(124) || coalesce(status, chr(45)) || chr(124) || coalesce(drive_file_name, chr(45)) || chr(124) || coalesce(replace(created_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(replace(completed_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(total_rows::text, chr(45)) || chr(124) || coalesce(success_rows::text, chr(45)) || chr(124) || coalesce(error_rows::text, chr(45)) FROM import_jobs WHERE job_type = 'daily_sales' ORDER BY created_at DESC LIMIT 1;")"
|
||||
emit IMPORT_CONFIG "$(psql_query "SELECT config_key || chr(61) || config_value FROM import_config;" | awk -F= '$1 == "gdrive_folder_path" {folder=$2} $1 == "gdrive_file_pattern" {pattern=$2} END {if (folder || pattern) print folder "|" pattern}')"
|
||||
REMOTE
|
||||
then
|
||||
cat "$tmp_output"
|
||||
blocked "MOMO host read-only SSH preflight failed: $MOMO_HOST"
|
||||
else
|
||||
cat "$tmp_output"
|
||||
fi
|
||||
|
||||
value_for() {
|
||||
awk -v key="$1" '$1 == key {sub($1 " ", ""); print; exit}' "$tmp_output"
|
||||
}
|
||||
|
||||
num_for() {
|
||||
local value
|
||||
value="$(value_for "$1")"
|
||||
[[ "$value" =~ ^[0-9]+$ ]] && printf '%s\n' "$value" || printf '0\n'
|
||||
}
|
||||
|
||||
health_code="$(value_for MOMO_HEALTH_CODE)"
|
||||
public_health_code="$(value_for MOMO_PUBLIC_HEALTH_CODE)"
|
||||
[[ "$public_health_code" == "200" ]] && ok "MOMO public health endpoint returns 200" || blocked "MOMO public health endpoint is not 200: ${public_health_code:-missing}"
|
||||
[[ "$health_code" == "200" ]] && ok "MOMO local health endpoint returns 200" || warn "MOMO local health endpoint is not 200: ${health_code:-missing}"
|
||||
|
||||
scheduler_running="$(value_for SCHEDULER_RUNNING)"
|
||||
scheduler_health="$(value_for SCHEDULER_HEALTH)"
|
||||
scheduler_started_at="$(value_for SCHEDULER_STARTED_AT)"
|
||||
[[ "$scheduler_running" == "true" ]] && ok "momo-scheduler container is running" || blocked "momo-scheduler container is not running"
|
||||
[[ "$scheduler_health" == "healthy" ]] && ok "momo-scheduler container health is healthy" || warn "momo-scheduler health is not healthy: ${scheduler_health:-missing}"
|
||||
[[ -n "$scheduler_started_at" ]] && ok "momo-scheduler started_at metadata is available: $scheduler_started_at" || warn "momo-scheduler started_at metadata unavailable"
|
||||
|
||||
scheduler_uid="$(value_for SCHEDULER_UID)"
|
||||
token_stat="$(value_for TOKEN_STAT)"
|
||||
container_token_stat="$(value_for CONTAINER_TOKEN_STAT)"
|
||||
if [[ "$token_stat" == "missing" || -z "$token_stat" ]]; then
|
||||
warn "host Google token artifact metadata is missing"
|
||||
elif [[ "$scheduler_uid" =~ ^[0-9]+$ ]]; then
|
||||
token_uid="${token_stat%%:*}"
|
||||
token_mode="${token_stat##*:}"
|
||||
if [[ "$token_uid" == "$scheduler_uid" && "$token_mode" =~ ^[0-9]+$ && "$token_mode" -le 600 ]]; then
|
||||
ok "host Google token metadata matches scheduler UID and restrictive mode"
|
||||
else
|
||||
warn "host Google token metadata does not match scheduler UID/mode: token=$token_stat scheduler_uid=$scheduler_uid"
|
||||
fi
|
||||
else
|
||||
warn "scheduler UID unavailable; token metadata cannot be matched"
|
||||
fi
|
||||
|
||||
if [[ "$container_token_stat" == "missing" || -z "$container_token_stat" ]]; then
|
||||
warn "container Google token artifact metadata is missing"
|
||||
else
|
||||
ok "container Google token artifact metadata exists"
|
||||
fi
|
||||
|
||||
auth_failures="$(num_for LOG_AUTH_FAILURE_COUNT)"
|
||||
fail_closed="$(num_for LOG_FAIL_CLOSED_COUNT)"
|
||||
notify_success="$(num_for LOG_FAILURE_NOTIFY_SUCCESS_COUNT)"
|
||||
if [[ "$auth_failures" -gt 0 && "$fail_closed" -gt 0 ]]; then
|
||||
ok "scheduler has recent Drive auth/API failure fail-closed evidence"
|
||||
else
|
||||
warn "scheduler recent fail-closed evidence not observed in the last 8h"
|
||||
fi
|
||||
|
||||
if [[ "$notify_success" -gt 0 ]]; then
|
||||
ok "scheduler failure notification success evidence exists"
|
||||
else
|
||||
warn "scheduler failure notification success evidence not observed in the last 8h"
|
||||
fi
|
||||
|
||||
import_config="$(value_for IMPORT_CONFIG)"
|
||||
[[ "$import_config" == *"當日業績匯入|即時業績_當日"* ]] && ok "Drive import config points to expected daily-sales intake" || blocked "Drive import config is unavailable or drifted: ${import_config:-missing}"
|
||||
|
||||
monthly_sync="$(value_for DB_MONTHLY_SYNC)"
|
||||
IFS='|' read -r sync_snapshot_count sync_monthly_count sync_dmin sync_dmax sync_mmin sync_mmax <<<"$monthly_sync"
|
||||
if [[ "$sync_snapshot_count" =~ ^[0-9]+$ && "$sync_snapshot_count" -gt 0 && "$sync_snapshot_count" == "$sync_monthly_count" && "$sync_dmin" == "$sync_mmin" && "$sync_dmax" == "$sync_mmax" ]]; then
|
||||
ok "current-month daily snapshot and realtime tables are in sync"
|
||||
else
|
||||
blocked "current-month daily snapshot and realtime sync is not proven: ${monthly_sync:-missing}"
|
||||
fi
|
||||
|
||||
freshness="$(value_for DB_DAILY_FRESHNESS)"
|
||||
IFS='|' read -r freshness_days latest_daily_date <<<"$freshness"
|
||||
if [[ "$freshness_days" =~ ^[0-9]+$ && "$freshness_days" -le "$FRESHNESS_MAX_DAYS" ]]; then
|
||||
ok "daily sales data freshness is within ${FRESHNESS_MAX_DAYS} days: $freshness"
|
||||
elif [[ "$freshness_days" =~ ^[0-9]+$ ]]; then
|
||||
blocked "daily sales data is stale: $freshness"
|
||||
else
|
||||
blocked "daily sales freshness is unavailable: ${freshness:-missing}"
|
||||
fi
|
||||
|
||||
latest_job="$(value_for DB_LATEST_DAILY_IMPORT_JOB)"
|
||||
IFS='|' read -r job_id job_status job_file job_created job_completed job_total job_success job_errors <<<"$latest_job"
|
||||
if [[ "$job_id" =~ ^[0-9]+$ && "$job_status" == "completed" && "$job_total" == "$job_success" && "$job_errors" == "0" ]]; then
|
||||
ok "latest daily import job completed cleanly: id=$job_id file=$job_file"
|
||||
else
|
||||
warn "latest daily import job is not a clean completed job: ${latest_job:-missing}"
|
||||
fi
|
||||
|
||||
if [[ "$freshness_days" =~ ^[0-9]+$ && "$freshness_days" -gt "$FRESHNESS_MAX_DAYS" ]]; then
|
||||
if [[ "$auth_failures" -gt 0 ]]; then
|
||||
blocked "release blocker is stale business data with active Drive auth/source evidence gate"
|
||||
else
|
||||
blocked "release blocker is stale business data; source evidence must be refreshed"
|
||||
fi
|
||||
fi
|
||||
|
||||
printf 'MOMO_DRIVE_TOKEN_SOURCE_PREFLIGHT PASS=%d WARN=%d BLOCKED=%d HOST=%s FRESHNESS_MAX_DAYS=%s\n' \
|
||||
"$PASS_COUNT" "$WARN_COUNT" "$BLOCKED_COUNT" "$MOMO_HOST" "$FRESHNESS_MAX_DAYS"
|
||||
|
||||
if [[ "$BLOCKED_COUNT" -gt 0 ]]; then
|
||||
exit 2
|
||||
fi
|
||||
if [[ "$WARN_COUNT" -gt 0 ]]; then
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
||||
Reference in New Issue
Block a user