From 709825a89f989276ecc7253fc2771b041907ce29 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Jun 2026 20:22:35 +0800 Subject: [PATCH] fix(recovery): bound post reboot summary readbacks --- docs/LOGBOOK.md | 24 +++++++++++++++++++ docs/runbooks/FULL-STACK-COLD-START-SOP.md | 10 +++++--- ...oot-cold-start-backup-recovery-workplan.md | 4 ++-- .../188-host-hygiene-maintenance-checklist.sh | 12 +++++++++- .../post-reboot-readiness-summary.sh | 6 +++++ .../reboot-recovery/post-start-quick-check.sh | 15 +++++++++++- .../tests/test_188_host_hygiene_checklist.py | 9 ++++++- ..._post_reboot_readiness_summary_contract.py | 24 +++++++++++++++++++ .../test_post_start_quick_check_contract.py | 8 ++++++- 9 files changed, 103 insertions(+), 9 deletions(-) create mode 100644 scripts/reboot-recovery/tests/test_post_reboot_readiness_summary_contract.py diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 66d8552f..e621a473 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,27 @@ +## 2026-06-30 — 20:28 Post-reboot summary bounded SSH 與 no-false-green 修復 + +**照主線修正的問題**: +- `post-reboot-readiness-summary.sh` 先前會被 `post-start-quick-check.sh` / `188-host-hygiene-maintenance-checklist.sh` 內的 110 read-only SSH 卡住,導致全主機重啟後無法穩定產出 T+10 判定;現在兩支 delegated checklist 的 SSH helper 都有 command timeout、connection attempts、ServerAlive 與 no password prompt,遠端控制路徑卡住時會收斂成 blocker / evidence,而不是拖死整輪 summary。 +- 修正 summary 的 backup / credential escrow no-false-green:當 110 `backup-status` 讀不到、`ESCROW_MISSING_COUNT=unknown` 時,`DR_ESCROW_BLOCKED=1`、`DR_ESCROW_EVIDENCE_UNKNOWN=1`,且 `NEXT_REQUIRED_GATES` 會包含 `backup_core_readback_recovery` 與 `credential_escrow_evidence`;未知證據不得再被當成 DR 不阻塞。 + +**Live readback**: +- `SSH_COMMAND_TIMEOUT_SECONDS=8 scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` 完成,artifact `/tmp/awoooi-post-reboot-readiness-20260630-201642/summary.txt`。 +- 結果:`POST_START_RESULT=BLOCKED`、`POST_START_PASS=33`、`POST_START_WARN=6`、`POST_START_BLOCKED=8`、`SERVICE_GREEN=0`、`PRODUCT_DATA_GREEN=0`、`STOCK_FRESHNESS_STATUS=not_configured`、`STOCK_BLOCKERS=postgres_not_ready`、`BACKUP_CORE_GREEN=0`、`DR_ESCROW_BLOCKED=1`、`DR_ESCROW_EVIDENCE_UNKNOWN=1`、`HOST_188_HYGIENE_BLOCKED=1`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=6`、`OVERALL_DECLARATION=SERVICE_BLOCKED`。 +- 下一 gates:`product_data_freshness_recovery,backup_core_readback_recovery,credential_escrow_evidence,host_188_hygiene_maintenance_window`。 + +**主要 blocker**: +- 110 registry `/v2`:external HTTP `502`、K3s registry pull refused by `110:5000`。 +- 110 control path:SSH read-only check / backup-status / CPU / runner readback 均失敗或不可確認。 +- StockPlatform freshness:`postgres_not_ready`。 +- 188 hygiene:systemd not fully running、host PostgreSQL failed unit visible;SignOz public route `502`。 + +**驗證**: +- `bash -n post-reboot-readiness-summary.sh post-start-quick-check.sh 188-host-hygiene-maintenance-checklist.sh` 通過。 +- `pytest scripts/reboot-recovery/tests/test_post_reboot_readiness_summary_contract.py scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py -q`:`5 passed`。 +- `git diff --check` 通過。 + +**邊界**:只做本機 source / docs 與 read-only public / internal probes;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未重啟主機,未 restart Docker daemon / Nginx / K3s / DB / Redis / firewall,未 restore / prune / DB write。 + ## 2026-06-30 — 20:22 Harbor gate CD self-heal 接入 **照主線修正的問題**: diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index a8f33be0..7847500c 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -1,7 +1,7 @@ # AWOOOI 全棧冷啟動與主機重啟 SOP -> Version: v1.81 -> Last updated: 2026-06-29 Asia/Taipei +> Version: v1.82 +> Last updated: 2026-06-30 Asia/Taipei > Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path. --- @@ -18,7 +18,11 @@ v1.79 active owner response template rule:同一輪 owner packet 產生後,p v1.80 / v1.81 credential escrow intake scorecard rule:同一輪 owner response preflight 後,必須用 `scripts/reboot-recovery/post-reboot-credential-escrow-intake-scorecard.py --summary-file "$ARTIFACT_DIR/summary.txt" --owner-packet-file --response-file --offsite-report-file --escrow-status-file ` 收斂 DR escrow gate。scorecard 只讀 sanitized artifacts;不得讀 secret value、不得寫 marker、不得送 owner request、不得開 runtime gate。placeholder readback 期望 `STATUS=blocked_waiting_non_secret_credential_escrow_evidence`、`EFFECTIVE_ESCROW_MISSING_COUNT=5`、`OWNER_RESPONSE_RECEIVED_COUNT=0`、`OWNER_RESPONSE_ACCEPTED_COUNT=0`、`RUNTIME_GATE_COUNT=0`、`CREDENTIAL_MARKER_WRITE_AUTHORIZED_COUNT=0`。若未來收到合格 redacted owner response 並由 preflight 回 `ready_for_independent_reviewer_acceptance`,scorecard 應轉為 `STATUS=ready_for_independent_reviewer_acceptance`;即使 marker 尚未寫入,也只能進 `independent_reviewer_acceptance_then_marker_dry_run`,不得直接寫 marker 或宣稱 `DR_COMPLETE`。 -2026-06-29 09:13 latest live summary:`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` artifact `/tmp/awoooi-post-reboot-readiness-20260629-091918/summary.txt` 回傳 `POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_SERVICE_WARNINGS=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`STOCK_FRESHNESS_STATUS=ok`、`STOCK_LATEST_TRADING_DATE=2026-06-26`、`BACKUP_CORE_GREEN=1`、`HOST_188_HYGIENE_BLOCKED=0`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=6`、`RUNTIME_ACTION_AUTHORIZED=0`、`NEXT_REQUIRED_GATES=credential_escrow_evidence`。目前仍不可宣稱 `DR_COMPLETE`,因為 `ESCROW_MISSING_COUNT=5`;owner packet contract guard 期望 `gates=1`。 +2026-06-30 20:18 latest live summary:`SSH_COMMAND_TIMEOUT_SECONDS=8 scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` artifact `/tmp/awoooi-post-reboot-readiness-20260630-201642/summary.txt` 回傳 `POST_START_RESULT=BLOCKED`、`POST_START_PASS=33`、`POST_START_WARN=6`、`POST_START_BLOCKED=8`、`SERVICE_GREEN=0`、`PRODUCT_DATA_GREEN=0`、`STOCK_FRESHNESS_STATUS=not_configured`、`STOCK_BLOCKERS=postgres_not_ready`、`BACKUP_CORE_GREEN=0`、`DR_ESCROW_BLOCKED=1`、`DR_ESCROW_EVIDENCE_UNKNOWN=1`、`ESCROW_MISSING_COUNT=unknown`、`HOST_188_HYGIENE_BLOCKED=1`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=6`、`RUNTIME_ACTION_AUTHORIZED=0`、`OVERALL_DECLARATION=SERVICE_BLOCKED`、`NEXT_REQUIRED_GATES=product_data_freshness_recovery,backup_core_readback_recovery,credential_escrow_evidence,host_188_hygiene_maintenance_window`。目前不可宣稱全服務 10 分鐘恢復、Stock 最新、backup core green、DR complete 或 188 hygiene green;下一步優先修 110 control path / Harbor registry `/v2`,再以同一 summary 重跑驗證。 + +v1.82 bounded summary rule:`post-start-quick-check.sh` 與 `188-host-hygiene-maintenance-checklist.sh` 的 SSH helper 必須有 command timeout、single connection attempt、ServerAlive 與 no password prompt;任何 110 / 188 read-only control path 卡住時,都要收斂成 blocker / evidence,而不是讓 `post-reboot-readiness-summary.sh` 無限等待。若 backup / escrow 證據讀不到,`ESCROW_MISSING_COUNT=unknown` 必須同時輸出 `DR_ESCROW_BLOCKED=1` 與 `DR_ESCROW_EVIDENCE_UNKNOWN=1`,並把 `backup_core_readback_recovery`、`credential_escrow_evidence` 放進 `NEXT_REQUIRED_GATES`;unknown 不得被解讀為 DR 或 backup green。 + +2026-06-29 09:13 previous live summary:`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` artifact `/tmp/awoooi-post-reboot-readiness-20260629-091918/summary.txt` 回傳 `POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_SERVICE_WARNINGS=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`STOCK_FRESHNESS_STATUS=ok`、`STOCK_LATEST_TRADING_DATE=2026-06-26`、`BACKUP_CORE_GREEN=1`、`HOST_188_HYGIENE_BLOCKED=0`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=6`、`RUNTIME_ACTION_AUTHORIZED=0`、`NEXT_REQUIRED_GATES=credential_escrow_evidence`。此 baseline 已被 2026-06-30 20:18 全主機重啟後 evidence 覆蓋,不得再拿來宣稱目前 green。 2026-06-27 11:51 最新 live revalidation:`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` artifact `/tmp/awoooi-post-reboot-readiness-20260627-115046/summary.txt` 回傳 `POST_START_RESULT=BLOCKED`、`POST_START_PASS=37`、`POST_START_WARN=3`、`POST_START_BLOCKED=2`、`SERVICE_GREEN=0`、`PRODUCT_DATA_GREEN=1`、`STOCK_FRESHNESS_STATUS=ok`、`STOCK_LATEST_TRADING_DATE=2026-06-26`、`STOCK_BLOCKERS=none`、`BACKUP_CORE_GREEN=1`、`HOST_188_HYGIENE_BLOCKED=0`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`、`RUNTIME_ACTION_AUTHORIZED=0`。本輪已再次修復 188 `momo_pg_daily` crontab configured drift,`backup-status` 回 `core_blockers=0`、`configured_missing_188=0`;K3s / ArgoCD live readback 顯示 120 / 121 皆 `Ready`,`awoooi-prod` 為 `Synced / Healthy`,api/web/worker pods 均 Running。現在 hard blocker 是 MOMO business data freshness:`daily_sales_snapshot` 最新仍為 `2026-06-24`,`DRIVE_INTAKE_COUNT=0`,Drive archive / global latest `即時業績_當日` 均為 `2026-06-25T04:21:47Z`,最新 import job `57` 已 clean completed 且 `sync_success=true`。因此可宣稱主機、K3s、public routes、backup core 與 Stock freshness 已恢復;不可宣稱 full-stack green,直到 MOMO 來源檔補齊並由正式 import pipeline 更新 DB。DR complete 仍因 `ESCROW_MISSING_COUNT=5` 禁止宣稱,Wazuh 全主機納管仍因 manager registry accepted `0` 禁止宣稱。 diff --git a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md index f6f0b022..7ee0bf79 100644 --- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md +++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md @@ -15,12 +15,12 @@ | 優先 | 狀態 | 工作項 | 2026-06-30 證據 | 下一步 / 完成條件 | |------|------|--------|------------------|-------------------| -| P0-1 | BLOCKED | 全主機 cold-start / 10 分鐘自動恢復 SLO | 最新 `full-stack-cold-start-check.sh --monitor-read-only --no-color` 回 `PASS=67 WARN=4 BLOCKED=5`;110 registry external `/v2`、110 SSH read-only check、K3s registry pull refused、AWOOOI internal API probe、SigNoz TLS/public route 仍 blocked。 | 先修第一個 cold-start blocker,重跑同一 scorecard 到 `BLOCKED=0`;不可只用 route 200 宣稱恢復。 | +| P0-1 | BLOCKED | 全主機 cold-start / 10 分鐘自動恢復 SLO | 20:18 `post-reboot-readiness-summary.sh --no-color` artifact `/tmp/awoooi-post-reboot-readiness-20260630-201642/summary.txt` 回 `POST_START_PASS=33 WARN=6 BLOCKED=8`、`SERVICE_GREEN=0`、`OVERALL_DECLARATION=SERVICE_BLOCKED`;110 registry `/v2`、110 SSH / backup / CPU / runner readback、K3s registry pull refused、SignOz 502/TLS、Stock `postgres_not_ready`、188 hygiene 仍 blocked。summary SSH 已 bounded,不再無限卡住。 | 先修第一個 runtime blocker:110 control path / Harbor registry `/v2`。重跑同一 summary 到 `SERVICE_GREEN=1` 且 `POST_START_BLOCKED=0`;不可只用 route 200 宣稱恢復。 | | P0-2 | DONE_THIS_INCIDENT | 使用者可見 502:Tsenyang | `www.tsenyang.com` / `tsenyang.com` 由 502 恢復為 200;188 `tsenyang-website` container running;local `127.0.0.1:3000` 回 200。 | 下次同類 502 先查 release symlink / image / container;不先動 Nginx、DNS、DB、主機重啟。 | | P0-3 | BLOCKED | StockPlatform data freshness | public `/healthz`、`/api/healthz` 回 200;freshness / ingestion 回 `not_configured`、`postgres_not_ready`。 | 恢復 110 control path 後,read-only 查 `/home/wooo/stockplatform-v2` compose / DB schema / migration status;禁止 fake freshness、manual DB rows、restore/prune。 | | P0-4 | BLOCKED | AWOOOI production 版本最新性 | Gitea `main` 已多次前進但 production runtime readback 仍為 `7890778b83`,`runtime_build_readback_status=runtime_build_diverges_from_committed_deploy_readback`。Public Gitea visible run `cd.yaml #4043` 是 Failure;jobs API 與 visible run `head_sha` 不一致,已標 `cd_jobs_stale_or_mismatched`。 | 補 deploy marker / runtime SHA / endpoint readback 一致;未一致前不可宣稱 AWOOOI 最新。 | | P0-5 | BLOCKED | 110 control path | `diagnose-110-ssh-publickey-auth.sh`:node-exporter / SSH banner 正常;`NODE_LOAD_CLASSIFIER=high_load`、`NODE_PROCS_BLOCKED=0`;`wooo` publickey `publickey_offer_timeout`,`root` publickey `permission_denied`,`git` / `ollama` `preauth_timeout`。 | 集中查 110 sshd publickey auth / authorized_keys / PAM / account lookup path,並把 load / runner pressure 視為同一 blocker 的共因;可在 110 console/local 跑 `repair-110-ssh-publickey-auth-local.sh --check` / `--apply` 修 metadata 權限。恢復 SSH read-only command path 後才能驗證 Stock DB、Gitea dump、110 backup completeness。 | -| P0-6 | BLOCKED_BACKUP_COMPLETENESS | Gitea repo visibility 與完整備份 | Gitea version API 200;public repo search 只列 4 個 public repo;`stockplatform-v2` public page/API 404,但 internal `git ls-remote` 成功;188 `/home/ollama/backup/110/gitea` 起初為空。已建立 verified emergency bundle `/home/ollama/backup/110/gitea/git-bundles/20260630-190931`:4 個 public/internal repo bundle verify + checksum 成功,`AwoooGo`、`stockplatform-v2`、`vibework` 因 private auth fail-closed。 | 188 `gitea_repo_mirror_from_110` subtree metric / alert 已補;下一步仍是恢復 110 SSH command path 後跑正式 `gitea dump`、private repo 非互動備份、repo count 與 restore drill readback。 | +| P0-6 | BLOCKED_BACKUP_COMPLETENESS | Gitea repo visibility 與完整備份 | Gitea version API 200;public repo search 只列 4 個 public repo;`stockplatform-v2` public page/API 404,但 internal `git ls-remote` 成功;188 `/home/ollama/backup/110/gitea` 起初為空。已建立 verified emergency bundle `/home/ollama/backup/110/gitea/git-bundles/20260630-190931`:4 個 public/internal repo bundle verify + checksum 成功,`AwoooGo`、`stockplatform-v2`、`vibework` 因 private auth fail-closed。20:18 summary 因 110 `backup-status` 不可讀回,`BACKUP_CORE_GREEN=0`、`DR_ESCROW_BLOCKED=1`、`DR_ESCROW_EVIDENCE_UNKNOWN=1`。 | 188 `gitea_repo_mirror_from_110` subtree metric / alert 已補;下一步仍是恢復 110 SSH command path 後跑正式 `gitea dump`、private repo 非互動備份、repo count、backup-status 與 restore drill readback。unknown 不得當作 backup / DR green。 | | P0-7 | SOURCE_READY_RUNTIME_BLOCKED | 99 VMware / VM autostart | repo 已有 `windows99-vmware-autostart.ps1`;最新只讀 readback:99 ping OK、RDP 3389 OK、SSH 22 OK、WinRM 5985 fail,`administrator@192.168.0.99` SSH publickey denied;VM host 111 仍不可達。 | 恢復 99 可控通道或由 console 套用腳本;完成後讀回 111/188/120/121/112 boot evidence。 | | P0-8 | SOURCE_READY_RUNTIME_BLOCKED | 502 maintenance fallback / Telegram / backup alert | L0/L1 fallback runbook、Nginx snippet、reboot / backup alert rules 已在 source;runtime 尚需部署與外部 L1 provider readback。 | L0 以測試 vhost 驗證 `X-AWOOOI-Fallback`;L1 需外部雲端/CDN probe;Telegram 以脫敏 alert receipt 驗證。 | diff --git a/scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh b/scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh index edc515cb..ff8f23c8 100755 --- a/scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh +++ b/scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh @@ -9,6 +9,7 @@ REMOTE_110="${REMOTE_110:-wooo@192.168.0.110}" SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}" SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}" SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-8}" +SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-25}" NO_COLOR=0 usage() { @@ -26,6 +27,7 @@ Environment: SSH_BATCH_MODE=yes SSH_STRICT_HOST_KEY_CHECKING=accept-new SSH_CONNECT_TIMEOUT=8 + SSH_COMMAND_TIMEOUT_SECONDS=25 USAGE } @@ -70,6 +72,11 @@ HOST_HYGIENE_BLOCKED=0 ssh_opts=( -o BatchMode="$SSH_BATCH_MODE" -o ConnectTimeout="$SSH_CONNECT_TIMEOUT" + -o ConnectionAttempts=1 + -o ServerAliveInterval=5 + -o ServerAliveCountMax=1 + -o PreferredAuthentications=publickey + -o NumberOfPasswordPrompts=0 -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING" ) @@ -113,11 +120,14 @@ blocked() { ssh_cmd() { local target="$1" local command="$2" + local quoted_command="" if is_local_target "$target"; then bash -lc "$command" return $? fi - ssh "${ssh_opts[@]}" "$target" "$command" + printf -v quoted_command '%q' "$command" + ssh "${ssh_opts[@]}" "$target" \ + "if command -v timeout >/dev/null 2>&1; then timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_command}; else bash -lc ${quoted_command}; fi" } echo "AWOOOI 188 host hygiene maintenance checklist" diff --git a/scripts/reboot-recovery/post-reboot-readiness-summary.sh b/scripts/reboot-recovery/post-reboot-readiness-summary.sh index fb9d506f..de53d1db 100755 --- a/scripts/reboot-recovery/post-reboot-readiness-summary.sh +++ b/scripts/reboot-recovery/post-reboot-readiness-summary.sh @@ -128,10 +128,14 @@ stock_eod_final_retry_window_end="$(awk '$1 == "STOCK_EOD_FINAL_RETRY_WINDOW_END escrow_missing_count="$(grep -Eo 'escrow_missing=[0-9]+' "$post_start_log" | tail -n 1 | cut -d= -f2 || true)" dr_escrow_blocked=0 +escrow_evidence_unknown=0 if [[ -n "$escrow_missing_count" && "$escrow_missing_count" != "0" ]]; then dr_escrow_blocked=1 elif [[ "$post_result" == "FULL_STACK_GREEN_DR_ESCROW_BLOCKED" ]]; then dr_escrow_blocked=1 +elif [[ -z "$escrow_missing_count" ]]; then + dr_escrow_blocked=1 + escrow_evidence_unknown=1 fi host_188_hygiene_blocked="unknown" @@ -207,6 +211,7 @@ if [[ "$product_data_green" != "1" && "$stock_eod_window_pending" == "1" ]]; the elif [[ "$product_data_green" != "1" ]]; then next_required_gates+=("product_data_freshness_recovery") fi +[[ "$backup_core_green" != "1" ]] && next_required_gates+=("backup_core_readback_recovery") [[ "$dr_escrow_blocked" == "1" ]] && next_required_gates+=("credential_escrow_evidence") [[ "$host_188_hygiene_blocked" == "1" ]] && next_required_gates+=("host_188_hygiene_maintenance_window") [[ "$wazuh_registry_accepted" == "0" ]] && next_required_gates+=("wazuh_manager_registry_export") @@ -243,6 +248,7 @@ STOCK_EOD_FIRST_FULL_WINDOW_END_LOCAL=${stock_eod_first_full_window_end:-unknown STOCK_EOD_FINAL_RETRY_WINDOW_END_LOCAL=${stock_eod_final_retry_window_end:-unknown} BACKUP_CORE_GREEN=$backup_core_green DR_ESCROW_BLOCKED=$dr_escrow_blocked +DR_ESCROW_EVIDENCE_UNKNOWN=$escrow_evidence_unknown ESCROW_MISSING_COUNT=${escrow_missing_count:-unknown} HOST_188_SERVICE_GREEN=$host_188_service_green HOST_188_HYGIENE_BLOCKED=$host_188_hygiene_blocked diff --git a/scripts/reboot-recovery/post-start-quick-check.sh b/scripts/reboot-recovery/post-start-quick-check.sh index aaa8dd53..04d76e0d 100755 --- a/scripts/reboot-recovery/post-start-quick-check.sh +++ b/scripts/reboot-recovery/post-start-quick-check.sh @@ -7,6 +7,7 @@ set -uo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-6}" +SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-25}" ROUTE_RETRY_ATTEMPTS="${ROUTE_RETRY_ATTEMPTS:-3}" ROUTE_RETRY_DELAY_SECONDS="${ROUTE_RETRY_DELAY_SECONDS:-2}" STOCK_FRESHNESS_RETRY_ATTEMPTS="${STOCK_FRESHNESS_RETRY_ATTEMPTS:-6}" @@ -86,6 +87,7 @@ Options: -h, --help Show this help. Environment: + SSH_COMMAND_TIMEOUT_SECONDS Per remote SSH command timeout. Default: 25. ROUTE_RETRY_ATTEMPTS Public route attempts before blocking. Default: 3. ROUTE_RETRY_DELAY_SECONDS Delay between failed public route attempts. Default: 2. STOCK_FRESHNESS_RETRY_ATTEMPTS Stock freshness attempts before blocking. Default: 6. @@ -204,11 +206,22 @@ is_local_target() { ssh_read() { local user_host="$1" local command="$2" + local quoted_command="" if is_local_target "$user_host"; then bash -lc "$command" return $? fi - ssh -o BatchMode=yes -o ConnectTimeout="$SSH_CONNECT_TIMEOUT" "$user_host" "$command" + printf -v quoted_command '%q' "$command" + ssh \ + -o BatchMode=yes \ + -o ConnectTimeout="$SSH_CONNECT_TIMEOUT" \ + -o ConnectionAttempts=1 \ + -o ServerAliveInterval=5 \ + -o ServerAliveCountMax=1 \ + -o PreferredAuthentications=publickey \ + -o NumberOfPasswordPrompts=0 \ + "$user_host" \ + "if command -v timeout >/dev/null 2>&1; then timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_command}; else bash -lc ${quoted_command}; fi" } service_route_recovered() { diff --git a/scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py b/scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py index af8236da..61d54ba1 100644 --- a/scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py +++ b/scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py @@ -21,4 +21,11 @@ def test_110_self_check_can_run_locally_without_ssh_config_drift() -> None: assert 'ips="$(local_ip_list)"' in text assert 'local_ip_list | grep' not in text assert 'bash -lc "$command"' in text - assert 'ssh "${ssh_opts[@]}" "$target" "$command"' in text + assert 'SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-25}"' in text + assert "printf -v quoted_command '%q' \"$command\"" in text + assert '-o ConnectionAttempts=1' in text + assert '-o ServerAliveInterval=5' in text + assert '-o ServerAliveCountMax=1' in text + assert '-o NumberOfPasswordPrompts=0' in text + assert 'ssh "${ssh_opts[@]}" "$target" \\' in text + assert "timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_command}" in text diff --git a/scripts/reboot-recovery/tests/test_post_reboot_readiness_summary_contract.py b/scripts/reboot-recovery/tests/test_post_reboot_readiness_summary_contract.py new file mode 100644 index 00000000..669194e2 --- /dev/null +++ b/scripts/reboot-recovery/tests/test_post_reboot_readiness_summary_contract.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] +SCRIPT = ROOT / "scripts" / "reboot-recovery" / "post-reboot-readiness-summary.sh" + + +def test_unknown_backup_escrow_evidence_stays_blocked() -> None: + text = SCRIPT.read_text(encoding="utf-8") + + assert "escrow_evidence_unknown=0" in text + assert 'elif [[ -z "$escrow_missing_count" ]]; then' in text + assert "escrow_evidence_unknown=1" in text + assert "DR_ESCROW_EVIDENCE_UNKNOWN=$escrow_evidence_unknown" in text + assert ( + '[[ "$backup_core_green" != "1" ]] ' + '&& next_required_gates+=("backup_core_readback_recovery")' + ) in text + assert ( + '[[ "$dr_escrow_blocked" == "1" ]] ' + '&& next_required_gates+=("credential_escrow_evidence")' + ) in text diff --git a/scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py b/scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py index 5bd7b1fe..dc0fdd44 100644 --- a/scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py +++ b/scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py @@ -14,7 +14,13 @@ def test_post_start_self_checks_can_run_locally_on_110() -> None: assert 'ips="$(local_ip_list)"' in text assert 'local_ip_list | grep' not in text assert 'bash -lc "$command"' in text - assert 'ssh -o BatchMode=yes -o ConnectTimeout="$SSH_CONNECT_TIMEOUT" "$user_host" "$command"' in text + assert 'SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-25}"' in text + assert "printf -v quoted_command '%q' \"$command\"" in text + assert '-o ConnectionAttempts=1' in text + assert '-o ServerAliveInterval=5' in text + assert '-o ServerAliveCountMax=1' in text + assert '-o NumberOfPasswordPrompts=0' in text + assert "timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_command}" in text def test_runner_pressure_is_capacity_evidence_after_routes_recover() -> None: