From c45f274d5e85b13d6d6820ba2aac9f392345540d Mon Sep 17 00:00:00 2001 From: ogt Date: Fri, 26 Jun 2026 08:44:37 +0800 Subject: [PATCH] ops(reboot): guard post-reboot owner packets [skip ci] --- docs/LOGBOOK.md | 35 +++ docs/runbooks/FULL-STACK-COLD-START-SOP.md | 6 +- .../runbooks/REBOOT-POST-START-QUICK-CHECK.md | 13 +- ...oot-cold-start-backup-recovery-workplan.md | 4 +- ...post-reboot-owner-packet-contract-guard.py | 264 ++++++++++++++++++ 5 files changed, 317 insertions(+), 5 deletions(-) create mode 100755 scripts/reboot-recovery/post-reboot-owner-packet-contract-guard.py diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 3bc4db1e..41703584 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -45190,3 +45190,38 @@ production browser smoke: - 188 host hygiene 維護窗口仍未執行。 - Wazuh manager registry accepted remains `0`。 - 不得宣稱 owner request 已送出、owner response 已收到 / 接受、runtime 寫入已批准、`DR_COMPLETE`、188 host fully green、或 Wazuh registry recovered。 + +## 2026-06-26 — 08:40 post-reboot owner-packet contract guard / SOP v1.70 + +**時間與來源**: +- 2026-06-26 08:40 Asia/Taipei。 +- 來源:新增 `scripts/reboot-recovery/post-reboot-owner-packet-contract-guard.py`,驗收 `scripts/reboot-recovery/post-reboot-next-gate-owner-packets.py --no-color --output /tmp/awoooi-post-reboot-owner-packets.json` 產出的 owner packet JSON。 + +**完成內容**: +- 新增 post-reboot owner-packet contract guard,將 `awoooi_post_reboot_next_gate_owner_packets_v1` 的 fail-closed 欄位變成硬門檻。 +- guard 固定要求三個 P0 gate:`credential_escrow_evidence`、`host_188_hygiene_maintenance_window`、`wazuh_manager_registry_export`。 +- guard 固定要求 `request_sent_count=0`、`owner_response_received_count=0`、`owner_response_accepted_count=0`、`runtime_action_authorized_count=0`、`dispatch_authorized=0`、`host_write_authorized=0`、`secret_value_collection_allowed=0`、`runtime_gate_count=0`。 +- guard 會拒收缺少 credential escrow 禁用 payload、188 host hygiene 禁用維修動作、Wazuh 禁用 raw payload / active response / host write / Kali active scan,以及缺少 no-false-green 規則的 packet。 +- `docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md` 升至 v1.10,加入 JSON artifact + contract guard 固定步驟。 +- `docs/runbooks/FULL-STACK-COLD-START-SOP.md` 升至 v1.70,將 contract guard 列為 owner review intake 前置條件。 +- `docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md` 更新為 `DONE_WITH_OWNER_PACKET_CONTRACT_GUARD_V170`。 + +**只讀驗證預期**: +- `POST_REBOOT_OWNER_PACKET_CONTRACT_GUARD_OK gates=3 request_sent=0 accepted=0 runtime_gate=0` + +**做過的命令類型**: +- 只讀:post-reboot readiness summary、next-gate dispatch checklist、owner-packet JSON generation、contract guard、source guard。 +- 寫入:repo script / docs-only。 +- 未做:host / Docker / systemd / Nginx / firewall / K8s / DB / Wazuh runtime 寫操作;未讀 secret 明文;未送 owner request;未寫 escrow marker;未執行 active response。 + +**目前判定**: +- Owner-packet contract guard automation:`0% -> 100%`。 +- Reboot service / data / backup readiness remains `GREEN`。 +- Overall declaration remains `FULL_STACK_GREEN_DR_ESCROW_BLOCKED`。 +- Runtime repair / owner request sent / credential marker write / Wazuh registry accepted:仍 `0%`。 + +**仍 blocked / 不得宣稱**: +- DR credential escrow evidence missing `5`。 +- 188 host hygiene 維護窗口仍未執行。 +- Wazuh manager registry accepted remains `0`。 +- 不得宣稱 owner request 已送出、owner response 已收到 / 接受、runtime 寫入已批准、`DR_COMPLETE`、188 host fully green、或 Wazuh registry recovered。 diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index f22d91d6..5118a39a 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -1,6 +1,6 @@ # AWOOOI 全棧冷啟動與主機重啟 SOP -> Version: v1.69 +> Version: v1.70 > Last updated: 2026-06-26 Asia/Taipei > Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path. @@ -10,7 +10,7 @@ 本節是每次接手、開機、關機、重啟後的第一個判定錨點。若日期不是今天,必須先重跑 live check,再更新本節與 `docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md`。 -若只是重啟後要快速判斷能不能宣稱恢復,先跑機器可讀摘要:`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color`。此腳本會呼叫一頁式總檢查、188 host hygiene checklist 與 Wazuh no-false-green repo gates,並把 delegated logs 留在 `/tmp/awoooi-post-reboot-readiness-*`。若 summary 顯示 `SERVICE_GREEN=1` 但 `NEXT_REQUIRED_GATES` 仍非空,接著跑 `scripts/reboot-recovery/post-reboot-next-gate-dispatch.sh --no-color`,把 DR escrow、188 hygiene、Wazuh registry 三條 blocker 轉成 owner / evidence / forbidden-action dispatch checklist;需要機器可讀 intake 時,再跑 `scripts/reboot-recovery/post-reboot-next-gate-owner-packets.py --no-color` 產生 `awoooi_post_reboot_next_gate_owner_packets_v1` JSON。dispatch / packet 均固定 `DISPATCH_AUTHORIZED=0`、`REQUEST_SENT_COUNT=0`、`OWNER_RESPONSE_ACCEPTED=0`、`HOST_WRITE_AUTHORIZED=0`、`SECRET_VALUE_COLLECTION_ALLOWED=0`。需要人工展開時,再跑 `scripts/reboot-recovery/post-start-quick-check.sh --no-color` 並以 `docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md` 作為 fallback。長 SOP 保留完整背景、例外處理與 Plan B;短版 wrapper / checklist 負責每次 T+10 分鐘內的固定判定。 +若只是重啟後要快速判斷能不能宣稱恢復,先跑機器可讀摘要:`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color`。此腳本會呼叫一頁式總檢查、188 host hygiene checklist 與 Wazuh no-false-green repo gates,並把 delegated logs 留在 `/tmp/awoooi-post-reboot-readiness-*`。若 summary 顯示 `SERVICE_GREEN=1` 但 `NEXT_REQUIRED_GATES` 仍非空,接著跑 `scripts/reboot-recovery/post-reboot-next-gate-dispatch.sh --no-color`,把 DR escrow、188 hygiene、Wazuh registry 三條 blocker 轉成 owner / evidence / forbidden-action dispatch checklist;需要機器可讀 intake 時,再跑 `scripts/reboot-recovery/post-reboot-next-gate-owner-packets.py --no-color --output /tmp/awoooi-post-reboot-owner-packets.json` 產生 `awoooi_post_reboot_next_gate_owner_packets_v1` JSON,並立刻跑 `scripts/reboot-recovery/post-reboot-owner-packet-contract-guard.py --packet-file /tmp/awoooi-post-reboot-owner-packets.json`。dispatch / packet / guard 均固定 `DISPATCH_AUTHORIZED=0`、`REQUEST_SENT_COUNT=0`、`OWNER_RESPONSE_ACCEPTED=0`、`HOST_WRITE_AUTHORIZED=0`、`SECRET_VALUE_COLLECTION_ALLOWED=0`、`RUNTIME_GATE=0`;guard 未通過時不得送 owner request、不得寫 escrow marker、不得進維護窗口、不得宣稱 DR / 188 host hygiene / Wazuh registry complete。需要人工展開時,再跑 `scripts/reboot-recovery/post-start-quick-check.sh --no-color` 並以 `docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md` 作為 fallback。長 SOP 保留完整背景、例外處理與 Plan B;短版 wrapper / checklist 負責每次 T+10 分鐘內的固定判定。 2026-06-26 07:47 machine-readable readiness summary:`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` 已驗證可用,artifact dir `/tmp/awoooi-post-reboot-readiness-20260626-074702`。摘要輸出 `POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_PASS=38`、`POST_START_WARN=3`、`POST_START_BLOCKED=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`BACKUP_CORE_GREEN=1`、`DR_ESCROW_BLOCKED=1`、`ESCROW_MISSING_COUNT=5`、`HOST_188_SERVICE_GREEN=1`、`HOST_188_HYGIENE_BLOCKED=1`、`WAZUH_ROUTE_CODE=200`、`WAZUH_TRANSPORT_COUNT=6`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`、`WAZUH_RUNTIME_GATE=0`、`RUNTIME_ACTION_AUTHORIZED=0`。目前 `OVERALL_DECLARATION=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`,`NEXT_REQUIRED_GATES=credential_escrow_evidence,host_188_hygiene_maintenance_window,wazuh_manager_registry_export`。這是每次重啟後的第一層 operator / AI agent 判定格式。 @@ -18,6 +18,8 @@ 2026-06-26 08:29 owner-packet JSON baseline:`scripts/reboot-recovery/post-reboot-next-gate-owner-packets.py --no-color` 將 dispatch output 轉成 `schema_version=awoooi_post_reboot_next_gate_owner_packets_v1`,包含三個 `owner_packets`、`next_gate_count=3`、`p0_gate_count=3`、`request_sent_count=0`、`owner_response_received_count=0`、`owner_response_accepted_count=0`、`runtime_action_authorized_count=0`。此 JSON 是 AI / operator / owner review intake,不是外部 request,也不是維護窗口批准。 +2026-06-26 08:40 owner-packet contract guard baseline:`scripts/reboot-recovery/post-reboot-owner-packet-contract-guard.py --packet-file /tmp/awoooi-post-reboot-owner-packets.json` 鎖定 `schema_version=awoooi_post_reboot_next_gate_owner_packets_v1`、三個 P0 gate id、`next_gate_count=3`、`p0_gate_count=3`、`request_sent_count=0`、`owner_response_received_count=0`、`owner_response_accepted_count=0`、`runtime_action_authorized_count=0`、`dispatch_authorized=0`、`host_write_authorized=0`、`secret_value_collection_allowed=0`、`runtime_gate_count=0`。此 guard 也驗證 escrow 禁止 password / token / secret value / hash / prefix / suffix / raw credential,188 禁止 `pg_resetwal` / certbot renew / Nginx reload / DB restore / Docker restart / host file write,Wazuh 禁止 raw payload / internal IP / active response / re-enroll / restart / secret patch / host write / Kali active scan,並要求四條 no-false-green 規則存在。輸出必須是 `POST_REBOOT_OWNER_PACKET_CONTRACT_GUARD_OK gates=3 request_sent=0 accepted=0 runtime_gate=0`。 + 2026-06-26 07:39 live quick-check refresh:`scripts/reboot-recovery/post-start-quick-check.sh --no-color` 完整跑完,四主機 ping / SSH 全部 OK,delegated cold-start 為 `PASS=89 WARN=0 BLOCKED=0`,wrapper 總結為 `POST_START_QUICK_CHECK PASS=38 WARN=3 BLOCKED=0`、warning split `SERVICE=0 BOUNDARY=1 EVIDENCE=2`、`RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`。MOMO health `V10.701`,daily snapshot `109061` rows / `2025-07-01..2026-06-24`,current-month parity `15383|15383|2026-06-01|2026-06-24|2026-06-01|2026-06-24`,latest import job `57 completed`。StockPlatform freshness `status=ok`、latest trading date `2026-06-25`,price / chips / margin / AI recommendations 均為 `2026-06-25`。Backup-status 07:39 顯示 110 `13/13 fresh failed=0`、188 `2/2 fresh failed=0`、`core_blockers=0`、offsite/rclone fresh、`last_backup_all=2026-06-26 02:31:02`、`escrow_missing=5`。Public routes extended list 全部回 expected 2xx/3xx。110 CPU attribution 顯示 load 約 `5.19 / 4.66 / 4.91`,CPU idle 多數樣本 `80%+`,目前負載來自 Gitea / ClickHouse / Docker / Kafka / StockPlatform / AWOOOI API / Sentry 等正常平台工作,不是 orphan Chrome。這一輪 allowed declaration:主機、K3s、服務、網站、產品資料 freshness、備份核心與 offsite freshness 綠;forbidden declaration:DR complete、credential escrow complete、188 host fully green、Wazuh registry recovered。 2026-06-26 07:19 follow-up:`gitea/main` 已包含前一輪 SOP 文件 commit `1fd5e2a8`,ArgoCD `awoooi-prod` 讀回 `Synced / Healthy`,revision `1fd5e2a8b0f18d24eed16aa2a44286bcbf230603`,API `2/2`、Web `2/2`、Worker `1/1`,pods `restart=0`。重跑 full cold-start 仍是 `PASS=87 WARN=0 BLOCKED=0`,result `GREEN`。直接 public route 讀回:AWOOOI API `200`、AWOOOI Web `307`、VibeWork `200`、AwoooGo `200`、MOMO health `200`、Stock freshness `200`、Bitan `200`、Gitea `200`、Harbor `200`、Registry `/v2/` expected `401`、Sentry expected `302`、SigNoz `200`、Langfuse `200`。188 blocker 精準分類:`pg_lsclusters` 顯示 host PostgreSQL `14/main` down,`systemctl status postgresql@14-main` 顯示 `invalid primary checkpoint record` 與 `PANIC: could not locate a valid checkpoint record`;`certbot.service` 顯示 `sentry.wooo.work` renew rate-limited,`snap.certbot.renew.service` 顯示 challenge failed;`awoooi-startup.service` 曾嘗試以 root 執行 `pg_resetwal` 並失敗。本輪不執行 `pg_resetwal`、不 `reset-failed`、不重啟 service;188 需用獨立維護窗口、rollback owner、restore/source-of-truth plan 處理,詳見 `docs/runbooks/HOST-188-HYGIENE-MAINTENANCE-RUNBOOK.md`,並可先跑 `scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh --no-color` 取得只讀 preflight。110 load 已降到約 `4.83 / 4.82 / 5.52`,top CPU 是 active AWOOOI Web `turbo build` / Docker buildx;Swap 仍滿但 memory available 約 `41Gi`,本輪不手動清 swap。整體宣告仍是 `FULL_STACK_GREEN_DR_ESCROW_BLOCKED`。 diff --git a/docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md b/docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md index ed5d46c8..599ac875 100644 --- a/docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md +++ b/docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md @@ -1,6 +1,6 @@ # 主機重啟後一頁式總檢查 -> Version: v1.9 +> Version: v1.10 > Last updated: 2026-06-26 Asia/Taipei > Scope: 110 / 120 / 121 / 188 post-reboot service recovery. 112 Kali / Wazuh / active scan 不屬於本流程。 @@ -10,7 +10,7 @@ 每次 110 / 120 / 121 / 188 任一台主機開機、關機、重啟、斷電恢復、VMware console fsck、Docker / K3s 大量重排後,都先跑本頁,再決定是否宣稱恢復。 -最新基準:2026-06-26 08:29 next-gate owner packets。`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` 回傳 `SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`BACKUP_CORE_GREEN=1`、`DR_ESCROW_BLOCKED=1`、`ESCROW_MISSING_COUNT=5`、`HOST_188_HYGIENE_BLOCKED=1`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`、`RUNTIME_ACTION_AUTHORIZED=0`、`OVERALL_DECLARATION=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`。接著 `scripts/reboot-recovery/post-reboot-next-gate-dispatch.sh --no-color` 將 `NEXT_REQUIRED_GATES=credential_escrow_evidence,host_188_hygiene_maintenance_window,wazuh_manager_registry_export` 展成三個 owner / evidence / forbidden-action checklist;`scripts/reboot-recovery/post-reboot-next-gate-owner-packets.py --no-color` 進一步轉成 `awoooi_post_reboot_next_gate_owner_packets_v1` JSON,固定 `dispatch_authorized=0`、`request_sent_count=0`、`owner_response_accepted_count=0`、`host_write_authorized=0`、`secret_value_collection_allowed=0`、`runtime_gate_count=0`。Cold-start `PASS=89 WARN=0 BLOCKED=0`;MOMO `V10.701`、latest import job `57 completed`、`DB_DAILY_FRESHNESS 1|2026-06-24`;StockPlatform `/api/v1/system/freshness` 為 `status=ok`、`latest_trading_date=2026-06-25`、blockers `[]`;backup-status 110 `13/13 fresh failed=0`、188 `2/2 fresh failed=0`、`core_blockers=0`、`offsite_fresh=1`、`rclone_gdrive_fresh=1`、`last_backup_all=2026-06-26 02:31:02`。DR 仍因 `escrow_missing=5` 不可宣稱 complete。188 host hygiene 與 Wazuh manager registry 仍是 service green 之外的獨立 blocker。 +最新基準:2026-06-26 08:40 next-gate owner packet contract guard。`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` 回傳 `SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`BACKUP_CORE_GREEN=1`、`DR_ESCROW_BLOCKED=1`、`ESCROW_MISSING_COUNT=5`、`HOST_188_HYGIENE_BLOCKED=1`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`、`RUNTIME_ACTION_AUTHORIZED=0`、`OVERALL_DECLARATION=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`。接著 `scripts/reboot-recovery/post-reboot-next-gate-dispatch.sh --no-color` 將 `NEXT_REQUIRED_GATES=credential_escrow_evidence,host_188_hygiene_maintenance_window,wazuh_manager_registry_export` 展成三個 owner / evidence / forbidden-action checklist;`scripts/reboot-recovery/post-reboot-next-gate-owner-packets.py --no-color` 進一步轉成 `awoooi_post_reboot_next_gate_owner_packets_v1` JSON,固定 `dispatch_authorized=0`、`request_sent_count=0`、`owner_response_accepted_count=0`、`host_write_authorized=0`、`secret_value_collection_allowed=0`、`runtime_gate_count=0`;`scripts/reboot-recovery/post-reboot-owner-packet-contract-guard.py --packet-file /tmp/awoooi-post-reboot-owner-packets.json` 鎖定三個 P0 gate、所有 `0 / false` 邊界、禁用 secret payload / runtime action 與 no-false-green 規則。Cold-start `PASS=89 WARN=0 BLOCKED=0`;MOMO `V10.701`、latest import job `57 completed`、`DB_DAILY_FRESHNESS 1|2026-06-24`;StockPlatform `/api/v1/system/freshness` 為 `status=ok`、`latest_trading_date=2026-06-25`、blockers `[]`;backup-status 110 `13/13 fresh failed=0`、188 `2/2 fresh failed=0`、`core_blockers=0`、`offsite_fresh=1`、`rclone_gdrive_fresh=1`、`last_backup_all=2026-06-26 02:31:02`。DR 仍因 `escrow_missing=5` 不可宣稱 complete。188 host hygiene 與 Wazuh manager registry 仍是 service green 之外的獨立 blocker。 本頁只回答四件事: @@ -76,6 +76,15 @@ scripts/reboot-recovery/post-reboot-next-gate-owner-packets.py --no-color 輸出 JSON 只能作為 intake / review packet,不是 request sent。必須看到 `request_sent_count=0`、`owner_response_accepted_count=0`、`runtime_action_authorized_count=0`,否則視為不合格。 +送入任何 owner review queue 前,必須先把 JSON 存成 artifact 並跑 contract guard: + +```bash +scripts/reboot-recovery/post-reboot-next-gate-owner-packets.py --no-color --output /tmp/awoooi-post-reboot-owner-packets.json +scripts/reboot-recovery/post-reboot-owner-packet-contract-guard.py --packet-file /tmp/awoooi-post-reboot-owner-packets.json +``` + +guard 必須輸出 `POST_REBOOT_OWNER_PACKET_CONTRACT_GUARD_OK gates=3 request_sent=0 accepted=0 runtime_gate=0`。若 gate 數量、P0 gate id、`0 / false` 欄位、禁用 secret payload、188 禁用維修動作、Wazuh 禁用 active response / host write,或 no-false-green 規則任何一項漂移,視為 `BLOCKED`,不得送 owner request、不得寫 escrow marker、不得進維護窗口、不得宣稱 DR / Wazuh / 188 host hygiene 完成。 + 需要展開細節時,再使用 repo-side wrapper: ```bash diff --git a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md index de70be92..cc84ca21 100644 --- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md +++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md @@ -15,7 +15,7 @@ | P0 host / K3s recovery | DONE | 100% | 120 booted after console fsck at `2026-06-12 15:13`; latest 2026-06-26 07:19 readback shows 120 and 121 reachable, K3s active, `mon` and `mon1` both `Ready control-plane`, AWOOOI API/Web replicas split across both nodes, ArgoCD `awoooi-prod Synced / Healthy` at revision `1fd5e2a8b0f18d24eed16aa2a44286bcbf230603`, and `km-vectorize` official 03:00 台北時間 run succeeded with `lastSuccess=2026-06-25T19:00:14Z`. | | P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 97% | 2026-06-26 06:58 backup readback shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `integrity_stale=0`, `offsite_fresh=1`, `rclone_gdrive_fresh=1`, `escrow_missing=5`, last aggregate `2026-06-26 02:31:02`。DR remains blocked on real non-secret credential escrow evidence IDs; do not write placeholder markers or paste secret values. | | P2 service / data truth | DONE | 100% | Service routes and core runtime are available, 110 current CPU pressure is attributable to active AWOOOI Web `turbo build` / Docker buildx, and previous orphan Chrome groups remain cleared. 2026-06-26 07:19 StockPlatform `/api/v1/system/freshness` returned `200`; 07:01 freshness payload was `status=ok`, `latest_trading_date=2026-06-25`, blockers `[]`; price / chips / margin / AI recommendations are all on `2026-06-25`. `ai.recommendations` row count is `2868`; `core.margin_short_daily` row count is `1976`. MOMO health `V10.699`, current-month parity `15383|15383|2026-06-01|2026-06-24|2026-06-01|2026-06-24`, and `MOMO_DAILY_FRESHNESS 1|2026-06-24` are green; expanded public routes are green. | -| P3 docs / automation contracts | DONE_WITH_OWNER_PACKET_JSON_V169 | 100% | Workplan, SOP v1.69, machine-readable post-reboot readiness summary, post-reboot next-gate dispatch checklist, owner-packet JSON generator, one-page post-start quick check v1.9, route retry gate, deploy warmup classification, expanded public route list, StockPlatform freshness gate, StockPlatform cron-source recovery evidence, StockPlatform natural schedule green evidence, 110 orphan Chrome recurrence cleanup evidence, 188 fail-closed startup data recovery gate, 188 host hygiene read-only checklist, baseline `stockplatform_system_freshness_ok`, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, healthy heartbeat Telegram suppression, MOMO scheduler / current-month detector fix, exporter restore helpers, 110 Docker disk pressure cleanup boundary, notification-noise readback, MOMO import-boundary / Drive-auth fail-closed deploys, product version/readback matrix, and stricter product-data / route retry gates are updated. Owner-packet JSON turns `credential_escrow_evidence`、`host_188_hygiene_maintenance_window`、`wazuh_manager_registry_export` into structured review packets while keeping request sent / owner accepted / host write / secret collection / runtime action at `0`. Live 110 script sync remains a separate approved live-write gate; do not claim it here. | +| P3 docs / automation contracts | DONE_WITH_OWNER_PACKET_CONTRACT_GUARD_V170 | 100% | Workplan, SOP v1.70, machine-readable post-reboot readiness summary, post-reboot next-gate dispatch checklist, owner-packet JSON generator, owner-packet contract guard, one-page post-start quick check v1.10, route retry gate, deploy warmup classification, expanded public route list, StockPlatform freshness gate, StockPlatform cron-source recovery evidence, StockPlatform natural schedule green evidence, 110 orphan Chrome recurrence cleanup evidence, 188 fail-closed startup data recovery gate, 188 host hygiene read-only checklist, baseline `stockplatform_system_freshness_ok`, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, healthy heartbeat Telegram suppression, MOMO scheduler / current-month detector fix, exporter restore helpers, 110 Docker disk pressure cleanup boundary, notification-noise readback, MOMO import-boundary / Drive-auth fail-closed deploys, product version/readback matrix, and stricter product-data / route retry gates are updated. Owner-packet JSON turns `credential_escrow_evidence`、`host_188_hygiene_maintenance_window`、`wazuh_manager_registry_export` into structured review packets while keeping request sent / owner accepted / host write / secret collection / runtime action at `0`; contract guard now rejects packet drift before owner review intake. Live 110 script sync remains a separate approved live-write gate; do not claim it here. | 2026-06-26 07:47 machine-readable summary baseline: `scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` stores delegated logs under `/tmp/awoooi-post-reboot-readiness-20260626-074702` and returns `SERVICE_GREEN=1`, `PRODUCT_DATA_GREEN=1`, `BACKUP_CORE_GREEN=1`, `DR_ESCROW_BLOCKED=1`, `ESCROW_MISSING_COUNT=5`, `HOST_188_SERVICE_GREEN=1`, `HOST_188_HYGIENE_BLOCKED=1`, `WAZUH_ROUTE_CODE=200`, `WAZUH_TRANSPORT_COUNT=6`, `WAZUH_MANAGER_REGISTRY_ACCEPTED=0`, `WAZUH_RUNTIME_GATE=0`, `RUNTIME_ACTION_AUTHORIZED=0`, `OVERALL_DECLARATION=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`, and `NEXT_REQUIRED_GATES=credential_escrow_evidence,host_188_hygiene_maintenance_window,wazuh_manager_registry_export`. This is now the preferred first operator/AI-agent entrypoint after reboot because it separates service health from DR, host hygiene, and security registry evidence. @@ -23,6 +23,8 @@ 2026-06-26 08:29 owner-packet JSON baseline: `scripts/reboot-recovery/post-reboot-next-gate-owner-packets.py --no-color` emits `schema_version=awoooi_post_reboot_next_gate_owner_packets_v1` with `next_gate_count=3`, `p0_gate_count=3`, `request_sent_count=0`, `owner_response_received_count=0`, `owner_response_accepted_count=0`, `runtime_action_authorized_count=0`. This packet is for AI / operator / owner review intake only; it does not send request, write credential marker, read secret, or authorize runtime action. +2026-06-26 08:40 owner-packet contract guard baseline: `scripts/reboot-recovery/post-reboot-owner-packet-contract-guard.py --packet-file /tmp/awoooi-post-reboot-owner-packets.json` validates the generated JSON before any owner review intake. It requires exactly three P0 gates, preserves `request_sent=0`、`owner_response_received=0`、`owner_response_accepted=0`、`runtime_action_authorized=0`、`host_write_authorized=0`、`secret_value_collection_allowed=0`、`runtime_gate=0`, and rejects missing forbidden payload/action controls for credential escrow, 188 host hygiene, and Wazuh registry export. Expected success line: `POST_REBOOT_OWNER_PACKET_CONTRACT_GUARD_OK gates=3 request_sent=0 accepted=0 runtime_gate=0`. + 2026-06-26 07:39 live quick-check refresh supersedes the 07:19 row for current operator status. `scripts/reboot-recovery/post-start-quick-check.sh --no-color` returned `POST_START_QUICK_CHECK PASS=38 WARN=3 BLOCKED=0`, warning split `SERVICE=0 BOUNDARY=1 EVIDENCE=2`, result `FULL_STACK_GREEN_DR_ESCROW_BLOCKED`. Delegated cold-start returned `PASS=89 WARN=0 BLOCKED=0`; four reboot-scope hosts ping/SSH were OK; AWOOOI / VibeWork / AwoooGo / 2026FIFA / Agent Bounty / MOMO / Stock / Bitan / TsenYang / VTuber / Gitea / Harbor / Registry / Sentry / SigNoz / Langfuse / AIOps routes returned expected 2xx/3xx. MOMO `V10.701` has job `57 completed`, daily freshness `1|2026-06-24`, and current-month parity `15383|15383|2026-06-01|2026-06-24|2026-06-01|2026-06-24`. StockPlatform freshness is `ok` through `2026-06-25` with price / chips / margin / AI recommendations current. Backup core remains green: 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, offsite/rclone fresh, `last_backup_all=2026-06-26 02:31:02`; DR still has `escrow_missing=5`. 110 load around `5.19 / 4.66 / 4.91` is attributable to normal platform processes, not orphan Chrome. 188 host hygiene remains blocked by failed host PostgreSQL / certbot / startup units and must use the dedicated maintenance runbook and read-only checklist. 2026-06-25 19:06 post-CD wrapper readback supersedes the 18:53 wording: consecutive main pushes created a deploy storm where older deploy markers were superseded by later commits. Latest production truth is deploy marker `d8ca8224 chore(cd): deploy 9dbe044 [skip ci]`, ArgoCD `Synced / Healthy`, API/Web/Worker image tag `9dbe044ea1e8e3894ccbeb5ed760bb124b87f7be`, direct route smoke 200 for AWOOOI API / IwoooS / VibeWork / AwoooGo / MOMO health / Stock / Bitan and expected route-gate statuses for MOMO / Gitea / Harbor / Registry / Sentry / SigNoz / Langfuse / AIOps, and wrapper `POST_START_QUICK_CHECK PASS=18 WARN=3 BLOCKED=0`. Repo-side cold-start returns `PASS=89 WARN=0 BLOCKED=0`; `/backup/scripts/backup-status.sh --no-notify --no-refresh` reports 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `integrity_stale=0`, `offsite_fresh=1`, `rclone_gdrive_fresh=1`, `escrow_missing=5`; MOMO dedicated preflight returns `PASS=19 WARN=2 BLOCKED=0`; MOMO health is `V10.690`; AwoooGo / Stock transient 502 reads cleared after upstream warmup and five consecutive route reads returned `200`; 110 load is around `14.51 / 12.34 / 11.42`, with Gitea Actions cache save / `zstdmt` / `tar`, StockPlatform headless Chrome smoke / CI, Gitea, AWOOOI API, ClickHouse, Docker, and platform services visible, not an AWOOOI service blocker. Wrapper result is `FULL_STACK_GREEN_DR_ESCROW_BLOCKED`, not `DEGRADED`, because service warnings are `0` and only DR boundary / evidence warnings remain. Wazuh route readback is now `200 disabled_waiting_iwooos_wazuh_owner_gate`, but manager registry accepted remains `0`, so Wazuh is a security registry evidence blocker rather than a reboot service blocker. diff --git a/scripts/reboot-recovery/post-reboot-owner-packet-contract-guard.py b/scripts/reboot-recovery/post-reboot-owner-packet-contract-guard.py new file mode 100755 index 00000000..31f21afc --- /dev/null +++ b/scripts/reboot-recovery/post-reboot-owner-packet-contract-guard.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +"""Validate post-reboot owner packet JSON stays fail-closed. + +Read-only by design. The guard validates an owner packet artifact produced by +post-reboot-next-gate-owner-packets.py, or runs that generator when no packet +file is provided. It never sends requests, reads secrets, writes markers, or +modifies host/runtime state. +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from pathlib import Path +from typing import Any + + +ROOT = Path(__file__).resolve().parents[2] +OWNER_PACKET_GENERATOR = ( + ROOT / "scripts" / "reboot-recovery" / "post-reboot-next-gate-owner-packets.py" +) + +EXPECTED_SCHEMA = "awoooi_post_reboot_next_gate_owner_packets_v1" +EXPECTED_GATES = { + "credential_escrow_evidence", + "host_188_hygiene_maintenance_window", + "wazuh_manager_registry_export", +} +EXPECTED_NO_FALSE_GREEN_RULES = { + "service_green_does_not_equal_dr_complete", + "backup_fresh_does_not_equal_credential_escrow_complete", + "host_188_service_green_does_not_equal_host_hygiene_green", + "wazuh_route_or_transport_does_not_equal_manager_registry_accepted", +} + +GLOBAL_ZERO_FIELDS = { + ("status", "runtime_action_authorized"): 0, + ("status", "dispatch_authorized"): 0, + ("status", "request_sent_count"): 0, + ("status", "owner_response_received_count"): 0, + ("status", "owner_response_accepted_count"): 0, + ("status", "host_write_authorized"): 0, + ("status", "secret_value_collection_allowed"): 0, + ("status", "runtime_gate_count"): 0, + ("counts", "request_sent_count"): 0, + ("counts", "owner_response_received_count"): 0, + ("counts", "owner_response_accepted_count"): 0, + ("counts", "runtime_action_authorized_count"): 0, +} + +GATE_REQUIRED_FORBIDDEN_PAYLOADS = { + "credential_escrow_evidence": { + "password", + "token", + "secret_value", + "hash", + "prefix", + "suffix", + "raw_credential", + }, + "wazuh_manager_registry_export": { + "agent_real_name", + "internal_ip", + "client_keys", + "raw_wazuh_payload", + "token", + "password", + "authorization_header", + }, +} + +GATE_REQUIRED_FORBIDDEN_ACTIONS = { + "credential_escrow_evidence": { + "mark_placeholder", + "write_fake_marker", + "store_secret", + "disable_alert", + }, + "host_188_hygiene_maintenance_window": { + "pg_resetwal", + "certbot_renew", + "nginx_reload", + "db_restore", + "docker_restart", + "host_file_write", + }, + "wazuh_manager_registry_export": { + "active_response", + "agent_reenroll", + "wazuh_restart", + "secret_patch", + "host_write", + "kali_active_scan", + }, +} + +PACKET_FALSE_FIELDS = ( + "request_sent", + "response_received", + "response_accepted", + "runtime_action_authorized", +) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Validate post-reboot owner packet JSON contract.", + ) + parser.add_argument( + "--packet-file", + type=Path, + help="Validate an existing owner packet JSON file.", + ) + parser.add_argument( + "--no-color", + action="store_true", + help="Pass --no-color when running the owner packet generator.", + ) + return parser.parse_args() + + +def run_owner_packet_generator(no_color: bool) -> dict[str, Any]: + cmd = [str(OWNER_PACKET_GENERATOR)] + if no_color: + cmd.append("--no-color") + completed = subprocess.run( + cmd, + cwd=ROOT, + check=False, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + if completed.returncode != 0: + raise SystemExit( + "POST_REBOOT_OWNER_PACKET_CONTRACT_GUARD_FAILED " + f"generator_rc={completed.returncode}\n{completed.stdout}" + ) + try: + return json.loads(completed.stdout) + except json.JSONDecodeError as exc: + raise SystemExit( + "POST_REBOOT_OWNER_PACKET_CONTRACT_GUARD_FAILED " + f"generator_json_invalid={exc}" + ) from exc + + +def load_packet(args: argparse.Namespace) -> dict[str, Any]: + if args.packet_file: + try: + return json.loads(args.packet_file.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise SystemExit( + "POST_REBOOT_OWNER_PACKET_CONTRACT_GUARD_FAILED " + f"packet_json_invalid={exc}" + ) from exc + return run_owner_packet_generator(no_color=args.no_color) + + +def as_list(value: Any) -> list[Any]: + if value is None: + return [] + if isinstance(value, list): + return value + return [value] + + +def get_nested(packet: dict[str, Any], path: tuple[str, str]) -> Any: + parent = packet.get(path[0], {}) + if not isinstance(parent, dict): + return None + return parent.get(path[1]) + + +def validate_packet(packet: dict[str, Any]) -> list[str]: + failures: list[str] = [] + + if packet.get("schema_version") != EXPECTED_SCHEMA: + failures.append(f"schema_version={packet.get('schema_version')!r}") + + owner_packets = as_list(packet.get("owner_packets")) + counts = packet.get("counts", {}) + if not isinstance(counts, dict): + failures.append("counts_not_object") + counts = {} + + gate_ids = {str(item.get("packet_id", "")) for item in owner_packets if isinstance(item, dict)} + if gate_ids != EXPECTED_GATES: + failures.append(f"gate_ids={sorted(gate_ids)}") + + expected_counts = { + "next_gate_count": 3, + "p0_gate_count": 3, + } + for key, expected in expected_counts.items(): + if counts.get(key) != expected: + failures.append(f"{key}={counts.get(key)!r}") + + for path, expected in GLOBAL_ZERO_FIELDS.items(): + actual = get_nested(packet, path) + if actual != expected: + failures.append(f"{'.'.join(path)}={actual!r}") + + no_false_green_rules = set( + str(item) for item in as_list(packet.get("no_false_green_rules")) + ) + missing_rules = sorted(EXPECTED_NO_FALSE_GREEN_RULES - no_false_green_rules) + if missing_rules: + failures.append(f"missing_no_false_green_rules={missing_rules}") + + for raw_packet in owner_packets: + if not isinstance(raw_packet, dict): + failures.append("owner_packet_not_object") + continue + packet_id = str(raw_packet.get("packet_id", "")) + if raw_packet.get("priority") != "P0": + failures.append(f"{packet_id}.priority={raw_packet.get('priority')!r}") + + for key in PACKET_FALSE_FIELDS: + if raw_packet.get(key) is not False: + failures.append(f"{packet_id}.{key}={raw_packet.get(key)!r}") + + required_payloads = GATE_REQUIRED_FORBIDDEN_PAYLOADS.get(packet_id, set()) + actual_payloads = set(str(item) for item in as_list(raw_packet.get("forbidden_payloads"))) + missing_payloads = sorted(required_payloads - actual_payloads) + if missing_payloads: + failures.append(f"{packet_id}.missing_forbidden_payloads={missing_payloads}") + + required_actions = GATE_REQUIRED_FORBIDDEN_ACTIONS.get(packet_id, set()) + actual_actions = set(str(item) for item in as_list(raw_packet.get("forbidden_actions"))) + missing_actions = sorted(required_actions - actual_actions) + if missing_actions: + failures.append(f"{packet_id}.missing_forbidden_actions={missing_actions}") + + return failures + + +def main() -> int: + args = parse_args() + packet = load_packet(args) + failures = validate_packet(packet) + if failures: + print( + "POST_REBOOT_OWNER_PACKET_CONTRACT_GUARD_FAILED " + + " ".join(failures) + ) + return 1 + + counts = packet.get("counts", {}) + status = packet.get("status", {}) + print( + "POST_REBOOT_OWNER_PACKET_CONTRACT_GUARD_OK " + f"gates={counts.get('next_gate_count')} " + f"request_sent={counts.get('request_sent_count')} " + f"accepted={counts.get('owner_response_accepted_count')} " + f"runtime_gate={status.get('runtime_gate_count')}" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main())