From 8fdcc0194f5a5a82e75d865d622ea3aff1a901f8 Mon Sep 17 00:00:00 2001 From: ogt Date: Sat, 27 Jun 2026 03:06:42 +0800 Subject: [PATCH] fix(ops): recover backup core after reboot [skip ci] --- docs/LOGBOOK.md | 42 ++++++- docs/runbooks/BACKUP-STATUS.md | 26 +++++ docs/runbooks/FULL-STACK-COLD-START-SOP.md | 6 +- .../runbooks/REBOOT-POST-START-QUICK-CHECK.md | 4 +- ...oot-cold-start-backup-recovery-workplan.md | 6 +- .../playbooks/188-momo-backup-user.yml | 8 +- .../ops/recovery-scorecard-contract-check.py | 105 +++++++++++++++++- 7 files changed, 184 insertions(+), 13 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index d5e094b8..db192572 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,44 @@ +## 2026-06-27|00:58 reboot SOP 實際修復:188 MOMO backup core 假紅收斂 + +**時間與來源**: +- 2026-06-27 00:11-00:58 Asia/Taipei。 +- 來源:`dr-offsite-operator-checklist.sh --check --no-color`、`recovery-scorecard-contract-check.py`、188 `ollama` crontab / textfile exporter、110 `/backup/scripts/backup-status.sh --no-notify --no-refresh`、`post-start-quick-check.sh --no-color`、`post-reboot-readiness-summary.sh --no-color`、Prometheus recovery recording rules。 + +**實際問題**: +- `dr-offsite-operator-checklist.sh` 原本會因 `scripts/ops/recovery-scorecard-contract-check.py` 直接 `import yaml` 而在 lean Python 環境中中斷,錯誤是 `ModuleNotFoundError: No module named 'yaml'`。 +- 00:16 post-reboot summary 進一步顯示 `SERVICE_GREEN=0`、`BACKUP_CORE_GREEN=0`、`POST_START_BLOCKED=2`。根因不是備份資料缺失,而是 188 `momo_pg_daily` 備份 fresh、cron 存在,但 exporter 仍判 `awoooi_backup_job_configured{host="188",job="momo_pg_daily"} 0`,導致 110 backup-status 回 `core_blockers=1`、`configured_missing_188=1`。 + +**修復內容**: +- `scripts/ops/recovery-scorecard-contract-check.py` 已改成 PyYAML optional;若沒有 PyYAML,使用標準 Python fallback 解析 recovery recording rules 與 baseline `monitoring_contract.prometheus_recording_rules`。 +- 188 上已做最小可逆 host 寫入:先備份 `ollama` crontab 到 `/home/ollama/momo_backups/crontab-before-momo-pg-host-owned-20260627-001925.txt`,再把 `AWOOOI momo PostgreSQL daily backup` 收斂到 host-owned `/home/ollama/bin/momo-pg-backup.sh`。沒有重啟 Docker / systemd / Nginx / firewall / K3s / DB。 +- 188 textfile exporter 已手動刷新,讀回 `awoooi_backup_job_configured{host="188",job="momo_pg_daily"} 1`。 +- repo source-of-truth `infra/ansible/playbooks/188-momo-backup-user.yml` 已同步改用 host-owned `/home/ollama/bin/momo-pg-backup.sh`,避免未來再把 crontab 改回 app-side path。 + +**驗證結果**: +- `python3 scripts/ops/recovery-scorecard-contract-check.py`:`RECOVERY_SCORECARD_CONTRACT_OK`。 +- `python3 scripts/ops/recovery-scorecard-contract-check.py --prometheus-url http://192.168.0.110:9090 --expect-core-ready`:`awoooi_recovery_core_ready=1`、`awoooi_recovery_dr_offsite_ready=0`,core ready 已恢復,DR 因 escrow 仍正確為 0。 +- `python3 scripts/ops/recovery-scorecard-contract-check.py --prometheus-url http://192.168.0.110:9090 --expect-core-ready --expect-dr-ready`:正確失敗,原因 `expected DR offsite ready, got 0.0`。 +- 110 backup-status 00:56:`110備份=13/13 fresh failed=0`、`188備份=2/2 fresh failed=0`、`core_blockers=0`、`configured_missing_188=0`、`integrity_stale=0`、`offsite_fresh=1`、`rclone_gdrive_fresh=1`、`escrow_missing=5`。 +- `post-start-quick-check.sh` 00:57:`POST_START_QUICK_CHECK PASS=38 WARN=3 BLOCKED=0`、`SERVICE=0`、`RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`。 +- `post-reboot-readiness-summary.sh` 00:58 artifact `/tmp/awoooi-post-reboot-readiness-20260627-005728/summary.txt`:`POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`BACKUP_CORE_GREEN=1`、`HOST_188_HYGIENE_BLOCKED=0`、`ESCROW_MISSING_COUNT=5`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`。 +- 02:42 live revalidation artifact `/tmp/awoooi-post-reboot-readiness-20260627-024151/summary.txt`:`POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_PASS=38`、`POST_START_WARN=3`、`POST_START_BLOCKED=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`STOCK_FRESHNESS_STATUS=ok`、`STOCK_LATEST_TRADING_DATE=2026-06-26`、`BACKUP_CORE_GREEN=1`、`HOST_188_HYGIENE_BLOCKED=0`、`ESCROW_MISSING_COUNT=5`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`、`RUNTIME_ACTION_AUTHORIZED=0`。 +- 02:41 DR checklist:`CORE_COLD_START_GREEN=1`、`RECOVERY_STATE=CORE_READY_DR_OFFSITE_PENDING`,Prometheus contract `awoooi_recovery_core_ready=1`、`awoooi_recovery_dr_offsite_ready=0`。 + +**做過的命令類型**: +- 只讀:scorecard / DR checklist / backup-status / post-start / post-reboot summary / Prometheus readback / route and process evidence。 +- 寫入:repo script / Ansible playbook / runbook / workplan / LOGBOOK;188 `ollama` crontab 單一備份排程路徑修正與 exporter 手動刷新。 +- 未做:沒有讀或保存 secret、沒有 credential marker write、沒有 backup restore / prune / remote delete、沒有 Docker/systemd/Nginx/firewall/K8s/DB/Wazuh restart 或 active response、沒有 Kali active scan。 + +**目前判定**: +- 主機 / K3s / public routes / AWOOOI / MOMO / Stock / backup core / 188 hygiene:`GREEN`。 +- Prometheus recovery core:`awoooi_recovery_core_ready=1`。 +- Overall recovery declaration:`FULL_STACK_GREEN_DR_ESCROW_BLOCKED`。 + +**仍 blocked / 不得宣稱**: +- DR credential escrow evidence 仍缺 `5`:`restic_repository_password`、`offsite_provider_credentials`、`break_glass_admin_credentials`、`dns_registrar_recovery`、`oauth_ai_provider_recovery`;不得宣稱 `DR_COMPLETE`。 +- Wazuh manager registry accepted 仍為 `0`;不得宣稱 Wazuh 全主機納管恢復。 +- Runtime action / host write 擴大授權 / Wazuh active response / Kali active scan 仍全部 `0 / false`。 + ## 2026-06-26|D1I 最新正式基線同步:Delivery workbench、controlled apply、Wazuh metadata gate smoke **背景**:D1H 後,平行 delivery workbench release 與 Wazuh live metadata gate 繼續推進;為避免正式環境再次落後 main,本段只做最新 `gitea/main` fast-forward、正式 API / Browser smoke 與證據補帳,不新增 runtime 執行權限。 @@ -72,7 +113,6 @@ - 負責人回覆接受、機密來源中繼資料接受、唯讀範圍接受、啟用後讀回、Wazuh 即時查詢、主動回應、主機寫入、Kali 主動掃描、Telegram 實發、機密收集、執行期閘門:仍全部 `0 / false`。 **下一個 P0**:取得正式負責人回覆封包:即時中繼資料負責人、機密注入負責人、機密來源中繼資料參照、Wazuh 管理節點健康參照、TLS 驗證參照、唯讀帳號範圍參照、agent 別名映射政策、啟用後讀回指令、回滾負責人、維護窗口、驗證計畫,以及不提供機密明文 / 不提供原始載荷聲明。驗收前不得啟用 Wazuh 即時中繼資料環境變數、不得查 live Wazuh API、不得重啟 Wazuh / Docker / Nginx / firewall、不得重新註冊 agent、不得啟用主動回應。 - ## 2026-06-26|D1G IwoooS Wazuh live route 紅燈前移:Runtime board 與正式站讀回完成 **背景**:正式站已確認 `/api/iwooos/wazuh` 不是 registry empty,而是 `disabled_waiting_iwooos_wazuh_owner_gate`;過去這個狀態只在頁面下方 Wazuh 卡片可見,容易讓 Runtime 資安總板看起來像只剩靜態 snapshot。此段把 Wazuh 只讀路由的公開安全 aggregate 狀態接進 Runtime 資安讀回首屏,讓 disabled、misconfigured、empty、below expected、unavailable 都成為 P0 紅燈。 diff --git a/docs/runbooks/BACKUP-STATUS.md b/docs/runbooks/BACKUP-STATUS.md index 6991c01a..c883d621 100644 --- a/docs/runbooks/BACKUP-STATUS.md +++ b/docs/runbooks/BACKUP-STATUS.md @@ -25,9 +25,35 @@ > 2026-06-25 20:25 Codex 110 CPU cleanup: two orphan StockPlatform headless Chrome process groups were cleared by targeted approved `SIGTERM`; no Docker/systemd/Nginx/K8s/DB/backup write occurred. Backup/offsite remains green, DR still blocked by `escrow_missing=5`, and Stock freshness remains the only hard product-data blocker. > 2026-06-25 21:14 Codex full wrapper refresh: StockPlatform 21:00 `intelligence-sync` and 21:10 AI pipeline naturally caught up; `/api/v1/system/freshness` is `status=ok` with blockers `[]`. Backup/offsite remains 110 `13/13` and 188 `2/2` fresh, `core_blockers=0`, `offsite_fresh=1`, `rclone_gdrive_fresh=1`; full-stack service/data result is `FULL_STACK_GREEN_DR_ESCROW_BLOCKED`, with only `escrow_missing=5` blocking DR complete. > 2026-06-26 06:28 Codex隔日 backup readback: 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `integrity_stale=0`, `offsite_fresh=1`, `rclone_gdrive_fresh=1`, `last_backup_all=2026-06-26 02:31:02`, `escrow_missing=5`; full-stack service/data result remains `FULL_STACK_GREEN_DR_ESCROW_BLOCKED`. +> 2026-06-27 00:56 Codex backup core recovery: 188 `momo_pg_daily` was fresh but temporarily false-blocked by cron/config drift (`configured_missing_188=1`). 188 crontab was backed up to `/home/ollama/momo_backups/crontab-before-momo-pg-host-owned-20260627-001925.txt`, the daily MOMO PostgreSQL backup entry was restored to host-owned `/home/ollama/bin/momo-pg-backup.sh`, and the exporter now reports `awoooi_backup_job_configured{host="188",job="momo_pg_daily"} 1`. `backup-status` now reports 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `configured_missing_188=0`, `integrity_stale=0`, `offsite_fresh=1`, `rclone_gdrive_fresh=1`, `escrow_missing=5`; DR still blocked only by credential escrow evidence. +> 2026-06-27 02:42 Codex post-reboot revalidation: `post-reboot-readiness-summary.sh` remains `FULL_STACK_GREEN_DR_ESCROW_BLOCKED` with `SERVICE_GREEN=1`, `PRODUCT_DATA_GREEN=1`, `BACKUP_CORE_GREEN=1`, `HOST_188_HYGIENE_BLOCKED=0`, `STOCK_FRESHNESS_STATUS=ok`, and `ESCROW_MISSING_COUNT=5`. `dr-offsite-operator-checklist.sh --check` confirms `CORE_COLD_START_GREEN=1`, `RECOVERY_STATE=CORE_READY_DR_OFFSITE_PENDING`, live Prometheus `awoooi_recovery_core_ready=1`, and `awoooi_recovery_dr_offsite_ready=0`. --- +## 2026-06-27 00:56 Backup / Offsite / Escrow Live Status + +Read-only and minimal-write evidence sources: 00:56 `/backup/scripts/backup-status.sh --no-notify --no-refresh` from 110, 188 crontab backup / controlled MOMO backup path correction, 188 textfile exporter refresh, post-start quick check at 00:57, and Prometheus recovery recording-rule readback. + +- 110 backup health: `13/13 fresh failed=0`。 +- 188 backup health: `2/2 fresh failed=0`。 +- Integrity / configured blockers: `core_blockers=0`、`configured_missing_110=0`、`configured_missing_188=0`、`script_missing_110=0`、`script_missing_188=0`、`integrity_stale=0`。 +- 188 MOMO backup config drift fix: crontab rollback file `/home/ollama/momo_backups/crontab-before-momo-pg-host-owned-20260627-001925.txt`; active cron now uses `/home/ollama/bin/momo-pg-backup.sh`; exporter reports `awoooi_backup_job_configured{host="188",job="momo_pg_daily"} 1`。 +- Offsite / GDrive freshness: `offsite_configured=1`、`offsite_fresh=1`、`rclone_gdrive_configured=1`、`rclone_gdrive_fresh=1`。 +- Last aggregate backup: `2026-06-26 02:31:02`。 +- Prometheus recovery rules: `awoooi_recovery_core_ready=1`、`awoooi_recovery_dr_offsite_ready=0`。 +- DR blocker remains: `escrow_missing=5`,不得偽造 evidence marker,也不得貼 secret value / hash / partial token。 +- Full-stack service state: `FULL_STACK_GREEN_DR_ESCROW_BLOCKED`。Post-start quick check `PASS=38 WARN=3 BLOCKED=0`;StockPlatform freshness `status=ok`;MOMO daily freshness `2|2026-06-24`。 + +| Gate | Status | Evidence | +|------|--------|----------| +| 110 backup freshness | VERIFIED | 13/13 fresh, failed count 0. | +| 188 backup freshness | VERIFIED | 2/2 fresh, failed count 0. | +| 188 MOMO backup cron/config | VERIFIED | Active crontab uses `/home/ollama/bin/momo-pg-backup.sh`; `configured_missing_188=0`. | +| Offsite / GDrive freshness | VERIFIED | `offsite_fresh=1`, `rclone_gdrive_fresh=1`. | +| Backup core blockers | GREEN | `core_blockers=0`; Prometheus `awoooi_recovery_core_ready=1`. | +| Full-stack service state | FULL_STACK_GREEN_DR_ESCROW_BLOCKED | `POST_START_QUICK_CHECK PASS=38 WARN=3 BLOCKED=0`; service/data/backup core green. | +| Credential escrow | BLOCKED | `escrow_missing=5`; only real non-secret owner evidence may close this. | + ## 2026-06-26 06:28 Backup / Offsite / Escrow Live Status Read-only evidence sources: 06:26 / 06:28 `post-start-quick-check.sh`, delegated `/backup/scripts/backup-status.sh --no-notify --no-refresh`, route-only wrapper retry validation, and direct StockPlatform / MOMO freshness readback. diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index 14ff7a17..46e91dcd 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -1,6 +1,6 @@ # AWOOOI 全棧冷啟動與主機重啟 SOP -> Version: v1.77 +> Version: v1.78 > Last updated: 2026-06-27 Asia/Taipei > Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path. @@ -14,6 +14,10 @@ v1.76 owner gate replay rule:同一輪 summary 產生後,owner packet 與 owner response preflight 必須優先使用 `--summary-file "$ARTIFACT_DIR/summary.txt"`,例如 `scripts/reboot-recovery/post-reboot-next-gate-owner-packets.py --no-color --summary-file "$ARTIFACT_DIR/summary.txt" --output /tmp/awoooi-post-reboot-owner-packets.json` 與 `scripts/reboot-recovery/post-reboot-owner-response-preflight.py --no-color --summary-file "$ARTIFACT_DIR/summary.txt" --response-file `。只有在刻意要重新取 live evidence 時,才允許省略 `--summary-file`;否則 preflight 不得自己重跑 summary 造成同一輪狀態漂移。 +2026-06-27 02:42 最新 live revalidation:`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` artifact `/tmp/awoooi-post-reboot-readiness-20260627-024151/summary.txt` 回傳 `POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_PASS=38`、`POST_START_WARN=3`、`POST_START_BLOCKED=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`STOCK_FRESHNESS_STATUS=ok`、`STOCK_LATEST_TRADING_DATE=2026-06-26`、`STOCK_BLOCKERS=none`、`BACKUP_CORE_GREEN=1`、`HOST_188_HYGIENE_BLOCKED=0`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`、`RUNTIME_ACTION_AUTHORIZED=0`。同輪 DR checklist 回 `CORE_COLD_START_GREEN=1`、`RECOVERY_STATE=CORE_READY_DR_OFFSITE_PENDING`、Prometheus contract `awoooi_recovery_core_ready=1`、`awoooi_recovery_dr_offsite_ready=0`。因此目前服務 / 資料 / 備份核心可宣稱恢復;DR complete 仍因 `ESCROW_MISSING_COUNT=5` 禁止宣稱,Wazuh 全主機納管仍因 manager registry accepted `0` 禁止宣稱。 + +2026-06-27 00:58 最新 live summary:本輪先修復兩個實際 SOP blocker。第一,`scripts/ops/recovery-scorecard-contract-check.py` 已改成 PyYAML optional,標準 Python 環境也能驗證 recovery recording-rule contract,不會因 `ModuleNotFoundError: yaml` 中斷 DR/offsite checklist。第二,188 `ollama` crontab 已備份到 `/home/ollama/momo_backups/crontab-before-momo-pg-host-owned-20260627-001925.txt`,並把 `AWOOOI momo PostgreSQL daily backup` 從 app-side `/home/ollama/momo-pro/scripts/pg_backup.sh` 收斂回 host-owned `/home/ollama/bin/momo-pg-backup.sh`;刷新 188 textfile exporter 後 `awoooi_backup_job_configured{host="188",job="momo_pg_daily"} 1`。00:58 `scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` artifact `/tmp/awoooi-post-reboot-readiness-20260627-005728/summary.txt` 回傳 `POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_PASS=38`、`POST_START_WARN=3`、`POST_START_BLOCKED=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`BACKUP_CORE_GREEN=1`、`ESCROW_MISSING_COUNT=5`、`HOST_188_HYGIENE_BLOCKED=0`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`、`RUNTIME_ACTION_AUTHORIZED=0`。同輪 `backup-status` 回 `core_blockers=0`、`configured_missing_188=0`;Prometheus live contract 回 `awoooi_recovery_core_ready=1`、`awoooi_recovery_dr_offsite_ready=0`,表示主機 / K3s / public routes / product data / backup core 已恢復,DR 仍只因 credential escrow 缺 5 個 non-secret evidence marker blocked,Wazuh 全主機 registry accepted 仍為 0。 + 2026-06-27 00:02 最新 live summary:`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` 回傳 `POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_PASS=38`、`POST_START_WARN=4`、`POST_START_BLOCKED=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`STOCK_FRESHNESS_STATUS=ok`、`STOCK_LATEST_TRADING_DATE=2026-06-26`、`STOCK_BLOCKERS=none`、`BACKUP_CORE_GREEN=1`、`ESCROW_MISSING_COUNT=5`、`HOST_188_HYGIENE_BLOCKED=0`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`、`RUNTIME_ACTION_AUTHORIZED=0`。同一輪 production route smoke 回傳:IwoooS `200`、Wazuh read-only routes `200`、VibeWork `200`、AwoooGo `200`、MOMO health `200`、Stock `200`;AWOOOI API health `healthy / prod / mock_mode=false`,PostgreSQL / Redis / OpenClaw / SigNoz / GCP Ollama provider up,local Ollama endpoint 仍為 cooldown / degraded,由 provider fallback 承接,不是網站或 API service blocker。最新 deploy marker 為 `e506b9d5 chore(cd): deploy fe74d86 [skip ci]`;本輪 `89b9e67a` 是 SOP / scripts / docs source update,不是 runtime bundle deploy marker。112 Wazuh 與 120 K3s 的 23:56 脫敏 readback 仍作為本輪相鄰 evidence:120 ArgoCD `Synced / Healthy`、Pod 均 `Running` 或 `Completed`;Wazuh manager registry 並非全空,但 `WAZUH_MANAGER_REGISTRY_ACCEPTED=0` 維持,不能宣稱全主機納管恢復。 2026-06-26 23:56 live summary retained for comparison:`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` 回傳 `POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_PASS=38`、`POST_START_WARN=3`、`POST_START_BLOCKED=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`STOCK_FRESHNESS_STATUS=ok`、`STOCK_LATEST_TRADING_DATE=2026-06-26`、`STOCK_BLOCKERS=none`、`BACKUP_CORE_GREEN=1`、`ESCROW_MISSING_COUNT=5`、`HOST_188_HYGIENE_BLOCKED=0`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`、`RUNTIME_ACTION_AUTHORIZED=0`。同一時段只讀補查 120:ArgoCD `awoooi-prod` 為 `Synced / Healthy`,`awoooi-prod` Pod 均為 `Running` 或 `Completed`;歷史 `km-vectorize-29689620` failed Job 已被 2026-06-23、2026-06-24、2026-06-25 後續成功 Job 覆蓋,不是目前服務 blocker。同一時段只讀補查 112:systemd `running`,Wazuh manager / indexer / dashboard `active`,manager API root 回 `401`,Dashboard unauthenticated check endpoints 回 `401`,manager registry 脫敏讀回為 local manager `1`、受管 agent `5`、active managed `5`、disconnected `0`、never connected `0`。此證據證明 registry 不再是「全空」,但仍不能宣稱 Wazuh 全主機納管恢復,因為 SOP expected scope 仍是 6、Dashboard API connection / version 尚未以登入或 owner evidence 驗收,owner response accepted 仍為 `0`。 diff --git a/docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md b/docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md index 5c2b1ad2..d45c62e3 100644 --- a/docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md +++ b/docs/runbooks/REBOOT-POST-START-QUICK-CHECK.md @@ -1,6 +1,6 @@ # 主機重啟後一頁式總檢查 -> Version: v1.17 +> Version: v1.18 > Last updated: 2026-06-27 Asia/Taipei > Scope: 110 / 120 / 121 / 188 post-reboot service recovery. 112 Kali / Wazuh / active scan 不屬於本流程。 @@ -10,7 +10,7 @@ 每次 110 / 120 / 121 / 188 任一台主機開機、關機、重啟、斷電恢復、VMware console fsck、Docker / K3s 大量重排後,都先跑本頁,再決定是否宣稱恢復。 -最新基準:2026-06-27 00:02 single-summary replay / route + AWOOOI API warmup classifier。`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` 回傳 `POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_PASS=38`、`POST_START_WARN=4`、`POST_START_BLOCKED=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`STOCK_FRESHNESS_STATUS=ok`、`STOCK_LATEST_TRADING_DATE=2026-06-26`、`STOCK_BLOCKERS=none`、`BACKUP_CORE_GREEN=1`、`DR_ESCROW_BLOCKED=1`、`ESCROW_MISSING_COUNT=5`、`HOST_188_HYGIENE_BLOCKED=0`、`HOST_188_RESULT=HOST_188_HYGIENE_GREEN.`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`、`WAZUH_COVERAGE_SCOPE=6`、`WAZUH_DIRECT_ACTIVE=2`、`WAZUH_NO_TRANSPORT=1`、`WAZUH_SSH_BLOCKED=3`、`WAZUH_ROUTE_CODE=200`、`WAZUH_TRANSPORT_COUNT=6`、`WAZUH_DASHBOARD_API_CONNECTION=pending_or_spinning`、`WAZUH_DASHBOARD_INDEX_OK=3`、`RUNTIME_ACTION_AUTHORIZED=0`、`OVERALL_DECLARATION=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`,並自動把同一份 key/value 寫到 `$ARTIFACT_DIR/summary.txt`。Production route smoke 同輪確認 IwoooS、Wazuh read-only routes、VibeWork、AwoooGo、MOMO health、Stock 均為 `200`;AWOOOI API health 整體 `healthy`,local Ollama cooldown 由 GCP provider fallback 承接,不是網站或 API service blocker。同一輪後續 `post-reboot-declaration-guard.py`、`post-reboot-next-gate-dispatch.sh`、`post-reboot-next-gate-owner-packets.py`、`post-reboot-owner-packet-contract-guard.py`、`post-reboot-owner-response-preflight.py` 必須使用這份 `summary.txt` 或由它產生的 dispatch / packet,不得混用多次 live probe 的不同時間點結果。`NEXT_REQUIRED_GATES=credential_escrow_evidence,wazuh_manager_registry_export` 仍是唯一目前 next gates;DR 仍因 `escrow_missing=5` 不可宣稱 complete;Wazuh manager registry accepted 仍是 `0`,不可把 route `200`、transport `6`、Dashboard index pattern `3` 或脫敏 registry 計數當成全主機納管完成。v1.17 維持 route/API warmup classifier:delegated cold-start 若只因 public route 單次 502 / TLS readback,或 K3s rollout 瞬間單次 `BLOCKED AWOOOI API not reachable`,但 wrapper route retry 已確認 public AWOOOI API health 為 2xx,該 blocker 會降級為 evidence warning;public API 仍失敗、其他 non-route blocker 或 retry 後未恢復仍為 hard blocked。 +最新基準:2026-06-27 02:42 live revalidation / backup core recovery。`scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` artifact `/tmp/awoooi-post-reboot-readiness-20260627-024151/summary.txt` 回傳 `POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_PASS=38`、`POST_START_WARN=3`、`POST_START_BLOCKED=0`、`SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`STOCK_FRESHNESS_STATUS=ok`、`STOCK_LATEST_TRADING_DATE=2026-06-26`、`STOCK_BLOCKERS=none`、`BACKUP_CORE_GREEN=1`、`DR_ESCROW_BLOCKED=1`、`ESCROW_MISSING_COUNT=5`、`HOST_188_HYGIENE_BLOCKED=0`、`HOST_188_RESULT=HOST_188_HYGIENE_GREEN.`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`、`WAZUH_COVERAGE_SCOPE=6`、`WAZUH_DIRECT_ACTIVE=2`、`WAZUH_NO_TRANSPORT=1`、`WAZUH_SSH_BLOCKED=3`、`WAZUH_ROUTE_CODE=200`、`WAZUH_TRANSPORT_COUNT=6`、`WAZUH_DASHBOARD_API_CONNECTION=pending_or_spinning`、`WAZUH_DASHBOARD_INDEX_OK=3`、`RUNTIME_ACTION_AUTHORIZED=0`、`OVERALL_DECLARATION=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`。本輪實際修復 188 `momo_pg_daily` backup configured drift:先前 00:16 summary 因 `configured_missing_188=1` 暫時 blocked;00:19 已備份 188 crontab 到 `/home/ollama/momo_backups/crontab-before-momo-pg-host-owned-20260627-001925.txt`,並把 MOMO PostgreSQL daily backup 收斂到 host-owned `/home/ollama/bin/momo-pg-backup.sh`;刷新 exporter 後 `configured_missing_188=0`,00:56 `backup-status` 回 `core_blockers=0`。02:41 DR checklist 回 `CORE_COLD_START_GREEN=1`、`RECOVERY_STATE=CORE_READY_DR_OFFSITE_PENDING`;Prometheus recovery contract 回 `awoooi_recovery_core_ready=1`、`awoooi_recovery_dr_offsite_ready=0`。同一輪後續 `post-reboot-declaration-guard.py`、`post-reboot-next-gate-dispatch.sh`、`post-reboot-next-gate-owner-packets.py`、`post-reboot-owner-packet-contract-guard.py`、`post-reboot-owner-response-preflight.py` 必須使用這份 `summary.txt` 或由它產生的 dispatch / packet,不得混用多次 live probe 的不同時間點結果。`NEXT_REQUIRED_GATES=credential_escrow_evidence,wazuh_manager_registry_export` 仍是唯一目前 next gates;DR 仍因 `escrow_missing=5` 不可宣稱 complete;Wazuh manager registry accepted 仍是 `0`,不可把 route `200`、transport `6`、Dashboard index pattern `3` 或脫敏 registry 計數當成全主機納管完成。v1.18 維持 route/API warmup classifier:delegated cold-start 若只因 public route 單次 502 / TLS readback,或 K3s rollout 瞬間單次 `BLOCKED AWOOOI API not reachable`,但 wrapper route retry 已確認 public AWOOOI API health 為 2xx,該 blocker 會降級為 evidence warning;public API 仍失敗、其他 non-route blocker 或 retry 後未恢復仍為 hard blocked。 本頁只回答四件事: diff --git a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md index c5ba6fe5..22e830f1 100644 --- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md +++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md @@ -11,11 +11,11 @@ | Area | Status | Completion | Evidence | |------|--------|------------|----------| -| Overall recovery readiness | FULL_STACK_GREEN_DR_ESCROW_BLOCKED | 99% | 2026-06-27 00:02 即時摘要覆蓋 2026-06-26 23:56 判讀。`post-reboot-readiness-summary.sh --no-color` 回傳 `SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_WARN=4`、`STOCK_FRESHNESS_STATUS=ok`、`STOCK_LATEST_TRADING_DATE=2026-06-26`、`STOCK_BLOCKERS=none`、`BACKUP_CORE_GREEN=1`、`ESCROW_MISSING_COUNT=5`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`。Production route smoke:IwoooS / Wazuh read-only routes / VibeWork / AwoooGo / MOMO health / Stock 均 `200`;AWOOOI API health `healthy / prod / mock_mode=false`,local Ollama cooldown 由 GCP provider fallback 承接,不是網站或 API blocker。主機 / K3s / public routes / AWOOOI / MOMO / Stock / backup core / 188 hygiene 已恢復。DR 仍因 credential escrow 缺 5 不能宣稱 complete;Wazuh registry 已有脫敏 manager readback,但尚未 Dashboard API / owner acceptance。 | +| Overall recovery readiness | FULL_STACK_GREEN_DR_ESCROW_BLOCKED | 99% | 2026-06-27 02:42 live revalidation 覆蓋 00:16 暫時 blocked 判讀。`post-reboot-readiness-summary.sh --no-color` artifact `/tmp/awoooi-post-reboot-readiness-20260627-024151/summary.txt` 回傳 `SERVICE_GREEN=1`、`PRODUCT_DATA_GREEN=1`、`POST_START_RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`、`POST_START_WARN=3`、`POST_START_BLOCKED=0`、`STOCK_FRESHNESS_STATUS=ok`、`STOCK_LATEST_TRADING_DATE=2026-06-26`、`STOCK_BLOCKERS=none`、`BACKUP_CORE_GREEN=1`、`ESCROW_MISSING_COUNT=5`、`WAZUH_MANAGER_REGISTRY_ACCEPTED=0`。00:16 的 blocker 是 188 `momo_pg_daily` configured drift:備份 fresh,但 exporter 因 crontab 仍指 app-side path 判 `configured_missing_188=1`;00:19 已備份 188 crontab 到 `/home/ollama/momo_backups/crontab-before-momo-pg-host-owned-20260627-001925.txt` 並收斂到 host-owned `/home/ollama/bin/momo-pg-backup.sh`,刷新 exporter 後 `awoooi_backup_job_configured{host="188",job="momo_pg_daily"} 1`,00:56 `backup-status` 回 `core_blockers=0`。02:41 DR checklist 回 `CORE_COLD_START_GREEN=1`、`RECOVERY_STATE=CORE_READY_DR_OFFSITE_PENDING`;Prometheus live contract 回 `awoooi_recovery_core_ready=1`、`awoooi_recovery_dr_offsite_ready=0`。主機 / K3s / public routes / AWOOOI / MOMO / Stock / backup core / 188 hygiene 已恢復。DR 仍因 credential escrow 缺 5 不能宣稱 complete;Wazuh registry 已有脫敏 manager readback,但尚未 Dashboard API / owner acceptance。 | | P0 host / K3s recovery | DONE | 100% | 120 booted after console fsck at `2026-06-12 15:13`; latest 2026-06-26 07:19 readback shows 120 and 121 reachable, K3s active, `mon` and `mon1` both `Ready control-plane`, AWOOOI API/Web replicas split across both nodes, ArgoCD `awoooi-prod Synced / Healthy` at revision `1fd5e2a8b0f18d24eed16aa2a44286bcbf230603`, and `km-vectorize` official 03:00 台北時間 run succeeded with `lastSuccess=2026-06-25T19:00:14Z`. | -| P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 97% | 2026-06-26 06:58 backup readback shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `integrity_stale=0`, `offsite_fresh=1`, `rclone_gdrive_fresh=1`, `escrow_missing=5`, last aggregate `2026-06-26 02:31:02`。DR remains blocked on real non-secret credential escrow evidence IDs; do not write placeholder markers or paste secret values. | +| P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 98% | 2026-06-27 00:56 backup readback shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `integrity_stale=0`, `offsite_fresh=1`, `rclone_gdrive_fresh=1`, `configured_missing_188=0`, `escrow_missing=5`, last aggregate `2026-06-26 02:31:02`。188 MOMO backup crontab drift 已修復並保留 rollback crontab。DR remains blocked on real non-secret credential escrow evidence IDs; do not write placeholder markers or paste secret values. | | P2 service / data truth | DONE | 100% | Public routes 與 service health 為綠燈,MOMO health `V10.719`,current-month parity 為 `15383|15383|2026-06-01|2026-06-24|2026-06-01|2026-06-24`。StockPlatform `/api/v1/system/freshness` 為 `ok`,latest trading date `2026-06-26`,blockers `none`;先前 Stock EOD blocker 已由官方來源與正式 cron 自然收斂。 | -| P3 docs / automation contracts | DONE_WITH_API_WARMUP_CLASSIFIER_V176 | 100% | Workplan, SOP v1.76, post-reboot declaration guard, machine-readable post-reboot readiness summary with Wazuh registry detail fields and auto-persisted `summary.txt`, post-reboot next-gate dispatch checklist, owner-packet JSON generator, dynamic owner-packet contract guard, post-reboot owner response preflight, owner response placeholder template, one-page post-start quick check v1.16, route retry gate, delegated cold-start public-route / AWOOOI API warmup classifier, deploy warmup classification, expanded public route list, StockPlatform freshness gate, StockPlatform cron-source recovery evidence, StockPlatform natural schedule green evidence, 110 orphan Chrome recurrence cleanup evidence, 188 fail-closed startup data recovery gate, 188 host hygiene read-only checklist, 188 PostgreSQL runtime-ready source-of-truth, 188 ACME route/timer hygiene, baseline `stockplatform_system_freshness_ok`, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, healthy heartbeat suppression, MOMO scheduler / current-month detector fix, exporter restore helpers, 110 Docker disk pressure cleanup boundary, notification-noise readback, MOMO import-boundary / Drive-auth fail-closed deploys, product version/readback matrix, and stricter product-data / route retry gates are updated. Declaration guard now machine-checks allowed / forbidden recovery statements from the same `summary.txt`: service/data/backup/188 host hygiene green may be declared when live summary says so, while `DR_COMPLETE`、`WAZUH_REGISTRY_RECOVERED` and `RUNTIME_ACTION_AUTHORIZED` remain forbidden until evidence gates close. | +| P3 docs / automation contracts | DONE_WITH_BACKUP_CORE_RECOVERY_V178 | 100% | Workplan, SOP v1.78, post-reboot declaration guard, machine-readable post-reboot readiness summary with Wazuh registry detail fields and auto-persisted `summary.txt`, post-reboot next-gate dispatch checklist, owner-packet JSON generator, dynamic owner-packet contract guard, post-reboot owner response preflight, owner response placeholder template, one-page post-start quick check v1.18, route retry gate, delegated cold-start public-route / AWOOOI API warmup classifier, backup-status core-blocker readback, PyYAML-optional recovery-scorecard contract check, 188 MOMO backup crontab host-owned rollback evidence, deploy warmup classification, expanded public route list, StockPlatform freshness gate, StockPlatform cron-source recovery evidence, StockPlatform natural schedule green evidence, 110 orphan Chrome recurrence cleanup evidence, 188 fail-closed startup data recovery gate, 188 host hygiene read-only checklist, 188 PostgreSQL runtime-ready source-of-truth, 188 ACME route/timer hygiene, baseline `stockplatform_system_freshness_ok`, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, healthy heartbeat suppression, MOMO scheduler / current-month detector fix, exporter restore helpers, 110 Docker disk pressure cleanup boundary, notification-noise readback, MOMO import-boundary / Drive-auth fail-closed deploys, product version/readback matrix, and stricter product-data / route retry gates are updated. Declaration guard now machine-checks allowed / forbidden recovery statements from the same `summary.txt`: service/data/backup/188 host hygiene green may be declared when live summary says so, while `DR_COMPLETE`、`WAZUH_REGISTRY_RECOVERED` and `RUNTIME_ACTION_AUTHORIZED` remain forbidden until evidence gates close. | 2026-06-26 12:13 machine-readable summary baseline supersedes the 07:47 / 08:59 gate set: `scripts/reboot-recovery/post-reboot-readiness-summary.sh --no-color` stores delegated logs under `/tmp/awoooi-post-reboot-readiness-20260626-121303` and returns `SERVICE_GREEN=1`, `PRODUCT_DATA_GREEN=1`, `BACKUP_CORE_GREEN=1`, `DR_ESCROW_BLOCKED=1`, `ESCROW_MISSING_COUNT=5`, `HOST_188_SERVICE_GREEN=1`, `HOST_188_HYGIENE_BLOCKED=0`, `HOST_188_CHECK_RC=0`, `HOST_188_RESULT=HOST_188_HYGIENE_GREEN.`, `WAZUH_ROUTE_CODE=200`, `WAZUH_TRANSPORT_COUNT=6`, `WAZUH_COVERAGE_SCOPE=6`, `WAZUH_DIRECT_ACTIVE=2`, `WAZUH_NO_TRANSPORT=1`, `WAZUH_SSH_BLOCKED=3`, `WAZUH_DASHBOARD_API_CONNECTION=pending_or_spinning`, `WAZUH_DASHBOARD_INDEX_OK=3`, `WAZUH_MANAGER_REGISTRY_ACCEPTED=0`, `WAZUH_RUNTIME_GATE=0`, `RUNTIME_ACTION_AUTHORIZED=0`, `OVERALL_DECLARATION=FULL_STACK_GREEN_DR_ESCROW_BLOCKED`, and `NEXT_REQUIRED_GATES=credential_escrow_evidence,wazuh_manager_registry_export`. This is now the preferred first operator/AI-agent entrypoint after reboot because it separates service health from DR and security registry evidence; 188 host hygiene is no longer a next gate unless the live checklist regresses. diff --git a/infra/ansible/playbooks/188-momo-backup-user.yml b/infra/ansible/playbooks/188-momo-backup-user.yml index 6e5713b0..fcf1a188 100644 --- a/infra/ansible/playbooks/188-momo-backup-user.yml +++ b/infra/ansible/playbooks/188-momo-backup-user.yml @@ -11,14 +11,14 @@ vars: momo_backup_script_source: "{{ playbook_dir }}/../../../scripts/backup/backup-momo-188-pg.sh" momo_notify_helper_source: "{{ playbook_dir }}/../../../scripts/ops/notify-awoooi-ops.sh" - momo_scripts_dir: /home/ollama/momo-pro/scripts - momo_backup_script_path: /home/ollama/momo-pro/scripts/pg_backup.sh - momo_notify_helper_path: /home/ollama/momo-pro/scripts/notify-awoooi-ops.sh + momo_scripts_dir: /home/ollama/bin + momo_backup_script_path: /home/ollama/bin/momo-pg-backup.sh + momo_notify_helper_path: /home/ollama/bin/notify-awoooi-ops.sh momo_backup_dir: /home/ollama/momo_backups momo_backup_cron_name: AWOOOI momo PostgreSQL daily backup momo_backup_cron_job: >- PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin - /home/ollama/momo-pro/scripts/pg_backup.sh + /home/ollama/bin/momo-pg-backup.sh >> /home/ollama/momo_backups/backup.log 2>&1 momo_legacy_bin_cron_line: "0 2 * * * /home/ollama/bin/momo-pg-backup.sh >> /home/ollama/momo_backups/backup.log 2>&1" momo_legacy_direct_cron_line: "0 2 * * * /home/ollama/momo-pro/scripts/pg_backup.sh >> /home/ollama/momo_backups/backup.log 2>&1" diff --git a/scripts/ops/recovery-scorecard-contract-check.py b/scripts/ops/recovery-scorecard-contract-check.py index e4f59f3c..5be1f88c 100755 --- a/scripts/ops/recovery-scorecard-contract-check.py +++ b/scripts/ops/recovery-scorecard-contract-check.py @@ -5,13 +5,20 @@ from __future__ import annotations import argparse import json +import re import sys import urllib.parse import urllib.request from pathlib import Path from typing import Any -import yaml +try: + import yaml +except ModuleNotFoundError: # pragma: no cover - exercised on lean operator hosts + yaml = None + YAML_ERROR_TYPES: tuple[type[BaseException], ...] = () +else: + YAML_ERROR_TYPES = (yaml.YAMLError,) DEFAULT_RULES = Path("ops/monitoring/alerts-unified.yml") @@ -24,7 +31,99 @@ class ContractError(RuntimeError): pass +RECOVERABLE_ERRORS = (ContractError, OSError, json.JSONDecodeError) + YAML_ERROR_TYPES +_RECORD_RE = re.compile(r"^(?P\s*)-\s+record:\s*(?P.+?)\s*$") +_RULE_START_RE = re.compile(r"^(?P\s*)-\s+(?:record|alert):\s*.+$") +_EXPR_RE = re.compile(r"^(?P\s*)expr:\s*(?P.*)$") +_PROM_RULES_RE = re.compile(r"^(?P\s*)prometheus_recording_rules:\s*$") +_LIST_ITEM_RE = re.compile(r"^(?P\s*)-\s+(?P.+?)\s*$") + + +def _strip_yaml_scalar(value: str) -> str: + return value.strip().strip('"').strip("'") + + +def _indent_width(line: str) -> int: + return len(line) - len(line.lstrip(" ")) + + +def _fallback_rules(path: Path) -> list[dict[str, Any]]: + lines = path.read_text(encoding="utf-8").splitlines() + rules: list[dict[str, Any]] = [] + index = 0 + while index < len(lines): + record_match = _RECORD_RE.match(lines[index]) + if not record_match: + index += 1 + continue + + record_indent = len(record_match.group("indent")) + rule: dict[str, Any] = {"record": _strip_yaml_scalar(record_match.group("record"))} + index += 1 + + while index < len(lines): + next_rule = _RULE_START_RE.match(lines[index]) + if next_rule and len(next_rule.group("indent")) <= record_indent: + break + + expr_match = _EXPR_RE.match(lines[index]) + if not expr_match: + index += 1 + continue + + expr_indent = len(expr_match.group("indent")) + tail = expr_match.group("tail").strip() + if tail not in {"|", "|-", "|+"}: + rule["expr"] = _strip_yaml_scalar(tail) + index += 1 + continue + + block: list[str] = [] + index += 1 + while index < len(lines): + block_next_rule = _RULE_START_RE.match(lines[index]) + if block_next_rule and len(block_next_rule.group("indent")) <= record_indent: + break + if lines[index].strip() and _indent_width(lines[index]) <= expr_indent: + break + block.append(lines[index]) + index += 1 + rule["expr"] = "\n".join(block) + + rules.append(rule) + + if not rules: + raise ContractError(f"missing recording rules in {path}") + return rules + + +def _fallback_expected_recording_rules(path: Path) -> list[str]: + lines = path.read_text(encoding="utf-8").splitlines() + for index, line in enumerate(lines): + key_match = _PROM_RULES_RE.match(line) + if not key_match: + continue + + key_indent = len(key_match.group("indent")) + rules: list[str] = [] + for child in lines[index + 1 :]: + if not child.strip(): + continue + child_indent = _indent_width(child) + if child_indent <= key_indent: + break + item_match = _LIST_ITEM_RE.match(child) + if item_match and len(item_match.group("indent")) > key_indent: + rules.append(_strip_yaml_scalar(item_match.group("value"))) + if rules: + return rules + + raise ContractError(f"missing monitoring_contract.prometheus_recording_rules in {path}") + + def _rules(path: Path) -> list[dict[str, Any]]: + if yaml is None: + return _fallback_rules(path) data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} rules: list[dict[str, Any]] = [] for group in data.get("groups") or []: @@ -33,6 +132,8 @@ def _rules(path: Path) -> list[dict[str, Any]]: def _expected_recording_rules(path: Path) -> list[str]: + if yaml is None: + return _fallback_expected_recording_rules(path) data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} rules = data.get("monitoring_contract", {}).get("prometheus_recording_rules") or [] if not rules: @@ -136,7 +237,7 @@ def main() -> int: args.expect_dr_ready, ): print(line) - except (ContractError, OSError, yaml.YAMLError, json.JSONDecodeError) as exc: + except RECOVERABLE_ERRORS as exc: print(f"RECOVERY_SCORECARD_CONTRACT_FAILED {exc}", file=sys.stderr) return 1