From e9db7741dbcdd983e8cd80033529f7447d2ef265 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Jun 2026 23:35:45 +0800 Subject: [PATCH] fix(recovery): require guarded 110 drain startup in slo --- docs/LOGBOOK.md | 4 +++- ...oi-reboot-auto-recovery-slo-scorecard.snapshot.json | 1 + .../reboot-auto-recovery-slo-scorecard.py | 10 ++++++++++ .../tests/test_reboot_auto_recovery_slo_scorecard.py | 3 +++ 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index c4547b02..bc12b62e 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -4,12 +4,14 @@ - 23:27 live queue 已證明 Harbor repair `#4115 Waiting` 卡在 `awoooi-host` no matching runner;`awoooi-startup-110.sh` 雖已有 controlled drain lane guardrails,但預設 `AWOOOI_START_CONTROLLED_CD_LANE=0`,重啟後即使 config / binary / labels / root restore-source 全部合格,也不會自動恢復 `awoooi-host` repair lane。 - 將 startup 預設改成 guarded auto-start:`START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-1}"`。legacy host / generic runner 仍維持 fail-closed;drain lane 只有在 `capacity=1`、labels 僅 `awoooi-ubuntu` / `awoooi-host`、無泛用重型 label、binary / config 可用、root restore-source left `0` 時才會 `enable --now awoooi-cd-lane-drain.service`。 - 仍保留明確關閉開關:`AWOOOI_START_CONTROLLED_CD_LANE=0` 可讓 startup 不拉起 drain lane;不得用此變更恢復 legacy runner、generic label、`ubuntu-latest`、StockPlatform/headless/Playwright 重型 label。 +- `reboot-auto-recovery-slo-scorecard.py` 新增 source contract:`host_110_startup_controlled_drain_guarded_autostart_source_present`。未來若 startup 又退回只存在腳本、不自動 guard-on 啟動 controlled drain lane,10 分鐘自動恢復 SLO 必須 fail-closed。 **驗證**: - 更新 `scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py`,斷言 controlled drain lane 預設 guard-on,且關閉/guard fail 會保持 closed。 +- 更新 `scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py` 與 committed scorecard snapshot,讓 API readback 的 source controls 覆蓋 guarded auto-start。 - 更新 `ops/runner/README.md`,把「startup 不自動重開 runner」改成「legacy 不啟動;受控 drain lane guard 後自動拉起」。 -**邊界**:只改 110 startup source / test / runner README / LOGBOOK;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未 SSH 寫入、未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall。 +**邊界**:只改 110 startup source / SLO scorecard / tests / runner README / LOGBOOK / snapshot;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未 SSH 寫入、未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall。 ## 2026-06-30 — 23:27 Post-push cold-start / Harbor / Stock readback diff --git a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json index 29fa884b..0f216abd 100644 --- a/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json +++ b/docs/operations/awoooi-reboot-auto-recovery-slo-scorecard.snapshot.json @@ -185,6 +185,7 @@ }, "source_controls": { "cold_start_textfile_exporter_source_present": true, + "host_110_startup_controlled_drain_guarded_autostart_source_present": true, "host_110_startup_unit_source_present": true, "host_188_startup_unit_source_present": true, "host_boot_probe_source_present": true, diff --git a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py index 950664a2..56b1e578 100755 --- a/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py +++ b/scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py @@ -139,6 +139,16 @@ def source_controls() -> dict[str, bool]: "WantedBy=multi-user.target", ) and source_file("scripts/reboot-recovery/awoooi-startup-110.sh").exists(), + "host_110_startup_controlled_drain_guarded_autostart_source_present": ( + file_contains( + source_file("scripts/reboot-recovery/awoooi-startup-110.sh"), + 'START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-1}"', + "cd_lane_root_restore_sources_left()", + "install_controlled_cd_lane_drain_unit", + 'systemctl enable --now "$CD_LANE_DRAIN_SERVICE"', + "controlled cd-lane remains closed because guardrails failed or AWOOOI_START_CONTROLLED_CD_LANE=0", + ) + ), "host_188_startup_unit_source_present": file_contains( source_file("scripts/reboot-recovery/awoooi-startup.service"), "ExecStart=/usr/local/bin/awoooi-startup.sh", diff --git a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py index ff491e75..2b2a4aea 100644 --- a/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py +++ b/scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py @@ -134,6 +134,9 @@ def test_green_summary_and_recent_all_host_probe_can_claim_slo(tmp_path: Path) - assert payload["schema_version"] == "awoooi_reboot_auto_recovery_slo_scorecard_v1" assert payload["status"] == "slo_ready" assert payload["can_claim_all_services_recovered_within_target"] is True + assert payload["source_controls"][ + "host_110_startup_controlled_drain_guarded_autostart_source_present" + ] is True assert payload["host_boot_detection"]["max_observed_uptime_seconds"] == 150 assert payload["active_blockers"] == []