From 3397f5a9aa6650e57b57f39e159f3a91bbefe797 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Jun 2026 21:26:47 +0800 Subject: [PATCH] fix(recovery): open controlled drain lane after guardrails --- scripts/reboot-recovery/awoooi-startup-110.sh | 32 ++++++++++++++++++- .../test_cold_start_monitor_bounded_probes.py | 17 ++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index bc6a9a78..01c97f1d 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -509,6 +509,13 @@ cd_lane_drain_is_controlled_available() { return 0 } +cd_lane_root_restore_sources_left() { + find /root -maxdepth 1 -type d \( \ + -name 'awoooi-cd-lane-disabled-*' -o \ + -name 'awoooi-cd-lane-drain-disabled-*' \ + \) -print 2>/dev/null | wc -l | tr -d " " +} + quarantine_cd_lane_registration_fail_closed() { local quarantine_dir local lane_dir @@ -625,10 +632,25 @@ ensure_host_runner_fail_closed() { pkill -KILL -f "^${RUNNER_DIR}/act_runner(\\.real-[^ ]*)? daemon" >/dev/null 2>&1 || true quarantine_cd_lane_root_restore_sources_fail_closed for binary in "${RUNNER_FAIL_CLOSED_BINARY_PATHS[@]}"; do + if [ "$START_CD_LANE_ALLOWED" = "1" ] && [ "$binary" = "$CD_LANE_DRAIN_BINARY" ]; then + continue + fi guard_runner_binary_fail_closed "$binary" done } +CD_LANE_ROOT_RESTORE_LEFT="$(cd_lane_root_restore_sources_left)" +if [ "$START_CONTROLLED_CD_LANE" = "1" ]; then + if ! cd_lane_drain_is_controlled_available; then + log "⛔ AWOOOI_START_CONTROLLED_CD_LANE=1 但 controlled drain lane config/binary 未通過;維持 fail-closed" + elif [ "$CD_LANE_ROOT_RESTORE_LEFT" != "0" ]; then + log "⛔ AWOOOI_START_CONTROLLED_CD_LANE=1 但 root restore-source left=${CD_LANE_ROOT_RESTORE_LEFT};維持 fail-closed" + else + START_CD_LANE_ALLOWED=1 + log "✅ controlled cd-lane drain preflight passed; legacy runner 仍維持 fail-closed" + fi +fi + if [ -x "$RUNNER_DIR/act_runner" ] && [ -f "$RUNNER_DIR/config.yaml" ]; then # 若舊的 .runner 配置指向過期 hostname,只有在明確允許啟動 runner # 時才清除重新註冊;預設降壓模式不得碰 registration 狀態。 @@ -729,7 +751,15 @@ else log "⚠️ 找不到 act-runner binary/config: $RUNNER_DIR" fi -log "✅ controlled cd-lane startup override active; startup will not enforce drain fail-closed" +if [ "$START_CD_LANE_ALLOWED" = "1" ]; then + install_controlled_cd_lane_drain_unit + systemctl daemon-reload >/dev/null 2>&1 || true + systemctl unmask "$CD_LANE_DRAIN_SERVICE" >/dev/null 2>&1 || true + systemctl enable --now "$CD_LANE_DRAIN_SERVICE" >/dev/null 2>&1 || true + ensure_controlled_cd_lane_open +else + log "✅ controlled cd-lane remains closed unless AWOOOI_START_CONTROLLED_CD_LANE=1 passes guardrails" +fi # ────────────────────────────────────────────── # STEP 7: Sentry(Error Tracking) diff --git a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py index e09c2286..0c44b1eb 100644 --- a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py +++ b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py @@ -76,6 +76,23 @@ def test_startup_110_quarantines_corrupt_docker_container_metadata() -> None: assert 'run_bounded "$DOCKER_START_TIMEOUT_SECONDS" systemctl start docker.socket docker.service' in text +def test_startup_110_opens_only_controlled_cd_lane_after_guardrails() -> None: + text = STARTUP_110.read_text(encoding="utf-8") + + assert 'START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-0}"' in text + assert "cd_lane_root_restore_sources_left()" in text + assert 'CD_LANE_ROOT_RESTORE_LEFT="$(cd_lane_root_restore_sources_left)"' in text + assert 'START_CD_LANE_ALLOWED=1' in text + assert 'install_controlled_cd_lane_drain_unit' in text + assert 'systemctl unmask "$CD_LANE_DRAIN_SERVICE"' in text + assert 'systemctl enable --now "$CD_LANE_DRAIN_SERVICE"' in text + assert 'ensure_controlled_cd_lane_open' in text + assert 'if [ "$START_CD_LANE_ALLOWED" = "1" ] && [ "$binary" = "$CD_LANE_DRAIN_BINARY" ]; then' in text + assert 'systemctl enable --now "$RUNNER_SERVICE"' in text + assert "legacy runner 仍維持 fail-closed" in text + assert "controlled cd-lane remains closed unless AWOOOI_START_CONTROLLED_CD_LANE=1 passes guardrails" in text + + def test_cold_start_deploy_parity_verifier_bounds_ssh_readback() -> None: text = VERIFY_DEPLOY.read_text(encoding="utf-8")