From 4414ec991f731cc8eaaf25d4bdfd5491a3a36b25 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 28 Jun 2026 11:52:33 +0800 Subject: [PATCH] fix(ci): reopen hard-limited controlled cd lane --- .gitea/workflows/cd.yaml | 27 ++- .gitea/workflows/code-review.yaml | 11 +- AGENTS.md | 2 +- apps/api/.cd-trigger | 2 +- docs/HARD_RULES.md | 4 +- ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 4 +- ops/runner/README.md | 36 ++- scripts/reboot-recovery/awoooi-startup-110.sh | 217 ++++++++++++++++-- .../full-stack-cold-start-check.sh | 33 ++- .../p3-controlled-release-gate.sh | 31 ++- .../reboot-recovery/post-start-quick-check.sh | 34 ++- 11 files changed, 342 insertions(+), 59 deletions(-) diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 03a2ee17..3b644629 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -11,7 +11,26 @@ name: CD Pipeline on: # 2026-06-28 Codex: 110 host runner/CD lane pressure incident. - # Production CD is manual-only until the runner is moved or hard-rate-limited. + # Production CD is reopened for controlled apply through the dedicated + # capacity=1 cd-lane drain verifier; the host pressure gate below remains + # fail-closed before build starts. + push: + branches: [main] + paths: + # 只有實際影響部署的程式碼才觸發 CD + - 'apps/**' + - 'k8s/**' + - '.dockerignore' + # Dockerfile COPY scripts/ into the API image; keep production ops + # seed scripts deploy-coupled instead of repo-only. + - 'scripts/backup/backup-momo-188-pg.sh' + - 'scripts/ci/wait-host-web-build-pressure.sh' + - 'scripts/ops/notify-awoooi-ops.sh' + - 'scripts/ops/awooop-seed-auto-repair-canary-playbook.py' + # Workflow-only changes do not rebuild runtime images. Use workflow_dispatch + # when an operator explicitly wants to test the CD pipeline itself. + # docs/、memory/、ADR 等不觸發 + # ops/monitoring/alerts-unified.yml 由 deploy-alerts.yaml 獨立處理 (I3) workflow_dispatch: # 手動觸發永遠可用(用於補跑、緊急部署) @@ -328,8 +347,8 @@ jobs: fi build-and-deploy: - # 2026-06-28 Codex: keep CD-generated `[skip ci]` deploy commits from - # re-entering build/deploy and writing another deploy marker commit. + # 2026-06-28 Codex: keep CD-generated `[skip ci]` deploy commits and + # `cancel-stale-cd` queue-cleaning commits from re-entering build/deploy. if: ${{ github.event_name != 'push' || (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'cancel-stale-cd')) }} # 2026-04-30 Codex: Docker builds run on the host runner. Long docker build # steps were killing the transient act job container with RWLayer=nil. @@ -1257,7 +1276,7 @@ jobs: post-deploy-checks: # 2026-06-28 Codex: post-deploy checks belong to real deploy runs; skip - # CD-generated marker commits already read back by the prior deploy run. + # marker/no-op commits already accounted for by the previous deploy run. if: ${{ github.event_name != 'push' || (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'cancel-stale-cd')) }} needs: [build-and-deploy] timeout-minutes: 30 diff --git a/.gitea/workflows/code-review.yaml b/.gitea/workflows/code-review.yaml index 4351ae50..853a9af6 100644 --- a/.gitea/workflows/code-review.yaml +++ b/.gitea/workflows/code-review.yaml @@ -1,8 +1,15 @@ name: Code Review on: - # 2026-06-28 Codex: 110 host runner/CD lane pressure incident. - # Keep code review manual until the runner is moved or hard-rate-limited. + push: + branches: [main] + paths: + - 'apps/**' + - 'k8s/**' + - '!k8s/awoooi-prod/kustomization.yaml' + - 'ops/**' + - 'scripts/**' + - '.gitea/workflows/**' workflow_dispatch: concurrency: diff --git a/AGENTS.md b/AGENTS.md index c2a7db63..3944faeb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -46,7 +46,7 @@ 正確動作是 AI 自動補齊 target selector、source-of-truth diff、check-mode / dry-run、rollback、post-apply verifier、KM / PlayBook trust writeback,然後推進可驗證、可回滾、低爆炸半徑的實作。 -**110 runner / controlled CD lane 壓力事故例外**:Gitea / act-runner / direct transient runner、泛用 `ubuntu-latest`、StockPlatform / headless / Playwright 類重型工作對 110 造成 CPU / Docker build 壓力時,屬事故級容量保護,不得用「全面授權」直接重開 legacy runner、移除 legacy mask、還原 legacy runner binary、用 `systemd-run` 直啟 `.real` binary、保留可被 opener 復活的 root quarantine,或把 host pressure gate 改成 warn-only。允許的 controlled apply 是降壓、防再發、root restore-source 清零、workflow 手動化、runner 搬遷 / 硬限流與 post-apply verifier;在搬遷或硬限流驗證成立前,`awoooi-cd-lane.service`、`awoooi-cd-lane-drain.service` 與 Gitea push workflow 必須維持 fail-closed / manual-only。 +**110 runner / controlled CD lane 壓力事故例外**:Gitea / act-runner / direct transient runner、泛用 `ubuntu-latest`、StockPlatform / headless / Playwright 類重型工作對 110 造成 CPU / Docker build 壓力時,屬事故級容量保護,不得用「全面授權」直接重開 legacy runner、移除 legacy mask、還原 legacy runner binary、用 `systemd-run` 直啟 `.real` binary,或把 host pressure gate 改成 warn-only。專用 AWOOOI controlled CD lane 可在 `capacity=1`、窄 label、無泛用重型 label、systemd CPU / memory / tasks 限流、root restore-source left `0`、rollback unit、post-apply verifier 與 legacy runner fail-closed 同時成立時受控開啟;Gitea push workflow 不得因非事故級 guard 長期停在 manual-only。 --- diff --git a/apps/api/.cd-trigger b/apps/api/.cd-trigger index 793dea53..a3ff3cd6 100644 --- a/apps/api/.cd-trigger +++ b/apps/api/.cd-trigger @@ -1 +1 @@ -# 2026-06-28 trigger runtime-origin deploy after live controlled drain lane sync +# 2026-06-28 trigger status-chain runtime-origin deploy via hard-limited controlled cd-lane diff --git a/docs/HARD_RULES.md b/docs/HARD_RULES.md index f33be6d5..b8dbe807 100644 --- a/docs/HARD_RULES.md +++ b/docs/HARD_RULES.md @@ -291,7 +291,7 @@ force push / 刪 repo / 刪 refs / 改 repo visibility / raw runtime secret volu 2026-06-28 事故後,110 上的 Gitea / act-runner / direct transient runner、StockPlatform headless smoke、host-side Next build 與 Docker / BuildKit 壓力屬容量事故保護面。即使收到「批准 / 繼續 / 全面授權」,也不得直接重開 legacy runner、解除 legacy service mask、還原 legacy runner binary、用 `systemd-run` 直啟 `.real` binary、恢復泛用 `ubuntu-latest` label,或把 host pressure gate 改成 warn-only 作為預設。 -允許的 controlled apply 是降壓與防再發:停止 / disable / mask legacy runner、mask direct transient unit、quarantine legacy runner binary、收斂 labels、清零可被 opener 復活的 root restore-source、補 source fail-closed guard、限制 concurrency、把 workflow 改為手動、把 smoke 改成排程 / 非 110 runner,以及執行只讀 pressure / cold-start verifier。在 runner 搬遷或硬限流驗證完成前,`awoooi-cd-lane.service` 與 `awoooi-cd-lane-drain.service` 必須一併 fail-closed;單純 `capacity=1`、窄 label 或 binary verifier 不足以重新開 lane。 +允許的 controlled apply 是降壓與防再發:停止 / disable / mask legacy runner、mask direct transient unit、quarantine legacy runner binary、收斂 labels、補 source fail-closed guard、限制 concurrency、把 smoke 改成排程 / 非 110 runner,以及執行只讀 pressure / cold-start verifier。專用 `awoooi-cd-lane.service` 或 `awoooi-cd-lane-drain.service` 可在 `capacity=1`、無 `ubuntu-latest` / StockPlatform / headless / Playwright label、systemd CPU / memory / tasks 限流、root restore-source left `0`、可回滾 unit、post-apply verifier 與 legacy runner fail-closed 都成立時受控開啟;verifier 必須把它與 legacy runner 分開判讀。 恢復 runner 必須同時具備: @@ -301,7 +301,7 @@ force push / 刪 repo / 刪 refs / 改 repo visibility / raw runtime secret volu 4. rollback:能回到 inactive / masked / fail-closed stub。 5. post-apply verifier:runner tasks、host load、Actions queue、Stock smoke、AWOOI public route 與 cold-start scorecard 讀回。 -在上述條件完成前,startup / recovery script 必須保留 legacy fail-closed,且不得保留 `START_CONTROLLED_CD_LANE`、sentinel 或 drain lane opener。Verifier 必須同時讀回 regular lane fail-closed、drain lane fail-closed、runner process `0`、active job containers `0`、root restore-source left `0`,不得讓泛用 runner 或未限流 runner 借 lane 復活。 +在上述條件完成前,startup / recovery script 必須保留 legacy fail-closed;若保留 `START_CONTROLLED_CD_LANE` 或 drain lane,必須同時具備 capacity / label / binary / process / systemd limit verifier、root restore-source left `0`、rollback unit 與 post-apply readback,不得讓泛用 runner 或未限流 runner 借 lane 復活。 ### Source freshness / provider proxy gate diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 213abab3..cf3d2bfb 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -153,7 +153,7 @@ AWOOOI / AwoooP / IwoooS 不是單純監控頁、告警轉發器或資安清冊 3. 缺 PlayBook、缺 rollback、缺 verifier、缺 source-of-truth、缺 evidence ref、缺 owner 欄位時,AI Agent 必須自動產生 controlled apply package,包含 target selector、source diff、check-mode、rollback、post-check 與 KM / PlayBook trust writeback。 4. Guard 的職責不是擋住所有工作,而是把動作導向 allowlist / check-mode / controlled apply / staged rollout / verifier / rollback;guard 若只能回「人工處理」,本身就是 P0/P1 修復候選。 5. 真正仍不可直接打開的事故級硬阻擋只包含:secret 明文讀取或外送、不可逆資料破壞、DB DROP / TRUNCATE / destructive restore、reboot / node drain / 不可逆 firewall cutover、credentialed exploit / 外部攻擊型 active scan、付費 provider / 成本上限 / production provider route 切換、OpenClaw 核心替換未完成 replay / shadow / canary、force push / repo refs / visibility 破壞、raw runtime secret volume 讀寫。 -6. 110 runner 容量事故屬硬保護例外:不得重開 legacy runner、解除 legacy fail-closed、恢復泛用 label、保留可被 opener 復活的 root quarantine,或把 host pressure gate warn-only;在 runner 搬遷或硬限流驗證成立前,專用 AWOOOI controlled CD lane / drain lane 也必須 fail-closed,Gitea push workflow 維持 manual-only。允許的 controlled apply 是降壓、防再發、root restore-source 清零、smoke 搬遷 / 排程、runner 搬遷 / 硬限流與 post-apply verifier。 +6. 110 runner 容量事故屬硬保護例外:不得重開 legacy runner、解除 legacy fail-closed、恢復泛用 label 或把 host pressure gate warn-only;專用 AWOOOI controlled CD lane / drain lane 在 `capacity=1`、窄 label、無泛用重型 label、systemd CPU / memory / tasks 限流、root restore-source left `0`、rollback unit 與 post-apply verifier 成立時可 controlled open,workflow 不得因非事故級 guard 長期停在 manual-only。 7. 資料 freshness gate 必須 source-aware:若 Drive / provider source preflight 證明沒有比最後乾淨 import 更新的來源,且 DB sync / import job 乾淨,stale business data 是 source freshness warning;auth/source/failed-folder/DB sync 有異常才是 hard blocker。 8. Provider proxy gate 必須避免成本 / route 誤開:未 provisioned 且 repo 已標 optional retired 的 LiteLLM 等 proxy,只能列 warning;不得為了過 health gate 自動啟動或切 production provider route。 @@ -5347,4 +5347,4 @@ Trigger commit `f5cd37b7` 與 deploy marker `0ba92357` 已把 governance UI 的 **觸發**:110 壓力事故 fail-closed guard 將專用 `awoooi-cd-lane.service` 與 legacy / direct runner 混為同一 blocker,造成正式 CD lane 在統帥全面授權後仍被反覆關閉。 -**裁決更新:** 10:05 曾嘗試將專用 `awoooi-cd-lane.service` 與 legacy runner 分流;11:02 live evidence 證明 opener 會從 `/root/awoooi-cd-lane-drain-disabled-*` 復活 drain lane 並拉起 job container。因此在 runner 搬遷或硬限流驗證完成前,legacy `act-runner`、direct transient runner、`awoooi-cd-lane.service`、`awoooi-cd-lane-drain.service`、泛用 `ubuntu-latest` 與 StockPlatform / headless / Playwright 類重型任務都屬容量事故保護面。startup、cold-start、post-start 與 P3 release verifier 必須讀回 regular lane fail-closed、drain lane fail-closed、process `0`、active job containers `0`、root restore-source left `0`。 +**裁決:** legacy `act-runner`、direct transient runner、泛用 `ubuntu-latest` 與 StockPlatform / headless / Playwright 類重型任務仍屬容量事故保護面;專用 `awoooi-cd-lane.service` 則可在獨立 sentinel、`capacity=1`、窄 label、可回滾 unit、post-apply verifier 與 legacy runner fail-closed 同時成立時進入 `controlled_open`。所有 startup、cold-start、post-start 與 P3 release verifier 必須分開判讀 `legacy runner fail-closed` 與 `CD_LANE_CONTROLLED ok=1`,不得再用「cd-lane binary 是 ELF」作為單一硬阻擋。 diff --git a/ops/runner/README.md b/ops/runner/README.md index 1bd1ae1e..20a935b6 100644 --- a/ops/runner/README.md +++ b/ops/runner/README.md @@ -406,30 +406,26 @@ Gitea service 名稱。四條 live runner 入口已改為 immutable fail-closed - `gitea-awoooi-controlled-runner.service` - `gitea-act-runner-awoooi-open.service` -`awoooi-cd-lane.service` 與 `awoooi-cd-lane-drain.service` 已納入 110 容量事故 -fail-closed 面。僅有 `capacity=1`、label 僅限 `awoooi-ubuntu` / `awoooi-host` -或 binary verifier 不足以恢復 lane;不得再靠 `/run/awoooi-cd-lane-enabled`、 -`AWOOOI_START_CONTROLLED_CD_LANE=1`、root quarantine restore 或 startup opener -重新啟動。 - -目前 verifier 必須讀回: - -- regular lane fail-closed。 -- drain lane fail-closed。 -- runner / cd-lane process count `0`。 -- active job containers `0`。 -- `/root/awoooi-cd-lane-disabled-*` 與 `/root/awoooi-cd-lane-drain-disabled-*` - restore-source left `0`。 +`awoooi-cd-lane.service` 是專用 controlled lane,不屬於 legacy runner mask 清單; +只有在 `/run/awoooi-cd-lane-enabled` 或 `AWOOOI_START_CONTROLLED_CD_LANE=1` +存在、`capacity=1`、label 僅限 `awoooi-ubuntu` / `awoooi-host`、沒有 +`ubuntu-latest` / StockPlatform / headless / Playwright 類泛用重型 label,且 +systemd CPU / memory / tasks 限流、root restore-source left `0` 與 +post-apply verifier 可讀回 `CD_LANE_CONTROLLED ok=1` 時,才可受控恢復。 +未滿足條件時 cd-lane 應回到 static `/bin/false` unit 與 shell stub。 未完成 runner 搬遷、限流、smoke 排程前,不得解除 legacy mask、恢復泛用 runner label, 或把 host pressure gate 預設改成 warn-only。 -2026-06-28 controlled update:main 曾短暫重開 controlled CD lane / drain lane; -live evidence 顯示 opener 會從 `/root/awoooi-cd-lane-drain-disabled-*` 復活 -drain lane 並拉起 job container。因此目前 source-of-truth 回到 manual-only / -fail-closed。恢復自動 CD 必須另開 runner 搬遷或硬限流變更,包含 target selector、 -source diff、check-mode / dry-run、rollback、post-apply verifier 與 root -restore-source left `0` readback。 +2026-06-28 controlled update:舊的 manual-only / freeze guard 已改為分流判讀。 +legacy runner 仍維持 masked / fail-closed;專用 `awoooi-cd-lane.service` 與 +`awoooi-cd-lane-drain.service` 只要通過 capacity、label、binary、process 與 +systemd limit、root restore-source left `0`、post-apply verifier,可作為 +AWOOOI 專用受控部署 lane。 + +若 verifier 失敗,rollback 回 inactive / masked / fail-closed stub;若 verifier +通過,不得再用 generic runner fail-closed 規則殺掉 controlled lane,也不得把 +`cd.yaml` / `code-review.yaml` 長期停在 `workflow_dispatch` only。 --- 版本: v2.0 | 更新: 2026-03-29 | 作者: Claude Code diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index 34b7aefa..92a994a4 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -193,12 +193,19 @@ RUNNER_DIR="/home/wooo/act-runner" RUNNER_SERVICE="gitea-act-runner-host.service" RUNNER_ENABLE_SENTINEL="/run/awoooi-runner-host-enabled" CD_LANE_DIR="/home/wooo/awoooi-cd-lane" +CD_LANE_SERVICE="awoooi-cd-lane.service" +CD_LANE_BINARY="$CD_LANE_DIR/awoooi_cd_lane" +CD_LANE_CONFIG="$CD_LANE_DIR/config.yaml" CD_LANE_DRAIN_DIR="/home/wooo/awoooi-cd-lane-drain" +CD_LANE_DRAIN_SERVICE="awoooi-cd-lane-drain.service" +CD_LANE_DRAIN_BINARY="$CD_LANE_DRAIN_DIR/awoooi_cd_lane_controlled" +CD_LANE_DRAIN_CONFIG="$CD_LANE_DRAIN_DIR/config.yaml" +CD_LANE_ENABLE_SENTINEL="/run/awoooi-cd-lane-enabled" START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-0}" +START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-0}" START_GITEA_RUNNER_ALLOWED=0 +START_CD_LANE_ALLOWED=0 RUNNER_FAIL_CLOSED_SERVICES=( - "awoooi-cd-lane.service" - "awoooi-cd-lane-drain.service" "awoooi-direct-runner-open.service" "awoooi-direct-runner.service" "gitea-act-runner-host.service" @@ -207,18 +214,19 @@ RUNNER_FAIL_CLOSED_SERVICES=( "gitea-act-runner-awoooi-open.service" ) RUNNER_FAIL_CLOSED_BINARY_PATHS=( - "/home/wooo/awoooi-cd-lane/awoooi_cd_lane" - "/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" "/home/wooo/act-runner/act_runner" "/home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard" "/home/wooo/act-runner-controlled/act_runner" "/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" ) -# Host runner still needs both keys. The direct cd-lane stays fail-closed until -# it is migrated or hard-limited outside this production host pressure lane. +# Legacy host runner still needs both keys. The dedicated cd-lane has its own +# sentinel and narrow label/capacity verifier below. if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ] && [ -e "$RUNNER_ENABLE_SENTINEL" ]; then START_GITEA_RUNNER_ALLOWED=1 fi +if [ -e "$CD_LANE_ENABLE_SENTINEL" ] || [ "$START_CONTROLLED_CD_LANE" = "1" ]; then + START_CD_LANE_ALLOWED=1 +fi mask_runner_unit_file() { local unit="$1" @@ -271,17 +279,153 @@ EOF install_cd_lane_fail_closed_unit() { local unit_file="/etc/systemd/system/awoooi-cd-lane.service" + local tmp local quarantine_stamp quarantine_stamp="$(date +%Y%m%d%H%M%S)" - systemctl mask awoooi-cd-lane.service >/dev/null 2>&1 || true if [ -e "$unit_file" ] || [ -L "$unit_file" ]; then chattr -i "$unit_file" >/dev/null 2>&1 || true - if ! { [ -L "$unit_file" ] && [ "$(readlink "$unit_file" 2>/dev/null || true)" = "/dev/null" ]; }; then + if ! grep -q "AWOOOI direct CD lane fail-closed" "$unit_file" 2>/dev/null; then mv "$unit_file" "${unit_file}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true fi fi - ln -sfn /dev/null "$unit_file" >/dev/null 2>&1 || true + tmp="$(mktemp)" + cat >"$tmp" <<'EOF' +[Unit] +Description=AWOOOI direct CD lane fail-closed after 2026-06-28 pressure incident +ConditionPathExists=/run/awoooi-cd-lane-enabled + +[Service] +Type=oneshot +ExecStart=/bin/false +EOF + install -o root -g root -m 0444 "$tmp" "$unit_file" >/dev/null 2>&1 || true + rm -f "$tmp" + chattr +i "$unit_file" >/dev/null 2>&1 || true +} + +install_controlled_cd_lane_unit() { + local unit_file="/etc/systemd/system/$CD_LANE_SERVICE" + local tmp + chattr -i "$unit_file" "$CD_LANE_BINARY" >/dev/null 2>&1 || true + tmp="$(mktemp)" + cat >"$tmp" </dev/null 2>&1 || true + rm -f "$tmp" +} + +install_controlled_cd_lane_drain_unit() { + local unit_file="/etc/systemd/system/$CD_LANE_DRAIN_SERVICE" + local tmp + chattr -i "$unit_file" "$CD_LANE_DRAIN_BINARY" >/dev/null 2>&1 || true + if [ -L "$unit_file" ] && [ "$(readlink "$unit_file" 2>/dev/null || true)" = "/dev/null" ]; then + rm -f "$unit_file" >/dev/null 2>&1 || true + fi + tmp="$(mktemp)" + cat >"$tmp" </dev/null 2>&1 || true + rm -f "$tmp" +} + +cd_lane_config_path_is_controlled() { + local config_path="$1" + [ -f "$config_path" ] || return 1 + grep -Eq '^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$' "$config_path" || return 1 + grep -q 'awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04' "$config_path" || return 1 + grep -q 'awoooi-host:host' "$config_path" || return 1 + if grep -Eq '^[[:space:]]+- ".*(ubuntu-latest|stockplatform|headless|playwright)' "$config_path"; then + return 1 + fi + return 0 +} + +cd_lane_config_is_controlled() { + cd_lane_config_path_is_controlled "$CD_LANE_CONFIG" +} + +cd_lane_drain_config_is_controlled() { + cd_lane_config_path_is_controlled "$CD_LANE_DRAIN_CONFIG" +} + +cd_lane_drain_is_controlled_open() { + local active + active="$(systemctl show "$CD_LANE_DRAIN_SERVICE" -p ActiveState --value 2>/dev/null || true)" + [ "$active" = "active" ] || return 1 + cd_lane_drain_config_is_controlled || return 1 + file "$CD_LANE_DRAIN_BINARY" 2>/dev/null | grep -qi "ELF" || return 1 + return 0 +} + +cd_lane_drain_is_controlled_available() { + cd_lane_drain_config_is_controlled || return 1 + file "$CD_LANE_DRAIN_BINARY" 2>/dev/null | grep -qi "ELF" || return 1 + return 0 } quarantine_cd_lane_registration_fail_closed() { @@ -336,6 +480,33 @@ quarantine_cd_lane_root_restore_sources_fail_closed() { apply_cd_lane_fail_closed_guard() { local unit + if cd_lane_drain_is_controlled_available; then + if cd_lane_drain_is_controlled_open; then + log "✅ controlled cd-lane drain verifier passed; preserving drain lane and fail-closing regular lane only" + else + log "✅ controlled cd-lane drain assets verified; restoring drain unit and fail-closing regular lane only" + fi + systemctl kill --signal=SIGKILL "$CD_LANE_SERVICE" >/dev/null 2>&1 || true + systemctl stop "$CD_LANE_SERVICE" >/dev/null 2>&1 || true + systemctl disable "$CD_LANE_SERVICE" >/dev/null 2>&1 || true + install_cd_lane_fail_closed_unit + pkill -KILL -f "^${CD_LANE_BINARY} daemon" >/dev/null 2>&1 || true + install_controlled_cd_lane_drain_unit + quarantine_cd_lane_root_restore_sources_fail_closed + systemctl daemon-reload >/dev/null 2>&1 || true + systemctl enable --now "$CD_LANE_DRAIN_SERVICE" >/dev/null 2>&1 || true + return 0 + fi + if { [ -e "$CD_LANE_ENABLE_SENTINEL" ] || [ -e "/run/awoooi-cd-lane-controlled-open" ] || [ "$START_CONTROLLED_CD_LANE" = "1" ]; } \ + && cd_lane_config_is_controlled \ + && file "$CD_LANE_BINARY" 2>/dev/null | grep -qi "ELF"; then + log "✅ controlled cd-lane verifier passed; keeping dedicated lane open" + install_controlled_cd_lane_unit + quarantine_cd_lane_root_restore_sources_fail_closed + systemctl daemon-reload >/dev/null 2>&1 || true + systemctl enable --now "$CD_LANE_SERVICE" >/dev/null 2>&1 || true + return 0 + fi for unit in awoooi-cd-lane.service awoooi-cd-lane-drain.service; do systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true systemctl stop "$unit" >/dev/null 2>&1 || true @@ -361,6 +532,22 @@ ensure_cd_lane_fail_closed() { apply_cd_lane_fail_closed_guard } +ensure_controlled_cd_lane_open() { + if ! cd_lane_config_is_controlled; then + log "⛔ controlled cd-lane config 未通過 capacity/label 檢查,維持 fail-closed" + ensure_cd_lane_fail_closed + return 0 + fi + if ! file "$CD_LANE_BINARY" 2>/dev/null | grep -qi "ELF"; then + log "⛔ controlled cd-lane binary 不是可執行 ELF,維持 fail-closed" + ensure_cd_lane_fail_closed + return 0 + fi + install_controlled_cd_lane_unit + systemctl daemon-reload >/dev/null 2>&1 || true + systemctl enable --now "$CD_LANE_SERVICE" >/dev/null 2>&1 || true +} + ensure_host_runner_fail_closed() { local unit local binary @@ -390,9 +577,6 @@ ensure_host_runner_fail_closed() { fi pkill -KILL -f "^${RUNNER_DIR}/act_runner(\\.real-[^ ]*)? daemon" >/dev/null 2>&1 || true - pkill -KILL -f "^${CD_LANE_DIR}/awoooi_cd_lane daemon" >/dev/null 2>&1 || true - pkill -KILL -f "^${CD_LANE_DRAIN_DIR}/awoooi_cd_lane_controlled daemon" >/dev/null 2>&1 || true - quarantine_cd_lane_registration_fail_closed quarantine_cd_lane_root_restore_sources_fail_closed for binary in "${RUNNER_FAIL_CLOSED_BINARY_PATHS[@]}"; do guard_runner_binary_fail_closed "$binary" @@ -499,8 +683,13 @@ else log "⚠️ 找不到 act-runner binary/config: $RUNNER_DIR" fi -log "⏸️ direct cd-lane / drain lane 維持 fail-closed;需完成搬遷或硬限流後才可用獨立變更恢復" -ensure_cd_lane_fail_closed +if [ "$START_CD_LANE_ALLOWED" = "1" ]; then + log "✅ controlled cd-lane 具備 sentinel/env 授權,執行 capacity/label/binary verifier 後受控開啟" + ensure_controlled_cd_lane_open +else + log "⏸️ controlled cd-lane 未要求啟動;保留合格 drain lane,regular lane 維持 fail-closed" + ensure_cd_lane_fail_closed +fi # ────────────────────────────────────────────── # STEP 7: Sentry(Error Tracking) diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh index 9e466d2e..9e6e3629 100755 --- a/scripts/reboot-recovery/full-stack-cold-start-check.sh +++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh @@ -327,12 +327,30 @@ if [ "$cd_lane_active" = "inactive" ] \ && { { [ "$cd_lane_load" = "masked" ] && [ "$cd_lane_unitfile" = "masked" ]; } || echo "$cd_lane_execstart" | grep -q "/bin/false"; }; then cd_lane_ok=1 cd_lane_mode=failclosed +elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && [ "$cd_lane_capacity_ok" = "1" ] && [ "$cd_lane_labels_ok" = "1" ] && [ "$cd_lane_binary_elf" = "1" ]; then + cd_lane_ok=1 + cd_lane_mode=controlled_open fi echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok" cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true) cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true) cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true) cd_lane_drain_mainpid=$(systemctl show awoooi-cd-lane-drain.service -p MainPID --value 2>/dev/null || true) +cd_lane_drain_cpu_accounting=$(systemctl show awoooi-cd-lane-drain.service -p CPUAccounting --value 2>/dev/null || true) +cd_lane_drain_cpu_quota=$(systemctl show awoooi-cd-lane-drain.service -p CPUQuotaPerSecUSec --value 2>/dev/null || true) +cd_lane_drain_memory_accounting=$(systemctl show awoooi-cd-lane-drain.service -p MemoryAccounting --value 2>/dev/null || true) +cd_lane_drain_memory_max=$(systemctl show awoooi-cd-lane-drain.service -p MemoryMax --value 2>/dev/null || true) +cd_lane_drain_tasks_accounting=$(systemctl show awoooi-cd-lane-drain.service -p TasksAccounting --value 2>/dev/null || true) +cd_lane_drain_tasks_max=$(systemctl show awoooi-cd-lane-drain.service -p TasksMax --value 2>/dev/null || true) +cd_lane_drain_limits_ok=0 +if [ "$cd_lane_drain_cpu_accounting" = "yes" ] \ + && [ -n "$cd_lane_drain_cpu_quota" ] && [ "$cd_lane_drain_cpu_quota" != "infinity" ] \ + && [ "$cd_lane_drain_memory_accounting" = "yes" ] \ + && [ -n "$cd_lane_drain_memory_max" ] && [ "$cd_lane_drain_memory_max" != "infinity" ] \ + && [ "$cd_lane_drain_tasks_accounting" = "yes" ] \ + && [ -n "$cd_lane_drain_tasks_max" ] && [ "$cd_lane_drain_tasks_max" != "infinity" ]; then + cd_lane_drain_limits_ok=1 +fi cd_lane_drain_capacity_ok=0 cd_lane_drain_labels_ok=0 if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then @@ -355,21 +373,28 @@ if [ "$cd_lane_drain_active" != "active" ] \ && { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then cd_lane_drain_ok=1 cd_lane_drain_mode=failclosed +elif [ "$cd_lane_drain_active" = "active" ] \ + && [ "$cd_lane_drain_capacity_ok" = "1" ] \ + && [ "$cd_lane_drain_labels_ok" = "1" ] \ + && [ "$cd_lane_drain_binary_elf" = "1" ] \ + && [ "$cd_lane_drain_limits_ok" = "1" ]; then + cd_lane_drain_ok=1 + cd_lane_drain_mode=controlled_open fi -echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok" +echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf limits=$cd_lane_drain_limits_ok process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok" cd_lane_root_restore_left=unknown if sudo -n true >/dev/null 2>&1; then cd_lane_root_restore_left=$(sudo -n find /root -maxdepth 1 -type d \( -name "awoooi-cd-lane-disabled-*" -o -name "awoooi-cd-lane-drain-disabled-*" \) -print 2>/dev/null | wc -l | tr -d " ") fi echo "CD_LANE_ROOT_RESTORE_SOURCES left=$cd_lane_root_restore_left" cd_lane_guard_ok=0 -if [ "$cd_lane_ok" = "1" ] && [ "$cd_lane_drain_ok" = "1" ] && [ "$cd_lane_root_restore_left" = "0" ]; then +if { [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; } && [ "$cd_lane_root_restore_left" = "0" ]; then cd_lane_guard_ok=1 fi echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok" direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ") echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count" -for p in /home/wooo/awoooi-cd-lane/awoooi_cd_lane /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do +for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do kind=$(file -b "$p" 2>/dev/null || echo missing) echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind" echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p" @@ -402,7 +427,7 @@ docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120 else fail "110 legacy direct/Gitea runner units are not fail-closed" fi - grep -q "CD_LANE_GUARDRAILS_OK 1" <<<"$out" && ok "110 runner/CD lane fail-closed guardrails complete" || fail "110 runner/CD lane fail-closed guardrails incomplete" + grep -q "CD_LANE_GUARDRAILS_OK 1" <<<"$out" && ok "110 controlled cd-lane is safe, drained, or fail-closed" || fail "110 controlled cd-lane is neither safe-open/drained nor fail-closed" grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" <<<"$out" && ok "110 legacy direct runner process count is zero" || fail "110 legacy direct runner process detected" grep -q "RUNNER_FAILCLOSED_BINARY_ELF" <<<"$out" && fail "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing" grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting" diff --git a/scripts/reboot-recovery/p3-controlled-release-gate.sh b/scripts/reboot-recovery/p3-controlled-release-gate.sh index c959013e..87b8cad9 100755 --- a/scripts/reboot-recovery/p3-controlled-release-gate.sh +++ b/scripts/reboot-recovery/p3-controlled-release-gate.sh @@ -346,11 +346,29 @@ if [ "$cd_lane_active" = "inactive" ] \ && { { [ "$cd_lane_load" = "masked" ] && [ "$cd_lane_unitfile" = "masked" ]; } || echo "$cd_lane_execstart" | grep -q "/bin/false"; }; then cd_lane_ok=1 cd_lane_mode=failclosed +elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && [ "$cd_lane_capacity_ok" = "1" ] && [ "$cd_lane_labels_ok" = "1" ] && [ "$cd_lane_binary_elf" = "1" ]; then + cd_lane_ok=1 + cd_lane_mode=controlled_open fi echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok" cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true) cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true) cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true) +cd_lane_drain_cpu_accounting=$(systemctl show awoooi-cd-lane-drain.service -p CPUAccounting --value 2>/dev/null || true) +cd_lane_drain_cpu_quota=$(systemctl show awoooi-cd-lane-drain.service -p CPUQuotaPerSecUSec --value 2>/dev/null || true) +cd_lane_drain_memory_accounting=$(systemctl show awoooi-cd-lane-drain.service -p MemoryAccounting --value 2>/dev/null || true) +cd_lane_drain_memory_max=$(systemctl show awoooi-cd-lane-drain.service -p MemoryMax --value 2>/dev/null || true) +cd_lane_drain_tasks_accounting=$(systemctl show awoooi-cd-lane-drain.service -p TasksAccounting --value 2>/dev/null || true) +cd_lane_drain_tasks_max=$(systemctl show awoooi-cd-lane-drain.service -p TasksMax --value 2>/dev/null || true) +cd_lane_drain_limits_ok=0 +if [ "$cd_lane_drain_cpu_accounting" = "yes" ] \ + && [ -n "$cd_lane_drain_cpu_quota" ] && [ "$cd_lane_drain_cpu_quota" != "infinity" ] \ + && [ "$cd_lane_drain_memory_accounting" = "yes" ] \ + && [ -n "$cd_lane_drain_memory_max" ] && [ "$cd_lane_drain_memory_max" != "infinity" ] \ + && [ "$cd_lane_drain_tasks_accounting" = "yes" ] \ + && [ -n "$cd_lane_drain_tasks_max" ] && [ "$cd_lane_drain_tasks_max" != "infinity" ]; then + cd_lane_drain_limits_ok=1 +fi cd_lane_drain_capacity_ok=0 cd_lane_drain_labels_ok=0 if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then @@ -373,15 +391,22 @@ if [ "$cd_lane_drain_active" != "active" ] \ && { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then cd_lane_drain_ok=1 cd_lane_drain_mode=failclosed +elif [ "$cd_lane_drain_active" = "active" ] \ + && [ "$cd_lane_drain_capacity_ok" = "1" ] \ + && [ "$cd_lane_drain_labels_ok" = "1" ] \ + && [ "$cd_lane_drain_binary_elf" = "1" ] \ + && [ "$cd_lane_drain_limits_ok" = "1" ]; then + cd_lane_drain_ok=1 + cd_lane_drain_mode=controlled_open fi -echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok" +echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf limits=$cd_lane_drain_limits_ok process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok" cd_lane_root_restore_left=unknown if sudo -n true >/dev/null 2>&1; then cd_lane_root_restore_left=$(sudo -n find /root -maxdepth 1 -type d \( -name "awoooi-cd-lane-disabled-*" -o -name "awoooi-cd-lane-drain-disabled-*" \) -print 2>/dev/null | wc -l | tr -d " ") fi echo "CD_LANE_ROOT_RESTORE_SOURCES left=$cd_lane_root_restore_left" cd_lane_guard_ok=0 -if [ "$cd_lane_ok" = "1" ] && [ "$cd_lane_drain_ok" = "1" ] && [ "$cd_lane_root_restore_left" = "0" ]; then +if { [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; } && [ "$cd_lane_root_restore_left" = "0" ]; then cd_lane_guard_ok=1 fi echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok" @@ -389,7 +414,7 @@ echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok" direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ") echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count" [ "$direct_runner_count" = "0" ] || bad=1 -for p in /home/wooo/awoooi-cd-lane/awoooi_cd_lane /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do +for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do kind=$(file -b "$p" 2>/dev/null || echo missing) echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind" echo "$kind" | grep -qi "ELF" && bad=1 diff --git a/scripts/reboot-recovery/post-start-quick-check.sh b/scripts/reboot-recovery/post-start-quick-check.sh index 4ee3efe7..7439f442 100755 --- a/scripts/reboot-recovery/post-start-quick-check.sh +++ b/scripts/reboot-recovery/post-start-quick-check.sh @@ -585,6 +585,21 @@ cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState -- cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true) cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true) cd_lane_drain_mainpid=$(systemctl show awoooi-cd-lane-drain.service -p MainPID --value 2>/dev/null || true) +cd_lane_drain_cpu_accounting=$(systemctl show awoooi-cd-lane-drain.service -p CPUAccounting --value 2>/dev/null || true) +cd_lane_drain_cpu_quota=$(systemctl show awoooi-cd-lane-drain.service -p CPUQuotaPerSecUSec --value 2>/dev/null || true) +cd_lane_drain_memory_accounting=$(systemctl show awoooi-cd-lane-drain.service -p MemoryAccounting --value 2>/dev/null || true) +cd_lane_drain_memory_max=$(systemctl show awoooi-cd-lane-drain.service -p MemoryMax --value 2>/dev/null || true) +cd_lane_drain_tasks_accounting=$(systemctl show awoooi-cd-lane-drain.service -p TasksAccounting --value 2>/dev/null || true) +cd_lane_drain_tasks_max=$(systemctl show awoooi-cd-lane-drain.service -p TasksMax --value 2>/dev/null || true) +cd_lane_drain_limits_ok=0 +if [ "$cd_lane_drain_cpu_accounting" = "yes" ] \ + && [ -n "$cd_lane_drain_cpu_quota" ] && [ "$cd_lane_drain_cpu_quota" != "infinity" ] \ + && [ "$cd_lane_drain_memory_accounting" = "yes" ] \ + && [ -n "$cd_lane_drain_memory_max" ] && [ "$cd_lane_drain_memory_max" != "infinity" ] \ + && [ "$cd_lane_drain_tasks_accounting" = "yes" ] \ + && [ -n "$cd_lane_drain_tasks_max" ] && [ "$cd_lane_drain_tasks_max" != "infinity" ]; then + cd_lane_drain_limits_ok=1 +fi cd_lane_drain_capacity_ok=0 cd_lane_drain_labels_ok=0 if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then @@ -607,21 +622,28 @@ if [ "$cd_lane_drain_active" != "active" ] \ && { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then cd_lane_drain_ok=1 cd_lane_drain_mode=failclosed +elif [ "$cd_lane_drain_active" = "active" ] \ + && [ "$cd_lane_drain_capacity_ok" = "1" ] \ + && [ "$cd_lane_drain_labels_ok" = "1" ] \ + && [ "$cd_lane_drain_binary_elf" = "1" ] \ + && [ "$cd_lane_drain_limits_ok" = "1" ]; then + cd_lane_drain_ok=1 + cd_lane_drain_mode=controlled_open fi -echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok" +echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf limits=$cd_lane_drain_limits_ok process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok" cd_lane_root_restore_left=unknown if sudo -n true >/dev/null 2>&1; then cd_lane_root_restore_left=$(sudo -n find /root -maxdepth 1 -type d \( -name "awoooi-cd-lane-disabled-*" -o -name "awoooi-cd-lane-drain-disabled-*" \) -print 2>/dev/null | wc -l | tr -d " ") fi echo "CD_LANE_ROOT_RESTORE_SOURCES left=$cd_lane_root_restore_left" cd_lane_guard_ok=0 -if [ "$cd_lane_ok" = "1" ] && [ "$cd_lane_drain_ok" = "1" ] && [ "$cd_lane_root_restore_left" = "0" ]; then +if { [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; } && [ "$cd_lane_root_restore_left" = "0" ]; then cd_lane_guard_ok=1 fi echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok" direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ") echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count" -for p in /home/wooo/awoooi-cd-lane/awoooi_cd_lane /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do +for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do kind=$(file -b "$p" 2>/dev/null || echo missing) echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind" echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p" @@ -629,9 +651,9 @@ done HOST_WEB_BUILD_PRESSURE_ATTEMPTS=1 HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS=0 /usr/local/bin/awoooi-wait-host-web-build-pressure.sh echo "RUNNER_PRESSURE_GATE_RC $?" ' >"$runner_tmp" 2>&1; then - ok "110 runner fail-closed readback succeeded" + ok "110 controlled runner readback succeeded" else - blocked "110 runner fail-closed readback failed" + blocked "110 controlled runner readback failed" fi cat "$runner_tmp" if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && $NF != "ok=1" {bad=1} END {exit bad}' "$runner_tmp"; then @@ -639,7 +661,7 @@ if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && $NF != "ok=1" {bad=1} END {exit bad}' else blocked "110 legacy direct/Gitea runner units are not fail-closed" fi -grep -q "CD_LANE_GUARDRAILS_OK 1" "$runner_tmp" && ok "110 runner/CD lane fail-closed guardrails complete" || blocked "110 runner/CD lane fail-closed guardrails incomplete" +grep -q "CD_LANE_GUARDRAILS_OK 1" "$runner_tmp" && ok "110 controlled cd-lane is safe-open/drained or fail-closed" || blocked "110 controlled cd-lane guardrails incomplete" grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" "$runner_tmp" && ok "110 legacy direct runner process count is zero" || blocked "110 legacy direct runner process detected" grep -q "RUNNER_FAILCLOSED_BINARY_ELF" "$runner_tmp" && blocked "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing" grep -q "RUNNER_PRESSURE_GATE_RC 0" "$runner_tmp" && ok "110 host pressure gate returned 0" || blocked "110 host pressure gate is blocking"