From 8f402983eecbc398658e484c87caa8964cb85a77 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 28 Jun 2026 09:16:43 +0800 Subject: [PATCH] fix(reboot): enforce direct runner fail-closed guard [skip ci] --- docs/LOGBOOK.md | 19 ++++ scripts/reboot-recovery/awoooi-startup-110.sh | 98 ++++++++++++++++++- .../full-stack-cold-start-check.sh | 21 ++++ .../p3-controlled-release-gate.sh | 17 +++- .../reboot-recovery/post-start-quick-check.sh | 35 +++++++ 5 files changed, 186 insertions(+), 4 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 2cacfa07..e3f4b286 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -20,6 +20,25 @@ - OpenClaw 仍維持 production decision core;替換前必須 replay / shadow / canary / ADR。 - SDK install、API shadow / canary、production route、paid provider / cost route、external active security scan、secret value / credential URL / raw env、DB destructive / backup restore、force push / repo refs deletion 仍不得被本段 controlled queue 直接打開。 +## 2026-06-28 — 09:16 direct runner source guard 實作收斂 + +**背景**:09:00 前的 live hotfix 已把 110 上 direct / Gitea runner 全部 mask,但 `awoooi-startup-110.sh`、cold-start 與 P3 release gate 還沒有把 `awoooi-direct-runner-open.service` 這條 transient direct runner 路徑納入 source-level guard。 + +**完成內容**: +- `scripts/reboot-recovery/awoooi-startup-110.sh` 新增 `RUNNER_FAIL_CLOSED_SERVICES` 與 `RUNNER_FAIL_CLOSED_BINARY_PATHS`,預設未同時具備 `AWOOOI_START_GITEA_RUNNER_ON_BOOT=1` 與 `/run/awoooi-runner-host-enabled` 時,會強制 kill / disable / mask direct runner 與 Gitea runner units,並把 live runner ELF quarantine 成 163-byte fail-closed stub。 +- `scripts/reboot-recovery/full-stack-cold-start-check.sh` 新增 110 runner fail-closed readback:direct / Gitea units 必須 `load=masked unitfile=masked active=inactive`,direct runner process count 必須 `0`,runner binary 不得是 ELF。 +- `scripts/reboot-recovery/post-start-quick-check.sh` 新增 `110 runner fail-closed guard` section,並以 `HOST_WEB_BUILD_PRESSURE_ATTEMPTS=1` 讀回 pressure gate。 +- `scripts/reboot-recovery/p3-controlled-release-gate.sh` 將 direct runner fail-closed 狀態納入 `BAD_RUNNER_GUARDRAILS`,避免 P3 release gate 只看 `actions.runner.*` 而漏掉 transient direct runner。 +- Live `/usr/local/bin/awoooi-startup-110.sh` 已更新並加 immutable;讀回 `LIVE_STARTUP_DIRECT_UNIT=1`、`LIVE_STARTUP_GUARD_FUNC=2`、`LIVE_STARTUP_DEFAULT=failclosed`。 + +**驗證結果**: +- 本地:`bash -n` 通過 `awoooi-startup-110.sh`、`full-stack-cold-start-check.sh`、`post-start-quick-check.sh`、`p3-controlled-release-gate.sh`;`git diff --check` 通過;direct runner source invariant 通過。 +- quick-check runner-only:`POST_START_QUICK_CHECK PASS=13 WARN=0 BLOCKED=0`、`RESULT=GREEN`;六個 runner/direct units 全部 masked / inactive、runner process `0`、四條 binary path 皆為 shell stub、pressure gate `RUNNER_PRESSURE_GATE_RC 0`。 +- cold-start 單次讀回:runner guard OK;整體仍 `PASS=90 WARN=1 BLOCKED=1`、`Result: BLOCKED`,blocker 是 `188 momo daily sales data stale beyond 3 days`,不是 runner。 +- P3 release gate:runner/CD guardrails 顯示 `BAD_RUNNER_GUARDRAILS 0`;整體仍 `HOLD_P3_RELEASE`,blockers 包含 cold-start、188 backup stale、188 litellm not running。 + +**邊界**:本段沒有重啟 Docker / Nginx / firewall / K3s / DB,沒有讀 raw sessions / SQLite / auth / `.env` / runner token,也沒有恢復 110 runner。 + ## 2026-06-28 — 08:45 110 runner 壓力事故 source / live fail-closed 收斂 **背景**:統帥全面授權打開非事故級 gate,但 110 Gitea runner 反覆拉起 StockPlatform headless Chrome smoke,已造成 production host CPU / CI 壓力事故;runner 未搬遷 / 限流前不得直接重開。 diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index aaca6054..f2609877 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -194,11 +194,105 @@ RUNNER_SERVICE="gitea-act-runner-host.service" RUNNER_ENABLE_SENTINEL="/run/awoooi-runner-host-enabled" START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-0}" START_GITEA_RUNNER_ALLOWED=0 +RUNNER_FAIL_CLOSED_SERVICES=( + "awoooi-direct-runner-open.service" + "awoooi-direct-runner.service" + "gitea-act-runner-host.service" + "gitea-act-runner-awoooi-controlled.service" + "gitea-awoooi-controlled-runner.service" + "gitea-act-runner-awoooi-open.service" +) +RUNNER_FAIL_CLOSED_BINARY_PATHS=( + "/home/wooo/act-runner/act_runner" + "/home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard" + "/home/wooo/act-runner-controlled/act_runner" + "/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" +) # The runtime operator sentinel is the second key for an authorized deployment # window. A single env var or a stale sentinel alone must not reopen host CI. if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ] && [ -e "$RUNNER_ENABLE_SENTINEL" ]; then START_GITEA_RUNNER_ALLOWED=1 fi + +mask_runner_unit_file() { + local unit="$1" + local unit_dir="$2" + local owner_user="${3:-}" + local unit_file="$unit_dir/$unit" + local quarantine_stamp + quarantine_stamp="$(date +%Y%m%d%H%M%S)" + + mkdir -p "$unit_dir" >/dev/null 2>&1 || true + if [ -L "$unit_file" ] && [ "$(readlink "$unit_file" 2>/dev/null || true)" = "/dev/null" ]; then + return 0 + fi + if [ -e "$unit_file" ] || [ -L "$unit_file" ]; then + chattr -i "$unit_file" >/dev/null 2>&1 || true + mv "$unit_file" "${unit_file}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true + fi + ln -s /dev/null "$unit_file" >/dev/null 2>&1 || true + if [ -n "$owner_user" ]; then + chown -h "$owner_user:$owner_user" "$unit_file" >/dev/null 2>&1 || true + fi +} + +guard_runner_binary_fail_closed() { + local path="$1" + local tmp + local quarantine_stamp + quarantine_stamp="$(date +%Y%m%d%H%M%S)" + + if [ -e "$path" ]; then + chattr -i "$path" >/dev/null 2>&1 || true + if file "$path" 2>/dev/null | grep -qi "ELF"; then + mv "$path" "${path}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true + chmod 0400 "${path}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true + chattr +i "${path}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true + fi + fi + + tmp="$(mktemp)" + cat >"$tmp" <<'EOF' +#!/usr/bin/env bash +set -eu +echo "AWOOOI host runner is fail-closed on 110 after 2026-06-28 pressure incident; migrate or rate-limit before enabling." >&2 +exit 75 +EOF + install -o root -g root -m 0755 "$tmp" "$path" >/dev/null 2>&1 || true + rm -f "$tmp" + chattr +i "$path" >/dev/null 2>&1 || true +} + +ensure_host_runner_fail_closed() { + local unit + local binary + local wooo_uid + + for unit in "${RUNNER_FAIL_CLOSED_SERVICES[@]}"; do + systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true + systemctl reset-failed "$unit" >/dev/null 2>&1 || true + systemctl disable "$unit" >/dev/null 2>&1 || true + systemctl mask "$unit" >/dev/null 2>&1 || mask_runner_unit_file "$unit" "/etc/systemd/system" + mask_runner_unit_file "$unit" "/etc/systemd/system" + done + systemctl daemon-reload >/dev/null 2>&1 || true + + if wooo_uid="$(id -u wooo 2>/dev/null)"; then + mkdir -p /home/wooo/.config/systemd/user >/dev/null 2>&1 || true + for unit in "${RUNNER_FAIL_CLOSED_SERVICES[@]}"; do + if [ -d "/run/user/$wooo_uid" ] && command -v runuser >/dev/null 2>&1; then + runuser -u wooo -- env XDG_RUNTIME_DIR="/run/user/$wooo_uid" systemctl --user kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true + fi + mask_runner_unit_file "$unit" "/home/wooo/.config/systemd/user" "wooo" + done + fi + + pkill -KILL -f "^${RUNNER_DIR}/act_runner(\\.real-[^ ]*)? daemon" >/dev/null 2>&1 || true + for binary in "${RUNNER_FAIL_CLOSED_BINARY_PATHS[@]}"; do + guard_runner_binary_fail_closed "$binary" + done +} + if [ -x "$RUNNER_DIR/act_runner" ] && [ -f "$RUNNER_DIR/config.yaml" ]; then # 若舊的 .runner 配置指向過期 hostname,只有在明確允許啟動 runner # 時才清除重新註冊;預設降壓模式不得碰 registration 狀態。 @@ -271,9 +365,7 @@ PY else log "⏸️ Gitea host runner 維持停用;需同時設定 AWOOOI_START_GITEA_RUNNER_ON_BOOT=1 與建立 $RUNNER_ENABLE_SENTINEL 才允許 startup 啟動" fi - systemctl disable --now "$RUNNER_SERVICE" >/dev/null 2>&1 || true - systemctl kill -s SIGKILL "$RUNNER_SERVICE" >/dev/null 2>&1 || true - pkill -KILL -f "$RUNNER_DIR/act_runner daemon" >/dev/null 2>&1 || true + ensure_host_runner_fail_closed fi # 已停用 Docker-wrapped runner;避免它搶走 host label job。 diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh index 037ecc3c..b7d64bd2 100755 --- a/scripts/reboot-recovery/full-stack-cold-start-check.sh +++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh @@ -286,6 +286,20 @@ echo "ACTION_RUNNER_ENABLED_COUNT $(systemctl list-unit-files "actions.runner.*" for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /" done +for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do + load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true) + unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true) + active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true) + mainpid=$(systemctl show "$u" -p MainPID --value 2>/dev/null || true) + echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid" +done +direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ") +echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count" +for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do + kind=$(file -b "$p" 2>/dev/null || echo missing) + echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind" + echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p" +done docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120 ' 2>&1); then fail "ssh 110 read-only check" @@ -309,6 +323,13 @@ docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120 else warn "runner watchdog state not confirmed" fi + if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && ($3 != "load=masked" || $4 != "unitfile=masked") {bad=1} END {exit bad}' <<<"$out"; then + ok "110 direct/Gitea runner fail-closed units are masked" + else + fail "110 direct/Gitea runner fail-closed units are not all masked" + fi + grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" <<<"$out" && ok "110 direct runner process count is zero" || fail "110 direct runner process detected" + grep -q "RUNNER_FAILCLOSED_BINARY_ELF" <<<"$out" && fail "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing" grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting" } diff --git a/scripts/reboot-recovery/p3-controlled-release-gate.sh b/scripts/reboot-recovery/p3-controlled-release-gate.sh index 0c852ab5..1b42e4f6 100755 --- a/scripts/reboot-recovery/p3-controlled-release-gate.sh +++ b/scripts/reboot-recovery/p3-controlled-release-gate.sh @@ -304,8 +304,23 @@ awk ' check_runner_guardrails() { section "runner/CD guardrails" local out bad - if ! out=$(ssh_cmd "wooo@192.168.0.110" ' +if ! out=$(ssh_cmd "wooo@192.168.0.110" ' bad=0 +for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do + load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true) + unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true) + active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true) + echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active" + [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ] || bad=1 +done +direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ") +echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count" +[ "$direct_runner_count" = "0" ] || bad=1 +for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do + kind=$(file -b "$p" 2>/dev/null || echo missing) + echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind" + echo "$kind" | grep -qi "ELF" && bad=1 +done for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do watchdog=$(systemctl show "$u" -p WatchdogUSec --value) quota=$(systemctl show "$u" -p CPUQuotaPerSecUSec --value) diff --git a/scripts/reboot-recovery/post-start-quick-check.sh b/scripts/reboot-recovery/post-start-quick-check.sh index 2a783c94..24dad37b 100755 --- a/scripts/reboot-recovery/post-start-quick-check.sh +++ b/scripts/reboot-recovery/post-start-quick-check.sh @@ -535,6 +535,41 @@ if [[ "$RUN_CPU" -eq 1 ]]; then rm -f "$cpu_tmp" fi +section "110 runner fail-closed guard" +runner_tmp="$(mktemp -t post-start-runner.XXXXXX)" +if ssh_read "wooo@192.168.0.110" ' +for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do + load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true) + unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true) + active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true) + mainpid=$(systemctl show "$u" -p MainPID --value 2>/dev/null || true) + echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid" +done +direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ") +echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count" +for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do + kind=$(file -b "$p" 2>/dev/null || echo missing) + echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind" + echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p" +done +HOST_WEB_BUILD_PRESSURE_ATTEMPTS=1 HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS=0 /usr/local/bin/awoooi-wait-host-web-build-pressure.sh +echo "RUNNER_PRESSURE_GATE_RC $?" +' >"$runner_tmp" 2>&1; then + ok "110 runner fail-closed readback succeeded" +else + blocked "110 runner fail-closed readback failed" +fi +cat "$runner_tmp" +if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && ($3 != "load=masked" || $4 != "unitfile=masked") {bad=1} END {exit bad}' "$runner_tmp"; then + ok "110 direct/Gitea runner fail-closed units are masked" +else + blocked "110 direct/Gitea runner fail-closed units are not all masked" +fi +grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" "$runner_tmp" && ok "110 direct runner process count is zero" || blocked "110 direct runner process detected" +grep -q "RUNNER_FAILCLOSED_BINARY_ELF" "$runner_tmp" && blocked "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing" +grep -q "RUNNER_PRESSURE_GATE_RC 0" "$runner_tmp" && ok "110 host pressure gate returned 0" || blocked "110 host pressure gate is blocking" +rm -f "$runner_tmp" + section "總結" printf 'POST_START_QUICK_CHECK PASS=%s WARN=%s BLOCKED=%s\n' "$PASS_COUNT" "$WARN_COUNT" "$BLOCKED_COUNT" printf 'POST_START_QUICK_CHECK_WARNINGS SERVICE=%s BOUNDARY=%s EVIDENCE=%s\n' "$SERVICE_WARN_COUNT" "$BOUNDARY_WARN_COUNT" "$EVIDENCE_WARN_COUNT"