fix(reboot): enforce direct runner fail-closed guard [skip ci]

This commit is contained in:
Your Name
2026-06-28 09:16:43 +08:00
parent f8e2b39ab3
commit 8f402983ee
5 changed files with 186 additions and 4 deletions

View File

@@ -194,11 +194,105 @@ RUNNER_SERVICE="gitea-act-runner-host.service"
RUNNER_ENABLE_SENTINEL="/run/awoooi-runner-host-enabled"
START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-0}"
START_GITEA_RUNNER_ALLOWED=0
RUNNER_FAIL_CLOSED_SERVICES=(
"awoooi-direct-runner-open.service"
"awoooi-direct-runner.service"
"gitea-act-runner-host.service"
"gitea-act-runner-awoooi-controlled.service"
"gitea-awoooi-controlled-runner.service"
"gitea-act-runner-awoooi-open.service"
)
RUNNER_FAIL_CLOSED_BINARY_PATHS=(
"/home/wooo/act-runner/act_runner"
"/home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard"
"/home/wooo/act-runner-controlled/act_runner"
"/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner"
)
# The runtime operator sentinel is the second key for an authorized deployment
# window. A single env var or a stale sentinel alone must not reopen host CI.
if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ] && [ -e "$RUNNER_ENABLE_SENTINEL" ]; then
START_GITEA_RUNNER_ALLOWED=1
fi
mask_runner_unit_file() {
local unit="$1"
local unit_dir="$2"
local owner_user="${3:-}"
local unit_file="$unit_dir/$unit"
local quarantine_stamp
quarantine_stamp="$(date +%Y%m%d%H%M%S)"
mkdir -p "$unit_dir" >/dev/null 2>&1 || true
if [ -L "$unit_file" ] && [ "$(readlink "$unit_file" 2>/dev/null || true)" = "/dev/null" ]; then
return 0
fi
if [ -e "$unit_file" ] || [ -L "$unit_file" ]; then
chattr -i "$unit_file" >/dev/null 2>&1 || true
mv "$unit_file" "${unit_file}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true
fi
ln -s /dev/null "$unit_file" >/dev/null 2>&1 || true
if [ -n "$owner_user" ]; then
chown -h "$owner_user:$owner_user" "$unit_file" >/dev/null 2>&1 || true
fi
}
guard_runner_binary_fail_closed() {
local path="$1"
local tmp
local quarantine_stamp
quarantine_stamp="$(date +%Y%m%d%H%M%S)"
if [ -e "$path" ]; then
chattr -i "$path" >/dev/null 2>&1 || true
if file "$path" 2>/dev/null | grep -qi "ELF"; then
mv "$path" "${path}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true
chmod 0400 "${path}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true
chattr +i "${path}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true
fi
fi
tmp="$(mktemp)"
cat >"$tmp" <<'EOF'
#!/usr/bin/env bash
set -eu
echo "AWOOOI host runner is fail-closed on 110 after 2026-06-28 pressure incident; migrate or rate-limit before enabling." >&2
exit 75
EOF
install -o root -g root -m 0755 "$tmp" "$path" >/dev/null 2>&1 || true
rm -f "$tmp"
chattr +i "$path" >/dev/null 2>&1 || true
}
ensure_host_runner_fail_closed() {
local unit
local binary
local wooo_uid
for unit in "${RUNNER_FAIL_CLOSED_SERVICES[@]}"; do
systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true
systemctl reset-failed "$unit" >/dev/null 2>&1 || true
systemctl disable "$unit" >/dev/null 2>&1 || true
systemctl mask "$unit" >/dev/null 2>&1 || mask_runner_unit_file "$unit" "/etc/systemd/system"
mask_runner_unit_file "$unit" "/etc/systemd/system"
done
systemctl daemon-reload >/dev/null 2>&1 || true
if wooo_uid="$(id -u wooo 2>/dev/null)"; then
mkdir -p /home/wooo/.config/systemd/user >/dev/null 2>&1 || true
for unit in "${RUNNER_FAIL_CLOSED_SERVICES[@]}"; do
if [ -d "/run/user/$wooo_uid" ] && command -v runuser >/dev/null 2>&1; then
runuser -u wooo -- env XDG_RUNTIME_DIR="/run/user/$wooo_uid" systemctl --user kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true
fi
mask_runner_unit_file "$unit" "/home/wooo/.config/systemd/user" "wooo"
done
fi
pkill -KILL -f "^${RUNNER_DIR}/act_runner(\\.real-[^ ]*)? daemon" >/dev/null 2>&1 || true
for binary in "${RUNNER_FAIL_CLOSED_BINARY_PATHS[@]}"; do
guard_runner_binary_fail_closed "$binary"
done
}
if [ -x "$RUNNER_DIR/act_runner" ] && [ -f "$RUNNER_DIR/config.yaml" ]; then
# 若舊的 .runner 配置指向過期 hostname只有在明確允許啟動 runner
# 時才清除重新註冊;預設降壓模式不得碰 registration 狀態。
@@ -271,9 +365,7 @@ PY
else
log "⏸️ Gitea host runner 維持停用;需同時設定 AWOOOI_START_GITEA_RUNNER_ON_BOOT=1 與建立 $RUNNER_ENABLE_SENTINEL 才允許 startup 啟動"
fi
systemctl disable --now "$RUNNER_SERVICE" >/dev/null 2>&1 || true
systemctl kill -s SIGKILL "$RUNNER_SERVICE" >/dev/null 2>&1 || true
pkill -KILL -f "$RUNNER_DIR/act_runner daemon" >/dev/null 2>&1 || true
ensure_host_runner_fail_closed
fi
# 已停用 Docker-wrapped runner避免它搶走 host label job。

View File

@@ -286,6 +286,20 @@ echo "ACTION_RUNNER_ENABLED_COUNT $(systemctl list-unit-files "actions.runner.*"
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
done
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
mainpid=$(systemctl show "$u" -p MainPID --value 2>/dev/null || true)
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid"
done
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
kind=$(file -b "$p" 2>/dev/null || echo missing)
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p"
done
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
' 2>&1); then
fail "ssh 110 read-only check"
@@ -309,6 +323,13 @@ docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
else
warn "runner watchdog state not confirmed"
fi
if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && ($3 != "load=masked" || $4 != "unitfile=masked") {bad=1} END {exit bad}' <<<"$out"; then
ok "110 direct/Gitea runner fail-closed units are masked"
else
fail "110 direct/Gitea runner fail-closed units are not all masked"
fi
grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" <<<"$out" && ok "110 direct runner process count is zero" || fail "110 direct runner process detected"
grep -q "RUNNER_FAILCLOSED_BINARY_ELF" <<<"$out" && fail "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing"
grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
}

View File

@@ -304,8 +304,23 @@ awk '
check_runner_guardrails() {
section "runner/CD guardrails"
local out bad
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
bad=0
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active"
[ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ] || bad=1
done
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
[ "$direct_runner_count" = "0" ] || bad=1
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
kind=$(file -b "$p" 2>/dev/null || echo missing)
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
echo "$kind" | grep -qi "ELF" && bad=1
done
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
watchdog=$(systemctl show "$u" -p WatchdogUSec --value)
quota=$(systemctl show "$u" -p CPUQuotaPerSecUSec --value)

View File

@@ -535,6 +535,41 @@ if [[ "$RUN_CPU" -eq 1 ]]; then
rm -f "$cpu_tmp"
fi
section "110 runner fail-closed guard"
runner_tmp="$(mktemp -t post-start-runner.XXXXXX)"
if ssh_read "wooo@192.168.0.110" '
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
mainpid=$(systemctl show "$u" -p MainPID --value 2>/dev/null || true)
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid"
done
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
kind=$(file -b "$p" 2>/dev/null || echo missing)
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p"
done
HOST_WEB_BUILD_PRESSURE_ATTEMPTS=1 HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS=0 /usr/local/bin/awoooi-wait-host-web-build-pressure.sh
echo "RUNNER_PRESSURE_GATE_RC $?"
' >"$runner_tmp" 2>&1; then
ok "110 runner fail-closed readback succeeded"
else
blocked "110 runner fail-closed readback failed"
fi
cat "$runner_tmp"
if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && ($3 != "load=masked" || $4 != "unitfile=masked") {bad=1} END {exit bad}' "$runner_tmp"; then
ok "110 direct/Gitea runner fail-closed units are masked"
else
blocked "110 direct/Gitea runner fail-closed units are not all masked"
fi
grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" "$runner_tmp" && ok "110 direct runner process count is zero" || blocked "110 direct runner process detected"
grep -q "RUNNER_FAILCLOSED_BINARY_ELF" "$runner_tmp" && blocked "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing"
grep -q "RUNNER_PRESSURE_GATE_RC 0" "$runner_tmp" && ok "110 host pressure gate returned 0" || blocked "110 host pressure gate is blocking"
rm -f "$runner_tmp"
section "總結"
printf 'POST_START_QUICK_CHECK PASS=%s WARN=%s BLOCKED=%s\n' "$PASS_COUNT" "$WARN_COUNT" "$BLOCKED_COUNT"
printf 'POST_START_QUICK_CHECK_WARNINGS SERVICE=%s BOUNDARY=%s EVIDENCE=%s\n' "$SERVICE_WARN_COUNT" "$BOUNDARY_WARN_COUNT" "$EVIDENCE_WARN_COUNT"