#!/usr/bin/env bash set -euo pipefail # 2026-05-21 Codex: protect the shared 110 host runner from overlapping # host-side frontend production builds launched by other repositories. # 2026-06-27 Codex: make the gate enforce real host pressure too. 110 is both a # production host and a CI host, so CD must not start a new Docker/Next build # while load, BuildKit, Gitea Actions, or headless smoke pressure is already high. # This gate never kills, renices, or rewrites another repo's process tree. # 2026-06-28 Codex: 110 runner pressure is incident-grade; CD remains fail-hard # on host pressure until runner work is moved or hard-limited. # 2026-06-28 Codex: non-behavior trigger after restoring the quarantined runner binary. # 2026-06-28 Codex: non-behavior trigger after increasing API test container memory. # 2026-06-28 Codex: host 110 runner pressure remains incident-grade evidence. # Controlled CD keeps the readback and blocks until non-110 readiness is proved. # 2026-06-28 Codex: cancel-stale-cd trigger for the pre-guard CD run queue. # 2026-06-28 Codex: controlled-runtime CD trigger after API test OOM 137. # 2026-06-28 Codex: old fail-closed pressure guard remains fail-hard in CD. # 2026-06-28 Codex: controlled-runtime diff detection now uses event payload. # 2026-06-28 Codex: controlled CD retry after opening 110 systemd guard. # 2026-06-28 Codex: retry after disabling canonical failclosed enforcer. # 2026-06-28 Codex: retry after disabling failclosed authority timer. # 2026-06-28 Codex: retry after restoring controlled drain readback on 110. # 2026-06-28 Codex: retry after disabling failclosed authority cron source. ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-60}}" SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-10}}" WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-0}" MAX_LOAD5_PER_CORE="${HOST_WEB_BUILD_PRESSURE_MAX_LOAD5_PER_CORE:-0.85}" MAX_CI_CPU_PERCENT="${HOST_WEB_BUILD_PRESSURE_MAX_CI_CPU_PERCENT:-250}" # One Gitea Actions task container/process group is the current job itself. # Block only when there is additional CI/BuildKit pressure unless explicitly # tightened by the runner environment. MAX_ACTIVE_CI_PROCESS_GROUPS="${HOST_WEB_BUILD_PRESSURE_MAX_ACTIVE_CI_PROCESS_GROUPS:-1}" MAX_ACTIVE_CI_CONTAINERS="${HOST_WEB_BUILD_PRESSURE_MAX_ACTIVE_CI_CONTAINERS:-1}" MAX_ORPHAN_BROWSER_GROUPS="${HOST_WEB_BUILD_PRESSURE_MAX_ORPHAN_BROWSER_GROUPS:-0}" MAX_POSTGRES_CPU_CORES="${HOST_WEB_BUILD_PRESSURE_MAX_POSTGRES_CPU_CORES:-2.0}" POSTGRES_CONTAINER_NAME="${HOST_WEB_BUILD_PRESSURE_POSTGRES_CONTAINER:-k3s-postgres-recovery}" METRICS_FILE="${HOST_RUNAWAY_PROCESS_METRICS_FILE:-${HOST_WEB_BUILD_PRESSURE_METRICS_FILE:-/home/wooo/node_exporter_textfiles/host_runaway_process.prom}}" DEFAULT_DOCKER_METRICS_FILE="/home/$(id -un)/node_exporter_textfiles/docker_stats.prom" DOCKER_METRICS_FILE="${HOST_WEB_BUILD_PRESSURE_DOCKER_METRICS_FILE:-$DEFAULT_DOCKER_METRICS_FILE}" EXPORTER="${HOST_RUNAWAY_PROCESS_EXPORTER:-/home/wooo/scripts/host-runaway-process-exporter.py}" default_ps_command() { if ps -eo pid=,ppid=,pcpu=,pmem=,args= --sort=-pcpu >/dev/null 2>&1; then printf '%s\n' "ps -eo pid=,ppid=,pcpu=,pmem=,args= --sort=-pcpu" return fi printf '%s\n' "ps -axo pid=,ppid=,pcpu=,pmem=,command=" } PS_COMMAND="${HOST_WEB_BUILD_PRESSURE_PS_COMMAND:-$(default_ps_command)}" list_foreign_web_builds() { bash -c "$PS_COMMAND" | awk ' BEGIN { IGNORECASE = 1 } function is_diagnostic_command(line) { return line ~ /(^|[[:space:]])(grep|rg|awk|sed|perl|find|pgrep|pkill)([[:space:]]|$)/ \ || line ~ /bash -c .*grep/ \ || line ~ /bash -c .*rg/ \ || line ~ /bash -c .*awk/ } /[n]ext[\/[:alnum:]._-]*[[:space:]]+build|[t]urbo[[:space:]]+build|[v]ite[[:space:]]+build/ { if (is_diagnostic_command($0)) next if ($0 ~ /\/workspace\/wooo\/awoooi/) next if ($0 ~ /\/Users\/ogt\/awoooi/) next if ($0 ~ /\/private\/tmp\/awoooi/) next if ($0 ~ /\/app\/apps\/web/) next if ($0 ~ /scripts\/ci\/wait-host-web-build-pressure\.sh/) next print } ' } refresh_metrics() { if [ -x "$EXPORTER" ]; then AIOPS_HOST_LABEL="${AIOPS_HOST_LABEL:-110}" \ NODE_EXPORTER_TEXTFILE_DIR="${NODE_EXPORTER_TEXTFILE_DIR:-/home/wooo/node_exporter_textfiles}" \ AIOPS_RUNAWAY_PROCESS_MIN_AGE_SECONDS="${AIOPS_RUNAWAY_PROCESS_MIN_AGE_SECONDS:-1800}" \ AIOPS_RUNAWAY_PROCESS_MIN_CPU_PERCENT="${AIOPS_RUNAWAY_PROCESS_MIN_CPU_PERCENT:-50}" \ "$EXPORTER" >/dev/null 2>&1 || true fi } metric_value() { local name="$1" if [ ! -r "$METRICS_FILE" ]; then return 1 fi awk -v metric="$name" ' $1 ~ ("^" metric "(\\{|$)") { value = $NF } END { if (value != "") print value else exit 1 } ' "$METRICS_FILE" } metric_sum() { local name="$1" if [ ! -r "$METRICS_FILE" ]; then return 1 fi awk -v metric="$name" ' $1 ~ ("^" metric "(\\{|$)") { sum += $NF found = 1 } END { if (found) print sum else exit 1 } ' "$METRICS_FILE" } docker_metric_labeled_value() { local name="$1" local label_key="$2" local label_value="$3" if [ ! -r "$DOCKER_METRICS_FILE" ]; then return 1 fi awk -v metric="$name" -v key="$label_key" -v val="$label_value" ' $1 ~ ("^" metric "\\{") && $0 ~ (key "=\"" val "\"") { value = $NF } END { if (value != "") print value else exit 1 } ' "$DOCKER_METRICS_FILE" } load5_per_core() { metric_value "awoooi_host_load5_per_core" 2>/dev/null || awk ' BEGIN { cores = 0 while ((getline line < "/proc/cpuinfo") > 0) { if (line ~ /^processor[[:space:]]*:/) cores += 1 } close("/proc/cpuinfo") if (cores < 1) cores = 1 if ((getline loadline < "/proc/loadavg") <= 0) exit 1 split(loadline, parts, " ") printf "%.6f\n", parts[2] / cores } ' } greater_than() { awk -v left="$1" -v right="$2" 'BEGIN { exit !(left > right) }' } list_headless_smoke_pressure() { bash -c "$PS_COMMAND" | awk ' BEGIN { IGNORECASE = 1 } function is_diagnostic_command(line) { return line ~ /(^|[[:space:]])(grep|rg|awk|sed|perl|find|pgrep|pkill)([[:space:]]|$)/ \ || line ~ /bash -c .*grep/ \ || line ~ /bash -c .*rg/ \ || line ~ /bash -c .*awk/ } /[c]hrome.*\/tmp\/stockplatform|[s]tockplatform-[[:alnum:]_-]*smoke|[h]eadless=new/ { if (is_diagnostic_command($0)) next if ($0 ~ /scripts\/ci\/wait-host-web-build-pressure\.sh/) next print } ' } pressure_report() { local report="" local load_ratio active_ci_cpu active_ci_groups active_ci_containers orphan_groups postgres_cpu_cores load_ratio="$(load5_per_core 2>/dev/null || echo 0)" active_ci_cpu="$(metric_value "awoooi_host_gitea_actions_active_process_cpu_percent" 2>/dev/null || echo 0)" active_ci_groups="$(metric_value "awoooi_host_gitea_actions_active_process_group_count" 2>/dev/null || echo 0)" active_ci_containers="$(metric_value "awoooi_host_gitea_actions_active_container_count" 2>/dev/null || echo 0)" orphan_groups="$( metric_sum "awoooi_host_runaway_browser_orphan_group_count" 2>/dev/null \ || metric_sum "awoooi_host_orphan_browser_group_count" 2>/dev/null \ || echo 0 )" postgres_cpu_cores="$( docker_metric_labeled_value "docker_container_cpu_cores" "container_name" "$POSTGRES_CONTAINER_NAME" 2>/dev/null \ || echo 0 )" if greater_than "$load_ratio" "$MAX_LOAD5_PER_CORE"; then report="${report}host load5/core ${load_ratio} > ${MAX_LOAD5_PER_CORE}"$'\n' fi if greater_than "$active_ci_cpu" "$MAX_CI_CPU_PERCENT"; then report="${report}active CI/BuildKit CPU ${active_ci_cpu}% > ${MAX_CI_CPU_PERCENT}%"$'\n' fi if greater_than "$active_ci_groups" "$MAX_ACTIVE_CI_PROCESS_GROUPS"; then report="${report}active CI/BuildKit process groups ${active_ci_groups} > ${MAX_ACTIVE_CI_PROCESS_GROUPS}"$'\n' fi if greater_than "$active_ci_containers" "$MAX_ACTIVE_CI_CONTAINERS"; then report="${report}active Gitea Actions containers ${active_ci_containers} > ${MAX_ACTIVE_CI_CONTAINERS}"$'\n' fi if greater_than "$orphan_groups" "$MAX_ORPHAN_BROWSER_GROUPS"; then report="${report}orphan browser/smoke groups ${orphan_groups} > ${MAX_ORPHAN_BROWSER_GROUPS}"$'\n' fi if greater_than "$postgres_cpu_cores" "$MAX_POSTGRES_CPU_CORES"; then report="${report}${POSTGRES_CONTAINER_NAME} CPU cores ${postgres_cpu_cores} > ${MAX_POSTGRES_CPU_CORES}"$'\n' fi local smoke_pressure smoke_pressure="$(list_headless_smoke_pressure || true)" if [ -n "$smoke_pressure" ]; then report="${report}active headless smoke pressure detected"$'\n'"$(printf '%s\n' "$smoke_pressure" | head -n 6)"$'\n' fi printf '%s' "$report" } for attempt in $(seq 1 "$ATTEMPTS"); do refresh_metrics active_builds="$(list_foreign_web_builds || true)" host_pressure="$(pressure_report || true)" if [ -z "$active_builds" ] && [ -z "$host_pressure" ]; then echo "✅ no host web/build/smoke pressure detected" exit 0 fi echo "⏳ host web/build/smoke pressure detected (attempt ${attempt}/${ATTEMPTS}); waiting ${SLEEP_SECONDS}s" if [ -n "$host_pressure" ]; then printf '%s\n' "$host_pressure" | sed -n '1,12p' fi if [ -n "$active_builds" ]; then printf '%s\n' "$active_builds" | head -n 8 fi if [ "$attempt" -lt "$ATTEMPTS" ]; then sleep "$SLEEP_SECONDS" fi done echo "⚠️ host web/build/smoke pressure still active after ${ATTEMPTS} checks" if [ "$WARN_ONLY" = "1" ]; then echo "⚠️ continuing to avoid a stuck deploy; see ops/runner/README.md for the runner isolation plan" exit 0 fi echo "❌ refusing to start AWOOI image build while host web/build/smoke pressure is still active" exit 1