Files
awoooi/scripts/ci/wait-host-web-build-pressure.sh
Your Name bdccd29d2d
Some checks failed
CD Pipeline / tests (push) Failing after 42s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 14s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
fix(ci): allow baseline host pressure gate action
2026-06-27 20:07:47 +08:00

187 lines
6.8 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
# 2026-05-21 Codex: protect the shared 110 host runner from overlapping
# host-side frontend production builds launched by other repositories.
# 2026-06-27 Codex: make the gate enforce real host pressure too. 110 is both a
# production host and a CI host, so CD must not start a new Docker/Next build
# while load, BuildKit, Gitea Actions, or headless smoke pressure is already high.
# This gate never kills, renices, or rewrites another repo's process tree.
ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-60}}"
SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-10}}"
WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-0}"
MAX_LOAD5_PER_CORE="${HOST_WEB_BUILD_PRESSURE_MAX_LOAD5_PER_CORE:-0.85}"
MAX_CI_CPU_PERCENT="${HOST_WEB_BUILD_PRESSURE_MAX_CI_CPU_PERCENT:-250}"
# One Gitea Actions task container/process group is the current job itself.
# Block only when there is additional CI/BuildKit pressure unless explicitly
# tightened by the runner environment.
MAX_ACTIVE_CI_PROCESS_GROUPS="${HOST_WEB_BUILD_PRESSURE_MAX_ACTIVE_CI_PROCESS_GROUPS:-1}"
MAX_ACTIVE_CI_CONTAINERS="${HOST_WEB_BUILD_PRESSURE_MAX_ACTIVE_CI_CONTAINERS:-1}"
MAX_ORPHAN_BROWSER_GROUPS="${HOST_WEB_BUILD_PRESSURE_MAX_ORPHAN_BROWSER_GROUPS:-0}"
METRICS_FILE="${HOST_RUNAWAY_PROCESS_METRICS_FILE:-${HOST_WEB_BUILD_PRESSURE_METRICS_FILE:-/home/wooo/node_exporter_textfiles/host_runaway_process.prom}}"
EXPORTER="${HOST_RUNAWAY_PROCESS_EXPORTER:-/home/wooo/scripts/host-runaway-process-exporter.py}"
default_ps_command() {
if ps -eo pid=,ppid=,pcpu=,pmem=,args= --sort=-pcpu >/dev/null 2>&1; then
printf '%s\n' "ps -eo pid=,ppid=,pcpu=,pmem=,args= --sort=-pcpu"
return
fi
printf '%s\n' "ps -axo pid=,ppid=,pcpu=,pmem=,command="
}
PS_COMMAND="${HOST_WEB_BUILD_PRESSURE_PS_COMMAND:-$(default_ps_command)}"
list_foreign_web_builds() {
bash -c "$PS_COMMAND" | awk '
BEGIN { IGNORECASE = 1 }
/[n]ext[\/[:alnum:]._-]*[[:space:]]+build|[t]urbo[[:space:]]+build|[v]ite[[:space:]]+build/ {
if ($0 ~ /\/workspace\/wooo\/awoooi/) next
if ($0 ~ /\/Users\/ogt\/awoooi/) next
if ($0 ~ /\/private\/tmp\/awoooi/) next
if ($0 ~ /\/app\/apps\/web/) next
if ($0 ~ /scripts\/ci\/wait-host-web-build-pressure\.sh/) next
print
}
'
}
refresh_metrics() {
if [ -x "$EXPORTER" ]; then
AIOPS_HOST_LABEL="${AIOPS_HOST_LABEL:-110}" \
NODE_EXPORTER_TEXTFILE_DIR="${NODE_EXPORTER_TEXTFILE_DIR:-/home/wooo/node_exporter_textfiles}" \
AIOPS_RUNAWAY_PROCESS_MIN_AGE_SECONDS="${AIOPS_RUNAWAY_PROCESS_MIN_AGE_SECONDS:-1800}" \
AIOPS_RUNAWAY_PROCESS_MIN_CPU_PERCENT="${AIOPS_RUNAWAY_PROCESS_MIN_CPU_PERCENT:-50}" \
"$EXPORTER" >/dev/null 2>&1 || true
fi
}
metric_value() {
local name="$1"
if [ ! -r "$METRICS_FILE" ]; then
return 1
fi
awk -v metric="$name" '
$1 ~ ("^" metric "(\\{|$)") {
value = $NF
}
END {
if (value != "") print value
else exit 1
}
' "$METRICS_FILE"
}
metric_sum() {
local name="$1"
if [ ! -r "$METRICS_FILE" ]; then
return 1
fi
awk -v metric="$name" '
$1 ~ ("^" metric "(\\{|$)") {
sum += $NF
found = 1
}
END {
if (found) print sum
else exit 1
}
' "$METRICS_FILE"
}
load5_per_core() {
metric_value "awoooi_host_load5_per_core" 2>/dev/null || awk '
BEGIN {
cores = 0
while ((getline line < "/proc/cpuinfo") > 0) {
if (line ~ /^processor[[:space:]]*:/) cores += 1
}
close("/proc/cpuinfo")
if (cores < 1) cores = 1
if ((getline loadline < "/proc/loadavg") <= 0) exit 1
split(loadline, parts, " ")
printf "%.6f\n", parts[2] / cores
}
'
}
greater_than() {
awk -v left="$1" -v right="$2" 'BEGIN { exit !(left > right) }'
}
list_headless_smoke_pressure() {
bash -c "$PS_COMMAND" | awk '
BEGIN { IGNORECASE = 1 }
/[c]hrome.*\/tmp\/stockplatform|[s]tockplatform-[[:alnum:]_-]*smoke|[h]eadless=new/ {
if ($0 ~ /scripts\/ci\/wait-host-web-build-pressure\.sh/) next
print
}
'
}
pressure_report() {
local report=""
local load_ratio active_ci_cpu active_ci_groups active_ci_containers orphan_groups
load_ratio="$(load5_per_core 2>/dev/null || echo 0)"
active_ci_cpu="$(metric_value "awoooi_host_gitea_actions_active_process_cpu_percent" 2>/dev/null || echo 0)"
active_ci_groups="$(metric_value "awoooi_host_gitea_actions_active_process_group_count" 2>/dev/null || echo 0)"
active_ci_containers="$(metric_value "awoooi_host_gitea_actions_active_container_count" 2>/dev/null || echo 0)"
orphan_groups="$(
metric_sum "awoooi_host_runaway_browser_orphan_group_count" 2>/dev/null \
|| metric_sum "awoooi_host_orphan_browser_group_count" 2>/dev/null \
|| echo 0
)"
if greater_than "$load_ratio" "$MAX_LOAD5_PER_CORE"; then
report="${report}host load5/core ${load_ratio} > ${MAX_LOAD5_PER_CORE}"$'\n'
fi
if greater_than "$active_ci_cpu" "$MAX_CI_CPU_PERCENT"; then
report="${report}active CI/BuildKit CPU ${active_ci_cpu}% > ${MAX_CI_CPU_PERCENT}%"$'\n'
fi
if greater_than "$active_ci_groups" "$MAX_ACTIVE_CI_PROCESS_GROUPS"; then
report="${report}active CI/BuildKit process groups ${active_ci_groups} > ${MAX_ACTIVE_CI_PROCESS_GROUPS}"$'\n'
fi
if greater_than "$active_ci_containers" "$MAX_ACTIVE_CI_CONTAINERS"; then
report="${report}active Gitea Actions containers ${active_ci_containers} > ${MAX_ACTIVE_CI_CONTAINERS}"$'\n'
fi
if greater_than "$orphan_groups" "$MAX_ORPHAN_BROWSER_GROUPS"; then
report="${report}orphan browser/smoke groups ${orphan_groups} > ${MAX_ORPHAN_BROWSER_GROUPS}"$'\n'
fi
local smoke_pressure
smoke_pressure="$(list_headless_smoke_pressure || true)"
if [ -n "$smoke_pressure" ]; then
report="${report}active headless smoke pressure detected"$'\n'"$(printf '%s\n' "$smoke_pressure" | head -n 6)"$'\n'
fi
printf '%s' "$report"
}
for attempt in $(seq 1 "$ATTEMPTS"); do
refresh_metrics
active_builds="$(list_foreign_web_builds || true)"
host_pressure="$(pressure_report || true)"
if [ -z "$active_builds" ] && [ -z "$host_pressure" ]; then
echo "✅ no host web/build/smoke pressure detected"
exit 0
fi
echo "⏳ host web/build/smoke pressure detected (attempt ${attempt}/${ATTEMPTS}); waiting ${SLEEP_SECONDS}s"
if [ -n "$host_pressure" ]; then
printf '%s\n' "$host_pressure" | sed -n '1,12p'
fi
if [ -n "$active_builds" ]; then
printf '%s\n' "$active_builds" | head -n 8
fi
sleep "$SLEEP_SECONDS"
done
echo "⚠️ host web/build/smoke pressure still active after ${ATTEMPTS} checks"
if [ "$WARN_ONLY" = "1" ]; then
echo "⚠️ continuing to avoid a stuck deploy; see ops/runner/README.md for the runner isolation plan"
exit 0
fi
echo "❌ refusing to start AWOOI image build while host web/build/smoke pressure is still active"
exit 1