Files
awoooi/scripts/reboot-recovery/p3-controlled-release-gate.sh
Your Name cfb866d055
Some checks failed
Ansible Lint / lint (push) Successful in 35s
CD Pipeline / tests (push) Failing after 13s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Failing after 11s
feat(governance): add agent market automation surfaces
2026-06-04 21:50:55 +08:00

425 lines
16 KiB
Bash
Executable File

#!/usr/bin/env bash
# AWOOOI P3 controlled release gate.
# Read-only: this script never starts, stops, restarts, deletes, or modifies services.
set -uo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
cd "$ROOT_DIR"
SSH_BATCH_MODE=${SSH_BATCH_MODE:-yes}
SSH_OPTS=(-o BatchMode="$SSH_BATCH_MODE" -o ConnectTimeout=6)
NO_COLOR=0
SKIP_COLD_START_GATE=0
LOAD5_PER_CORE_LIMIT="${LOAD5_PER_CORE_LIMIT:-1.0}"
LOAD15_PER_CORE_LIMIT="${LOAD15_PER_CORE_LIMIT:-1.0}"
JOB_CONTAINER_CPU_LIMIT="${JOB_CONTAINER_CPU_LIMIT:-1.0}"
TEXTFILE_MAX_AGE_SECONDS="${TEXTFILE_MAX_AGE_SECONDS:-300}"
usage() {
cat <<'USAGE'
Usage: bash scripts/reboot-recovery/p3-controlled-release-gate.sh [options]
Options:
--skip-cold-start-gate Do not run the full P0/P1/P2 read-only gate first.
--no-color Disable ANSI colors.
-h, --help Show this help.
Environment overrides:
LOAD5_PER_CORE_LIMIT=1.0
LOAD15_PER_CORE_LIMIT=1.0
JOB_CONTAINER_CPU_LIMIT=1.0
TEXTFILE_MAX_AGE_SECONDS=300
USAGE
}
while [ "$#" -gt 0 ]; do
case "$1" in
--skip-cold-start-gate)
SKIP_COLD_START_GATE=1
;;
--no-color)
NO_COLOR=1
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 64
;;
esac
shift
done
if [ "$NO_COLOR" = "1" ]; then
RED=""
GREEN=""
YELLOW=""
BLUE=""
NC=""
else
RED=$'\033[0;31m'
GREEN=$'\033[0;32m'
YELLOW=$'\033[1;33m'
BLUE=$'\033[0;34m'
NC=$'\033[0m'
fi
PASS=0
WARN=0
FAIL=0
section() {
printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
}
ok() {
PASS=$((PASS + 1))
printf "%sOK%s %s\n" "$GREEN" "$NC" "$*"
}
warn() {
WARN=$((WARN + 1))
printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$*"
}
blocked() {
FAIL=$((FAIL + 1))
printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$*"
}
ssh_cmd() {
local target="$1"
local cmd="$2"
ssh "${SSH_OPTS[@]}" "$target" "$cmd"
}
float_le() {
awk -v a="$1" -v b="$2" 'BEGIN { exit !(a <= b) }'
}
check_cold_start_gate() {
section "P0/P1/P2 cold-start gate"
if [ "$SKIP_COLD_START_GATE" -eq 1 ]; then
warn "cold-start gate skipped by operator option"
return
fi
SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color --watch --interval 1 --max-attempts 1 >/tmp/awoooi-p3-cold-start-gate.log 2>&1
local rc=$?
local summary blocked_count warn_count
summary=$(grep -E '^PASS=[0-9]+ WARN=[0-9]+ BLOCKED=[0-9]+' /tmp/awoooi-p3-cold-start-gate.log | tail -1 || true)
blocked_count=$(awk -F'BLOCKED=' '/^PASS=/ {print $2}' <<<"$summary")
warn_count=$(awk -F'WARN=' '/^PASS=/ {split($2,a," "); print a[1]}' <<<"$summary")
if [ "$rc" -eq 0 ]; then
ok "cold-start gate is GREEN"
elif [ "${blocked_count:-1}" = "0" ]; then
warn "cold-start gate is DEGRADED but not blocked: ${summary:-summary unavailable}"
else
blocked "cold-start gate has blocked items: ${summary:-summary unavailable}; see /tmp/awoooi-p3-cold-start-gate.log"
fi
}
check_host_load() {
local label="$1"
local target="$2"
local out load5 load15 cores load5_per_core load15_per_core
section "$label load gate"
if ! out=$(ssh_cmd "$target" 'read _ load5 load15 _ < /proc/loadavg; cores=$(nproc); awk -v l5="$load5" -v l15="$load15" -v c="$cores" "BEGIN {printf \"LOAD5 %.4f LOAD15 %.4f CORES %d LOAD5_PER_CORE %.6f LOAD15_PER_CORE %.6f\\n\", l5, l15, c, l5/c, l15/c}"' 2>&1); then
blocked "$label load check unavailable"
echo "$out"
return
fi
echo "$out"
load5_per_core=$(awk '/LOAD5_PER_CORE/ {for (i=1;i<=NF;i++) if ($i=="LOAD5_PER_CORE") print $(i+1)}' <<<"$out")
load15_per_core=$(awk '/LOAD15_PER_CORE/ {for (i=1;i<=NF;i++) if ($i=="LOAD15_PER_CORE") print $(i+1)}' <<<"$out")
if float_le "$load5_per_core" "$LOAD5_PER_CORE_LIMIT"; then
ok "$label load5/core <= $LOAD5_PER_CORE_LIMIT"
else
blocked "$label load5/core too high for P3 release"
fi
if float_le "$load15_per_core" "$LOAD15_PER_CORE_LIMIT"; then
ok "$label load15/core <= $LOAD15_PER_CORE_LIMIT"
else
blocked "$label load15/core too high for P3 release"
fi
}
check_textfiles() {
section "textfile freshness"
local out
if out=$(ssh_cmd "wooo@192.168.0.110" '
now=$(date +%s)
for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom /home/wooo/node_exporter_textfiles/storage_health.prom /home/wooo/node_exporter_textfiles/backup_health.prom /home/wooo/node_exporter_textfiles/cold_start_recovery.prom; do
if [ -f "$f" ]; then
echo "$(basename "$f") $((now - $(stat -c %Y "$f")))"
else
echo "$(basename "$f") missing"
fi
done
' 2>&1); then
while read -r file age; do
[ -n "${file:-}" ] || continue
max_age="$TEXTFILE_MAX_AGE_SECONDS"
[ "$file" = "cold_start_recovery.prom" ] && max_age=900
[ "$file" = "backup_health.prom" ] && max_age=900
if [ "$age" = "missing" ]; then
blocked "110 $file missing"
elif [ "$age" -le "$max_age" ]; then
ok "110 $file fresh age=${age}s"
else
blocked "110 $file stale age=${age}s"
fi
done <<<"$out"
else
blocked "110 textfile freshness check unavailable"
echo "$out"
fi
if out=$(ssh_cmd "ollama@192.168.0.188" '
now=$(date +%s)
for f in /home/ollama/node_exporter_textfiles/docker_stats.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/storage_health.prom /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/backup_health.prom; do
if [ -f "$f" ]; then
echo "$(basename "$f") $((now - $(stat -c %Y "$f")))"
else
echo "$(basename "$f") missing"
fi
done
' 2>&1); then
while read -r file age; do
[ -n "${file:-}" ] || continue
max_age="$TEXTFILE_MAX_AGE_SECONDS"
[ "$file" = "backup.prom" ] && max_age=90000
[ "$file" = "backup_health.prom" ] && max_age=900
if [ "$age" = "missing" ]; then
blocked "188 $file missing"
elif [ "$age" -le "$max_age" ]; then
ok "188 $file fresh age=${age}s"
else
blocked "188 $file stale age=${age}s"
fi
done <<<"$out"
else
blocked "188 textfile freshness check unavailable"
echo "$out"
fi
}
check_backup_health() {
section "backup health gate"
local label target file out stale missing_cron missing_script failed_count integrity_stale
for spec in \
"110|wooo@192.168.0.110|/home/wooo/node_exporter_textfiles/backup_health.prom" \
"188|ollama@192.168.0.188|/home/ollama/node_exporter_textfiles/backup_health.prom"; do
label=${spec%%|*}
target=${spec#*|}
target=${target%%|*}
file=${spec##*|}
if ! out=$(ssh_cmd "$target" "
if [ ! -f '$file' ]; then
echo 'BACKUP_HEALTH missing'
exit 0
fi
awk '
/^awoooi_backup_job_fresh/ {total += 1; stale += (\$2 == 0)}
/^awoooi_backup_job_configured/ {missing_cron += (\$2 == 0)}
/^awoooi_backup_script_present/ {missing_script += (\$2 == 0)}
/^awoooi_backup_last_run_failed_count/ {failed += \$2}
/^awoooi_backup_integrity_fresh/ {integrity_total += 1; integrity_stale += (\$2 == 0)}
END {printf \"BACKUP_HEALTH total=%d stale=%d missing_cron=%d missing_script=%d failed_count=%d integrity_total=%d integrity_stale=%d\\n\", total, stale, missing_cron, missing_script, failed, integrity_total, integrity_stale}
' '$file'
" 2>&1); then
blocked "$label backup health check unavailable"
echo "$out"
continue
fi
echo "$label $out"
if grep -q "BACKUP_HEALTH missing" <<<"$out"; then
blocked "$label backup_health.prom missing"
continue
fi
stale=$(awk -F'stale=' '{split($2,a," "); print a[1]+0}' <<<"$out")
missing_cron=$(awk -F'missing_cron=' '{split($2,a," "); print a[1]+0}' <<<"$out")
missing_script=$(awk -F'missing_script=' '{split($2,a," "); print a[1]+0}' <<<"$out")
failed_count=$(awk -F'failed_count=' '{split($2,a," "); print a[1]+0}' <<<"$out")
integrity_stale=$(awk -F'integrity_stale=' '{split($2,a," "); print a[1]+0}' <<<"$out")
[ "$stale" -eq 0 ] && ok "$label expected backups are fresh" || blocked "$label expected backup jobs are stale"
[ "$missing_cron" -eq 0 ] && ok "$label expected backup crons are configured" || blocked "$label expected backup cron config missing"
[ "$missing_script" -eq 0 ] && ok "$label expected backup scripts are present" || blocked "$label expected backup scripts missing"
if [ "$label" = "110" ]; then
[ "$integrity_stale" -eq 0 ] && ok "110 backup integrity and restore drill are fresh" || blocked "110 backup integrity or restore drill stale"
[ "$failed_count" -eq 0 ] && ok "110 latest aggregate backup had no failed components" || warn "110 latest aggregate backup still records failed components; rerun backup-all after fixes"
fi
done
}
check_storage_health() {
section "storage health gate"
local label target file out root_readonly current_errors previous_errors fsck_errors
for spec in \
"110|wooo@192.168.0.110|/home/wooo/node_exporter_textfiles/storage_health.prom" \
"188|ollama@192.168.0.188|/home/ollama/node_exporter_textfiles/storage_health.prom"; do
label=${spec%%|*}
target=${spec#*|}
target=${target%%|*}
file=${spec##*|}
if ! out=$(ssh_cmd "$target" "
if [ ! -f '$file' ]; then
echo 'STORAGE_HEALTH missing'
exit 0
fi
awk '
/^awoooi_host_root_filesystem_readonly/ {root += \$2}
/^awoooi_host_storage_error_count/ && /boot=\"current\"/ {current += \$2}
/^awoooi_host_storage_error_count/ && /boot=\"previous\"/ {previous += \$2}
/^awoooi_host_storage_error_count/ && /boot=\"last-fsck-log\"/ {fsck += \$2}
END {printf \"STORAGE_HEALTH root_readonly=%d current=%d previous=%d fsck=%d\\n\", root, current, previous, fsck}
' '$file'
" 2>&1); then
blocked "$label storage health check unavailable"
echo "$out"
continue
fi
echo "$label $out"
if grep -q "STORAGE_HEALTH missing" <<<"$out"; then
blocked "$label storage_health.prom missing"
continue
fi
root_readonly=$(awk -F'root_readonly=' '{split($2,a," "); print a[1]+0}' <<<"$out")
current_errors=$(awk -F'current=' '{split($2,a," "); print a[1]+0}' <<<"$out")
previous_errors=$(awk -F'previous=' '{split($2,a," "); print a[1]+0}' <<<"$out")
fsck_errors=$(awk -F'fsck=' '{split($2,a," "); print a[1]+0}' <<<"$out")
[ "$root_readonly" -eq 0 ] && ok "$label root filesystem is writable" || blocked "$label root filesystem is read-only"
[ "$current_errors" -eq 0 ] && ok "$label current boot has no storage error evidence" || blocked "$label current boot has storage error evidence"
[ "$previous_errors" -eq 0 ] && ok "$label previous boot has no storage error evidence" || warn "$label previous boot has storage error evidence; keep fsck/backup follow-up open"
[ "$fsck_errors" -eq 0 ] && ok "$label fsck logs have no retained error evidence" || warn "$label fsck logs retain error evidence; verify offline fsck/backup status"
done
}
check_runner_guardrails() {
section "runner/CD guardrails"
local out bad
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
bad=0
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
watchdog=$(systemctl show "$u" -p WatchdogUSec --value)
quota=$(systemctl show "$u" -p CPUQuotaPerSecUSec --value)
memory=$(systemctl show "$u" -p MemoryMax --value)
state=$(systemctl show "$u" -p ActiveState --value)
echo "$u watchdog=$watchdog quota=$quota memory=$memory state=$state"
[ "$watchdog" = "0" ] || bad=1
[ "$quota" != "infinity" ] && [ "$quota" != "0" ] || bad=1
[ "$memory" != "infinity" ] && [ "$memory" != "0" ] || bad=1
done
echo "BAD_RUNNER_GUARDRAILS $bad"
' 2>&1); then
blocked "runner guardrail check unavailable"
echo "$out"
return
fi
echo "$out"
grep -q "BAD_RUNNER_GUARDRAILS 0" <<<"$out" && ok "all discovered runner units have watchdog disabled and CPU/memory limits" || blocked "runner guardrails incomplete"
}
check_job_containers() {
section "active job container CPU"
local out
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
names=$(docker ps --format "{{.Names}}" | grep -E "^(GITEA-ACTIONS-|awoooi-cd-)" || true)
if [ -z "$names" ]; then
echo "NO_ACTIVE_JOB_CONTAINERS"
exit 0
fi
for name in $names; do
cpu=$(docker stats "$name" --no-stream --format "{{.CPUPerc}}" | tr -d "%" | awk "{printf \"%.6f\", \$1 / 100}")
echo "JOB_CONTAINER $name cpu_cores=$cpu"
done
' 2>&1); then
blocked "job container CPU check unavailable"
echo "$out"
return
fi
echo "$out"
if grep -q "NO_ACTIVE_JOB_CONTAINERS" <<<"$out"; then
ok "no active Gitea/CD job containers"
return
fi
local bad_count
bad_count=$(awk -v limit="$JOB_CONTAINER_CPU_LIMIT" -F'cpu_cores=' '/^JOB_CONTAINER / {if (($2 + 0) > limit) bad++} END {print bad+0}' <<<"$out")
if [ "$bad_count" -eq 0 ]; then
ok "active job containers are below ${JOB_CONTAINER_CPU_LIMIT} CPU cores"
else
blocked "$bad_count active job container(s) exceed ${JOB_CONTAINER_CPU_LIMIT} CPU cores"
fi
}
check_high_load_services() {
section "high-load service health"
local out
if out=$(ssh_cmd "ollama@192.168.0.188" '
echo "ollama-systemd $(systemctl is-active ollama 2>/dev/null || true)"
echo "ollama-api $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:11434/api/tags || true)"
docker inspect -f "momo-scheduler {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" momo-scheduler 2>/dev/null || true
docker inspect -f "litellm {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" litellm 2>/dev/null || true
docker inspect -f "signoz-clickhouse {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" signoz-clickhouse 2>/dev/null || true
' 2>&1); then
echo "$out"
grep -q "ollama-systemd active" <<<"$out" && ok "188 Ollama systemd active" || blocked "188 Ollama systemd inactive"
grep -q "ollama-api 200" <<<"$out" && ok "188 Ollama API reachable" || blocked "188 Ollama API not reachable"
grep -q "momo-scheduler running healthy" <<<"$out" && ok "188 momo-scheduler healthy" || blocked "188 momo-scheduler not healthy"
grep -Eq "litellm running( |$)" <<<"$out" && ok "188 litellm running" || blocked "188 litellm not running"
grep -q "signoz-clickhouse running healthy" <<<"$out" && ok "188 SignOz ClickHouse healthy" || warn "188 SignOz ClickHouse health not confirmed"
else
blocked "188 high-load service check unavailable"
echo "$out"
fi
if out=$(ssh_cmd "wooo@192.168.0.110" '
docker inspect -f "sentry-clickhouse {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" sentry-self-hosted-clickhouse-1 2>/dev/null || true
docker inspect -f "sentry-kafka {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" sentry-self-hosted-kafka-1 2>/dev/null || true
docker ps --format "{{.Names}} {{.Status}}" | grep -E "sentry-self-hosted-(snuba|events|transactions|generic|metrics|subscription).*consumer" | head -20 || true
' 2>&1); then
echo "$out"
grep -q "sentry-clickhouse running healthy" <<<"$out" && ok "110 Sentry ClickHouse healthy" || blocked "110 Sentry ClickHouse not healthy"
grep -q "sentry-kafka running healthy" <<<"$out" && ok "110 Sentry Kafka healthy" || blocked "110 Sentry Kafka not healthy"
grep -q "Restarting" <<<"$out" && blocked "110 Sentry consumers include restarting containers" || ok "110 sampled Sentry consumers are not restarting"
else
blocked "110 high-load service check unavailable"
echo "$out"
fi
}
summary() {
section "summary"
echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
if [ "$FAIL" -gt 0 ]; then
echo "Result: HOLD_P3_RELEASE. Do not release runner/CD/crawlers/consumers further."
return 1
fi
if [ "$WARN" -gt 0 ]; then
echo "Result: P3_RELEASE_WITH_CAUTION. Proceed only with operator review."
return 0
fi
echo "Result: P3_RELEASE_READY. Controlled high-load work release is allowed."
}
echo "AWOOOI P3 controlled release gate"
date '+%Y-%m-%d %H:%M:%S %Z'
echo "Limits: load5/core<=$LOAD5_PER_CORE_LIMIT load15/core<=$LOAD15_PER_CORE_LIMIT job_container_cpu<=$JOB_CONTAINER_CPU_LIMIT"
check_cold_start_gate
check_host_load "110" "wooo@192.168.0.110"
check_host_load "188" "ollama@192.168.0.188"
check_textfiles
check_storage_health
check_backup_health
check_runner_guardrails
check_job_containers
check_high_load_services
summary