425 lines
16 KiB
Bash
Executable File
425 lines
16 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# AWOOOI P3 controlled release gate.
|
|
# Read-only: this script never starts, stops, restarts, deletes, or modifies services.
|
|
|
|
set -uo pipefail
|
|
|
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
cd "$ROOT_DIR"
|
|
|
|
SSH_BATCH_MODE=${SSH_BATCH_MODE:-yes}
|
|
SSH_OPTS=(-o BatchMode="$SSH_BATCH_MODE" -o ConnectTimeout=6)
|
|
NO_COLOR=0
|
|
SKIP_COLD_START_GATE=0
|
|
LOAD5_PER_CORE_LIMIT="${LOAD5_PER_CORE_LIMIT:-1.0}"
|
|
LOAD15_PER_CORE_LIMIT="${LOAD15_PER_CORE_LIMIT:-1.0}"
|
|
JOB_CONTAINER_CPU_LIMIT="${JOB_CONTAINER_CPU_LIMIT:-1.0}"
|
|
TEXTFILE_MAX_AGE_SECONDS="${TEXTFILE_MAX_AGE_SECONDS:-300}"
|
|
|
|
usage() {
|
|
cat <<'USAGE'
|
|
Usage: bash scripts/reboot-recovery/p3-controlled-release-gate.sh [options]
|
|
|
|
Options:
|
|
--skip-cold-start-gate Do not run the full P0/P1/P2 read-only gate first.
|
|
--no-color Disable ANSI colors.
|
|
-h, --help Show this help.
|
|
|
|
Environment overrides:
|
|
LOAD5_PER_CORE_LIMIT=1.0
|
|
LOAD15_PER_CORE_LIMIT=1.0
|
|
JOB_CONTAINER_CPU_LIMIT=1.0
|
|
TEXTFILE_MAX_AGE_SECONDS=300
|
|
USAGE
|
|
}
|
|
|
|
while [ "$#" -gt 0 ]; do
|
|
case "$1" in
|
|
--skip-cold-start-gate)
|
|
SKIP_COLD_START_GATE=1
|
|
;;
|
|
--no-color)
|
|
NO_COLOR=1
|
|
;;
|
|
-h|--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown argument: $1" >&2
|
|
usage >&2
|
|
exit 64
|
|
;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
if [ "$NO_COLOR" = "1" ]; then
|
|
RED=""
|
|
GREEN=""
|
|
YELLOW=""
|
|
BLUE=""
|
|
NC=""
|
|
else
|
|
RED=$'\033[0;31m'
|
|
GREEN=$'\033[0;32m'
|
|
YELLOW=$'\033[1;33m'
|
|
BLUE=$'\033[0;34m'
|
|
NC=$'\033[0m'
|
|
fi
|
|
|
|
PASS=0
|
|
WARN=0
|
|
FAIL=0
|
|
|
|
section() {
|
|
printf "\n%s=== %s ===%s\n" "$BLUE" "$1" "$NC"
|
|
}
|
|
|
|
ok() {
|
|
PASS=$((PASS + 1))
|
|
printf "%sOK%s %s\n" "$GREEN" "$NC" "$*"
|
|
}
|
|
|
|
warn() {
|
|
WARN=$((WARN + 1))
|
|
printf "%sWARN%s %s\n" "$YELLOW" "$NC" "$*"
|
|
}
|
|
|
|
blocked() {
|
|
FAIL=$((FAIL + 1))
|
|
printf "%sBLOCKED%s %s\n" "$RED" "$NC" "$*"
|
|
}
|
|
|
|
ssh_cmd() {
|
|
local target="$1"
|
|
local cmd="$2"
|
|
ssh "${SSH_OPTS[@]}" "$target" "$cmd"
|
|
}
|
|
|
|
float_le() {
|
|
awk -v a="$1" -v b="$2" 'BEGIN { exit !(a <= b) }'
|
|
}
|
|
|
|
check_cold_start_gate() {
|
|
section "P0/P1/P2 cold-start gate"
|
|
if [ "$SKIP_COLD_START_GATE" -eq 1 ]; then
|
|
warn "cold-start gate skipped by operator option"
|
|
return
|
|
fi
|
|
|
|
SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color --watch --interval 1 --max-attempts 1 >/tmp/awoooi-p3-cold-start-gate.log 2>&1
|
|
local rc=$?
|
|
local summary blocked_count warn_count
|
|
summary=$(grep -E '^PASS=[0-9]+ WARN=[0-9]+ BLOCKED=[0-9]+' /tmp/awoooi-p3-cold-start-gate.log | tail -1 || true)
|
|
blocked_count=$(awk -F'BLOCKED=' '/^PASS=/ {print $2}' <<<"$summary")
|
|
warn_count=$(awk -F'WARN=' '/^PASS=/ {split($2,a," "); print a[1]}' <<<"$summary")
|
|
|
|
if [ "$rc" -eq 0 ]; then
|
|
ok "cold-start gate is GREEN"
|
|
elif [ "${blocked_count:-1}" = "0" ]; then
|
|
warn "cold-start gate is DEGRADED but not blocked: ${summary:-summary unavailable}"
|
|
else
|
|
blocked "cold-start gate has blocked items: ${summary:-summary unavailable}; see /tmp/awoooi-p3-cold-start-gate.log"
|
|
fi
|
|
}
|
|
|
|
check_host_load() {
|
|
local label="$1"
|
|
local target="$2"
|
|
local out load5 load15 cores load5_per_core load15_per_core
|
|
|
|
section "$label load gate"
|
|
if ! out=$(ssh_cmd "$target" 'read _ load5 load15 _ < /proc/loadavg; cores=$(nproc); awk -v l5="$load5" -v l15="$load15" -v c="$cores" "BEGIN {printf \"LOAD5 %.4f LOAD15 %.4f CORES %d LOAD5_PER_CORE %.6f LOAD15_PER_CORE %.6f\\n\", l5, l15, c, l5/c, l15/c}"' 2>&1); then
|
|
blocked "$label load check unavailable"
|
|
echo "$out"
|
|
return
|
|
fi
|
|
echo "$out"
|
|
load5_per_core=$(awk '/LOAD5_PER_CORE/ {for (i=1;i<=NF;i++) if ($i=="LOAD5_PER_CORE") print $(i+1)}' <<<"$out")
|
|
load15_per_core=$(awk '/LOAD15_PER_CORE/ {for (i=1;i<=NF;i++) if ($i=="LOAD15_PER_CORE") print $(i+1)}' <<<"$out")
|
|
if float_le "$load5_per_core" "$LOAD5_PER_CORE_LIMIT"; then
|
|
ok "$label load5/core <= $LOAD5_PER_CORE_LIMIT"
|
|
else
|
|
blocked "$label load5/core too high for P3 release"
|
|
fi
|
|
if float_le "$load15_per_core" "$LOAD15_PER_CORE_LIMIT"; then
|
|
ok "$label load15/core <= $LOAD15_PER_CORE_LIMIT"
|
|
else
|
|
blocked "$label load15/core too high for P3 release"
|
|
fi
|
|
}
|
|
|
|
check_textfiles() {
|
|
section "textfile freshness"
|
|
local out
|
|
if out=$(ssh_cmd "wooo@192.168.0.110" '
|
|
now=$(date +%s)
|
|
for f in /home/wooo/node_exporter_textfiles/docker_stats.prom /home/wooo/node_exporter_textfiles/systemd_units.prom /home/wooo/node_exporter_textfiles/storage_health.prom /home/wooo/node_exporter_textfiles/backup_health.prom /home/wooo/node_exporter_textfiles/cold_start_recovery.prom; do
|
|
if [ -f "$f" ]; then
|
|
echo "$(basename "$f") $((now - $(stat -c %Y "$f")))"
|
|
else
|
|
echo "$(basename "$f") missing"
|
|
fi
|
|
done
|
|
' 2>&1); then
|
|
while read -r file age; do
|
|
[ -n "${file:-}" ] || continue
|
|
max_age="$TEXTFILE_MAX_AGE_SECONDS"
|
|
[ "$file" = "cold_start_recovery.prom" ] && max_age=900
|
|
[ "$file" = "backup_health.prom" ] && max_age=900
|
|
if [ "$age" = "missing" ]; then
|
|
blocked "110 $file missing"
|
|
elif [ "$age" -le "$max_age" ]; then
|
|
ok "110 $file fresh age=${age}s"
|
|
else
|
|
blocked "110 $file stale age=${age}s"
|
|
fi
|
|
done <<<"$out"
|
|
else
|
|
blocked "110 textfile freshness check unavailable"
|
|
echo "$out"
|
|
fi
|
|
|
|
if out=$(ssh_cmd "ollama@192.168.0.188" '
|
|
now=$(date +%s)
|
|
for f in /home/ollama/node_exporter_textfiles/docker_stats.prom /home/ollama/node_exporter_textfiles/docker_restart_count.prom /home/ollama/node_exporter_textfiles/storage_health.prom /home/ollama/node_exporter_textfiles/backup.prom /home/ollama/node_exporter_textfiles/backup_health.prom; do
|
|
if [ -f "$f" ]; then
|
|
echo "$(basename "$f") $((now - $(stat -c %Y "$f")))"
|
|
else
|
|
echo "$(basename "$f") missing"
|
|
fi
|
|
done
|
|
' 2>&1); then
|
|
while read -r file age; do
|
|
[ -n "${file:-}" ] || continue
|
|
max_age="$TEXTFILE_MAX_AGE_SECONDS"
|
|
[ "$file" = "backup.prom" ] && max_age=90000
|
|
[ "$file" = "backup_health.prom" ] && max_age=900
|
|
if [ "$age" = "missing" ]; then
|
|
blocked "188 $file missing"
|
|
elif [ "$age" -le "$max_age" ]; then
|
|
ok "188 $file fresh age=${age}s"
|
|
else
|
|
blocked "188 $file stale age=${age}s"
|
|
fi
|
|
done <<<"$out"
|
|
else
|
|
blocked "188 textfile freshness check unavailable"
|
|
echo "$out"
|
|
fi
|
|
}
|
|
|
|
check_backup_health() {
|
|
section "backup health gate"
|
|
local label target file out stale missing_cron missing_script failed_count integrity_stale
|
|
for spec in \
|
|
"110|wooo@192.168.0.110|/home/wooo/node_exporter_textfiles/backup_health.prom" \
|
|
"188|ollama@192.168.0.188|/home/ollama/node_exporter_textfiles/backup_health.prom"; do
|
|
label=${spec%%|*}
|
|
target=${spec#*|}
|
|
target=${target%%|*}
|
|
file=${spec##*|}
|
|
if ! out=$(ssh_cmd "$target" "
|
|
if [ ! -f '$file' ]; then
|
|
echo 'BACKUP_HEALTH missing'
|
|
exit 0
|
|
fi
|
|
awk '
|
|
/^awoooi_backup_job_fresh/ {total += 1; stale += (\$2 == 0)}
|
|
/^awoooi_backup_job_configured/ {missing_cron += (\$2 == 0)}
|
|
/^awoooi_backup_script_present/ {missing_script += (\$2 == 0)}
|
|
/^awoooi_backup_last_run_failed_count/ {failed += \$2}
|
|
/^awoooi_backup_integrity_fresh/ {integrity_total += 1; integrity_stale += (\$2 == 0)}
|
|
END {printf \"BACKUP_HEALTH total=%d stale=%d missing_cron=%d missing_script=%d failed_count=%d integrity_total=%d integrity_stale=%d\\n\", total, stale, missing_cron, missing_script, failed, integrity_total, integrity_stale}
|
|
' '$file'
|
|
" 2>&1); then
|
|
blocked "$label backup health check unavailable"
|
|
echo "$out"
|
|
continue
|
|
fi
|
|
echo "$label $out"
|
|
if grep -q "BACKUP_HEALTH missing" <<<"$out"; then
|
|
blocked "$label backup_health.prom missing"
|
|
continue
|
|
fi
|
|
stale=$(awk -F'stale=' '{split($2,a," "); print a[1]+0}' <<<"$out")
|
|
missing_cron=$(awk -F'missing_cron=' '{split($2,a," "); print a[1]+0}' <<<"$out")
|
|
missing_script=$(awk -F'missing_script=' '{split($2,a," "); print a[1]+0}' <<<"$out")
|
|
failed_count=$(awk -F'failed_count=' '{split($2,a," "); print a[1]+0}' <<<"$out")
|
|
integrity_stale=$(awk -F'integrity_stale=' '{split($2,a," "); print a[1]+0}' <<<"$out")
|
|
[ "$stale" -eq 0 ] && ok "$label expected backups are fresh" || blocked "$label expected backup jobs are stale"
|
|
[ "$missing_cron" -eq 0 ] && ok "$label expected backup crons are configured" || blocked "$label expected backup cron config missing"
|
|
[ "$missing_script" -eq 0 ] && ok "$label expected backup scripts are present" || blocked "$label expected backup scripts missing"
|
|
if [ "$label" = "110" ]; then
|
|
[ "$integrity_stale" -eq 0 ] && ok "110 backup integrity and restore drill are fresh" || blocked "110 backup integrity or restore drill stale"
|
|
[ "$failed_count" -eq 0 ] && ok "110 latest aggregate backup had no failed components" || warn "110 latest aggregate backup still records failed components; rerun backup-all after fixes"
|
|
fi
|
|
done
|
|
}
|
|
|
|
check_storage_health() {
|
|
section "storage health gate"
|
|
local label target file out root_readonly current_errors previous_errors fsck_errors
|
|
for spec in \
|
|
"110|wooo@192.168.0.110|/home/wooo/node_exporter_textfiles/storage_health.prom" \
|
|
"188|ollama@192.168.0.188|/home/ollama/node_exporter_textfiles/storage_health.prom"; do
|
|
label=${spec%%|*}
|
|
target=${spec#*|}
|
|
target=${target%%|*}
|
|
file=${spec##*|}
|
|
if ! out=$(ssh_cmd "$target" "
|
|
if [ ! -f '$file' ]; then
|
|
echo 'STORAGE_HEALTH missing'
|
|
exit 0
|
|
fi
|
|
awk '
|
|
/^awoooi_host_root_filesystem_readonly/ {root += \$2}
|
|
/^awoooi_host_storage_error_count/ && /boot=\"current\"/ {current += \$2}
|
|
/^awoooi_host_storage_error_count/ && /boot=\"previous\"/ {previous += \$2}
|
|
/^awoooi_host_storage_error_count/ && /boot=\"last-fsck-log\"/ {fsck += \$2}
|
|
END {printf \"STORAGE_HEALTH root_readonly=%d current=%d previous=%d fsck=%d\\n\", root, current, previous, fsck}
|
|
' '$file'
|
|
" 2>&1); then
|
|
blocked "$label storage health check unavailable"
|
|
echo "$out"
|
|
continue
|
|
fi
|
|
echo "$label $out"
|
|
if grep -q "STORAGE_HEALTH missing" <<<"$out"; then
|
|
blocked "$label storage_health.prom missing"
|
|
continue
|
|
fi
|
|
root_readonly=$(awk -F'root_readonly=' '{split($2,a," "); print a[1]+0}' <<<"$out")
|
|
current_errors=$(awk -F'current=' '{split($2,a," "); print a[1]+0}' <<<"$out")
|
|
previous_errors=$(awk -F'previous=' '{split($2,a," "); print a[1]+0}' <<<"$out")
|
|
fsck_errors=$(awk -F'fsck=' '{split($2,a," "); print a[1]+0}' <<<"$out")
|
|
[ "$root_readonly" -eq 0 ] && ok "$label root filesystem is writable" || blocked "$label root filesystem is read-only"
|
|
[ "$current_errors" -eq 0 ] && ok "$label current boot has no storage error evidence" || blocked "$label current boot has storage error evidence"
|
|
[ "$previous_errors" -eq 0 ] && ok "$label previous boot has no storage error evidence" || warn "$label previous boot has storage error evidence; keep fsck/backup follow-up open"
|
|
[ "$fsck_errors" -eq 0 ] && ok "$label fsck logs have no retained error evidence" || warn "$label fsck logs retain error evidence; verify offline fsck/backup status"
|
|
done
|
|
}
|
|
|
|
check_runner_guardrails() {
|
|
section "runner/CD guardrails"
|
|
local out bad
|
|
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
|
|
bad=0
|
|
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
|
|
watchdog=$(systemctl show "$u" -p WatchdogUSec --value)
|
|
quota=$(systemctl show "$u" -p CPUQuotaPerSecUSec --value)
|
|
memory=$(systemctl show "$u" -p MemoryMax --value)
|
|
state=$(systemctl show "$u" -p ActiveState --value)
|
|
echo "$u watchdog=$watchdog quota=$quota memory=$memory state=$state"
|
|
[ "$watchdog" = "0" ] || bad=1
|
|
[ "$quota" != "infinity" ] && [ "$quota" != "0" ] || bad=1
|
|
[ "$memory" != "infinity" ] && [ "$memory" != "0" ] || bad=1
|
|
done
|
|
echo "BAD_RUNNER_GUARDRAILS $bad"
|
|
' 2>&1); then
|
|
blocked "runner guardrail check unavailable"
|
|
echo "$out"
|
|
return
|
|
fi
|
|
echo "$out"
|
|
grep -q "BAD_RUNNER_GUARDRAILS 0" <<<"$out" && ok "all discovered runner units have watchdog disabled and CPU/memory limits" || blocked "runner guardrails incomplete"
|
|
}
|
|
|
|
check_job_containers() {
|
|
section "active job container CPU"
|
|
local out
|
|
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
|
|
names=$(docker ps --format "{{.Names}}" | grep -E "^(GITEA-ACTIONS-|awoooi-cd-)" || true)
|
|
if [ -z "$names" ]; then
|
|
echo "NO_ACTIVE_JOB_CONTAINERS"
|
|
exit 0
|
|
fi
|
|
for name in $names; do
|
|
cpu=$(docker stats "$name" --no-stream --format "{{.CPUPerc}}" | tr -d "%" | awk "{printf \"%.6f\", \$1 / 100}")
|
|
echo "JOB_CONTAINER $name cpu_cores=$cpu"
|
|
done
|
|
' 2>&1); then
|
|
blocked "job container CPU check unavailable"
|
|
echo "$out"
|
|
return
|
|
fi
|
|
echo "$out"
|
|
if grep -q "NO_ACTIVE_JOB_CONTAINERS" <<<"$out"; then
|
|
ok "no active Gitea/CD job containers"
|
|
return
|
|
fi
|
|
local bad_count
|
|
bad_count=$(awk -v limit="$JOB_CONTAINER_CPU_LIMIT" -F'cpu_cores=' '/^JOB_CONTAINER / {if (($2 + 0) > limit) bad++} END {print bad+0}' <<<"$out")
|
|
if [ "$bad_count" -eq 0 ]; then
|
|
ok "active job containers are below ${JOB_CONTAINER_CPU_LIMIT} CPU cores"
|
|
else
|
|
blocked "$bad_count active job container(s) exceed ${JOB_CONTAINER_CPU_LIMIT} CPU cores"
|
|
fi
|
|
}
|
|
|
|
check_high_load_services() {
|
|
section "high-load service health"
|
|
local out
|
|
if out=$(ssh_cmd "ollama@192.168.0.188" '
|
|
echo "ollama-systemd $(systemctl is-active ollama 2>/dev/null || true)"
|
|
echo "ollama-api $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:11434/api/tags || true)"
|
|
docker inspect -f "momo-scheduler {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" momo-scheduler 2>/dev/null || true
|
|
docker inspect -f "litellm {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" litellm 2>/dev/null || true
|
|
docker inspect -f "signoz-clickhouse {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" signoz-clickhouse 2>/dev/null || true
|
|
' 2>&1); then
|
|
echo "$out"
|
|
grep -q "ollama-systemd active" <<<"$out" && ok "188 Ollama systemd active" || blocked "188 Ollama systemd inactive"
|
|
grep -q "ollama-api 200" <<<"$out" && ok "188 Ollama API reachable" || blocked "188 Ollama API not reachable"
|
|
grep -q "momo-scheduler running healthy" <<<"$out" && ok "188 momo-scheduler healthy" || blocked "188 momo-scheduler not healthy"
|
|
grep -Eq "litellm running( |$)" <<<"$out" && ok "188 litellm running" || blocked "188 litellm not running"
|
|
grep -q "signoz-clickhouse running healthy" <<<"$out" && ok "188 SignOz ClickHouse healthy" || warn "188 SignOz ClickHouse health not confirmed"
|
|
else
|
|
blocked "188 high-load service check unavailable"
|
|
echo "$out"
|
|
fi
|
|
|
|
if out=$(ssh_cmd "wooo@192.168.0.110" '
|
|
docker inspect -f "sentry-clickhouse {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" sentry-self-hosted-clickhouse-1 2>/dev/null || true
|
|
docker inspect -f "sentry-kafka {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" sentry-self-hosted-kafka-1 2>/dev/null || true
|
|
docker ps --format "{{.Names}} {{.Status}}" | grep -E "sentry-self-hosted-(snuba|events|transactions|generic|metrics|subscription).*consumer" | head -20 || true
|
|
' 2>&1); then
|
|
echo "$out"
|
|
grep -q "sentry-clickhouse running healthy" <<<"$out" && ok "110 Sentry ClickHouse healthy" || blocked "110 Sentry ClickHouse not healthy"
|
|
grep -q "sentry-kafka running healthy" <<<"$out" && ok "110 Sentry Kafka healthy" || blocked "110 Sentry Kafka not healthy"
|
|
grep -q "Restarting" <<<"$out" && blocked "110 Sentry consumers include restarting containers" || ok "110 sampled Sentry consumers are not restarting"
|
|
else
|
|
blocked "110 high-load service check unavailable"
|
|
echo "$out"
|
|
fi
|
|
}
|
|
|
|
summary() {
|
|
section "summary"
|
|
echo "PASS=$PASS WARN=$WARN BLOCKED=$FAIL"
|
|
if [ "$FAIL" -gt 0 ]; then
|
|
echo "Result: HOLD_P3_RELEASE. Do not release runner/CD/crawlers/consumers further."
|
|
return 1
|
|
fi
|
|
if [ "$WARN" -gt 0 ]; then
|
|
echo "Result: P3_RELEASE_WITH_CAUTION. Proceed only with operator review."
|
|
return 0
|
|
fi
|
|
echo "Result: P3_RELEASE_READY. Controlled high-load work release is allowed."
|
|
}
|
|
|
|
echo "AWOOOI P3 controlled release gate"
|
|
date '+%Y-%m-%d %H:%M:%S %Z'
|
|
echo "Limits: load5/core<=$LOAD5_PER_CORE_LIMIT load15/core<=$LOAD15_PER_CORE_LIMIT job_container_cpu<=$JOB_CONTAINER_CPU_LIMIT"
|
|
|
|
check_cold_start_gate
|
|
check_host_load "110" "wooo@192.168.0.110"
|
|
check_host_load "188" "ollama@192.168.0.188"
|
|
check_textfiles
|
|
check_storage_health
|
|
check_backup_health
|
|
check_runner_guardrails
|
|
check_job_containers
|
|
check_high_load_services
|
|
summary
|