fix(recovery): bound 110 pressure readback
Some checks failed
CD Pipeline / workflow-shape (push) Has been cancelled
CD Pipeline / cancel-stale-cd (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled

This commit is contained in:
Your Name
2026-07-01 13:37:22 +08:00
parent 12b78453cf
commit bc46edc01c
7 changed files with 95 additions and 27 deletions

View File

@@ -35,6 +35,7 @@ MAX_ACTIVE_CI_PROCESS_GROUPS="${HOST_WEB_BUILD_PRESSURE_MAX_ACTIVE_CI_PROCESS_GR
MAX_ACTIVE_CI_CONTAINERS="${HOST_WEB_BUILD_PRESSURE_MAX_ACTIVE_CI_CONTAINERS:-1}"
MAX_ORPHAN_BROWSER_GROUPS="${HOST_WEB_BUILD_PRESSURE_MAX_ORPHAN_BROWSER_GROUPS:-0}"
MAX_POSTGRES_CPU_CORES="${HOST_WEB_BUILD_PRESSURE_MAX_POSTGRES_CPU_CORES:-2.0}"
MAX_DOCKER_METRICS_AGE_SECONDS="${HOST_WEB_BUILD_PRESSURE_MAX_DOCKER_METRICS_AGE_SECONDS:-300}"
POSTGRES_CONTAINER_NAME="${HOST_WEB_BUILD_PRESSURE_POSTGRES_CONTAINER:-k3s-postgres-recovery}"
METRICS_FILE="${HOST_RUNAWAY_PROCESS_METRICS_FILE:-${HOST_WEB_BUILD_PRESSURE_METRICS_FILE:-/home/wooo/node_exporter_textfiles/host_runaway_process.prom}}"
DEFAULT_DOCKER_METRICS_FILE="/home/$(id -un)/node_exporter_textfiles/docker_stats.prom"
@@ -122,6 +123,13 @@ docker_metric_labeled_value() {
if [ ! -r "$DOCKER_METRICS_FILE" ]; then
return 1
fi
local now mtime age
now="$(date +%s)"
mtime="$(stat -c %Y "$DOCKER_METRICS_FILE" 2>/dev/null || stat -f %m "$DOCKER_METRICS_FILE" 2>/dev/null || echo 0)"
age=$((now - mtime))
if [ "$age" -gt "$MAX_DOCKER_METRICS_AGE_SECONDS" ]; then
return 1
fi
awk -v metric="$name" -v key="$label_key" -v val="$label_value" '
$1 ~ ("^" metric "\\{") && $0 ~ (key "=\"" val "\"") {
value = $NF

View File

@@ -67,13 +67,26 @@ probe_node_exporter() {
echo "NODE_EXPORTER=ok"
awk '
$1 == "node_boot_time_seconds" {print "NODE_BOOT_TIME_SECONDS="$2}
$1 == "node_time_seconds" {print "NODE_TIME_SECONDS="$2}
$1 == "node_time_seconds" {node_time=$2; print "NODE_TIME_SECONDS="$2}
$1 == "node_load1" {print "NODE_LOAD1="$2}
$1 == "node_load5" {print "NODE_LOAD5="$2}
$1 == "node_load15" {print "NODE_LOAD15="$2}
$1 == "node_procs_blocked" {print "NODE_PROCS_BLOCKED="$2}
$1 == "node_memory_MemAvailable_bytes" {print "NODE_MEM_AVAILABLE_BYTES="$2}
$1 == "node_memory_MemTotal_bytes" {print "NODE_MEM_TOTAL_BYTES="$2}
/^node_textfile_mtime_seconds/ && /docker_stats\.prom/ {docker_stats_mtime=$NF}
END {
if (docker_stats_mtime != "") {
print "DOCKER_STATS_TEXTFILE_MTIME_SECONDS=" docker_stats_mtime
if (node_time != "") {
age = node_time - docker_stats_mtime
printf "DOCKER_STATS_TEXTFILE_AGE_SECONDS=%.0f\n", age
print "DOCKER_STATS_TEXTFILE_FRESHNESS=" (age <= 300 ? "fresh" : "stale")
}
} else {
print "DOCKER_STATS_TEXTFILE_FRESHNESS=missing"
}
}
' <<<"$metrics"
cpu_count="$(awk -F'cpu="' '/^node_cpu_seconds_total/ {split($2, a, "\""); seen[a[1]]=1} END {for (cpu in seen) n++; print n+0}' <<<"$metrics")"
load1="$(awk '$1 == "node_load1" {print $2; exit}' <<<"$metrics")"
@@ -96,8 +109,10 @@ probe_node_exporter() {
sub(/^.*sub_state="/, "", substate)
sub(/".*$/, "", substate)
classifier=active
if (active == "scrape_error" && substate ~ /timed out/) {
if (active == "scrape_error" && substate ~ /(timed out|timeout)/) {
classifier="systemctl_show_timeout"
} else if (active == "scrape_skipped" && substate ~ /systemctl_timeout_budget_exhausted/) {
classifier="systemctl_timeout_budget_exhausted"
}
printf "SYSTEMD_UNIT unit=%s active_state=%s classifier=%s\n", unit, active, classifier
}

View File

@@ -312,36 +312,43 @@ check_110() {
fi
if ! out=$(host_cmd "wooo@192.168.0.110" '
sc() {
if command -v timeout >/dev/null 2>&1; then
timeout 3 systemctl "$@" 2>/dev/null || true
else
systemctl "$@" 2>/dev/null || true
fi
}
echo "HOST $(hostname) $(uptime)"
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)"
echo "DOCKER_SYSTEMD $(sc is-active docker)"
echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)"
echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)"
echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)"
echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)"
echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)"
echo "ACTION_RUNNER_UNIT_FILE_COUNT $(systemctl list-unit-files "actions.runner.*" --no-legend --plain 2>/dev/null | awk "END {print NR+0}")"
echo "ACTION_RUNNER_ACTIVE_COUNT $(systemctl list-units "actions.runner.*" --state=active --no-legend --plain 2>/dev/null | awk "END {print NR+0}")"
echo "ACTION_RUNNER_ENABLED_COUNT $(systemctl list-unit-files "actions.runner.*" --no-legend --plain 2>/dev/null | awk "\$2 == \"enabled\" {c++} END {print c+0}")"
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
echo "ACTION_RUNNER_UNIT_FILE_COUNT $(sc list-unit-files "actions.runner.*" --no-legend --plain | awk "END {print NR+0}")"
echo "ACTION_RUNNER_ACTIVE_COUNT $(sc list-units "actions.runner.*" --state=active --no-legend --plain | awk "END {print NR+0}")"
echo "ACTION_RUNNER_ENABLED_COUNT $(sc list-unit-files "actions.runner.*" --no-legend --plain | awk "\$2 == \"enabled\" {c++} END {print c+0}")"
for u in $(sc list-units "actions.runner.*" --all --no-legend --plain | awk "{print \$1}"); do
sc show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
done
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
mainpid=$(systemctl show "$u" -p MainPID --value 2>/dev/null || true)
load=$(sc show "$u" -p LoadState --value)
unitfile=$(sc show "$u" -p UnitFileState --value)
active=$(sc show "$u" -p ActiveState --value)
mainpid=$(sc show "$u" -p MainPID --value)
unit_ok=0
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
unit_ok=1
fi
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid ok=$unit_ok"
done
cd_lane_load=$(systemctl show awoooi-cd-lane.service -p LoadState --value 2>/dev/null || true)
cd_lane_unitfile=$(systemctl show awoooi-cd-lane.service -p UnitFileState --value 2>/dev/null || true)
cd_lane_active=$(systemctl show awoooi-cd-lane.service -p ActiveState --value 2>/dev/null || true)
cd_lane_mainpid=$(systemctl show awoooi-cd-lane.service -p MainPID --value 2>/dev/null || true)
cd_lane_execstart=$(systemctl show awoooi-cd-lane.service -p ExecStart --value 2>/dev/null || true)
cd_lane_load=$(sc show awoooi-cd-lane.service -p LoadState --value)
cd_lane_unitfile=$(sc show awoooi-cd-lane.service -p UnitFileState --value)
cd_lane_active=$(sc show awoooi-cd-lane.service -p ActiveState --value)
cd_lane_mainpid=$(sc show awoooi-cd-lane.service -p MainPID --value)
cd_lane_execstart=$(sc show awoooi-cd-lane.service -p ExecStart --value)
cd_lane_sentinel=missing
[ -e /run/awoooi-cd-lane-enabled ] && cd_lane_sentinel=present
cd_lane_capacity_ok=0
@@ -372,16 +379,16 @@ elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] &&
cd_lane_mode=controlled_open
fi
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok"
cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true)
cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true)
cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true)
cd_lane_drain_mainpid=$(systemctl show awoooi-cd-lane-drain.service -p MainPID --value 2>/dev/null || true)
cd_lane_drain_cpu_accounting=$(systemctl show awoooi-cd-lane-drain.service -p CPUAccounting --value 2>/dev/null || true)
cd_lane_drain_cpu_quota=$(systemctl show awoooi-cd-lane-drain.service -p CPUQuotaPerSecUSec --value 2>/dev/null || true)
cd_lane_drain_memory_accounting=$(systemctl show awoooi-cd-lane-drain.service -p MemoryAccounting --value 2>/dev/null || true)
cd_lane_drain_memory_max=$(systemctl show awoooi-cd-lane-drain.service -p MemoryMax --value 2>/dev/null || true)
cd_lane_drain_tasks_accounting=$(systemctl show awoooi-cd-lane-drain.service -p TasksAccounting --value 2>/dev/null || true)
cd_lane_drain_tasks_max=$(systemctl show awoooi-cd-lane-drain.service -p TasksMax --value 2>/dev/null || true)
cd_lane_drain_load=$(sc show awoooi-cd-lane-drain.service -p LoadState --value)
cd_lane_drain_unitfile=$(sc show awoooi-cd-lane-drain.service -p UnitFileState --value)
cd_lane_drain_active=$(sc show awoooi-cd-lane-drain.service -p ActiveState --value)
cd_lane_drain_mainpid=$(sc show awoooi-cd-lane-drain.service -p MainPID --value)
cd_lane_drain_cpu_accounting=$(sc show awoooi-cd-lane-drain.service -p CPUAccounting --value)
cd_lane_drain_cpu_quota=$(sc show awoooi-cd-lane-drain.service -p CPUQuotaPerSecUSec --value)
cd_lane_drain_memory_accounting=$(sc show awoooi-cd-lane-drain.service -p MemoryAccounting --value)
cd_lane_drain_memory_max=$(sc show awoooi-cd-lane-drain.service -p MemoryMax --value)
cd_lane_drain_tasks_accounting=$(sc show awoooi-cd-lane-drain.service -p TasksAccounting --value)
cd_lane_drain_tasks_max=$(sc show awoooi-cd-lane-drain.service -p TasksMax --value)
cd_lane_drain_limits_ok=0
if [ "$cd_lane_drain_cpu_accounting" = "yes" ] \
&& [ -n "$cd_lane_drain_cpu_quota" ] && [ "$cd_lane_drain_cpu_quota" != "infinity" ] \
@@ -442,6 +449,8 @@ done
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
' 2>&1); then
fail "ssh 110 read-only check"
echo "SSH_110_BLOCKER remote_control_channel_unavailable"
echo "SSH_110_NEXT_ACTION local_console_run_recover_110_control_path_and_harbor_local_check"
echo "$out"
return
fi

View File

@@ -27,6 +27,9 @@ def test_full_stack_cold_start_check_bounds_ssh_probes() -> None:
assert "-o ServerAliveCountMax=1" in text
assert "timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc" in text
assert "printf -v quoted_cmd '%q' \"$cmd\"" in text
assert 'timeout 3 systemctl "$@"' in text
assert "SSH_110_BLOCKER remote_control_channel_unavailable" in text
assert "SSH_110_NEXT_ACTION local_console_run_recover_110_control_path_and_harbor_local_check" in text
def test_recovery_scorecard_bounds_offsite_evidence_ssh() -> None:
@@ -118,8 +121,11 @@ def test_110_ssh_publickey_auth_diagnosis_is_bounded_and_read_only() -> None:
assert "NODE_EXPORTER=ok" in text
assert "NODE_LOAD1_PER_CPU" in text
assert "NODE_LOAD_CLASSIFIER" in text
assert "DOCKER_STATS_TEXTFILE_AGE_SECONDS" in text
assert "DOCKER_STATS_TEXTFILE_FRESHNESS" in text
assert "SYSTEMD_UNIT unit=%s active_state=%s classifier=%s" in text
assert "systemctl_show_timeout" in text
assert "systemctl_timeout_budget_exhausted" in text
assert "cat /home" not in text
assert "cat ~/.ssh/authorized_keys" not in text
assert "cat \"$home_dir/.ssh/authorized_keys\"" not in text