fix(recovery): bound 110 pressure readback
Some checks failed
CD Pipeline / workflow-shape (push) Has been cancelled
CD Pipeline / cancel-stale-cd (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Has been cancelled
CD Pipeline / cancel-stale-cd (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
This commit is contained in:
@@ -35,6 +35,7 @@ MAX_ACTIVE_CI_PROCESS_GROUPS="${HOST_WEB_BUILD_PRESSURE_MAX_ACTIVE_CI_PROCESS_GR
|
||||
MAX_ACTIVE_CI_CONTAINERS="${HOST_WEB_BUILD_PRESSURE_MAX_ACTIVE_CI_CONTAINERS:-1}"
|
||||
MAX_ORPHAN_BROWSER_GROUPS="${HOST_WEB_BUILD_PRESSURE_MAX_ORPHAN_BROWSER_GROUPS:-0}"
|
||||
MAX_POSTGRES_CPU_CORES="${HOST_WEB_BUILD_PRESSURE_MAX_POSTGRES_CPU_CORES:-2.0}"
|
||||
MAX_DOCKER_METRICS_AGE_SECONDS="${HOST_WEB_BUILD_PRESSURE_MAX_DOCKER_METRICS_AGE_SECONDS:-300}"
|
||||
POSTGRES_CONTAINER_NAME="${HOST_WEB_BUILD_PRESSURE_POSTGRES_CONTAINER:-k3s-postgres-recovery}"
|
||||
METRICS_FILE="${HOST_RUNAWAY_PROCESS_METRICS_FILE:-${HOST_WEB_BUILD_PRESSURE_METRICS_FILE:-/home/wooo/node_exporter_textfiles/host_runaway_process.prom}}"
|
||||
DEFAULT_DOCKER_METRICS_FILE="/home/$(id -un)/node_exporter_textfiles/docker_stats.prom"
|
||||
@@ -122,6 +123,13 @@ docker_metric_labeled_value() {
|
||||
if [ ! -r "$DOCKER_METRICS_FILE" ]; then
|
||||
return 1
|
||||
fi
|
||||
local now mtime age
|
||||
now="$(date +%s)"
|
||||
mtime="$(stat -c %Y "$DOCKER_METRICS_FILE" 2>/dev/null || stat -f %m "$DOCKER_METRICS_FILE" 2>/dev/null || echo 0)"
|
||||
age=$((now - mtime))
|
||||
if [ "$age" -gt "$MAX_DOCKER_METRICS_AGE_SECONDS" ]; then
|
||||
return 1
|
||||
fi
|
||||
awk -v metric="$name" -v key="$label_key" -v val="$label_value" '
|
||||
$1 ~ ("^" metric "\\{") && $0 ~ (key "=\"" val "\"") {
|
||||
value = $NF
|
||||
|
||||
@@ -67,13 +67,26 @@ probe_node_exporter() {
|
||||
echo "NODE_EXPORTER=ok"
|
||||
awk '
|
||||
$1 == "node_boot_time_seconds" {print "NODE_BOOT_TIME_SECONDS="$2}
|
||||
$1 == "node_time_seconds" {print "NODE_TIME_SECONDS="$2}
|
||||
$1 == "node_time_seconds" {node_time=$2; print "NODE_TIME_SECONDS="$2}
|
||||
$1 == "node_load1" {print "NODE_LOAD1="$2}
|
||||
$1 == "node_load5" {print "NODE_LOAD5="$2}
|
||||
$1 == "node_load15" {print "NODE_LOAD15="$2}
|
||||
$1 == "node_procs_blocked" {print "NODE_PROCS_BLOCKED="$2}
|
||||
$1 == "node_memory_MemAvailable_bytes" {print "NODE_MEM_AVAILABLE_BYTES="$2}
|
||||
$1 == "node_memory_MemTotal_bytes" {print "NODE_MEM_TOTAL_BYTES="$2}
|
||||
/^node_textfile_mtime_seconds/ && /docker_stats\.prom/ {docker_stats_mtime=$NF}
|
||||
END {
|
||||
if (docker_stats_mtime != "") {
|
||||
print "DOCKER_STATS_TEXTFILE_MTIME_SECONDS=" docker_stats_mtime
|
||||
if (node_time != "") {
|
||||
age = node_time - docker_stats_mtime
|
||||
printf "DOCKER_STATS_TEXTFILE_AGE_SECONDS=%.0f\n", age
|
||||
print "DOCKER_STATS_TEXTFILE_FRESHNESS=" (age <= 300 ? "fresh" : "stale")
|
||||
}
|
||||
} else {
|
||||
print "DOCKER_STATS_TEXTFILE_FRESHNESS=missing"
|
||||
}
|
||||
}
|
||||
' <<<"$metrics"
|
||||
cpu_count="$(awk -F'cpu="' '/^node_cpu_seconds_total/ {split($2, a, "\""); seen[a[1]]=1} END {for (cpu in seen) n++; print n+0}' <<<"$metrics")"
|
||||
load1="$(awk '$1 == "node_load1" {print $2; exit}' <<<"$metrics")"
|
||||
@@ -96,8 +109,10 @@ probe_node_exporter() {
|
||||
sub(/^.*sub_state="/, "", substate)
|
||||
sub(/".*$/, "", substate)
|
||||
classifier=active
|
||||
if (active == "scrape_error" && substate ~ /timed out/) {
|
||||
if (active == "scrape_error" && substate ~ /(timed out|timeout)/) {
|
||||
classifier="systemctl_show_timeout"
|
||||
} else if (active == "scrape_skipped" && substate ~ /systemctl_timeout_budget_exhausted/) {
|
||||
classifier="systemctl_timeout_budget_exhausted"
|
||||
}
|
||||
printf "SYSTEMD_UNIT unit=%s active_state=%s classifier=%s\n", unit, active, classifier
|
||||
}
|
||||
|
||||
@@ -312,36 +312,43 @@ check_110() {
|
||||
fi
|
||||
|
||||
if ! out=$(host_cmd "wooo@192.168.0.110" '
|
||||
sc() {
|
||||
if command -v timeout >/dev/null 2>&1; then
|
||||
timeout 3 systemctl "$@" 2>/dev/null || true
|
||||
else
|
||||
systemctl "$@" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
echo "HOST $(hostname) $(uptime)"
|
||||
echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")"
|
||||
echo "DOCKER_SYSTEMD $(systemctl is-active docker 2>/dev/null || true)"
|
||||
echo "DOCKER_SYSTEMD $(sc is-active docker)"
|
||||
echo "HARBOR_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:5000/v2/ || true)"
|
||||
echo "GITEA_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:3001/ || true)"
|
||||
echo "PROM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9090/-/ready || true)"
|
||||
echo "AM_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:9093/-/healthy || true)"
|
||||
echo "SENTRY_CODE $(curl -s -o /dev/null -w "%{http_code}" --max-time 8 http://127.0.0.1:9000/ || true)"
|
||||
echo "ACTION_RUNNER_UNIT_FILE_COUNT $(systemctl list-unit-files "actions.runner.*" --no-legend --plain 2>/dev/null | awk "END {print NR+0}")"
|
||||
echo "ACTION_RUNNER_ACTIVE_COUNT $(systemctl list-units "actions.runner.*" --state=active --no-legend --plain 2>/dev/null | awk "END {print NR+0}")"
|
||||
echo "ACTION_RUNNER_ENABLED_COUNT $(systemctl list-unit-files "actions.runner.*" --no-legend --plain 2>/dev/null | awk "\$2 == \"enabled\" {c++} END {print c+0}")"
|
||||
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
|
||||
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
|
||||
echo "ACTION_RUNNER_UNIT_FILE_COUNT $(sc list-unit-files "actions.runner.*" --no-legend --plain | awk "END {print NR+0}")"
|
||||
echo "ACTION_RUNNER_ACTIVE_COUNT $(sc list-units "actions.runner.*" --state=active --no-legend --plain | awk "END {print NR+0}")"
|
||||
echo "ACTION_RUNNER_ENABLED_COUNT $(sc list-unit-files "actions.runner.*" --no-legend --plain | awk "\$2 == \"enabled\" {c++} END {print c+0}")"
|
||||
for u in $(sc list-units "actions.runner.*" --all --no-legend --plain | awk "{print \$1}"); do
|
||||
sc show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
|
||||
done
|
||||
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
|
||||
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
|
||||
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
|
||||
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
|
||||
mainpid=$(systemctl show "$u" -p MainPID --value 2>/dev/null || true)
|
||||
load=$(sc show "$u" -p LoadState --value)
|
||||
unitfile=$(sc show "$u" -p UnitFileState --value)
|
||||
active=$(sc show "$u" -p ActiveState --value)
|
||||
mainpid=$(sc show "$u" -p MainPID --value)
|
||||
unit_ok=0
|
||||
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
|
||||
unit_ok=1
|
||||
fi
|
||||
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid ok=$unit_ok"
|
||||
done
|
||||
cd_lane_load=$(systemctl show awoooi-cd-lane.service -p LoadState --value 2>/dev/null || true)
|
||||
cd_lane_unitfile=$(systemctl show awoooi-cd-lane.service -p UnitFileState --value 2>/dev/null || true)
|
||||
cd_lane_active=$(systemctl show awoooi-cd-lane.service -p ActiveState --value 2>/dev/null || true)
|
||||
cd_lane_mainpid=$(systemctl show awoooi-cd-lane.service -p MainPID --value 2>/dev/null || true)
|
||||
cd_lane_execstart=$(systemctl show awoooi-cd-lane.service -p ExecStart --value 2>/dev/null || true)
|
||||
cd_lane_load=$(sc show awoooi-cd-lane.service -p LoadState --value)
|
||||
cd_lane_unitfile=$(sc show awoooi-cd-lane.service -p UnitFileState --value)
|
||||
cd_lane_active=$(sc show awoooi-cd-lane.service -p ActiveState --value)
|
||||
cd_lane_mainpid=$(sc show awoooi-cd-lane.service -p MainPID --value)
|
||||
cd_lane_execstart=$(sc show awoooi-cd-lane.service -p ExecStart --value)
|
||||
cd_lane_sentinel=missing
|
||||
[ -e /run/awoooi-cd-lane-enabled ] && cd_lane_sentinel=present
|
||||
cd_lane_capacity_ok=0
|
||||
@@ -372,16 +379,16 @@ elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] &&
|
||||
cd_lane_mode=controlled_open
|
||||
fi
|
||||
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok"
|
||||
cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true)
|
||||
cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true)
|
||||
cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true)
|
||||
cd_lane_drain_mainpid=$(systemctl show awoooi-cd-lane-drain.service -p MainPID --value 2>/dev/null || true)
|
||||
cd_lane_drain_cpu_accounting=$(systemctl show awoooi-cd-lane-drain.service -p CPUAccounting --value 2>/dev/null || true)
|
||||
cd_lane_drain_cpu_quota=$(systemctl show awoooi-cd-lane-drain.service -p CPUQuotaPerSecUSec --value 2>/dev/null || true)
|
||||
cd_lane_drain_memory_accounting=$(systemctl show awoooi-cd-lane-drain.service -p MemoryAccounting --value 2>/dev/null || true)
|
||||
cd_lane_drain_memory_max=$(systemctl show awoooi-cd-lane-drain.service -p MemoryMax --value 2>/dev/null || true)
|
||||
cd_lane_drain_tasks_accounting=$(systemctl show awoooi-cd-lane-drain.service -p TasksAccounting --value 2>/dev/null || true)
|
||||
cd_lane_drain_tasks_max=$(systemctl show awoooi-cd-lane-drain.service -p TasksMax --value 2>/dev/null || true)
|
||||
cd_lane_drain_load=$(sc show awoooi-cd-lane-drain.service -p LoadState --value)
|
||||
cd_lane_drain_unitfile=$(sc show awoooi-cd-lane-drain.service -p UnitFileState --value)
|
||||
cd_lane_drain_active=$(sc show awoooi-cd-lane-drain.service -p ActiveState --value)
|
||||
cd_lane_drain_mainpid=$(sc show awoooi-cd-lane-drain.service -p MainPID --value)
|
||||
cd_lane_drain_cpu_accounting=$(sc show awoooi-cd-lane-drain.service -p CPUAccounting --value)
|
||||
cd_lane_drain_cpu_quota=$(sc show awoooi-cd-lane-drain.service -p CPUQuotaPerSecUSec --value)
|
||||
cd_lane_drain_memory_accounting=$(sc show awoooi-cd-lane-drain.service -p MemoryAccounting --value)
|
||||
cd_lane_drain_memory_max=$(sc show awoooi-cd-lane-drain.service -p MemoryMax --value)
|
||||
cd_lane_drain_tasks_accounting=$(sc show awoooi-cd-lane-drain.service -p TasksAccounting --value)
|
||||
cd_lane_drain_tasks_max=$(sc show awoooi-cd-lane-drain.service -p TasksMax --value)
|
||||
cd_lane_drain_limits_ok=0
|
||||
if [ "$cd_lane_drain_cpu_accounting" = "yes" ] \
|
||||
&& [ -n "$cd_lane_drain_cpu_quota" ] && [ "$cd_lane_drain_cpu_quota" != "infinity" ] \
|
||||
@@ -442,6 +449,8 @@ done
|
||||
docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
|
||||
' 2>&1); then
|
||||
fail "ssh 110 read-only check"
|
||||
echo "SSH_110_BLOCKER remote_control_channel_unavailable"
|
||||
echo "SSH_110_NEXT_ACTION local_console_run_recover_110_control_path_and_harbor_local_check"
|
||||
echo "$out"
|
||||
return
|
||||
fi
|
||||
|
||||
@@ -27,6 +27,9 @@ def test_full_stack_cold_start_check_bounds_ssh_probes() -> None:
|
||||
assert "-o ServerAliveCountMax=1" in text
|
||||
assert "timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc" in text
|
||||
assert "printf -v quoted_cmd '%q' \"$cmd\"" in text
|
||||
assert 'timeout 3 systemctl "$@"' in text
|
||||
assert "SSH_110_BLOCKER remote_control_channel_unavailable" in text
|
||||
assert "SSH_110_NEXT_ACTION local_console_run_recover_110_control_path_and_harbor_local_check" in text
|
||||
|
||||
|
||||
def test_recovery_scorecard_bounds_offsite_evidence_ssh() -> None:
|
||||
@@ -118,8 +121,11 @@ def test_110_ssh_publickey_auth_diagnosis_is_bounded_and_read_only() -> None:
|
||||
assert "NODE_EXPORTER=ok" in text
|
||||
assert "NODE_LOAD1_PER_CPU" in text
|
||||
assert "NODE_LOAD_CLASSIFIER" in text
|
||||
assert "DOCKER_STATS_TEXTFILE_AGE_SECONDS" in text
|
||||
assert "DOCKER_STATS_TEXTFILE_FRESHNESS" in text
|
||||
assert "SYSTEMD_UNIT unit=%s active_state=%s classifier=%s" in text
|
||||
assert "systemctl_show_timeout" in text
|
||||
assert "systemctl_timeout_budget_exhausted" in text
|
||||
assert "cat /home" not in text
|
||||
assert "cat ~/.ssh/authorized_keys" not in text
|
||||
assert "cat \"$home_dir/.ssh/authorized_keys\"" not in text
|
||||
|
||||
Reference in New Issue
Block a user