feat(agent): automate sustained host load response
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 28s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-01 08:43:40 +08:00
parent 5e629efa44
commit a6dc806d38
10 changed files with 1285 additions and 40 deletions

View File

@@ -0,0 +1,340 @@
#!/usr/bin/env python3
"""Classify sustained host load and emit a controlled automation packet.
The controller is intentionally read-only by default. It turns
HostLoadAverageSustainedHigh from a generic "SSH and look around" alert into a
deterministic AI Agent control packet:
* orphan browser/smoke load -> gated SIGTERM helper dry-run, then controlled
apply with evidence and post-apply verifier
* active Gitea Actions/BuildKit load -> runner pressure stays fail-closed;
drain/cancel decisions must use runner/CD verifiers, not process kills
* unknown or critical pressure -> source-specific playbook or break-glass
It never reads secrets, raw runner registrations, sessions, or environment
files, and it never mutates host state.
"""
from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
from typing import Any
DEFAULT_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom")
SCHEMA_VERSION = "host_sustained_load_controlled_automation_v1"
LABEL_RE = re.compile(r"(?P<key>[A-Za-z_][A-Za-z0-9_]*)=\"(?P<value>(?:[^\"\\\\]|\\\\.)*)\"")
METRIC_RE = re.compile(
r"^(?P<name>[A-Za-z_:][A-Za-z0-9_:]*)(?:\{(?P<labels>[^}]*)\})?\s+"
r"(?P<value>[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)$"
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Build a controlled AI Agent packet for sustained host load."
)
parser.add_argument("--host", default="110")
parser.add_argument("--metrics-file", type=Path, default=DEFAULT_METRICS_FILE)
parser.add_argument("--load5-per-core-threshold", type=float, default=1.5)
parser.add_argument("--ci-stale-age-seconds", type=int, default=1800)
parser.add_argument("--json", action="store_true", help="Print JSON only.")
return parser.parse_args()
def _unescape_label(value: str) -> str:
return value.replace(r"\"", '"').replace(r"\\", "\\").replace(r"\n", "\n")
def parse_prometheus_text(text: str) -> list[dict[str, Any]]:
samples: list[dict[str, Any]] = []
for raw_line in text.splitlines():
line = raw_line.strip()
if not line or line.startswith("#"):
continue
match = METRIC_RE.match(line)
if not match:
continue
labels = {
item.group("key"): _unescape_label(item.group("value"))
for item in LABEL_RE.finditer(match.group("labels") or "")
}
samples.append(
{
"name": match.group("name"),
"labels": labels,
"value": float(match.group("value")),
}
)
return samples
def _sample_value(
samples: list[dict[str, Any]],
name: str,
*,
host: str,
labels: dict[str, str] | None = None,
default: float = 0.0,
) -> float:
expected = {"host": host, **(labels or {})}
for sample in samples:
if sample["name"] != name:
continue
sample_labels = sample["labels"]
if all(sample_labels.get(key) == value for key, value in expected.items()):
return float(sample["value"])
return default
def _rule_values(samples: list[dict[str, Any]], name: str, *, host: str) -> list[dict[str, Any]]:
values = []
for sample in samples:
if sample["name"] != name:
continue
labels = sample["labels"]
if labels.get("host") != host:
continue
rule = labels.get("rule")
if not rule:
continue
values.append({"rule": rule, "value": float(sample["value"])})
return values
def _top_orphan_rule(samples: list[dict[str, Any]], *, host: str) -> dict[str, Any] | None:
counts = _rule_values(
samples,
"awoooi_host_runaway_browser_orphan_group_count",
host=host,
)
cpu_by_rule = {
item["rule"]: item["value"]
for item in _rule_values(
samples,
"awoooi_host_runaway_browser_orphan_cpu_percent",
host=host,
)
}
candidates = [
{
"rule": item["rule"],
"group_count": int(item["value"]),
"cpu_percent": round(cpu_by_rule.get(item["rule"], 0.0), 3),
}
for item in counts
if item["value"] > 0
]
if not candidates:
return None
return sorted(candidates, key=lambda item: (-item["cpu_percent"], item["rule"]))[0]
def build_packet(
*,
host: str,
samples: list[dict[str, Any]],
load5_per_core_threshold: float,
ci_stale_age_seconds: int,
) -> dict[str, Any]:
monitor_up = int(
_sample_value(
samples,
"awoooi_host_runaway_process_monitor_up",
host=host,
labels={"mode": "read_only"},
default=0,
)
)
load5_per_core = _sample_value(samples, "awoooi_host_load5_per_core", host=host)
swap_used_ratio = _sample_value(samples, "awoooi_host_swap_used_ratio", host=host)
remediation_authorized = int(
_sample_value(
samples,
"awoooi_host_runaway_process_remediation_authorized",
host=host,
)
)
active_ci_containers = int(
_sample_value(
samples,
"awoooi_host_gitea_actions_active_container_count",
host=host,
default=0,
)
)
active_ci_groups = int(
_sample_value(
samples,
"awoooi_host_gitea_actions_active_process_group_count",
host=host,
default=0,
)
)
active_ci_cpu = _sample_value(
samples,
"awoooi_host_gitea_actions_active_process_cpu_percent",
host=host,
)
active_ci_oldest_age = int(
_sample_value(
samples,
"awoooi_host_gitea_actions_active_process_oldest_age_seconds",
host=host,
)
)
top_orphan = _top_orphan_rule(samples, host=host)
classification = "observing_load_within_threshold"
severity = "info"
controlled_apply_allowed = False
next_action = "keep_read_only_monitoring"
dry_run_command = ""
controlled_apply_command = ""
verifier_command = (
"scripts/ops/host-sustained-load-controller.py "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE}"
)
if monitor_up != 1:
classification = "blocked_monitor_unavailable"
severity = "warning"
next_action = "restore_host_runaway_process_exporter_textfile_before_apply"
elif remediation_authorized > 0:
classification = "blocked_monitor_authority_violation"
severity = "critical"
next_action = "rollback_monitor_to_read_only_exporter"
elif load5_per_core > load5_per_core_threshold and top_orphan:
classification = "controlled_orphan_browser_remediation_ready"
severity = "critical"
controlled_apply_allowed = True
rule = top_orphan["rule"]
dry_run_command = f"scripts/ops/host-runaway-process-remediation.py --rule {rule}"
controlled_apply_command = (
"scripts/ops/host-runaway-process-remediation.py "
f"--rule {rule} --apply --confirm-apply "
"--controlled-apply-id ${CONTROLLED_APPLY_ID} "
"--evidence-ref ${EVIDENCE_REF} "
"--post-apply-verifier "
"'scripts/ops/host-sustained-load-controller.py --host "
f"{host} --metrics-file {DEFAULT_METRICS_FILE}' "
"--wait-seconds 10"
)
next_action = "run_orphan_browser_remediation_dry_run_then_controlled_sigterm"
elif (
load5_per_core > load5_per_core_threshold
and (active_ci_containers > 0 or active_ci_groups > 0)
):
classification = "controlled_ci_runner_saturation_guarded"
severity = "critical" if active_ci_oldest_age >= ci_stale_age_seconds else "warning"
controlled_apply_allowed = active_ci_oldest_age >= ci_stale_age_seconds
dry_run_command = (
"ops/runner/read-public-gitea-actions-queue.py --json "
"&& ops/runner/check-awoooi-non110-runner-readiness.sh"
)
controlled_apply_command = (
"keep_110_runner_pressure_gate_fail_closed; "
"only cancel/drain stale Gitea Actions through runner verifier packet"
)
next_action = (
"prepare_runner_drain_or_cancel_packet_without_process_kill"
if controlled_apply_allowed
else "keep_pressure_gate_fail_closed_until_ci_load_clears"
)
elif load5_per_core > load5_per_core_threshold and swap_used_ratio >= 0.85:
classification = "blocked_memory_or_swap_pressure_requires_service_playbook"
severity = "critical"
next_action = "route_to_service_specific_memory_pressure_playbook"
elif load5_per_core > load5_per_core_threshold:
classification = "blocked_unknown_sustained_load_requires_source_specific_playbook"
severity = "critical"
dry_run_command = (
"scripts/ops/host-sustained-load-evidence.py "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
"--docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom "
"--json"
)
next_action = "collect_sanitized_top_process_and_container_stats_then_select_playbook"
return {
"schema_version": SCHEMA_VERSION,
"host": host,
"mode": "read_only_control_packet",
"classification": classification,
"severity": severity,
"controlled_apply_allowed": controlled_apply_allowed,
"next_action": next_action,
"readback": {
"monitor_up": monitor_up,
"load5_per_core": round(load5_per_core, 6),
"load5_per_core_threshold": load5_per_core_threshold,
"swap_used_ratio": round(swap_used_ratio, 6),
"remediation_authorized": remediation_authorized,
"active_ci_container_count": active_ci_containers,
"active_ci_process_group_count": active_ci_groups,
"active_ci_process_cpu_percent": round(active_ci_cpu, 3),
"active_ci_oldest_age_seconds": active_ci_oldest_age,
"top_orphan_rule": top_orphan,
},
"commands": {
"dry_run": dry_run_command,
"controlled_apply": controlled_apply_command,
"post_apply_verifier": verifier_command,
"rollback": "send SIGTERM only; no persistent host mutation. Re-run workload if needed.",
},
"operation_boundaries": {
"secret_value_read": False,
"raw_session_read": False,
"raw_runner_registration_read": False,
"host_write_performed": False,
"process_signal_performed": False,
"docker_restart_allowed": False,
"systemd_restart_allowed": False,
"firewall_change_allowed": False,
"critical_break_glass_required": True,
},
"forbidden_actions": [
"SIGKILL",
"docker_restart",
"systemctl_restart",
"nginx_reload",
"firewall_change",
"kubectl_action",
"secret_read",
"legacy_or_generic_runner_restore",
],
}
def main() -> int:
args = parse_args()
try:
text = args.metrics_file.read_text(encoding="utf-8")
samples = parse_prometheus_text(text)
except FileNotFoundError:
samples = []
packet = build_packet(
host=args.host,
samples=samples,
load5_per_core_threshold=args.load5_per_core_threshold,
ci_stale_age_seconds=args.ci_stale_age_seconds,
)
if args.json:
print(json.dumps(packet, ensure_ascii=False, indent=2, sort_keys=True))
else:
print(f"status={packet['classification']}")
print(f"controlled_apply_allowed={str(packet['controlled_apply_allowed']).lower()}")
print(f"next_action={packet['next_action']}")
if packet["commands"]["dry_run"]:
print(f"dry_run_command={packet['commands']['dry_run']}")
if packet["commands"]["controlled_apply"]:
print(f"controlled_apply_command={packet['commands']['controlled_apply']}")
print(f"post_apply_verifier={packet['commands']['post_apply_verifier']}")
return 0 if not packet["classification"].startswith("blocked_") else 75
if __name__ == "__main__":
raise SystemExit(main())