fix(ops): route container pressure alerts to controller
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / tests (push) Has been cancelled

This commit is contained in:
Your Name
2026-07-01 23:38:34 +08:00
parent d658f03ac5
commit fffebf9597
6 changed files with 85 additions and 7 deletions

View File

@@ -49,6 +49,7 @@ def parse_args() -> argparse.Namespace:
default=DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS,
)
parser.add_argument("--load5-per-core-threshold", type=float, default=1.5)
parser.add_argument("--container-cpu-threshold", type=float, default=2.0)
parser.add_argument("--ci-stale-age-seconds", type=int, default=1800)
parser.add_argument("--json", action="store_true", help="Print JSON only.")
return parser.parse_args()
@@ -217,6 +218,7 @@ def build_packet(
docker_samples: list[dict[str, Any]],
docker_stats_status: dict[str, Any],
load5_per_core_threshold: float,
container_cpu_threshold: float,
ci_stale_age_seconds: int,
) -> dict[str, Any]:
monitor_up = int(
@@ -327,19 +329,27 @@ def build_packet(
if controlled_apply_allowed
else "keep_pressure_gate_fail_closed_until_ci_load_clears"
)
elif (
load5_per_core > load5_per_core_threshold
and top_container_name == "gitea"
and top_container_cpu >= 2.0
):
elif top_container_name == "gitea" and top_container_cpu >= container_cpu_threshold:
classification = "blocked_gitea_queue_or_hook_backlog_requires_playbook"
severity = "critical"
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
dry_run_command = (
"scripts/ops/host-sustained-load-evidence.py "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
)
next_action = "run_gitea_queue_or_hook_backlog_playbook_check_mode"
elif (
top_container_name in {"stockplatform-v2-postgres-1", "stockplatform-v2-api-1"}
and top_container_cpu >= container_cpu_threshold
):
classification = "blocked_stockplatform_hot_query_or_api_pressure_requires_playbook"
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
dry_run_command = (
"scripts/ops/host-sustained-load-evidence.py "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
)
next_action = "run_stockplatform_hot_query_or_api_pressure_playbook_check_mode"
elif load5_per_core > load5_per_core_threshold and swap_used_ratio >= 0.85:
classification = "blocked_memory_or_swap_pressure_requires_service_playbook"
severity = "critical"
@@ -367,6 +377,7 @@ def build_packet(
"monitor_up": monitor_up,
"load5_per_core": round(load5_per_core, 6),
"load5_per_core_threshold": load5_per_core_threshold,
"container_cpu_threshold": container_cpu_threshold,
"swap_used_ratio": round(swap_used_ratio, 6),
"remediation_authorized": remediation_authorized,
"active_ci_container_count": active_ci_containers,
@@ -430,6 +441,7 @@ def main() -> int:
max_age_seconds=args.docker_stats_max_age_seconds,
),
load5_per_core_threshold=args.load5_per_core_threshold,
container_cpu_threshold=args.container_cpu_threshold,
ci_stale_age_seconds=args.ci_stale_age_seconds,
)
if args.json:

View File

@@ -36,6 +36,7 @@ def test_110_moderate_pressure_alert_routes_to_live_controller() -> None:
assert rule["labels"]["auto_repair"] == "true"
assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
assert "--load5-per-core-threshold 0.75" in action
assert "--container-cpu-threshold 2.0" in action
assert "不讀 secret" in annotations["runbook"]
assert "禁止 Docker / systemd / Nginx / DB restart" in annotations["runbook"]

View File

@@ -425,6 +425,62 @@ def test_sustained_load_controller_routes_gitea_backlog_from_docker_metrics(tmp_
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
def test_sustained_load_controller_routes_gitea_quota_pressure_even_when_load_is_moderate(
tmp_path: Path,
) -> None:
metrics_file = tmp_path / "host.prom"
metrics_file.write_text(
"\n".join(
[
'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1',
'awoooi_host_load5_per_core{host="110"} 0.55',
'awoooi_host_swap_used_ratio{host="110"} 0.1',
'awoooi_host_runaway_process_remediation_authorized{host="110"} 0',
'awoooi_host_gitea_actions_active_container_count{host="110"} 0',
'awoooi_host_gitea_actions_active_process_group_count{host="110"} 0',
'awoooi_host_runaway_browser_orphan_group_count{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 0',
]
),
encoding="utf-8",
)
docker_file = tmp_path / "docker.prom"
docker_file.write_text(
"\n".join(
[
'docker_container_cpu_cores{host="110",container_name="gitea"} 2.08',
'docker_container_cpu_cores{host="110",container_name="redis"} 0.2',
]
),
encoding="utf-8",
)
result = subprocess.run(
[
sys.executable,
str(CONTROLLER_PATH),
"--host",
"110",
"--load5-per-core-threshold",
"0.75",
"--metrics-file",
str(metrics_file),
"--docker-stats-file",
str(docker_file),
"--json",
],
capture_output=True,
text=True,
)
assert result.returncode == 75
payload = json.loads(result.stdout)
assert payload["classification"] == "blocked_gitea_queue_or_hook_backlog_requires_playbook"
assert payload["severity"] == "warning"
assert payload["readback"]["container_cpu_threshold"] == 2.0
assert payload["readback"]["top_container_cpu"]["cpu_cores"] == 2.08
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
def test_sustained_load_controller_ignores_stale_docker_stats_attribution(tmp_path: Path) -> None:
metrics_file = tmp_path / "host.prom"
metrics_file.write_text(