feat(agent): automate sustained host load response
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 28s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-01 08:43:40 +08:00
parent 5e629efa44
commit a6dc806d38
10 changed files with 1285 additions and 40 deletions

View File

@@ -1459,6 +1459,160 @@ def _build_alert_noise_reduction_readback(
}
def _build_host_sustained_load_controlled_automation_readback() -> dict[str, Any]:
"""Expose the sustained CPU/load automation contract as a first-class lane."""
action_classes = [
{
"class_id": "orphan_browser_smoke_runaway_process",
"alertnames": [
"HostLoadAverageSustainedHigh",
"HostOrphanBrowserSmokeHighCpu",
],
"classifier": "host-sustained-load-controller.py:controlled_orphan_browser_remediation_ready",
"controlled_action": "host-runaway-process-remediation.py dry-run then gated SIGTERM",
"controlled_apply_allowed": True,
"post_apply_verifier": "host-sustained-load-controller.py --json",
"rollback": "no persistent host mutation; workload can be re-run",
"forbidden_actions": [
"SIGKILL",
"docker_restart",
"systemctl_restart",
"nginx_reload",
"firewall_change",
"reboot",
],
},
{
"class_id": "ci_runner_load_saturation",
"alertnames": [
"HostLoadAverageSustainedHigh",
"HostCiRunnerLoadSaturation",
],
"classifier": "host-sustained-load-controller.py:controlled_ci_runner_saturation_guarded",
"controlled_action": "keep runner pressure gate fail-closed; prepare stale-run drain/cancel packet only after queue verifier",
"controlled_apply_allowed": True,
"post_apply_verifier": "read-public-gitea-actions-queue.py + non110/110 runner readiness verifier",
"rollback": "do not restore legacy or generic runner labels; re-run CD after pressure clears",
"forbidden_actions": [
"legacy_runner_restore",
"generic_runner_label_restore",
"process_kill_for_legitimate_ci",
"warn_only_pressure_gate",
],
},
{
"class_id": "memory_or_swap_pressure",
"alertnames": ["HostLoadAverageSustainedHigh", "HostOutOfMemory"],
"classifier": "host-sustained-load-controller.py:blocked_memory_or_swap_pressure_requires_service_playbook",
"controlled_action": "route to service-specific memory/cgroup playbook with check-mode diff",
"controlled_apply_allowed": False,
"post_apply_verifier": "service-specific health and load readback",
"rollback": "service-specific resource rollback",
"forbidden_actions": [
"blind_limit_reduction",
"docker_restart_without_service_playbook",
"destructive_prune",
],
},
{
"class_id": "unknown_sustained_load",
"alertnames": ["HostLoadAverageSustainedHigh"],
"classifier": "host-sustained-load-controller.py:blocked_unknown_sustained_load_requires_source_specific_playbook",
"controlled_action": "run host-sustained-load-evidence.py then select or generate a source-specific PlayBook",
"controlled_apply_allowed": False,
"post_apply_verifier": "host-sustained-load-evidence.py readback plus source-specific verifier before closure",
"rollback": "source-specific rollback required before apply",
"forbidden_actions": [
"generic_kill",
"generic_docker_restart",
"generic_systemd_restart",
"secret_collection",
],
},
]
required_assets = [
{
"asset_id": "host_sustained_load_controller",
"path": "scripts/ops/host-sustained-load-controller.py",
"purpose": "classify sustained load and emit the controlled automation packet",
"ready": True,
},
{
"asset_id": "host_sustained_load_sanitized_evidence",
"path": "scripts/ops/host-sustained-load-evidence.py",
"purpose": "collect sanitized process-family and container evidence for source-specific PlayBooks",
"ready": True,
},
{
"asset_id": "host_runaway_process_exporter",
"path": "scripts/ops/host-runaway-process-exporter.py",
"purpose": "publish read-only load/root-cause metrics",
"ready": True,
},
{
"asset_id": "orphan_browser_remediation_helper",
"path": "scripts/ops/host-runaway-process-remediation.py",
"purpose": "dry-run and controlled SIGTERM for allowlisted orphan browser process groups",
"ready": True,
},
{
"asset_id": "prometheus_alert_route",
"path": "ops/monitoring/alerts-unified.yml:HostLoadAverageSustainedHigh",
"purpose": "route sustained load alerts to the controller instead of generic SSH top",
"ready": True,
},
{
"asset_id": "ai_agent_work_item_readback",
"path": "/api/v1/agents/agent-autonomous-runtime-control",
"purpose": "make this lane visible in work_item_progress and rollups",
"ready": True,
},
]
return {
"schema_version": "host_sustained_load_controlled_automation_readback_v1",
"status": "completed",
"current_work_item_id": "P1-D2-host-sustained-load-controlled-automation",
"problem_statement": (
"HostLoadAverageSustainedHigh must not stop at alerting; it must "
"classify root cause, produce a controlled action packet, run a "
"post-apply verifier, and write back learning evidence."
),
"action_classes": action_classes,
"required_assets": required_assets,
"control_flow": [
"alert_received",
"read_textfile_metrics",
"classify_root_cause",
"emit_controlled_packet",
"dry_run_or_check_mode",
"controlled_apply_when_allowlisted",
"post_apply_verifier",
"km_playbook_telegram_receipt_writeback",
],
"operation_boundaries": {
"executes_on_read": False,
"secret_value_read": False,
"raw_session_read": False,
"raw_runner_registration_read": False,
"critical_break_glass_still_required": True,
"legacy_runner_restore_allowed": False,
"generic_runner_label_restore_allowed": False,
},
"rollups": {
"action_class_count": len(action_classes),
"controlled_apply_class_count": sum(
1 for item in action_classes if item["controlled_apply_allowed"] is True
),
"required_asset_count": len(required_assets),
"ready_asset_count": sum(1 for item in required_assets if item["ready"] is True),
"forbidden_action_count": sum(
len(item["forbidden_actions"]) for item in action_classes
),
},
}
def _build_ui_productization_readback() -> dict[str, Any]:
"""Expose the concrete AwoooP product UI surfaces used to track this work."""
@@ -1739,6 +1893,7 @@ def _build_work_item_progress(
agent_decision_wiring: Mapping[str, Any],
learning_loop: Mapping[str, Any],
alert_noise_reduction: Mapping[str, Any],
host_sustained_load_automation: Mapping[str, Any],
ui_productization: Mapping[str, Any],
multi_product_taxonomy: Mapping[str, Any],
db_read_status: str,
@@ -1784,6 +1939,17 @@ def _build_work_item_progress(
and alert_noise_reduction.get("status") == "completed"
and alert_noise_missing == 0
)
host_load_rollups = host_sustained_load_automation.get("rollups")
if not isinstance(host_load_rollups, Mapping):
host_load_rollups = {}
host_load_ready = (
host_sustained_load_automation.get("schema_version")
== "host_sustained_load_controlled_automation_readback_v1"
and host_sustained_load_automation.get("status") == "completed"
and _int_value(host_load_rollups.get("required_asset_count"))
== _int_value(host_load_rollups.get("ready_asset_count"))
and _int_value(host_load_rollups.get("controlled_apply_class_count")) >= 1
)
log_executor_rollups = log_controlled_writeback_executor.get("rollups")
if not isinstance(log_executor_rollups, Mapping):
log_executor_rollups = {}
@@ -1911,11 +2077,23 @@ def _build_work_item_progress(
"exit_criteria": "repeated alerts are clustered, deduped, routed to controlled automation, and no longer default to manual handling",
"remaining_alert_noise_stage_count": alert_noise_missing,
},
{
"work_item_id": "P1-D2-host-sustained-load-controlled-automation",
"priority": "P1-D2",
"title": "CPU sustained-load alerts classify and run AI controlled remediation",
"status": "completed" if host_load_ready else "in_progress" if p1d_completed else "pending",
"exit_criteria": "HostLoadAverageSustainedHigh routes to classifier, dry-run/check-mode, controlled apply packet, verifier, and KM/PlayBook writeback",
"controlled_action_class_count": _int_value(
host_load_rollups.get("controlled_apply_class_count")
),
"ready_asset_count": _int_value(host_load_rollups.get("ready_asset_count")),
"required_asset_count": _int_value(host_load_rollups.get("required_asset_count")),
},
{
"work_item_id": "P1-E-log-controlled-writeback-executor",
"priority": "P1-E",
"title": "LOG feedback executor queue for KM / RAG / MCP / PlayBook",
"status": "completed" if log_executor_ready else "in_progress" if p1d_completed else "pending",
"status": "completed" if log_executor_ready else "in_progress" if host_load_ready else "pending",
"exit_criteria": "executor readback exposes ready batches, target selectors, source diffs, rollback, verifier, and next-action queue",
"remaining_executor_batch_count": max(
0,
@@ -2845,6 +3023,9 @@ def build_runtime_receipt_readback_from_rows(
agent_decision_wiring=agent_decision_wiring,
learning_loop=learning_loop,
)
host_sustained_load_automation = (
_build_host_sustained_load_controlled_automation_readback()
)
ui_productization = _build_ui_productization_readback()
multi_product_taxonomy = _build_multi_product_taxonomy_contract(log_integration_taxonomy)
log_controlled_writeback_executor = _load_log_controlled_writeback_executor_readback()
@@ -2860,6 +3041,7 @@ def build_runtime_receipt_readback_from_rows(
agent_decision_wiring=agent_decision_wiring,
learning_loop=learning_loop,
alert_noise_reduction=alert_noise_reduction,
host_sustained_load_automation=host_sustained_load_automation,
ui_productization=ui_productization,
multi_product_taxonomy=multi_product_taxonomy,
db_read_status=db_read_status,
@@ -2988,6 +3170,7 @@ def build_runtime_receipt_readback_from_rows(
"agent_decision_wiring": agent_decision_wiring,
"learning_loop": learning_loop,
"alert_noise_reduction": alert_noise_reduction,
"host_sustained_load_automation": host_sustained_load_automation,
"ui_productization": ui_productization,
"multi_product_taxonomy": multi_product_taxonomy,
"work_item_progress": work_item_progress,
@@ -3281,6 +3464,27 @@ def _attach_runtime_receipt_readback(
"controlled_route_total"
)
),
"live_host_sustained_load_action_class_count": _int_value(
((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get(
"action_class_count"
)
),
"live_host_sustained_load_controlled_apply_class_count": _int_value(
((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get(
"controlled_apply_class_count"
)
),
"live_host_sustained_load_ready_asset_count": _int_value(
((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get(
"ready_asset_count"
)
),
"live_host_sustained_load_complete_count": (
1
if (readback.get("host_sustained_load_automation") or {}).get("status")
== "completed"
else 0
),
"live_ui_productization_surface_count": _int_value(
((readback.get("ui_productization") or {}).get("rollups") or {}).get(
"surface_count"