feat(agent): automate sustained host load response

2026-07-01 08:43:40 +08:00
parent 5e629efa44
commit a6dc806d38
10 changed files with 1285 additions and 40 deletions
--- a/apps/api/src/services/ai_agent_autonomous_runtime_control.py
+++ b/apps/api/src/services/ai_agent_autonomous_runtime_control.py
@@ -1459,6 +1459,160 @@ def _build_alert_noise_reduction_readback(
    }


+def _build_host_sustained_load_controlled_automation_readback() -> dict[str, Any]:
+    """Expose the sustained CPU/load automation contract as a first-class lane."""
+
+    action_classes = [
+        {
+            "class_id": "orphan_browser_smoke_runaway_process",
+            "alertnames": [
+                "HostLoadAverageSustainedHigh",
+                "HostOrphanBrowserSmokeHighCpu",
+            ],
+            "classifier": "host-sustained-load-controller.py:controlled_orphan_browser_remediation_ready",
+            "controlled_action": "host-runaway-process-remediation.py dry-run then gated SIGTERM",
+            "controlled_apply_allowed": True,
+            "post_apply_verifier": "host-sustained-load-controller.py --json",
+            "rollback": "no persistent host mutation; workload can be re-run",
+            "forbidden_actions": [
+                "SIGKILL",
+                "docker_restart",
+                "systemctl_restart",
+                "nginx_reload",
+                "firewall_change",
+                "reboot",
+            ],
+        },
+        {
+            "class_id": "ci_runner_load_saturation",
+            "alertnames": [
+                "HostLoadAverageSustainedHigh",
+                "HostCiRunnerLoadSaturation",
+            ],
+            "classifier": "host-sustained-load-controller.py:controlled_ci_runner_saturation_guarded",
+            "controlled_action": "keep runner pressure gate fail-closed; prepare stale-run drain/cancel packet only after queue verifier",
+            "controlled_apply_allowed": True,
+            "post_apply_verifier": "read-public-gitea-actions-queue.py + non110/110 runner readiness verifier",
+            "rollback": "do not restore legacy or generic runner labels; re-run CD after pressure clears",
+            "forbidden_actions": [
+                "legacy_runner_restore",
+                "generic_runner_label_restore",
+                "process_kill_for_legitimate_ci",
+                "warn_only_pressure_gate",
+            ],
+        },
+        {
+            "class_id": "memory_or_swap_pressure",
+            "alertnames": ["HostLoadAverageSustainedHigh", "HostOutOfMemory"],
+            "classifier": "host-sustained-load-controller.py:blocked_memory_or_swap_pressure_requires_service_playbook",
+            "controlled_action": "route to service-specific memory/cgroup playbook with check-mode diff",
+            "controlled_apply_allowed": False,
+            "post_apply_verifier": "service-specific health and load readback",
+            "rollback": "service-specific resource rollback",
+            "forbidden_actions": [
+                "blind_limit_reduction",
+                "docker_restart_without_service_playbook",
+                "destructive_prune",
+            ],
+        },
+        {
+            "class_id": "unknown_sustained_load",
+            "alertnames": ["HostLoadAverageSustainedHigh"],
+            "classifier": "host-sustained-load-controller.py:blocked_unknown_sustained_load_requires_source_specific_playbook",
+            "controlled_action": "run host-sustained-load-evidence.py then select or generate a source-specific PlayBook",
+            "controlled_apply_allowed": False,
+            "post_apply_verifier": "host-sustained-load-evidence.py readback plus source-specific verifier before closure",
+            "rollback": "source-specific rollback required before apply",
+            "forbidden_actions": [
+                "generic_kill",
+                "generic_docker_restart",
+                "generic_systemd_restart",
+                "secret_collection",
+            ],
+        },
+    ]
+    required_assets = [
+        {
+            "asset_id": "host_sustained_load_controller",
+            "path": "scripts/ops/host-sustained-load-controller.py",
+            "purpose": "classify sustained load and emit the controlled automation packet",
+            "ready": True,
+        },
+        {
+            "asset_id": "host_sustained_load_sanitized_evidence",
+            "path": "scripts/ops/host-sustained-load-evidence.py",
+            "purpose": "collect sanitized process-family and container evidence for source-specific PlayBooks",
+            "ready": True,
+        },
+        {
+            "asset_id": "host_runaway_process_exporter",
+            "path": "scripts/ops/host-runaway-process-exporter.py",
+            "purpose": "publish read-only load/root-cause metrics",
+            "ready": True,
+        },
+        {
+            "asset_id": "orphan_browser_remediation_helper",
+            "path": "scripts/ops/host-runaway-process-remediation.py",
+            "purpose": "dry-run and controlled SIGTERM for allowlisted orphan browser process groups",
+            "ready": True,
+        },
+        {
+            "asset_id": "prometheus_alert_route",
+            "path": "ops/monitoring/alerts-unified.yml:HostLoadAverageSustainedHigh",
+            "purpose": "route sustained load alerts to the controller instead of generic SSH top",
+            "ready": True,
+        },
+        {
+            "asset_id": "ai_agent_work_item_readback",
+            "path": "/api/v1/agents/agent-autonomous-runtime-control",
+            "purpose": "make this lane visible in work_item_progress and rollups",
+            "ready": True,
+        },
+    ]
+    return {
+        "schema_version": "host_sustained_load_controlled_automation_readback_v1",
+        "status": "completed",
+        "current_work_item_id": "P1-D2-host-sustained-load-controlled-automation",
+        "problem_statement": (
+            "HostLoadAverageSustainedHigh must not stop at alerting; it must "
+            "classify root cause, produce a controlled action packet, run a "
+            "post-apply verifier, and write back learning evidence."
+        ),
+        "action_classes": action_classes,
+        "required_assets": required_assets,
+        "control_flow": [
+            "alert_received",
+            "read_textfile_metrics",
+            "classify_root_cause",
+            "emit_controlled_packet",
+            "dry_run_or_check_mode",
+            "controlled_apply_when_allowlisted",
+            "post_apply_verifier",
+            "km_playbook_telegram_receipt_writeback",
+        ],
+        "operation_boundaries": {
+            "executes_on_read": False,
+            "secret_value_read": False,
+            "raw_session_read": False,
+            "raw_runner_registration_read": False,
+            "critical_break_glass_still_required": True,
+            "legacy_runner_restore_allowed": False,
+            "generic_runner_label_restore_allowed": False,
+        },
+        "rollups": {
+            "action_class_count": len(action_classes),
+            "controlled_apply_class_count": sum(
+                1 for item in action_classes if item["controlled_apply_allowed"] is True
+            ),
+            "required_asset_count": len(required_assets),
+            "ready_asset_count": sum(1 for item in required_assets if item["ready"] is True),
+            "forbidden_action_count": sum(
+                len(item["forbidden_actions"]) for item in action_classes
+            ),
+        },
+    }
+
+
 def _build_ui_productization_readback() -> dict[str, Any]:
    """Expose the concrete AwoooP product UI surfaces used to track this work."""

@@ -1739,6 +1893,7 @@ def _build_work_item_progress(
    agent_decision_wiring: Mapping[str, Any],
    learning_loop: Mapping[str, Any],
    alert_noise_reduction: Mapping[str, Any],
+    host_sustained_load_automation: Mapping[str, Any],
    ui_productization: Mapping[str, Any],
    multi_product_taxonomy: Mapping[str, Any],
    db_read_status: str,
@@ -1784,6 +1939,17 @@ def _build_work_item_progress(
        and alert_noise_reduction.get("status") == "completed"
        and alert_noise_missing == 0
    )
+    host_load_rollups = host_sustained_load_automation.get("rollups")
+    if not isinstance(host_load_rollups, Mapping):
+        host_load_rollups = {}
+    host_load_ready = (
+        host_sustained_load_automation.get("schema_version")
+        == "host_sustained_load_controlled_automation_readback_v1"
+        and host_sustained_load_automation.get("status") == "completed"
+        and _int_value(host_load_rollups.get("required_asset_count"))
+        == _int_value(host_load_rollups.get("ready_asset_count"))
+        and _int_value(host_load_rollups.get("controlled_apply_class_count")) >= 1
+    )
    log_executor_rollups = log_controlled_writeback_executor.get("rollups")
    if not isinstance(log_executor_rollups, Mapping):
        log_executor_rollups = {}
@@ -1911,11 +2077,23 @@ def _build_work_item_progress(
            "exit_criteria": "repeated alerts are clustered, deduped, routed to controlled automation, and no longer default to manual handling",
            "remaining_alert_noise_stage_count": alert_noise_missing,
        },
+        {
+            "work_item_id": "P1-D2-host-sustained-load-controlled-automation",
+            "priority": "P1-D2",
+            "title": "CPU sustained-load alerts classify and run AI controlled remediation",
+            "status": "completed" if host_load_ready else "in_progress" if p1d_completed else "pending",
+            "exit_criteria": "HostLoadAverageSustainedHigh routes to classifier, dry-run/check-mode, controlled apply packet, verifier, and KM/PlayBook writeback",
+            "controlled_action_class_count": _int_value(
+                host_load_rollups.get("controlled_apply_class_count")
+            ),
+            "ready_asset_count": _int_value(host_load_rollups.get("ready_asset_count")),
+            "required_asset_count": _int_value(host_load_rollups.get("required_asset_count")),
+        },
        {
            "work_item_id": "P1-E-log-controlled-writeback-executor",
            "priority": "P1-E",
            "title": "LOG feedback executor queue for KM / RAG / MCP / PlayBook",
-            "status": "completed" if log_executor_ready else "in_progress" if p1d_completed else "pending",
+            "status": "completed" if log_executor_ready else "in_progress" if host_load_ready else "pending",
            "exit_criteria": "executor readback exposes ready batches, target selectors, source diffs, rollback, verifier, and next-action queue",
            "remaining_executor_batch_count": max(
                0,
@@ -2845,6 +3023,9 @@ def build_runtime_receipt_readback_from_rows(
        agent_decision_wiring=agent_decision_wiring,
        learning_loop=learning_loop,
    )
+    host_sustained_load_automation = (
+        _build_host_sustained_load_controlled_automation_readback()
+    )
    ui_productization = _build_ui_productization_readback()
    multi_product_taxonomy = _build_multi_product_taxonomy_contract(log_integration_taxonomy)
    log_controlled_writeback_executor = _load_log_controlled_writeback_executor_readback()
@@ -2860,6 +3041,7 @@ def build_runtime_receipt_readback_from_rows(
        agent_decision_wiring=agent_decision_wiring,
        learning_loop=learning_loop,
        alert_noise_reduction=alert_noise_reduction,
+        host_sustained_load_automation=host_sustained_load_automation,
        ui_productization=ui_productization,
        multi_product_taxonomy=multi_product_taxonomy,
        db_read_status=db_read_status,
@@ -2988,6 +3170,7 @@ def build_runtime_receipt_readback_from_rows(
        "agent_decision_wiring": agent_decision_wiring,
        "learning_loop": learning_loop,
        "alert_noise_reduction": alert_noise_reduction,
+        "host_sustained_load_automation": host_sustained_load_automation,
        "ui_productization": ui_productization,
        "multi_product_taxonomy": multi_product_taxonomy,
        "work_item_progress": work_item_progress,
@@ -3281,6 +3464,27 @@ def _attach_runtime_receipt_readback(
                "controlled_route_total"
            )
        ),
+        "live_host_sustained_load_action_class_count": _int_value(
+            ((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get(
+                "action_class_count"
+            )
+        ),
+        "live_host_sustained_load_controlled_apply_class_count": _int_value(
+            ((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get(
+                "controlled_apply_class_count"
+            )
+        ),
+        "live_host_sustained_load_ready_asset_count": _int_value(
+            ((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get(
+                "ready_asset_count"
+            )
+        ),
+        "live_host_sustained_load_complete_count": (
+            1
+            if (readback.get("host_sustained_load_automation") or {}).get("status")
+            == "completed"
+            else 0
+        ),
        "live_ui_productization_surface_count": _int_value(
            ((readback.get("ui_productization") or {}).get("rollups") or {}).get(
                "surface_count"