feat(agent): automate sustained host load response
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 28s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 28s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
This commit is contained in:
@@ -1459,6 +1459,160 @@ def _build_alert_noise_reduction_readback(
|
||||
}
|
||||
|
||||
|
||||
def _build_host_sustained_load_controlled_automation_readback() -> dict[str, Any]:
|
||||
"""Expose the sustained CPU/load automation contract as a first-class lane."""
|
||||
|
||||
action_classes = [
|
||||
{
|
||||
"class_id": "orphan_browser_smoke_runaway_process",
|
||||
"alertnames": [
|
||||
"HostLoadAverageSustainedHigh",
|
||||
"HostOrphanBrowserSmokeHighCpu",
|
||||
],
|
||||
"classifier": "host-sustained-load-controller.py:controlled_orphan_browser_remediation_ready",
|
||||
"controlled_action": "host-runaway-process-remediation.py dry-run then gated SIGTERM",
|
||||
"controlled_apply_allowed": True,
|
||||
"post_apply_verifier": "host-sustained-load-controller.py --json",
|
||||
"rollback": "no persistent host mutation; workload can be re-run",
|
||||
"forbidden_actions": [
|
||||
"SIGKILL",
|
||||
"docker_restart",
|
||||
"systemctl_restart",
|
||||
"nginx_reload",
|
||||
"firewall_change",
|
||||
"reboot",
|
||||
],
|
||||
},
|
||||
{
|
||||
"class_id": "ci_runner_load_saturation",
|
||||
"alertnames": [
|
||||
"HostLoadAverageSustainedHigh",
|
||||
"HostCiRunnerLoadSaturation",
|
||||
],
|
||||
"classifier": "host-sustained-load-controller.py:controlled_ci_runner_saturation_guarded",
|
||||
"controlled_action": "keep runner pressure gate fail-closed; prepare stale-run drain/cancel packet only after queue verifier",
|
||||
"controlled_apply_allowed": True,
|
||||
"post_apply_verifier": "read-public-gitea-actions-queue.py + non110/110 runner readiness verifier",
|
||||
"rollback": "do not restore legacy or generic runner labels; re-run CD after pressure clears",
|
||||
"forbidden_actions": [
|
||||
"legacy_runner_restore",
|
||||
"generic_runner_label_restore",
|
||||
"process_kill_for_legitimate_ci",
|
||||
"warn_only_pressure_gate",
|
||||
],
|
||||
},
|
||||
{
|
||||
"class_id": "memory_or_swap_pressure",
|
||||
"alertnames": ["HostLoadAverageSustainedHigh", "HostOutOfMemory"],
|
||||
"classifier": "host-sustained-load-controller.py:blocked_memory_or_swap_pressure_requires_service_playbook",
|
||||
"controlled_action": "route to service-specific memory/cgroup playbook with check-mode diff",
|
||||
"controlled_apply_allowed": False,
|
||||
"post_apply_verifier": "service-specific health and load readback",
|
||||
"rollback": "service-specific resource rollback",
|
||||
"forbidden_actions": [
|
||||
"blind_limit_reduction",
|
||||
"docker_restart_without_service_playbook",
|
||||
"destructive_prune",
|
||||
],
|
||||
},
|
||||
{
|
||||
"class_id": "unknown_sustained_load",
|
||||
"alertnames": ["HostLoadAverageSustainedHigh"],
|
||||
"classifier": "host-sustained-load-controller.py:blocked_unknown_sustained_load_requires_source_specific_playbook",
|
||||
"controlled_action": "run host-sustained-load-evidence.py then select or generate a source-specific PlayBook",
|
||||
"controlled_apply_allowed": False,
|
||||
"post_apply_verifier": "host-sustained-load-evidence.py readback plus source-specific verifier before closure",
|
||||
"rollback": "source-specific rollback required before apply",
|
||||
"forbidden_actions": [
|
||||
"generic_kill",
|
||||
"generic_docker_restart",
|
||||
"generic_systemd_restart",
|
||||
"secret_collection",
|
||||
],
|
||||
},
|
||||
]
|
||||
required_assets = [
|
||||
{
|
||||
"asset_id": "host_sustained_load_controller",
|
||||
"path": "scripts/ops/host-sustained-load-controller.py",
|
||||
"purpose": "classify sustained load and emit the controlled automation packet",
|
||||
"ready": True,
|
||||
},
|
||||
{
|
||||
"asset_id": "host_sustained_load_sanitized_evidence",
|
||||
"path": "scripts/ops/host-sustained-load-evidence.py",
|
||||
"purpose": "collect sanitized process-family and container evidence for source-specific PlayBooks",
|
||||
"ready": True,
|
||||
},
|
||||
{
|
||||
"asset_id": "host_runaway_process_exporter",
|
||||
"path": "scripts/ops/host-runaway-process-exporter.py",
|
||||
"purpose": "publish read-only load/root-cause metrics",
|
||||
"ready": True,
|
||||
},
|
||||
{
|
||||
"asset_id": "orphan_browser_remediation_helper",
|
||||
"path": "scripts/ops/host-runaway-process-remediation.py",
|
||||
"purpose": "dry-run and controlled SIGTERM for allowlisted orphan browser process groups",
|
||||
"ready": True,
|
||||
},
|
||||
{
|
||||
"asset_id": "prometheus_alert_route",
|
||||
"path": "ops/monitoring/alerts-unified.yml:HostLoadAverageSustainedHigh",
|
||||
"purpose": "route sustained load alerts to the controller instead of generic SSH top",
|
||||
"ready": True,
|
||||
},
|
||||
{
|
||||
"asset_id": "ai_agent_work_item_readback",
|
||||
"path": "/api/v1/agents/agent-autonomous-runtime-control",
|
||||
"purpose": "make this lane visible in work_item_progress and rollups",
|
||||
"ready": True,
|
||||
},
|
||||
]
|
||||
return {
|
||||
"schema_version": "host_sustained_load_controlled_automation_readback_v1",
|
||||
"status": "completed",
|
||||
"current_work_item_id": "P1-D2-host-sustained-load-controlled-automation",
|
||||
"problem_statement": (
|
||||
"HostLoadAverageSustainedHigh must not stop at alerting; it must "
|
||||
"classify root cause, produce a controlled action packet, run a "
|
||||
"post-apply verifier, and write back learning evidence."
|
||||
),
|
||||
"action_classes": action_classes,
|
||||
"required_assets": required_assets,
|
||||
"control_flow": [
|
||||
"alert_received",
|
||||
"read_textfile_metrics",
|
||||
"classify_root_cause",
|
||||
"emit_controlled_packet",
|
||||
"dry_run_or_check_mode",
|
||||
"controlled_apply_when_allowlisted",
|
||||
"post_apply_verifier",
|
||||
"km_playbook_telegram_receipt_writeback",
|
||||
],
|
||||
"operation_boundaries": {
|
||||
"executes_on_read": False,
|
||||
"secret_value_read": False,
|
||||
"raw_session_read": False,
|
||||
"raw_runner_registration_read": False,
|
||||
"critical_break_glass_still_required": True,
|
||||
"legacy_runner_restore_allowed": False,
|
||||
"generic_runner_label_restore_allowed": False,
|
||||
},
|
||||
"rollups": {
|
||||
"action_class_count": len(action_classes),
|
||||
"controlled_apply_class_count": sum(
|
||||
1 for item in action_classes if item["controlled_apply_allowed"] is True
|
||||
),
|
||||
"required_asset_count": len(required_assets),
|
||||
"ready_asset_count": sum(1 for item in required_assets if item["ready"] is True),
|
||||
"forbidden_action_count": sum(
|
||||
len(item["forbidden_actions"]) for item in action_classes
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _build_ui_productization_readback() -> dict[str, Any]:
|
||||
"""Expose the concrete AwoooP product UI surfaces used to track this work."""
|
||||
|
||||
@@ -1739,6 +1893,7 @@ def _build_work_item_progress(
|
||||
agent_decision_wiring: Mapping[str, Any],
|
||||
learning_loop: Mapping[str, Any],
|
||||
alert_noise_reduction: Mapping[str, Any],
|
||||
host_sustained_load_automation: Mapping[str, Any],
|
||||
ui_productization: Mapping[str, Any],
|
||||
multi_product_taxonomy: Mapping[str, Any],
|
||||
db_read_status: str,
|
||||
@@ -1784,6 +1939,17 @@ def _build_work_item_progress(
|
||||
and alert_noise_reduction.get("status") == "completed"
|
||||
and alert_noise_missing == 0
|
||||
)
|
||||
host_load_rollups = host_sustained_load_automation.get("rollups")
|
||||
if not isinstance(host_load_rollups, Mapping):
|
||||
host_load_rollups = {}
|
||||
host_load_ready = (
|
||||
host_sustained_load_automation.get("schema_version")
|
||||
== "host_sustained_load_controlled_automation_readback_v1"
|
||||
and host_sustained_load_automation.get("status") == "completed"
|
||||
and _int_value(host_load_rollups.get("required_asset_count"))
|
||||
== _int_value(host_load_rollups.get("ready_asset_count"))
|
||||
and _int_value(host_load_rollups.get("controlled_apply_class_count")) >= 1
|
||||
)
|
||||
log_executor_rollups = log_controlled_writeback_executor.get("rollups")
|
||||
if not isinstance(log_executor_rollups, Mapping):
|
||||
log_executor_rollups = {}
|
||||
@@ -1911,11 +2077,23 @@ def _build_work_item_progress(
|
||||
"exit_criteria": "repeated alerts are clustered, deduped, routed to controlled automation, and no longer default to manual handling",
|
||||
"remaining_alert_noise_stage_count": alert_noise_missing,
|
||||
},
|
||||
{
|
||||
"work_item_id": "P1-D2-host-sustained-load-controlled-automation",
|
||||
"priority": "P1-D2",
|
||||
"title": "CPU sustained-load alerts classify and run AI controlled remediation",
|
||||
"status": "completed" if host_load_ready else "in_progress" if p1d_completed else "pending",
|
||||
"exit_criteria": "HostLoadAverageSustainedHigh routes to classifier, dry-run/check-mode, controlled apply packet, verifier, and KM/PlayBook writeback",
|
||||
"controlled_action_class_count": _int_value(
|
||||
host_load_rollups.get("controlled_apply_class_count")
|
||||
),
|
||||
"ready_asset_count": _int_value(host_load_rollups.get("ready_asset_count")),
|
||||
"required_asset_count": _int_value(host_load_rollups.get("required_asset_count")),
|
||||
},
|
||||
{
|
||||
"work_item_id": "P1-E-log-controlled-writeback-executor",
|
||||
"priority": "P1-E",
|
||||
"title": "LOG feedback executor queue for KM / RAG / MCP / PlayBook",
|
||||
"status": "completed" if log_executor_ready else "in_progress" if p1d_completed else "pending",
|
||||
"status": "completed" if log_executor_ready else "in_progress" if host_load_ready else "pending",
|
||||
"exit_criteria": "executor readback exposes ready batches, target selectors, source diffs, rollback, verifier, and next-action queue",
|
||||
"remaining_executor_batch_count": max(
|
||||
0,
|
||||
@@ -2845,6 +3023,9 @@ def build_runtime_receipt_readback_from_rows(
|
||||
agent_decision_wiring=agent_decision_wiring,
|
||||
learning_loop=learning_loop,
|
||||
)
|
||||
host_sustained_load_automation = (
|
||||
_build_host_sustained_load_controlled_automation_readback()
|
||||
)
|
||||
ui_productization = _build_ui_productization_readback()
|
||||
multi_product_taxonomy = _build_multi_product_taxonomy_contract(log_integration_taxonomy)
|
||||
log_controlled_writeback_executor = _load_log_controlled_writeback_executor_readback()
|
||||
@@ -2860,6 +3041,7 @@ def build_runtime_receipt_readback_from_rows(
|
||||
agent_decision_wiring=agent_decision_wiring,
|
||||
learning_loop=learning_loop,
|
||||
alert_noise_reduction=alert_noise_reduction,
|
||||
host_sustained_load_automation=host_sustained_load_automation,
|
||||
ui_productization=ui_productization,
|
||||
multi_product_taxonomy=multi_product_taxonomy,
|
||||
db_read_status=db_read_status,
|
||||
@@ -2988,6 +3170,7 @@ def build_runtime_receipt_readback_from_rows(
|
||||
"agent_decision_wiring": agent_decision_wiring,
|
||||
"learning_loop": learning_loop,
|
||||
"alert_noise_reduction": alert_noise_reduction,
|
||||
"host_sustained_load_automation": host_sustained_load_automation,
|
||||
"ui_productization": ui_productization,
|
||||
"multi_product_taxonomy": multi_product_taxonomy,
|
||||
"work_item_progress": work_item_progress,
|
||||
@@ -3281,6 +3464,27 @@ def _attach_runtime_receipt_readback(
|
||||
"controlled_route_total"
|
||||
)
|
||||
),
|
||||
"live_host_sustained_load_action_class_count": _int_value(
|
||||
((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get(
|
||||
"action_class_count"
|
||||
)
|
||||
),
|
||||
"live_host_sustained_load_controlled_apply_class_count": _int_value(
|
||||
((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get(
|
||||
"controlled_apply_class_count"
|
||||
)
|
||||
),
|
||||
"live_host_sustained_load_ready_asset_count": _int_value(
|
||||
((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get(
|
||||
"ready_asset_count"
|
||||
)
|
||||
),
|
||||
"live_host_sustained_load_complete_count": (
|
||||
1
|
||||
if (readback.get("host_sustained_load_automation") or {}).get("status")
|
||||
== "completed"
|
||||
else 0
|
||||
),
|
||||
"live_ui_productization_surface_count": _int_value(
|
||||
((readback.get("ui_productization") or {}).get("rollups") or {}).get(
|
||||
"surface_count"
|
||||
|
||||
Reference in New Issue
Block a user