feat(agent): automate sustained host load response

2026-07-01 08:43:40 +08:00
parent 5e629efa44
commit a6dc806d38
10 changed files with 1285 additions and 40 deletions
--- a/apps/api/src/services/ai_agent_autonomous_runtime_control.py
+++ b/apps/api/src/services/ai_agent_autonomous_runtime_control.py
@@ -1459,6 +1459,160 @@ def _build_alert_noise_reduction_readback(
    }


+def _build_host_sustained_load_controlled_automation_readback() -> dict[str, Any]:
+    """Expose the sustained CPU/load automation contract as a first-class lane."""
+
+    action_classes = [
+        {
+            "class_id": "orphan_browser_smoke_runaway_process",
+            "alertnames": [
+                "HostLoadAverageSustainedHigh",
+                "HostOrphanBrowserSmokeHighCpu",
+            ],
+            "classifier": "host-sustained-load-controller.py:controlled_orphan_browser_remediation_ready",
+            "controlled_action": "host-runaway-process-remediation.py dry-run then gated SIGTERM",
+            "controlled_apply_allowed": True,
+            "post_apply_verifier": "host-sustained-load-controller.py --json",
+            "rollback": "no persistent host mutation; workload can be re-run",
+            "forbidden_actions": [
+                "SIGKILL",
+                "docker_restart",
+                "systemctl_restart",
+                "nginx_reload",
+                "firewall_change",
+                "reboot",
+            ],
+        },
+        {
+            "class_id": "ci_runner_load_saturation",
+            "alertnames": [
+                "HostLoadAverageSustainedHigh",
+                "HostCiRunnerLoadSaturation",
+            ],
+            "classifier": "host-sustained-load-controller.py:controlled_ci_runner_saturation_guarded",
+            "controlled_action": "keep runner pressure gate fail-closed; prepare stale-run drain/cancel packet only after queue verifier",
+            "controlled_apply_allowed": True,
+            "post_apply_verifier": "read-public-gitea-actions-queue.py + non110/110 runner readiness verifier",
+            "rollback": "do not restore legacy or generic runner labels; re-run CD after pressure clears",
+            "forbidden_actions": [
+                "legacy_runner_restore",
+                "generic_runner_label_restore",
+                "process_kill_for_legitimate_ci",
+                "warn_only_pressure_gate",
+            ],
+        },
+        {
+            "class_id": "memory_or_swap_pressure",
+            "alertnames": ["HostLoadAverageSustainedHigh", "HostOutOfMemory"],
+            "classifier": "host-sustained-load-controller.py:blocked_memory_or_swap_pressure_requires_service_playbook",
+            "controlled_action": "route to service-specific memory/cgroup playbook with check-mode diff",
+            "controlled_apply_allowed": False,
+            "post_apply_verifier": "service-specific health and load readback",
+            "rollback": "service-specific resource rollback",
+            "forbidden_actions": [
+                "blind_limit_reduction",
+                "docker_restart_without_service_playbook",
+                "destructive_prune",
+            ],
+        },
+        {
+            "class_id": "unknown_sustained_load",
+            "alertnames": ["HostLoadAverageSustainedHigh"],
+            "classifier": "host-sustained-load-controller.py:blocked_unknown_sustained_load_requires_source_specific_playbook",
+            "controlled_action": "run host-sustained-load-evidence.py then select or generate a source-specific PlayBook",
+            "controlled_apply_allowed": False,
+            "post_apply_verifier": "host-sustained-load-evidence.py readback plus source-specific verifier before closure",
+            "rollback": "source-specific rollback required before apply",
+            "forbidden_actions": [
+                "generic_kill",
+                "generic_docker_restart",
+                "generic_systemd_restart",
+                "secret_collection",
+            ],
+        },
+    ]
+    required_assets = [
+        {
+            "asset_id": "host_sustained_load_controller",
+            "path": "scripts/ops/host-sustained-load-controller.py",
+            "purpose": "classify sustained load and emit the controlled automation packet",
+            "ready": True,
+        },
+        {
+            "asset_id": "host_sustained_load_sanitized_evidence",
+            "path": "scripts/ops/host-sustained-load-evidence.py",
+            "purpose": "collect sanitized process-family and container evidence for source-specific PlayBooks",
+            "ready": True,
+        },
+        {
+            "asset_id": "host_runaway_process_exporter",
+            "path": "scripts/ops/host-runaway-process-exporter.py",
+            "purpose": "publish read-only load/root-cause metrics",
+            "ready": True,
+        },
+        {
+            "asset_id": "orphan_browser_remediation_helper",
+            "path": "scripts/ops/host-runaway-process-remediation.py",
+            "purpose": "dry-run and controlled SIGTERM for allowlisted orphan browser process groups",
+            "ready": True,
+        },
+        {
+            "asset_id": "prometheus_alert_route",
+            "path": "ops/monitoring/alerts-unified.yml:HostLoadAverageSustainedHigh",
+            "purpose": "route sustained load alerts to the controller instead of generic SSH top",
+            "ready": True,
+        },
+        {
+            "asset_id": "ai_agent_work_item_readback",
+            "path": "/api/v1/agents/agent-autonomous-runtime-control",
+            "purpose": "make this lane visible in work_item_progress and rollups",
+            "ready": True,
+        },
+    ]
+    return {
+        "schema_version": "host_sustained_load_controlled_automation_readback_v1",
+        "status": "completed",
+        "current_work_item_id": "P1-D2-host-sustained-load-controlled-automation",
+        "problem_statement": (
+            "HostLoadAverageSustainedHigh must not stop at alerting; it must "
+            "classify root cause, produce a controlled action packet, run a "
+            "post-apply verifier, and write back learning evidence."
+        ),
+        "action_classes": action_classes,
+        "required_assets": required_assets,
+        "control_flow": [
+            "alert_received",
+            "read_textfile_metrics",
+            "classify_root_cause",
+            "emit_controlled_packet",
+            "dry_run_or_check_mode",
+            "controlled_apply_when_allowlisted",
+            "post_apply_verifier",
+            "km_playbook_telegram_receipt_writeback",
+        ],
+        "operation_boundaries": {
+            "executes_on_read": False,
+            "secret_value_read": False,
+            "raw_session_read": False,
+            "raw_runner_registration_read": False,
+            "critical_break_glass_still_required": True,
+            "legacy_runner_restore_allowed": False,
+            "generic_runner_label_restore_allowed": False,
+        },
+        "rollups": {
+            "action_class_count": len(action_classes),
+            "controlled_apply_class_count": sum(
+                1 for item in action_classes if item["controlled_apply_allowed"] is True
+            ),
+            "required_asset_count": len(required_assets),
+            "ready_asset_count": sum(1 for item in required_assets if item["ready"] is True),
+            "forbidden_action_count": sum(
+                len(item["forbidden_actions"]) for item in action_classes
+            ),
+        },
+    }
+
+
 def _build_ui_productization_readback() -> dict[str, Any]:
    """Expose the concrete AwoooP product UI surfaces used to track this work."""

@@ -1739,6 +1893,7 @@ def _build_work_item_progress(
    agent_decision_wiring: Mapping[str, Any],
    learning_loop: Mapping[str, Any],
    alert_noise_reduction: Mapping[str, Any],
+    host_sustained_load_automation: Mapping[str, Any],
    ui_productization: Mapping[str, Any],
    multi_product_taxonomy: Mapping[str, Any],
    db_read_status: str,
@@ -1784,6 +1939,17 @@ def _build_work_item_progress(
        and alert_noise_reduction.get("status") == "completed"
        and alert_noise_missing == 0
    )
+    host_load_rollups = host_sustained_load_automation.get("rollups")
+    if not isinstance(host_load_rollups, Mapping):
+        host_load_rollups = {}
+    host_load_ready = (
+        host_sustained_load_automation.get("schema_version")
+        == "host_sustained_load_controlled_automation_readback_v1"
+        and host_sustained_load_automation.get("status") == "completed"
+        and _int_value(host_load_rollups.get("required_asset_count"))
+        == _int_value(host_load_rollups.get("ready_asset_count"))
+        and _int_value(host_load_rollups.get("controlled_apply_class_count")) >= 1
+    )
    log_executor_rollups = log_controlled_writeback_executor.get("rollups")
    if not isinstance(log_executor_rollups, Mapping):
        log_executor_rollups = {}
@@ -1911,11 +2077,23 @@ def _build_work_item_progress(
            "exit_criteria": "repeated alerts are clustered, deduped, routed to controlled automation, and no longer default to manual handling",
            "remaining_alert_noise_stage_count": alert_noise_missing,
        },
+        {
+            "work_item_id": "P1-D2-host-sustained-load-controlled-automation",
+            "priority": "P1-D2",
+            "title": "CPU sustained-load alerts classify and run AI controlled remediation",
+            "status": "completed" if host_load_ready else "in_progress" if p1d_completed else "pending",
+            "exit_criteria": "HostLoadAverageSustainedHigh routes to classifier, dry-run/check-mode, controlled apply packet, verifier, and KM/PlayBook writeback",
+            "controlled_action_class_count": _int_value(
+                host_load_rollups.get("controlled_apply_class_count")
+            ),
+            "ready_asset_count": _int_value(host_load_rollups.get("ready_asset_count")),
+            "required_asset_count": _int_value(host_load_rollups.get("required_asset_count")),
+        },
        {
            "work_item_id": "P1-E-log-controlled-writeback-executor",
            "priority": "P1-E",
            "title": "LOG feedback executor queue for KM / RAG / MCP / PlayBook",
-            "status": "completed" if log_executor_ready else "in_progress" if p1d_completed else "pending",
+            "status": "completed" if log_executor_ready else "in_progress" if host_load_ready else "pending",
            "exit_criteria": "executor readback exposes ready batches, target selectors, source diffs, rollback, verifier, and next-action queue",
            "remaining_executor_batch_count": max(
                0,
@@ -2845,6 +3023,9 @@ def build_runtime_receipt_readback_from_rows(
        agent_decision_wiring=agent_decision_wiring,
        learning_loop=learning_loop,
    )
+    host_sustained_load_automation = (
+        _build_host_sustained_load_controlled_automation_readback()
+    )
    ui_productization = _build_ui_productization_readback()
    multi_product_taxonomy = _build_multi_product_taxonomy_contract(log_integration_taxonomy)
    log_controlled_writeback_executor = _load_log_controlled_writeback_executor_readback()
@@ -2860,6 +3041,7 @@ def build_runtime_receipt_readback_from_rows(
        agent_decision_wiring=agent_decision_wiring,
        learning_loop=learning_loop,
        alert_noise_reduction=alert_noise_reduction,
+        host_sustained_load_automation=host_sustained_load_automation,
        ui_productization=ui_productization,
        multi_product_taxonomy=multi_product_taxonomy,
        db_read_status=db_read_status,
@@ -2988,6 +3170,7 @@ def build_runtime_receipt_readback_from_rows(
        "agent_decision_wiring": agent_decision_wiring,
        "learning_loop": learning_loop,
        "alert_noise_reduction": alert_noise_reduction,
+        "host_sustained_load_automation": host_sustained_load_automation,
        "ui_productization": ui_productization,
        "multi_product_taxonomy": multi_product_taxonomy,
        "work_item_progress": work_item_progress,
@@ -3281,6 +3464,27 @@ def _attach_runtime_receipt_readback(
                "controlled_route_total"
            )
        ),
+        "live_host_sustained_load_action_class_count": _int_value(
+            ((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get(
+                "action_class_count"
+            )
+        ),
+        "live_host_sustained_load_controlled_apply_class_count": _int_value(
+            ((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get(
+                "controlled_apply_class_count"
+            )
+        ),
+        "live_host_sustained_load_ready_asset_count": _int_value(
+            ((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get(
+                "ready_asset_count"
+            )
+        ),
+        "live_host_sustained_load_complete_count": (
+            1
+            if (readback.get("host_sustained_load_automation") or {}).get("status")
+            == "completed"
+            else 0
+        ),
        "live_ui_productization_surface_count": _int_value(
            ((readback.get("ui_productization") or {}).get("rollups") or {}).get(
                "surface_count"
--- a/apps/api/tests/test_ai_agent_autonomous_runtime_control.py
+++ b/apps/api/tests/test_ai_agent_autonomous_runtime_control.py
@@ -766,6 +766,24 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows():
    assert alert_noise["routing_policy"]["manual_default_route_allowed"] is False
    assert alert_noise["routing_policy"]["low_medium_high_alerts_route_to_ai_controlled_queue"] is True
    assert alert_noise["public_safety"]["stores_raw_alert_payload"] is False
+    host_load = readback["host_sustained_load_automation"]
+    assert host_load["schema_version"] == "host_sustained_load_controlled_automation_readback_v1"
+    assert host_load["status"] == "completed"
+    assert host_load["current_work_item_id"] == (
+        "P1-D2-host-sustained-load-controlled-automation"
+    )
+    assert host_load["rollups"]["action_class_count"] == 4
+    assert host_load["rollups"]["controlled_apply_class_count"] == 2
+    assert host_load["rollups"]["required_asset_count"] == 6
+    assert host_load["rollups"]["ready_asset_count"] == 6
+    assert {
+        "orphan_browser_smoke_runaway_process",
+        "ci_runner_load_saturation",
+        "memory_or_swap_pressure",
+        "unknown_sustained_load",
+    } == {item["class_id"] for item in host_load["action_classes"]}
+    assert host_load["operation_boundaries"]["executes_on_read"] is False
+    assert host_load["operation_boundaries"]["legacy_runner_restore_allowed"] is False
    ui_productization = readback["ui_productization"]
    assert ui_productization["schema_version"] == "ai_agent_ui_productization_readback_v1"
    assert ui_productization["status"] == "completed"
@@ -805,6 +823,7 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows():
        "P1-B-agent-decision-wiring",
        "P1-C-learning-loop",
        "P1-D-alert-noise-reduction",
+        "P1-D2-host-sustained-load-controlled-automation",
        "P1-E-log-controlled-writeback-executor",
        "P1-F-log-controlled-writeback-consumer",
        "P2-A-ui-ux-productization",
@@ -818,19 +837,22 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows():
    assert progress["ordered_items"][8]["status"] == "completed"
    assert progress["ordered_items"][8]["remaining_alert_noise_stage_count"] == 0
    assert progress["ordered_items"][9]["status"] == "completed"
-    assert progress["ordered_items"][9]["remaining_executor_batch_count"] == 0
-    assert progress["ordered_items"][9]["active_blocker_count"] == 0
+    assert progress["ordered_items"][9]["controlled_action_class_count"] == 2
+    assert progress["ordered_items"][9]["ready_asset_count"] == 6
    assert progress["ordered_items"][10]["status"] == "completed"
-    assert progress["ordered_items"][10]["remaining_consumer_binding_count"] == 0
+    assert progress["ordered_items"][10]["remaining_executor_batch_count"] == 0
    assert progress["ordered_items"][10]["active_blocker_count"] == 0
    assert progress["ordered_items"][11]["status"] == "completed"
-    assert progress["ordered_items"][11]["remaining_ui_surface_count"] == 0
+    assert progress["ordered_items"][11]["remaining_consumer_binding_count"] == 0
+    assert progress["ordered_items"][11]["active_blocker_count"] == 0
    assert progress["ordered_items"][12]["status"] == "completed"
-    assert progress["ordered_items"][12]["remaining_product_scope_count"] == 0
+    assert progress["ordered_items"][12]["remaining_ui_surface_count"] == 0
+    assert progress["ordered_items"][13]["status"] == "completed"
+    assert progress["ordered_items"][13]["remaining_product_scope_count"] == 0
    assert progress["source_family_items"]
    assert {item["status"] for item in progress["source_family_items"]} == {"completed"}
    assert progress["rollups"]["source_family_work_item_count"] == 10
-    assert progress["rollups"]["completed_count"] == 23
+    assert progress["rollups"]["completed_count"] == 24
    assert progress["rollups"]["pending_count"] == 0


@@ -1016,12 +1038,13 @@ def test_runtime_receipt_work_items_use_learning_receipts_without_latest_telegra
    }
    assert statuses["P1-C-learning-loop"] == "completed"
    assert statuses["P1-D-alert-noise-reduction"] == "completed"
+    assert statuses["P1-D2-host-sustained-load-controlled-automation"] == "completed"
    assert statuses["P1-E-log-controlled-writeback-executor"] == "completed"
    assert statuses["P1-F-log-controlled-writeback-consumer"] == "completed"
    assert statuses["P2-A-ui-ux-productization"] == "completed"
    assert statuses["P2-B-multi-product-expansion"] == "completed"
    assert {item["status"] for item in progress["source_family_items"]} == {"completed"}
-    assert progress["rollups"]["completed_count"] == 23
+    assert progress["rollups"]["completed_count"] == 24
    assert progress["rollups"]["pending_count"] == 0


--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,21 @@
+## 2026-07-01 — 08:37 Host sustained-load AI controlled automation
+
+**照主線修正的問題**：
+- 新增 `scripts/ops/host-sustained-load-controller.py`，把 `HostLoadAverageSustainedHigh` 從「SSH 看 top / 人工判斷」改成可機器讀取的 AI controlled packet：orphan browser / smoke load、合法 Gitea Actions / BuildKit saturation、memory / swap pressure、unknown sustained load 四類分流。
+- 新增 `scripts/ops/host-sustained-load-evidence.py`，unknown sustained load 不再回到 raw SSH top，而是產生脫敏 process-family / container evidence 給 source-specific PlayBook、KM、RAG 與後續 controller decision 使用；不輸出 raw command line、workspace path、URL 或 secret value。
+- `host-runaway-process-remediation.py` 的 apply gate 從 owner / maintenance-window 必填改成 controlled apply receipt 必填：`--controlled-apply-id`、`--evidence-ref`、`--post-apply-verifier`、`--confirm-apply`；owner / maintenance-window 只保留為可選 evidence。若目標 process group 已消失，回報 `already_exited` / `missing_process_group_count`，不再 traceback。
+- `ops/monitoring/alerts.yml` 與 `ops/monitoring/alerts-unified.yml` 將 sustained load 的 `auto_repair_action` 指向 controller，runbook 改為 AI controlled packet / dry-run / controlled SIGTERM / verifier；清掉 orphan browser 與 remediation authorization 告警中的人工批准語意。
+- `/api/v1/agents/agent-autonomous-runtime-control` readback 新增 `host_sustained_load_automation` 與 work item `P1-D2-host-sustained-load-controlled-automation`，6 個 required assets 全部 ready，讓告警降噪後的 CPU sustained-load automation 進入 work_item_progress / live counters，而不是只留在 Prometheus 文字。
+
+**驗證**：
+- `PYTHONDONTWRITEBYTECODE=1 python3.11 -m py_compile scripts/ops/host-sustained-load-controller.py scripts/ops/host-sustained-load-evidence.py scripts/ops/host-runaway-process-remediation.py apps/api/src/services/ai_agent_autonomous_runtime_control.py` 通過。
+- `PYTHONDONTWRITEBYTECODE=1 python3.11 -m pytest scripts/ops/tests/test_host_runaway_process_exporter.py -q --tb=short -p no:cacheprovider` 通過（16 passed）。
+- `DATABASE_URL=sqlite:///test.db PYTHONDONTWRITEBYTECODE=1 python3.11 -m pytest apps/api/tests/test_ai_agent_autonomous_runtime_control.py -q --tb=short -p no:cacheprovider` 通過（10 passed）。
+- `git diff --check` 通過。
+- 110 live readback：從 `http://192.168.0.110:9100/metrics` 只篩 `awoooi_host_*` 指標到 `/tmp/awoooi-host110-load.prom`；controller 回 `classification=blocked_unknown_sustained_load_requires_source_specific_playbook`、`load5_per_core=1.56`、`monitor_up=1`、active CI `0`、orphan rule `null`，並給出 `host-sustained-load-evidence.py --json` 只讀脫敏證據指令；本次未執行 host write / signal / restart。
+
+**邊界**：本段只做 source / test / Prometheus rule / API readback 實作；未對 live host 送 SIGTERM；未讀 secret / token / `.env` / raw sessions / SQLite / auth；未使用 GitHub / `gh` / GitHub API；未 workflow_dispatch；未重啟主機，未 restart Docker / Nginx / K3s / DB / firewall，未恢復 legacy / generic runner label。
+
 ## 2026-07-01 — 08:19 188 non-110 CD lane controlled apply / 110 Harbor blocker readback

 **照主線修正的問題**：
--- a/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md
+++ b/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md
@@ -17,7 +17,7 @@ read-only exporter -> Prometheus alert -> AI triage packet -> KM / PlayBook evid

 | 類型 | 判定 | 處理 |
 |------|------|------|
-| orphan browser smoke | headless Chrome / Chromium / Playwright process group 存活過久、PPID=1 或 group leader 消失、CPU 合計過高 | 走 dry-run 修復包；人工批准後可送 `SIGTERM` |
+| orphan browser smoke | headless Chrome / Chromium / Playwright process group 存活過久、PPID=1 或 group leader 消失、CPU 合計過高 | 走 dry-run 修復包；controlled apply receipt + evidence + verifier 成立後可送 `SIGTERM` |
 | 合法 CI load | Gitea Actions task container 正在跑，沒有 orphan browser 指標 | 觀察 queue / timeout；不要誤殺 |
 | Docker / Sentry / Harbor 事故 | container restart、port down、journal error、cold-start gate blocked | 走各服務自己的 SOP，不使用本 PlayBook 殺 process |
 | swap 已滿但未 thrash | swap ratio 高但 `vmstat` / load 分類未顯示即時 thrash | 不手動清 swap；先降高 CPU 來源 |
@@ -50,7 +50,7 @@ read-only exporter -> Prometheus alert -> AI triage packet -> KM / PlayBook evid

 | Alert | 條件 | 行動 |
 |-------|------|------|
-| `HostOrphanBrowserSmokeHighCpu` | orphan browser group `> 0` 且 CPU `>= 100%` 持續 10 分鐘 | 產生 dry-run 修復包，確認 owner / 維護窗口 / evidence |
+| `HostOrphanBrowserSmokeHighCpu` | orphan browser group `> 0` 且 CPU `>= 100%` 持續 10 分鐘 | 產生 dry-run 修復包，補 controlled apply id / evidence / post verifier |
 | `HostCiRunnerLoadSaturation` | load5/core `> 1.0` 且 active Gitea Actions `> 0` | 標為短期 CI 負載，檢查 runner queue，不直接 kill |
 | `HostRunawayProcessMonitorMissing` / `Stale` | exporter 缺失或超過 10 分鐘未更新 | 修 exporter / cron / textfile collector |
 | `HostRunawayProcessRemediationUnexpectedlyAuthorized` | `remediation_authorized > 0` | 立即回滾；禁止把監控器改成執行器 |
@@ -59,11 +59,11 @@ Telegram / AI event packet contract:

 | Alert / input | Telegram lane | 必須顯示 |
 |---------------|---------------|----------|
-| `HostOrphanBrowserSmokeHighCpu` | `orphan_browser_smoke_runaway_process` | alertname、host、rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫 |
+| `HostOrphanBrowserSmokeHighCpu` | `orphan_browser_smoke_runaway_process` | alertname、host、rule、runaway dry-run、controlled apply id、evidence ref、post verifier、KM / PlayBook / Verifier 回寫 |
 | `HostCiRunnerLoadSaturation` | `ci_runner_load_saturation` | Gitea Actions run、runner queue、load/core、swap trend、capacity / queue 判定、不做 process remediation |
-| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure`、`runner_prisma_generate_resource_pressure` 或 `host_resource_pressure_triage` | sanitized top process evidence，不顯示 raw workspace path、hosted toolcache path、`node_modules` path、外部 URL、JSON payload 或完整 process dump |
+| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure`、`runner_prisma_generate_resource_pressure` 或 `host_resource_pressure_triage` | `host-sustained-load-evidence.py` 產生 sanitized top process / container evidence，不顯示 raw workspace path、hosted toolcache path、`node_modules` path、外部 URL、JSON payload 或完整 process dump |

-所有 Telegram 卡片都必須保留 `runtime_write_gate=0`，並不得把 alert/card 轉成直接 kill / restart / reload 指令。
+所有 Telegram 卡片都必須明確顯示 `runtime_write_gate=controlled/0`、`controlled_apply_allowed`、post verifier 與 forbidden actions，並不得把 alert/card 轉成直接 kill / restart / reload 指令。

 Host / runner raw dump 進入 Telegram 前必須先被 `TelegramGateway` 壓成 `P1/P2/P3 主機資源壓力` 卡片。第一屏只允許顯示 CPU、load、root process count、AI lane、candidate gate、Top evidence 與禁止事項；完整命令列、套件 JSON、外部檢查 endpoint、內部 workspace path 與 raw `ps aux` 必須留在內部 evidence / timeline，不得外送。

@@ -89,24 +89,36 @@ dry-run 必須檢查：
 3. `oldest_age_seconds` 超過 PlayBook 門檻。
 4. `active Gitea Actions` 與候選 process group 不是同一個仍在跑的合法 job。
 5. 不是 Docker daemon、Sentry、Harbor、PostgreSQL、ClickHouse、K3s 或 backup 服務本體。
-6. 已有 owner / 維護窗口 / evidence ref。
+6. 已有 controlled apply id、evidence ref、post verifier；owner / 維護窗口只作額外 evidence，不作 low-blast orphan cleanup 的預設阻擋。

-如果只看到 `HostCiRunnerLoadSaturation`，且 orphan group count 為 `0`，預設判定是「合法 CI 短期負載」，不得自動修復。
+如果只看到 `HostCiRunnerLoadSaturation`，且 orphan group count 為 `0`，預設判定是「合法 CI 短期負載」，不得自動殺 process；只能走 runner queue verifier、stale-run drain/cancel packet 與 host pressure fail-closed。
+
+如果只看到 `HostLoadAverageSustainedHigh`，且 orphan / active CI / swap 都無明確命中，AI 必須先跑只讀脫敏 evidence collector：
+
+```bash
+python3 scripts/ops/host-sustained-load-evidence.py \
+  --host 110 \
+  --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom \
+  --docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom \
+  --json
+```
+
+collector 只輸出 process family、container CPU 與 PlayBook recommendation，不輸出 raw command line、workspace path、URL、JSON payload 或 secret。

 ---

 ## 4. Gated Remediation

-真正送 `SIGTERM` 時必須帶齊三個 gate：
+真正送 `SIGTERM` 時必須帶齊 controlled apply gate：

 ```bash
 python3 scripts/ops/host-runaway-process-remediation.py \
  --apply \
  --confirm-apply \
  --rule stockplatform_headless_smoke \
-  --owner-approval-id OWNER-APPROVAL-REDACTED \
-  --maintenance-window-id MW-REDACTED \
+  --controlled-apply-id CAP-REDACTED \
  --evidence-ref INC-REDACTED \
+  --post-apply-verifier "scripts/ops/host-sustained-load-controller.py --host 110 --json" \
  --wait-seconds 5
 ```

@@ -138,10 +150,10 @@ active Gitea Actions 若仍存在，告警降級為 CI load，而非 orphan smok
 | 資產 | 必填欄位 |
 |------|----------|
 | Incident evidence | alert name、host、rule、pgid count、cpu percent、oldest age、active CI count、swap ratio |
-| PlayBook run | dry-run payload、owner approval id、maintenance window id、evidence ref、actual signal summary |
+| PlayBook run | dry-run payload、controlled apply id、optional owner / maintenance evidence、evidence ref、post verifier、actual signal summary |
 | KM entry | 根因分類、誤判防護、修復結果、recurrence guard |
 | Verifier | post-check 指標、load trend、orphan group count、runner queue state |
-| Work item | 如果缺 owner / evidence / maintenance window，建立補件項，不假性拉高 runtime gate |
+| Work item | 如果缺 controlled apply id / evidence ref / post verifier，建立補件項；owner / maintenance 只作 optional evidence，不假性拉高 runtime gate |

 產品上的結論必須分開呈現：

@@ -150,7 +162,7 @@ monitoring_ready=true
 alert_ready=true
 playbook_ready=true
 km_writeback_required=true
-runtime_remediation_authorized=false unless gated apply is executed
+runtime_write_gate=controlled for allowlisted orphan browser cleanup; 0/false is evidence only unless critical break-glass applies
 ```

 ---
--- a/ops/monitoring/alerts-unified.yml
+++ b/ops/monitoring/alerts-unified.yml
@@ -101,8 +101,8 @@ groups:
        annotations:
          summary: "主機 {{ $labels.host }} load5/core 長時間過高"
          description: "load5 / CPU core > 1.5 持續 15 分鐘；這通常代表 runnable queue 已長期塞車，不是短暫尖峰。"
-          auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'"
-          runbook: "先判斷高 load 來源：ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter；只允許 read-only 診斷，自動修復需走服務專屬 playbook。"
+          auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'scripts/ops/host-sustained-load-controller.py --host {{ $labels.host }} --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --json'"
+          runbook: "交給 host-sustained-load-controller 產生 AI controlled packet：orphan browser 走 host-runaway-process-remediation.py dry-run → controlled SIGTERM → verifier；合法 CI/BuildKit 走 runner pressure fail-closed 與 drain/cancel packet；unknown 先跑 host-sustained-load-evidence.py 只讀脫敏證據再選服務專屬 PlayBook；swap 走服務專屬記憶體 PlayBook。禁止直接 docker/systemd/nginx/firewall/reboot。"

      - alert: HostOutOfMemory
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
@@ -285,7 +285,7 @@ groups:
        annotations:
          summary: "110 orphan browser smoke process group CPU 過高"
          description: "偵測到 {{ $labels.rule }} orphan process group，CPU 合計 >= 100% 持續 10 分鐘。這通常是跨專案 headless Chrome / Playwright smoke 遺留，不是 Docker/Sentry/Harbor 事故。"
-          runbook: "先執行 `scripts/ops/host-runaway-process-remediation.py --rule {{ $labels.rule }}` 產生 dry-run；確認 active Gitea Actions、owner、維護窗口與 evidence ref 後才可用 --apply --confirm-apply 送 SIGTERM。禁止預設 SIGKILL、Docker restart、systemctl restart 或 firewall 變更。"
+          runbook: "先執行 `scripts/ops/host-sustained-load-controller.py --host 110 --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --json` 產生 AI controlled packet；orphan browser 只允許 `host-runaway-process-remediation.py --rule {{ $labels.rule }}` dry-run 後，帶 controlled-apply-id、evidence-ref、post-apply-verifier 與 --confirm-apply 送 SIGTERM。禁止預設 SIGKILL、Docker restart、systemctl restart 或 firewall 變更。"

      - alert: HostRunawayProcessRemediationUnexpectedlyAuthorized
        expr: awoooi_host_runaway_process_remediation_authorized{host="110"} > 0
@@ -302,7 +302,7 @@ groups:
        annotations:
          summary: "110 runaway process monitor exposed runtime remediation authorization"
          description: "host-runaway-process exporter 應永遠保持 read-only；若 remediation_authorized > 0，代表有人把監控器改成執行器或把 runtime gate 誤接上。"
-          runbook: "立即回滾 exporter，檢查 Git diff、cron、Ansible role 與 /home/wooo/scripts/host-runaway-process-exporter.py。實際修復只能由 gated remediation helper 在人工批准後執行。"
+          runbook: "立即回滾 exporter，檢查 Git diff、cron、Ansible role 與 /home/wooo/scripts/host-runaway-process-exporter.py。實際修復只能由 AI controlled packet 呼叫 gated remediation helper；監控 exporter 不得持有 runtime apply 權限。"

      - alert: HostCiRunnerLoadSaturation
        expr: |
@@ -324,7 +324,7 @@ groups:
        annotations:
          summary: "110 high load is currently explained by active Gitea Actions"
          description: "load5/core > 1.0 且存在 Gitea Actions task container；若 orphan browser 指標為 0，先視為短期 CI build/test 負載，不要誤判成 Docker/Sentry/Harbor 事故。"
-          runbook: "檢查 Gitea runs、runner queue 與 `docker ps --filter name=GITEA-ACTIONS-TASK-`; 僅在 job 卡死、超過 workflow timeout 或 owner 取消後才走 runner drain / cleanup PlayBook。"
+          runbook: "執行 `scripts/ops/host-sustained-load-controller.py --host 110 --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --json`。若分類為 controlled_ci_runner_saturation_guarded，保持 runner pressure fail-closed；只有 stale workflow timeout / queue verifier 指向同一 run 時才產生 drain/cancel controlled packet，不做 process kill。"

  # =========================================================================
  # K8s 叢集告警 (kubernetes_alerts)
--- a/ops/monitoring/alerts.yml
+++ b/ops/monitoring/alerts.yml
@@ -69,8 +69,8 @@ groups:
        annotations:
          summary: "主機 {{ $labels.host }} load5/core 長時間過高"
          description: "load5 / CPU core > 1.5 持續 15 分鐘；這通常代表 runnable queue 已長期塞車，不是短暫尖峰。"
-          auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'"
-          runbook: "先判斷高 load 來源：ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter；只允許 read-only 診斷，自動修復需走服務專屬 playbook。"
+          auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'scripts/ops/host-sustained-load-controller.py --host {{ $labels.host }} --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --json'"
+          runbook: "交給 host-sustained-load-controller 產生 AI controlled packet：orphan browser 走 host-runaway-process-remediation.py dry-run → controlled SIGTERM → verifier；合法 CI/BuildKit 走 runner pressure fail-closed 與 drain/cancel packet；unknown 先跑 host-sustained-load-evidence.py 只讀脫敏證據再選服務專屬 PlayBook；swap 走服務專屬記憶體 PlayBook。禁止直接 docker/systemd/nginx/firewall/reboot。"

      - alert: HostOutOfMemory
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
--- a/scripts/ops/host-runaway-process-remediation.py
+++ b/scripts/ops/host-runaway-process-remediation.py
@@ -2,9 +2,11 @@
 """
 Gated remediation helper for AWOOOI host runaway process groups.

-Default mode is dry-run. Applying SIGTERM requires explicit owner approval,
-maintenance window, evidence reference, and --confirm-apply. This script is a
-PlayBook primitive, not a background auto-kill daemon.
+Default mode is dry-run. Applying SIGTERM requires an explicit controlled apply
+receipt, evidence reference, post-apply verifier, and --confirm-apply. Owner and
+maintenance-window identifiers are accepted as optional evidence, but they are
+not the default gate for allowlisted low-blast-radius orphan browser cleanup.
+This script is a PlayBook primitive, not a background auto-kill daemon.
 """

 from __future__ import annotations
@@ -18,6 +20,7 @@ import sys
 import time
 from pathlib import Path
 from types import ModuleType
+from typing import Any


 EXPORTER_PATH = Path(__file__).with_name("host-runaway-process-exporter.py")
@@ -42,9 +45,11 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument("--min-cpu-percent", type=float, default=50)
    parser.add_argument("--apply", action="store_true", help="Send SIGTERM to matching process groups.")
    parser.add_argument("--confirm-apply", action="store_true", help="Required together with --apply.")
+    parser.add_argument("--controlled-apply-id", default="")
    parser.add_argument("--owner-approval-id", default="")
    parser.add_argument("--maintenance-window-id", default="")
    parser.add_argument("--evidence-ref", default="")
+    parser.add_argument("--post-apply-verifier", default="")
    parser.add_argument("--wait-seconds", type=int, default=0, help="Optional wait after SIGTERM before re-reading ps.")
    return parser.parse_args()

@@ -57,17 +62,17 @@ def validate_apply_args(args: argparse.Namespace) -> None:
        missing.append("--confirm-apply")
    if not args.rule:
        missing.append("--rule")
-    if not args.owner_approval_id:
-        missing.append("--owner-approval-id")
-    if not args.maintenance_window_id:
-        missing.append("--maintenance-window-id")
+    if not args.controlled_apply_id:
+        missing.append("--controlled-apply-id")
    if not args.evidence_ref:
        missing.append("--evidence-ref")
+    if not args.post_apply_verifier:
+        missing.append("--post-apply-verifier")
    if missing:
        raise SystemExit(
            "Refusing apply; missing required gates: "
            + ", ".join(missing)
-            + ". Use dry-run output for the PlayBook packet first."
+            + ". Use dry-run output for the controlled PlayBook packet first."
        )


@@ -114,12 +119,31 @@ def main() -> None:
        )

    signaled: list[int] = []
+    missing_process_groups: list[int] = []
+    signal_errors: list[dict[str, Any]] = []
    if args.apply:
        for candidate in candidates:
            if candidate["blocked_reason"]:
                continue
-            os.killpg(int(candidate["pgid"]), signal.SIGTERM)
-            signaled.append(int(candidate["pgid"]))
+            pgid = int(candidate["pgid"])
+            try:
+                os.killpg(pgid, signal.SIGTERM)
+            except ProcessLookupError:
+                candidate["action"] = "already_exited"
+                candidate["blocked_reason"] = "process_group_missing_at_apply"
+                missing_process_groups.append(pgid)
+                continue
+            except PermissionError as exc:
+                candidate["action"] = "signal_failed"
+                candidate["blocked_reason"] = "permission_denied"
+                signal_errors.append(
+                    {
+                        "pgid": pgid,
+                        "error": exc.__class__.__name__,
+                    }
+                )
+                continue
+            signaled.append(pgid)

    remaining_after_wait = None
    if args.apply and args.wait_seconds > 0:
@@ -139,14 +163,20 @@ def main() -> None:
        "host": args.host,
        "mode": "apply_sigterm" if args.apply else "dry_run",
        "runtime_gate": 1 if args.apply else 0,
+        "controlled_apply_id": args.controlled_apply_id if args.apply else None,
        "owner_approval_id": args.owner_approval_id if args.apply else None,
        "maintenance_window_id": args.maintenance_window_id if args.apply else None,
        "evidence_ref": args.evidence_ref if args.apply else None,
+        "post_apply_verifier": args.post_apply_verifier if args.apply else None,
        "min_age_seconds": args.min_age_seconds,
        "min_cpu_percent": args.min_cpu_percent,
        "candidate_count": len(candidates),
        "signaled_process_group_count": len(signaled),
        "signaled_process_groups": signaled,
+        "missing_process_group_count": len(missing_process_groups),
+        "missing_process_groups": missing_process_groups,
+        "signal_error_count": len(signal_errors),
+        "signal_errors": signal_errors,
        "remaining_after_wait": remaining_after_wait,
        "candidates": candidates,
        "forbidden_without_gates": [
@@ -159,6 +189,8 @@ def main() -> None:
        ],
    }
    print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True))
+    if signal_errors:
+        raise SystemExit(75)


 if __name__ == "__main__":
--- a/scripts/ops/host-sustained-load-controller.py
+++ b/scripts/ops/host-sustained-load-controller.py
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+"""Classify sustained host load and emit a controlled automation packet.
+
+The controller is intentionally read-only by default. It turns
+HostLoadAverageSustainedHigh from a generic "SSH and look around" alert into a
+deterministic AI Agent control packet:
+
+* orphan browser/smoke load -> gated SIGTERM helper dry-run, then controlled
+  apply with evidence and post-apply verifier
+* active Gitea Actions/BuildKit load -> runner pressure stays fail-closed;
+  drain/cancel decisions must use runner/CD verifiers, not process kills
+* unknown or critical pressure -> source-specific playbook or break-glass
+
+It never reads secrets, raw runner registrations, sessions, or environment
+files, and it never mutates host state.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from pathlib import Path
+from typing import Any
+
+
+DEFAULT_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom")
+SCHEMA_VERSION = "host_sustained_load_controlled_automation_v1"
+LABEL_RE = re.compile(r"(?P<key>[A-Za-z_][A-Za-z0-9_]*)=\"(?P<value>(?:[^\"\\\\]|\\\\.)*)\"")
+METRIC_RE = re.compile(
+    r"^(?P<name>[A-Za-z_:][A-Za-z0-9_:]*)(?:\{(?P<labels>[^}]*)\})?\s+"
+    r"(?P<value>[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)$"
+)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Build a controlled AI Agent packet for sustained host load."
+    )
+    parser.add_argument("--host", default="110")
+    parser.add_argument("--metrics-file", type=Path, default=DEFAULT_METRICS_FILE)
+    parser.add_argument("--load5-per-core-threshold", type=float, default=1.5)
+    parser.add_argument("--ci-stale-age-seconds", type=int, default=1800)
+    parser.add_argument("--json", action="store_true", help="Print JSON only.")
+    return parser.parse_args()
+
+
+def _unescape_label(value: str) -> str:
+    return value.replace(r"\"", '"').replace(r"\\", "\\").replace(r"\n", "\n")
+
+
+def parse_prometheus_text(text: str) -> list[dict[str, Any]]:
+    samples: list[dict[str, Any]] = []
+    for raw_line in text.splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#"):
+            continue
+        match = METRIC_RE.match(line)
+        if not match:
+            continue
+        labels = {
+            item.group("key"): _unescape_label(item.group("value"))
+            for item in LABEL_RE.finditer(match.group("labels") or "")
+        }
+        samples.append(
+            {
+                "name": match.group("name"),
+                "labels": labels,
+                "value": float(match.group("value")),
+            }
+        )
+    return samples
+
+
+def _sample_value(
+    samples: list[dict[str, Any]],
+    name: str,
+    *,
+    host: str,
+    labels: dict[str, str] | None = None,
+    default: float = 0.0,
+) -> float:
+    expected = {"host": host, **(labels or {})}
+    for sample in samples:
+        if sample["name"] != name:
+            continue
+        sample_labels = sample["labels"]
+        if all(sample_labels.get(key) == value for key, value in expected.items()):
+            return float(sample["value"])
+    return default
+
+
+def _rule_values(samples: list[dict[str, Any]], name: str, *, host: str) -> list[dict[str, Any]]:
+    values = []
+    for sample in samples:
+        if sample["name"] != name:
+            continue
+        labels = sample["labels"]
+        if labels.get("host") != host:
+            continue
+        rule = labels.get("rule")
+        if not rule:
+            continue
+        values.append({"rule": rule, "value": float(sample["value"])})
+    return values
+
+
+def _top_orphan_rule(samples: list[dict[str, Any]], *, host: str) -> dict[str, Any] | None:
+    counts = _rule_values(
+        samples,
+        "awoooi_host_runaway_browser_orphan_group_count",
+        host=host,
+    )
+    cpu_by_rule = {
+        item["rule"]: item["value"]
+        for item in _rule_values(
+            samples,
+            "awoooi_host_runaway_browser_orphan_cpu_percent",
+            host=host,
+        )
+    }
+    candidates = [
+        {
+            "rule": item["rule"],
+            "group_count": int(item["value"]),
+            "cpu_percent": round(cpu_by_rule.get(item["rule"], 0.0), 3),
+        }
+        for item in counts
+        if item["value"] > 0
+    ]
+    if not candidates:
+        return None
+    return sorted(candidates, key=lambda item: (-item["cpu_percent"], item["rule"]))[0]
+
+
+def build_packet(
+    *,
+    host: str,
+    samples: list[dict[str, Any]],
+    load5_per_core_threshold: float,
+    ci_stale_age_seconds: int,
+) -> dict[str, Any]:
+    monitor_up = int(
+        _sample_value(
+            samples,
+            "awoooi_host_runaway_process_monitor_up",
+            host=host,
+            labels={"mode": "read_only"},
+            default=0,
+        )
+    )
+    load5_per_core = _sample_value(samples, "awoooi_host_load5_per_core", host=host)
+    swap_used_ratio = _sample_value(samples, "awoooi_host_swap_used_ratio", host=host)
+    remediation_authorized = int(
+        _sample_value(
+            samples,
+            "awoooi_host_runaway_process_remediation_authorized",
+            host=host,
+        )
+    )
+    active_ci_containers = int(
+        _sample_value(
+            samples,
+            "awoooi_host_gitea_actions_active_container_count",
+            host=host,
+            default=0,
+        )
+    )
+    active_ci_groups = int(
+        _sample_value(
+            samples,
+            "awoooi_host_gitea_actions_active_process_group_count",
+            host=host,
+            default=0,
+        )
+    )
+    active_ci_cpu = _sample_value(
+        samples,
+        "awoooi_host_gitea_actions_active_process_cpu_percent",
+        host=host,
+    )
+    active_ci_oldest_age = int(
+        _sample_value(
+            samples,
+            "awoooi_host_gitea_actions_active_process_oldest_age_seconds",
+            host=host,
+        )
+    )
+    top_orphan = _top_orphan_rule(samples, host=host)
+
+    classification = "observing_load_within_threshold"
+    severity = "info"
+    controlled_apply_allowed = False
+    next_action = "keep_read_only_monitoring"
+    dry_run_command = ""
+    controlled_apply_command = ""
+    verifier_command = (
+        "scripts/ops/host-sustained-load-controller.py "
+        f"--host {host} --metrics-file {DEFAULT_METRICS_FILE}"
+    )
+
+    if monitor_up != 1:
+        classification = "blocked_monitor_unavailable"
+        severity = "warning"
+        next_action = "restore_host_runaway_process_exporter_textfile_before_apply"
+    elif remediation_authorized > 0:
+        classification = "blocked_monitor_authority_violation"
+        severity = "critical"
+        next_action = "rollback_monitor_to_read_only_exporter"
+    elif load5_per_core > load5_per_core_threshold and top_orphan:
+        classification = "controlled_orphan_browser_remediation_ready"
+        severity = "critical"
+        controlled_apply_allowed = True
+        rule = top_orphan["rule"]
+        dry_run_command = f"scripts/ops/host-runaway-process-remediation.py --rule {rule}"
+        controlled_apply_command = (
+            "scripts/ops/host-runaway-process-remediation.py "
+            f"--rule {rule} --apply --confirm-apply "
+            "--controlled-apply-id ${CONTROLLED_APPLY_ID} "
+            "--evidence-ref ${EVIDENCE_REF} "
+            "--post-apply-verifier "
+            "'scripts/ops/host-sustained-load-controller.py --host "
+            f"{host} --metrics-file {DEFAULT_METRICS_FILE}' "
+            "--wait-seconds 10"
+        )
+        next_action = "run_orphan_browser_remediation_dry_run_then_controlled_sigterm"
+    elif (
+        load5_per_core > load5_per_core_threshold
+        and (active_ci_containers > 0 or active_ci_groups > 0)
+    ):
+        classification = "controlled_ci_runner_saturation_guarded"
+        severity = "critical" if active_ci_oldest_age >= ci_stale_age_seconds else "warning"
+        controlled_apply_allowed = active_ci_oldest_age >= ci_stale_age_seconds
+        dry_run_command = (
+            "ops/runner/read-public-gitea-actions-queue.py --json "
+            "&& ops/runner/check-awoooi-non110-runner-readiness.sh"
+        )
+        controlled_apply_command = (
+            "keep_110_runner_pressure_gate_fail_closed; "
+            "only cancel/drain stale Gitea Actions through runner verifier packet"
+        )
+        next_action = (
+            "prepare_runner_drain_or_cancel_packet_without_process_kill"
+            if controlled_apply_allowed
+            else "keep_pressure_gate_fail_closed_until_ci_load_clears"
+        )
+    elif load5_per_core > load5_per_core_threshold and swap_used_ratio >= 0.85:
+        classification = "blocked_memory_or_swap_pressure_requires_service_playbook"
+        severity = "critical"
+        next_action = "route_to_service_specific_memory_pressure_playbook"
+    elif load5_per_core > load5_per_core_threshold:
+        classification = "blocked_unknown_sustained_load_requires_source_specific_playbook"
+        severity = "critical"
+        dry_run_command = (
+            "scripts/ops/host-sustained-load-evidence.py "
+            f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
+            "--docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom "
+            "--json"
+        )
+        next_action = "collect_sanitized_top_process_and_container_stats_then_select_playbook"
+
+    return {
+        "schema_version": SCHEMA_VERSION,
+        "host": host,
+        "mode": "read_only_control_packet",
+        "classification": classification,
+        "severity": severity,
+        "controlled_apply_allowed": controlled_apply_allowed,
+        "next_action": next_action,
+        "readback": {
+            "monitor_up": monitor_up,
+            "load5_per_core": round(load5_per_core, 6),
+            "load5_per_core_threshold": load5_per_core_threshold,
+            "swap_used_ratio": round(swap_used_ratio, 6),
+            "remediation_authorized": remediation_authorized,
+            "active_ci_container_count": active_ci_containers,
+            "active_ci_process_group_count": active_ci_groups,
+            "active_ci_process_cpu_percent": round(active_ci_cpu, 3),
+            "active_ci_oldest_age_seconds": active_ci_oldest_age,
+            "top_orphan_rule": top_orphan,
+        },
+        "commands": {
+            "dry_run": dry_run_command,
+            "controlled_apply": controlled_apply_command,
+            "post_apply_verifier": verifier_command,
+            "rollback": "send SIGTERM only; no persistent host mutation. Re-run workload if needed.",
+        },
+        "operation_boundaries": {
+            "secret_value_read": False,
+            "raw_session_read": False,
+            "raw_runner_registration_read": False,
+            "host_write_performed": False,
+            "process_signal_performed": False,
+            "docker_restart_allowed": False,
+            "systemd_restart_allowed": False,
+            "firewall_change_allowed": False,
+            "critical_break_glass_required": True,
+        },
+        "forbidden_actions": [
+            "SIGKILL",
+            "docker_restart",
+            "systemctl_restart",
+            "nginx_reload",
+            "firewall_change",
+            "kubectl_action",
+            "secret_read",
+            "legacy_or_generic_runner_restore",
+        ],
+    }
+
+
+def main() -> int:
+    args = parse_args()
+    try:
+        text = args.metrics_file.read_text(encoding="utf-8")
+        samples = parse_prometheus_text(text)
+    except FileNotFoundError:
+        samples = []
+    packet = build_packet(
+        host=args.host,
+        samples=samples,
+        load5_per_core_threshold=args.load5_per_core_threshold,
+        ci_stale_age_seconds=args.ci_stale_age_seconds,
+    )
+    if args.json:
+        print(json.dumps(packet, ensure_ascii=False, indent=2, sort_keys=True))
+    else:
+        print(f"status={packet['classification']}")
+        print(f"controlled_apply_allowed={str(packet['controlled_apply_allowed']).lower()}")
+        print(f"next_action={packet['next_action']}")
+        if packet["commands"]["dry_run"]:
+            print(f"dry_run_command={packet['commands']['dry_run']}")
+        if packet["commands"]["controlled_apply"]:
+            print(f"controlled_apply_command={packet['commands']['controlled_apply']}")
+        print(f"post_apply_verifier={packet['commands']['post_apply_verifier']}")
+    return 0 if not packet["classification"].startswith("blocked_") else 75
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/ops/host-sustained-load-evidence.py
+++ b/scripts/ops/host-sustained-load-evidence.py
@@ -0,0 +1,287 @@
+#!/usr/bin/env python3
+"""Build sanitized evidence for unknown sustained host load.
+
+This collector is read-only. It intentionally emits process families and
+container names instead of raw command lines so CPU-pressure alerts can proceed
+to a source-specific PlayBook without leaking workspace paths, URLs, JSON
+payloads, or secrets.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import subprocess
+from pathlib import Path
+from typing import Any
+
+
+DEFAULT_HOST_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom")
+DEFAULT_DOCKER_STATS_FILE = Path("/home/wooo/node_exporter_textfiles/docker_stats.prom")
+SCHEMA_VERSION = "host_sustained_load_sanitized_evidence_v1"
+LABEL_RE = re.compile(r"(?P<key>[A-Za-z_][A-Za-z0-9_]*)=\"(?P<value>(?:[^\"\\\\]|\\\\.)*)\"")
+METRIC_RE = re.compile(
+    r"^(?P<name>[A-Za-z_:][A-Za-z0-9_:]*)(?:\{(?P<labels>[^}]*)\})?\s+"
+    r"(?P<value>[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)$"
+)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Collect sanitized sustained-load evidence.")
+    parser.add_argument("--host", default=os.environ.get("AIOPS_HOST_LABEL", "110"))
+    parser.add_argument("--metrics-file", type=Path, default=DEFAULT_HOST_METRICS_FILE)
+    parser.add_argument("--docker-stats-file", type=Path, default=DEFAULT_DOCKER_STATS_FILE)
+    parser.add_argument("--ps-file", type=Path)
+    parser.add_argument("--top-n", type=int, default=8)
+    parser.add_argument("--json", action="store_true")
+    return parser.parse_args()
+
+
+def _unescape_label(value: str) -> str:
+    return value.replace(r"\"", '"').replace(r"\\", "\\").replace(r"\n", "\n")
+
+
+def parse_prometheus_text(text: str) -> list[dict[str, Any]]:
+    samples: list[dict[str, Any]] = []
+    for raw_line in text.splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#"):
+            continue
+        match = METRIC_RE.match(line)
+        if not match:
+            continue
+        labels = {
+            item.group("key"): _unescape_label(item.group("value"))
+            for item in LABEL_RE.finditer(match.group("labels") or "")
+        }
+        samples.append(
+            {
+                "name": match.group("name"),
+                "labels": labels,
+                "value": float(match.group("value")),
+            }
+        )
+    return samples
+
+
+def read_text(path: Path | None) -> str:
+    if path is None:
+        return ""
+    try:
+        return path.read_text(encoding="utf-8")
+    except FileNotFoundError:
+        return ""
+
+
+def collect_ps_text(ps_file: Path | None) -> str:
+    if ps_file is not None:
+        return read_text(ps_file)
+    result = subprocess.run(
+        ["ps", "-eo", "pid=,ppid=,pgid=,etimes=,pcpu=,pmem=,comm=,args="],
+        check=True,
+        capture_output=True,
+        text=True,
+        timeout=10,
+    )
+    return result.stdout
+
+
+def parse_ps_text(text: str) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    for raw_line in text.splitlines():
+        line = raw_line.strip()
+        if not line:
+            continue
+        parts = line.split(None, 7)
+        if len(parts) < 7:
+            continue
+        pid, ppid, pgid, etimes, pcpu, pmem, comm = parts[:7]
+        args = parts[7] if len(parts) > 7 else comm
+        try:
+            rows.append(
+                {
+                    "pid": int(pid),
+                    "ppid": int(ppid),
+                    "pgid": int(pgid),
+                    "etimes": int(float(etimes)),
+                    "cpu_percent": float(pcpu),
+                    "mem_percent": float(pmem),
+                    "comm": Path(comm).name[:48],
+                    "family": classify_process_family(comm, args),
+                }
+            )
+        except ValueError:
+            continue
+    return rows
+
+
+def classify_process_family(comm: str, args: str) -> str:
+    text = f"{comm} {args}".lower()
+    if "act_runner" in text or "gitea-actions-task" in text or "/.cache/act/" in text:
+        return "gitea_actions_runner"
+    if "docker build" in text or "buildx" in text or "buildkit" in text:
+        return "docker_build"
+    if "next build" in text or "turbo build" in text or "pnpm" in text and " build" in text:
+        return "web_build"
+    if "chrome" in text or "chromium" in text or "playwright" in text:
+        return "headless_browser"
+    if "gitea" in text:
+        return "gitea_service"
+    if "postgres" in text or "postmaster" in text:
+        return "postgres"
+    if "clickhouse" in text:
+        return "clickhouse"
+    if "kafka" in text:
+        return "kafka"
+    if "sentry" in text:
+        return "sentry"
+    if "systemctl" in text or "systemd" in text or "dbus" in text:
+        return "systemd_control_plane"
+    if "sshd" in text:
+        return "ssh_control_plane"
+    if "python" in text:
+        return "python_job"
+    if "node" in text:
+        return "node_service"
+    return "unknown"
+
+
+def summarize_processes(rows: list[dict[str, Any]], *, top_n: int) -> dict[str, Any]:
+    top_rows = sorted(rows, key=lambda item: (-item["cpu_percent"], item["comm"], item["pid"]))[:top_n]
+    families: dict[str, dict[str, Any]] = {}
+    for row in rows:
+        family = row["family"]
+        current = families.setdefault(
+            family,
+            {
+                "family": family,
+                "process_count": 0,
+                "cpu_percent": 0.0,
+                "max_age_seconds": 0,
+                "sample_comm": "",
+            },
+        )
+        current["process_count"] += 1
+        current["cpu_percent"] += row["cpu_percent"]
+        current["max_age_seconds"] = max(current["max_age_seconds"], row["etimes"])
+        if not current["sample_comm"] or row["cpu_percent"] > current.get("_sample_cpu", -1):
+            current["sample_comm"] = row["comm"]
+            current["_sample_cpu"] = row["cpu_percent"]
+
+    family_rows = []
+    for item in families.values():
+        item.pop("_sample_cpu", None)
+        item["cpu_percent"] = round(float(item["cpu_percent"]), 3)
+        family_rows.append(item)
+
+    return {
+        "top_processes": [
+            {
+                "pid": row["pid"],
+                "ppid": row["ppid"],
+                "pgid": row["pgid"],
+                "cpu_percent": round(row["cpu_percent"], 3),
+                "mem_percent": round(row["mem_percent"], 3),
+                "age_seconds": row["etimes"],
+                "comm": row["comm"],
+                "family": row["family"],
+            }
+            for row in top_rows
+        ],
+        "families": sorted(family_rows, key=lambda item: (-item["cpu_percent"], item["family"]))[:top_n],
+    }
+
+
+def top_docker_containers(samples: list[dict[str, Any]], *, host: str, top_n: int) -> list[dict[str, Any]]:
+    rows = []
+    for sample in samples:
+        if sample["name"] != "docker_container_cpu_cores":
+            continue
+        labels = sample["labels"]
+        if labels.get("host", host) != host:
+            continue
+        rows.append(
+            {
+                "container_name": labels.get("container_name") or labels.get("name") or "unknown",
+                "cpu_cores": round(float(sample["value"]), 6),
+            }
+        )
+    return sorted(rows, key=lambda item: (-item["cpu_cores"], item["container_name"]))[:top_n]
+
+
+def recommend_playbook(process_families: list[dict[str, Any]], containers: list[dict[str, Any]]) -> str:
+    top_container = containers[0] if containers else {}
+    top_container_name = str(top_container.get("container_name") or "").lower()
+    top_container_cpu = float(top_container.get("cpu_cores") or 0.0)
+    top_family = process_families[0] if process_families else {}
+    family = str(top_family.get("family") or "")
+
+    if "gitea" in top_container_name and top_container_cpu >= 2.0:
+        return "gitea_queue_or_hook_backlog_playbook"
+    if "postgres" in top_container_name or "postgres" in family:
+        return "postgres_hot_query_or_backup_export_playbook"
+    if family in {"docker_build", "web_build", "gitea_actions_runner"}:
+        return "build_or_runner_pressure_playbook"
+    if family in {"systemd_control_plane", "ssh_control_plane"}:
+        return "control_plane_saturation_playbook"
+    if family == "headless_browser":
+        return "orphan_browser_classification_refresh_playbook"
+    return "source_specific_playbook_required"
+
+
+def build_payload(args: argparse.Namespace) -> dict[str, Any]:
+    host_samples = parse_prometheus_text(read_text(args.metrics_file))
+    docker_samples = parse_prometheus_text(read_text(args.docker_stats_file))
+    process_summary = summarize_processes(parse_ps_text(collect_ps_text(args.ps_file)), top_n=args.top_n)
+    containers = top_docker_containers(docker_samples, host=args.host, top_n=args.top_n)
+    recommendation = recommend_playbook(process_summary["families"], containers)
+
+    return {
+        "schema_version": SCHEMA_VERSION,
+        "host": args.host,
+        "mode": "read_only_sanitized_evidence",
+        "recommendation": recommendation,
+        "controlled_apply_allowed": False,
+        "next_action": "select_or_generate_source_specific_playbook_then_run_check_mode",
+        "readback": {
+            "host_metric_sample_count": len(host_samples),
+            "docker_metric_sample_count": len(docker_samples),
+            "top_container_count": len(containers),
+            "top_process_family_count": len(process_summary["families"]),
+        },
+        "top_containers": containers,
+        "top_process_families": process_summary["families"],
+        "top_processes_sanitized": process_summary["top_processes"],
+        "redaction": {
+            "raw_command_lines_emitted": False,
+            "workspace_paths_emitted": False,
+            "urls_emitted": False,
+            "secret_values_read": False,
+        },
+        "operation_boundaries": {
+            "host_write_performed": False,
+            "process_signal_performed": False,
+            "docker_restart_performed": False,
+            "systemd_restart_performed": False,
+            "raw_session_read": False,
+            "raw_runner_registration_read": False,
+        },
+    }
+
+
+def main() -> int:
+    args = parse_args()
+    payload = build_payload(args)
+    if args.json:
+        print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True))
+    else:
+        print(f"recommendation={payload['recommendation']}")
+        print(f"controlled_apply_allowed={str(payload['controlled_apply_allowed']).lower()}")
+        print(f"next_action={payload['next_action']}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/ops/tests/test_host_runaway_process_exporter.py
+++ b/scripts/ops/tests/test_host_runaway_process_exporter.py
@@ -1,6 +1,7 @@
 from __future__ import annotations

 import importlib.util
+import json
 import subprocess
 import sys
 from pathlib import Path
@@ -9,6 +10,8 @@ from pathlib import Path
 SCRIPT_ROOT = Path(__file__).resolve().parents[1]
 EXPORTER_PATH = SCRIPT_ROOT / "host-runaway-process-exporter.py"
 REMEDIATION_PATH = SCRIPT_ROOT / "host-runaway-process-remediation.py"
+CONTROLLER_PATH = SCRIPT_ROOT / "host-sustained-load-controller.py"
+EVIDENCE_PATH = SCRIPT_ROOT / "host-sustained-load-evidence.py"


 def load_exporter():
@@ -167,7 +170,7 @@ def test_ignores_the_host_pressure_gate_process_group() -> None:
 def test_remediation_defaults_to_dry_run(tmp_path: Path) -> None:
    ps_file = tmp_path / "ps.txt"
    ps_file.write_text(
-        "100 1 100 100 7200 65.0 S chrome /opt/chrome/chrome --headless --user-data-dir=/tmp/stockplatform-review-bulk-ux-aa\n",
+        "999999 1 999999 999999 7200 65.0 S chrome /opt/chrome/chrome --headless --user-data-dir=/tmp/stockplatform-review-bulk-ux-aa\n",
        encoding="utf-8",
    )

@@ -193,7 +196,7 @@ def test_remediation_defaults_to_dry_run(tmp_path: Path) -> None:
 def test_remediation_refuses_apply_without_gates(tmp_path: Path) -> None:
    ps_file = tmp_path / "ps.txt"
    ps_file.write_text(
-        "100 1 100 100 7200 65.0 S chrome /opt/chrome/chrome --headless --user-data-dir=/tmp/stockplatform-review-bulk-ux-aa\n",
+        "999999 1 999999 999999 7200 65.0 S chrome /opt/chrome/chrome --headless --user-data-dir=/tmp/stockplatform-review-bulk-ux-aa\n",
        encoding="utf-8",
    )

@@ -213,3 +216,329 @@ def test_remediation_refuses_apply_without_gates(tmp_path: Path) -> None:

    assert result.returncode != 0
    assert "Refusing apply" in result.stderr
+    assert "--controlled-apply-id" in result.stderr
+    assert "--confirm-apply" in result.stderr
+    assert "--post-apply-verifier" in result.stderr
+
+
+def test_remediation_accepts_controlled_apply_gate_without_owner_gate(tmp_path: Path) -> None:
+    ps_file = tmp_path / "ps.txt"
+    ps_file.write_text(
+        "100 1 1 100 7200 65.0 S chrome /opt/chrome/chrome --headless --user-data-dir=/tmp/stockplatform-review-bulk-ux-aa\n",
+        encoding="utf-8",
+    )
+
+    result = subprocess.run(
+        [
+            sys.executable,
+            str(REMEDIATION_PATH),
+            "--ps-file",
+            str(ps_file),
+            "--apply",
+            "--confirm-apply",
+            "--rule",
+            "stockplatform_headless_smoke",
+            "--controlled-apply-id",
+            "CAP-20260701-HOSTLOAD",
+            "--evidence-ref",
+            "HostLoadAverageSustainedHigh:110",
+            "--post-apply-verifier",
+            "scripts/ops/host-sustained-load-controller.py --host 110 --json",
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+
+    assert '"mode": "apply_sigterm"' in result.stdout
+    assert '"runtime_gate": 1' in result.stdout
+    assert '"controlled_apply_id": "CAP-20260701-HOSTLOAD"' in result.stdout
+    assert '"owner_approval_id": ""' in result.stdout
+    assert '"blocked_reason": "unsafe_pgid"' in result.stdout
+    assert '"missing_process_group_count": 0' in result.stdout
+    assert '"signal_error_count": 0' in result.stdout
+    assert '"signaled_process_group_count": 0' in result.stdout
+
+
+def test_sustained_load_controller_routes_orphan_browser_to_controlled_remediation(tmp_path: Path) -> None:
+    metrics_file = tmp_path / "host.prom"
+    metrics_file.write_text(
+        "\n".join(
+            [
+                'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1',
+                'awoooi_host_load5_per_core{host="110"} 2.2',
+                'awoooi_host_swap_used_ratio{host="110"} 0.1',
+                'awoooi_host_runaway_process_remediation_authorized{host="110"} 0',
+                'awoooi_host_gitea_actions_active_container_count{host="110"} 0',
+                'awoooi_host_gitea_actions_active_process_group_count{host="110"} 0',
+                'awoooi_host_runaway_browser_orphan_group_count{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 1',
+                'awoooi_host_runaway_browser_orphan_cpu_percent{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 155.5',
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    result = subprocess.run(
+        [
+            sys.executable,
+            str(CONTROLLER_PATH),
+            "--host",
+            "110",
+            "--metrics-file",
+            str(metrics_file),
+            "--json",
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+
+    payload = json.loads(result.stdout)
+    assert payload["classification"] == "controlled_orphan_browser_remediation_ready"
+    assert payload["controlled_apply_allowed"] is True
+    assert "host-runaway-process-remediation.py --rule stockplatform_headless_smoke" in payload["commands"]["dry_run"]
+    assert "--controlled-apply-id" in payload["commands"]["controlled_apply"]
+    assert payload["operation_boundaries"]["process_signal_performed"] is False
+
+
+def test_sustained_load_controller_keeps_ci_saturation_on_runner_path(tmp_path: Path) -> None:
+    metrics_file = tmp_path / "host.prom"
+    metrics_file.write_text(
+        "\n".join(
+            [
+                'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1',
+                'awoooi_host_load5_per_core{host="110"} 2.0',
+                'awoooi_host_swap_used_ratio{host="110"} 0.1',
+                'awoooi_host_runaway_process_remediation_authorized{host="110"} 0',
+                'awoooi_host_gitea_actions_active_container_count{host="110"} 2',
+                'awoooi_host_gitea_actions_active_process_group_count{host="110"} 1',
+                'awoooi_host_gitea_actions_active_process_cpu_percent{host="110"} 180.0',
+                'awoooi_host_gitea_actions_active_process_oldest_age_seconds{host="110"} 1900',
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    result = subprocess.run(
+        [
+            sys.executable,
+            str(CONTROLLER_PATH),
+            "--host",
+            "110",
+            "--metrics-file",
+            str(metrics_file),
+            "--json",
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+
+    payload = json.loads(result.stdout)
+    assert payload["classification"] == "controlled_ci_runner_saturation_guarded"
+    assert payload["controlled_apply_allowed"] is True
+    assert "fail_closed" in payload["commands"]["controlled_apply"]
+    assert "process_kill" not in payload["commands"]["controlled_apply"]
+
+
+def test_sustained_load_controller_blocks_monitor_authority_violation(tmp_path: Path) -> None:
+    metrics_file = tmp_path / "host.prom"
+    metrics_file.write_text(
+        "\n".join(
+            [
+                'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1',
+                'awoooi_host_load5_per_core{host="110"} 2.0',
+                'awoooi_host_runaway_process_remediation_authorized{host="110"} 1',
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    result = subprocess.run(
+        [
+            sys.executable,
+            str(CONTROLLER_PATH),
+            "--host",
+            "110",
+            "--metrics-file",
+            str(metrics_file),
+            "--json",
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode == 75
+    payload = json.loads(result.stdout)
+    assert payload["classification"] == "blocked_monitor_authority_violation"
+    assert payload["controlled_apply_allowed"] is False
+
+
+def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp_path: Path) -> None:
+    metrics_file = tmp_path / "host.prom"
+    metrics_file.write_text(
+        "\n".join(
+            [
+                'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1',
+                'awoooi_host_load5_per_core{host="110"} 2.0',
+                'awoooi_host_swap_used_ratio{host="110"} 0.1',
+                'awoooi_host_runaway_process_remediation_authorized{host="110"} 0',
+                'awoooi_host_gitea_actions_active_container_count{host="110"} 0',
+                'awoooi_host_gitea_actions_active_process_group_count{host="110"} 0',
+                'awoooi_host_runaway_browser_orphan_group_count{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 0',
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    result = subprocess.run(
+        [
+            sys.executable,
+            str(CONTROLLER_PATH),
+            "--host",
+            "110",
+            "--metrics-file",
+            str(metrics_file),
+            "--json",
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode == 75
+    payload = json.loads(result.stdout)
+    assert payload["classification"] == "blocked_unknown_sustained_load_requires_source_specific_playbook"
+    assert payload["controlled_apply_allowed"] is False
+    assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
+    assert payload["operation_boundaries"]["process_signal_performed"] is False
+
+
+def test_sustained_load_evidence_emits_sanitized_gitea_recommendation(tmp_path: Path) -> None:
+    ps_file = tmp_path / "ps.txt"
+    ps_file.write_text(
+        "\n".join(
+            [
+                "100 1 100 7200 280.0 1.0 gitea /usr/local/bin/gitea web --config /home/wooo/gitea/app.ini",
+                "200 1 200 180 15.0 0.5 systemd systemctl show gitea-act-runner-host.service",
+            ]
+        ),
+        encoding="utf-8",
+    )
+    docker_file = tmp_path / "docker.prom"
+    docker_file.write_text(
+        'docker_container_cpu_cores{host="110",container_name="gitea"} 3.4\n',
+        encoding="utf-8",
+    )
+
+    result = subprocess.run(
+        [
+            sys.executable,
+            str(EVIDENCE_PATH),
+            "--host",
+            "110",
+            "--ps-file",
+            str(ps_file),
+            "--docker-stats-file",
+            str(docker_file),
+            "--json",
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+
+    payload = json.loads(result.stdout)
+    assert payload["schema_version"] == "host_sustained_load_sanitized_evidence_v1"
+    assert payload["recommendation"] == "gitea_queue_or_hook_backlog_playbook"
+    assert payload["redaction"]["raw_command_lines_emitted"] is False
+    assert payload["operation_boundaries"]["host_write_performed"] is False
+    assert "/home/wooo" not in result.stdout
+
+
+def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp_path: Path) -> None:
+    metrics_file = tmp_path / "host.prom"
+    metrics_file.write_text(
+        "\n".join(
+            [
+                'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1',
+                'awoooi_host_load5_per_core{host="110"} 2.4',
+                'awoooi_host_swap_used_ratio{host="110"} 0.1',
+                'awoooi_host_runaway_process_remediation_authorized{host="110"} 0',
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    result = subprocess.run(
+        [
+            sys.executable,
+            str(CONTROLLER_PATH),
+            "--host",
+            "110",
+            "--metrics-file",
+            str(metrics_file),
+            "--json",
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode == 75
+    payload = json.loads(result.stdout)
+    assert (
+        payload["classification"]
+        == "blocked_unknown_sustained_load_requires_source_specific_playbook"
+    )
+    assert payload["controlled_apply_allowed"] is False
+    assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
+    assert payload["operation_boundaries"]["host_write_performed"] is False
+
+
+def test_sustained_load_evidence_sanitizes_process_details(tmp_path: Path) -> None:
+    ps_file = tmp_path / "ps.txt"
+    ps_file.write_text(
+        "\n".join(
+            [
+                "101 1 101 7200 65.0 2.5 chrome /opt/chrome/chrome --headless --user-data-dir=/tmp/stockplatform-review-bulk-ux-aa --url=https://example.invalid/token",
+                "102 1 102 3600 20.0 1.0 node node /srv/private/app/server.js --api-key=SECRET",
+            ]
+        ),
+        encoding="utf-8",
+    )
+    docker_stats_file = tmp_path / "docker.prom"
+    docker_stats_file.write_text(
+        'docker_container_cpu_cores{host="110",container_name="gitea"} 3.2\n',
+        encoding="utf-8",
+    )
+
+    result = subprocess.run(
+        [
+            sys.executable,
+            str(EVIDENCE_PATH),
+            "--host",
+            "110",
+            "--ps-file",
+            str(ps_file),
+            "--docker-stats-file",
+            str(docker_stats_file),
+            "--json",
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+
+    payload = json.loads(result.stdout)
+    assert payload["schema_version"] == "host_sustained_load_sanitized_evidence_v1"
+    assert payload["recommendation"] == "gitea_queue_or_hook_backlog_playbook"
+    assert payload["redaction"]["raw_command_lines_emitted"] is False
+    assert payload["redaction"]["workspace_paths_emitted"] is False
+    assert payload["redaction"]["urls_emitted"] is False
+    assert payload["operation_boundaries"]["host_write_performed"] is False
+    assert "https://example.invalid/token" not in result.stdout
+    assert "/tmp/stockplatform-review-bulk-ux-aa" not in result.stdout
+    assert "SECRET" not in result.stdout
+    assert {item["family"] for item in payload["top_process_families"]} >= {
+        "headless_browser",
+        "node_service",
+    }