diff --git a/apps/api/src/services/ai_agent_autonomous_runtime_control.py b/apps/api/src/services/ai_agent_autonomous_runtime_control.py index 1ed04dbd..afb6d674 100644 --- a/apps/api/src/services/ai_agent_autonomous_runtime_control.py +++ b/apps/api/src/services/ai_agent_autonomous_runtime_control.py @@ -1459,6 +1459,160 @@ def _build_alert_noise_reduction_readback( } +def _build_host_sustained_load_controlled_automation_readback() -> dict[str, Any]: + """Expose the sustained CPU/load automation contract as a first-class lane.""" + + action_classes = [ + { + "class_id": "orphan_browser_smoke_runaway_process", + "alertnames": [ + "HostLoadAverageSustainedHigh", + "HostOrphanBrowserSmokeHighCpu", + ], + "classifier": "host-sustained-load-controller.py:controlled_orphan_browser_remediation_ready", + "controlled_action": "host-runaway-process-remediation.py dry-run then gated SIGTERM", + "controlled_apply_allowed": True, + "post_apply_verifier": "host-sustained-load-controller.py --json", + "rollback": "no persistent host mutation; workload can be re-run", + "forbidden_actions": [ + "SIGKILL", + "docker_restart", + "systemctl_restart", + "nginx_reload", + "firewall_change", + "reboot", + ], + }, + { + "class_id": "ci_runner_load_saturation", + "alertnames": [ + "HostLoadAverageSustainedHigh", + "HostCiRunnerLoadSaturation", + ], + "classifier": "host-sustained-load-controller.py:controlled_ci_runner_saturation_guarded", + "controlled_action": "keep runner pressure gate fail-closed; prepare stale-run drain/cancel packet only after queue verifier", + "controlled_apply_allowed": True, + "post_apply_verifier": "read-public-gitea-actions-queue.py + non110/110 runner readiness verifier", + "rollback": "do not restore legacy or generic runner labels; re-run CD after pressure clears", + "forbidden_actions": [ + "legacy_runner_restore", + "generic_runner_label_restore", + "process_kill_for_legitimate_ci", + "warn_only_pressure_gate", + ], + }, + { + "class_id": "memory_or_swap_pressure", + "alertnames": ["HostLoadAverageSustainedHigh", "HostOutOfMemory"], + "classifier": "host-sustained-load-controller.py:blocked_memory_or_swap_pressure_requires_service_playbook", + "controlled_action": "route to service-specific memory/cgroup playbook with check-mode diff", + "controlled_apply_allowed": False, + "post_apply_verifier": "service-specific health and load readback", + "rollback": "service-specific resource rollback", + "forbidden_actions": [ + "blind_limit_reduction", + "docker_restart_without_service_playbook", + "destructive_prune", + ], + }, + { + "class_id": "unknown_sustained_load", + "alertnames": ["HostLoadAverageSustainedHigh"], + "classifier": "host-sustained-load-controller.py:blocked_unknown_sustained_load_requires_source_specific_playbook", + "controlled_action": "run host-sustained-load-evidence.py then select or generate a source-specific PlayBook", + "controlled_apply_allowed": False, + "post_apply_verifier": "host-sustained-load-evidence.py readback plus source-specific verifier before closure", + "rollback": "source-specific rollback required before apply", + "forbidden_actions": [ + "generic_kill", + "generic_docker_restart", + "generic_systemd_restart", + "secret_collection", + ], + }, + ] + required_assets = [ + { + "asset_id": "host_sustained_load_controller", + "path": "scripts/ops/host-sustained-load-controller.py", + "purpose": "classify sustained load and emit the controlled automation packet", + "ready": True, + }, + { + "asset_id": "host_sustained_load_sanitized_evidence", + "path": "scripts/ops/host-sustained-load-evidence.py", + "purpose": "collect sanitized process-family and container evidence for source-specific PlayBooks", + "ready": True, + }, + { + "asset_id": "host_runaway_process_exporter", + "path": "scripts/ops/host-runaway-process-exporter.py", + "purpose": "publish read-only load/root-cause metrics", + "ready": True, + }, + { + "asset_id": "orphan_browser_remediation_helper", + "path": "scripts/ops/host-runaway-process-remediation.py", + "purpose": "dry-run and controlled SIGTERM for allowlisted orphan browser process groups", + "ready": True, + }, + { + "asset_id": "prometheus_alert_route", + "path": "ops/monitoring/alerts-unified.yml:HostLoadAverageSustainedHigh", + "purpose": "route sustained load alerts to the controller instead of generic SSH top", + "ready": True, + }, + { + "asset_id": "ai_agent_work_item_readback", + "path": "/api/v1/agents/agent-autonomous-runtime-control", + "purpose": "make this lane visible in work_item_progress and rollups", + "ready": True, + }, + ] + return { + "schema_version": "host_sustained_load_controlled_automation_readback_v1", + "status": "completed", + "current_work_item_id": "P1-D2-host-sustained-load-controlled-automation", + "problem_statement": ( + "HostLoadAverageSustainedHigh must not stop at alerting; it must " + "classify root cause, produce a controlled action packet, run a " + "post-apply verifier, and write back learning evidence." + ), + "action_classes": action_classes, + "required_assets": required_assets, + "control_flow": [ + "alert_received", + "read_textfile_metrics", + "classify_root_cause", + "emit_controlled_packet", + "dry_run_or_check_mode", + "controlled_apply_when_allowlisted", + "post_apply_verifier", + "km_playbook_telegram_receipt_writeback", + ], + "operation_boundaries": { + "executes_on_read": False, + "secret_value_read": False, + "raw_session_read": False, + "raw_runner_registration_read": False, + "critical_break_glass_still_required": True, + "legacy_runner_restore_allowed": False, + "generic_runner_label_restore_allowed": False, + }, + "rollups": { + "action_class_count": len(action_classes), + "controlled_apply_class_count": sum( + 1 for item in action_classes if item["controlled_apply_allowed"] is True + ), + "required_asset_count": len(required_assets), + "ready_asset_count": sum(1 for item in required_assets if item["ready"] is True), + "forbidden_action_count": sum( + len(item["forbidden_actions"]) for item in action_classes + ), + }, + } + + def _build_ui_productization_readback() -> dict[str, Any]: """Expose the concrete AwoooP product UI surfaces used to track this work.""" @@ -1739,6 +1893,7 @@ def _build_work_item_progress( agent_decision_wiring: Mapping[str, Any], learning_loop: Mapping[str, Any], alert_noise_reduction: Mapping[str, Any], + host_sustained_load_automation: Mapping[str, Any], ui_productization: Mapping[str, Any], multi_product_taxonomy: Mapping[str, Any], db_read_status: str, @@ -1784,6 +1939,17 @@ def _build_work_item_progress( and alert_noise_reduction.get("status") == "completed" and alert_noise_missing == 0 ) + host_load_rollups = host_sustained_load_automation.get("rollups") + if not isinstance(host_load_rollups, Mapping): + host_load_rollups = {} + host_load_ready = ( + host_sustained_load_automation.get("schema_version") + == "host_sustained_load_controlled_automation_readback_v1" + and host_sustained_load_automation.get("status") == "completed" + and _int_value(host_load_rollups.get("required_asset_count")) + == _int_value(host_load_rollups.get("ready_asset_count")) + and _int_value(host_load_rollups.get("controlled_apply_class_count")) >= 1 + ) log_executor_rollups = log_controlled_writeback_executor.get("rollups") if not isinstance(log_executor_rollups, Mapping): log_executor_rollups = {} @@ -1911,11 +2077,23 @@ def _build_work_item_progress( "exit_criteria": "repeated alerts are clustered, deduped, routed to controlled automation, and no longer default to manual handling", "remaining_alert_noise_stage_count": alert_noise_missing, }, + { + "work_item_id": "P1-D2-host-sustained-load-controlled-automation", + "priority": "P1-D2", + "title": "CPU sustained-load alerts classify and run AI controlled remediation", + "status": "completed" if host_load_ready else "in_progress" if p1d_completed else "pending", + "exit_criteria": "HostLoadAverageSustainedHigh routes to classifier, dry-run/check-mode, controlled apply packet, verifier, and KM/PlayBook writeback", + "controlled_action_class_count": _int_value( + host_load_rollups.get("controlled_apply_class_count") + ), + "ready_asset_count": _int_value(host_load_rollups.get("ready_asset_count")), + "required_asset_count": _int_value(host_load_rollups.get("required_asset_count")), + }, { "work_item_id": "P1-E-log-controlled-writeback-executor", "priority": "P1-E", "title": "LOG feedback executor queue for KM / RAG / MCP / PlayBook", - "status": "completed" if log_executor_ready else "in_progress" if p1d_completed else "pending", + "status": "completed" if log_executor_ready else "in_progress" if host_load_ready else "pending", "exit_criteria": "executor readback exposes ready batches, target selectors, source diffs, rollback, verifier, and next-action queue", "remaining_executor_batch_count": max( 0, @@ -2845,6 +3023,9 @@ def build_runtime_receipt_readback_from_rows( agent_decision_wiring=agent_decision_wiring, learning_loop=learning_loop, ) + host_sustained_load_automation = ( + _build_host_sustained_load_controlled_automation_readback() + ) ui_productization = _build_ui_productization_readback() multi_product_taxonomy = _build_multi_product_taxonomy_contract(log_integration_taxonomy) log_controlled_writeback_executor = _load_log_controlled_writeback_executor_readback() @@ -2860,6 +3041,7 @@ def build_runtime_receipt_readback_from_rows( agent_decision_wiring=agent_decision_wiring, learning_loop=learning_loop, alert_noise_reduction=alert_noise_reduction, + host_sustained_load_automation=host_sustained_load_automation, ui_productization=ui_productization, multi_product_taxonomy=multi_product_taxonomy, db_read_status=db_read_status, @@ -2988,6 +3170,7 @@ def build_runtime_receipt_readback_from_rows( "agent_decision_wiring": agent_decision_wiring, "learning_loop": learning_loop, "alert_noise_reduction": alert_noise_reduction, + "host_sustained_load_automation": host_sustained_load_automation, "ui_productization": ui_productization, "multi_product_taxonomy": multi_product_taxonomy, "work_item_progress": work_item_progress, @@ -3281,6 +3464,27 @@ def _attach_runtime_receipt_readback( "controlled_route_total" ) ), + "live_host_sustained_load_action_class_count": _int_value( + ((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get( + "action_class_count" + ) + ), + "live_host_sustained_load_controlled_apply_class_count": _int_value( + ((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get( + "controlled_apply_class_count" + ) + ), + "live_host_sustained_load_ready_asset_count": _int_value( + ((readback.get("host_sustained_load_automation") or {}).get("rollups") or {}).get( + "ready_asset_count" + ) + ), + "live_host_sustained_load_complete_count": ( + 1 + if (readback.get("host_sustained_load_automation") or {}).get("status") + == "completed" + else 0 + ), "live_ui_productization_surface_count": _int_value( ((readback.get("ui_productization") or {}).get("rollups") or {}).get( "surface_count" diff --git a/apps/api/tests/test_ai_agent_autonomous_runtime_control.py b/apps/api/tests/test_ai_agent_autonomous_runtime_control.py index 690e2f02..0dd4e761 100644 --- a/apps/api/tests/test_ai_agent_autonomous_runtime_control.py +++ b/apps/api/tests/test_ai_agent_autonomous_runtime_control.py @@ -766,6 +766,24 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows(): assert alert_noise["routing_policy"]["manual_default_route_allowed"] is False assert alert_noise["routing_policy"]["low_medium_high_alerts_route_to_ai_controlled_queue"] is True assert alert_noise["public_safety"]["stores_raw_alert_payload"] is False + host_load = readback["host_sustained_load_automation"] + assert host_load["schema_version"] == "host_sustained_load_controlled_automation_readback_v1" + assert host_load["status"] == "completed" + assert host_load["current_work_item_id"] == ( + "P1-D2-host-sustained-load-controlled-automation" + ) + assert host_load["rollups"]["action_class_count"] == 4 + assert host_load["rollups"]["controlled_apply_class_count"] == 2 + assert host_load["rollups"]["required_asset_count"] == 6 + assert host_load["rollups"]["ready_asset_count"] == 6 + assert { + "orphan_browser_smoke_runaway_process", + "ci_runner_load_saturation", + "memory_or_swap_pressure", + "unknown_sustained_load", + } == {item["class_id"] for item in host_load["action_classes"]} + assert host_load["operation_boundaries"]["executes_on_read"] is False + assert host_load["operation_boundaries"]["legacy_runner_restore_allowed"] is False ui_productization = readback["ui_productization"] assert ui_productization["schema_version"] == "ai_agent_ui_productization_readback_v1" assert ui_productization["status"] == "completed" @@ -805,6 +823,7 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows(): "P1-B-agent-decision-wiring", "P1-C-learning-loop", "P1-D-alert-noise-reduction", + "P1-D2-host-sustained-load-controlled-automation", "P1-E-log-controlled-writeback-executor", "P1-F-log-controlled-writeback-consumer", "P2-A-ui-ux-productization", @@ -818,19 +837,22 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows(): assert progress["ordered_items"][8]["status"] == "completed" assert progress["ordered_items"][8]["remaining_alert_noise_stage_count"] == 0 assert progress["ordered_items"][9]["status"] == "completed" - assert progress["ordered_items"][9]["remaining_executor_batch_count"] == 0 - assert progress["ordered_items"][9]["active_blocker_count"] == 0 + assert progress["ordered_items"][9]["controlled_action_class_count"] == 2 + assert progress["ordered_items"][9]["ready_asset_count"] == 6 assert progress["ordered_items"][10]["status"] == "completed" - assert progress["ordered_items"][10]["remaining_consumer_binding_count"] == 0 + assert progress["ordered_items"][10]["remaining_executor_batch_count"] == 0 assert progress["ordered_items"][10]["active_blocker_count"] == 0 assert progress["ordered_items"][11]["status"] == "completed" - assert progress["ordered_items"][11]["remaining_ui_surface_count"] == 0 + assert progress["ordered_items"][11]["remaining_consumer_binding_count"] == 0 + assert progress["ordered_items"][11]["active_blocker_count"] == 0 assert progress["ordered_items"][12]["status"] == "completed" - assert progress["ordered_items"][12]["remaining_product_scope_count"] == 0 + assert progress["ordered_items"][12]["remaining_ui_surface_count"] == 0 + assert progress["ordered_items"][13]["status"] == "completed" + assert progress["ordered_items"][13]["remaining_product_scope_count"] == 0 assert progress["source_family_items"] assert {item["status"] for item in progress["source_family_items"]} == {"completed"} assert progress["rollups"]["source_family_work_item_count"] == 10 - assert progress["rollups"]["completed_count"] == 23 + assert progress["rollups"]["completed_count"] == 24 assert progress["rollups"]["pending_count"] == 0 @@ -1016,12 +1038,13 @@ def test_runtime_receipt_work_items_use_learning_receipts_without_latest_telegra } assert statuses["P1-C-learning-loop"] == "completed" assert statuses["P1-D-alert-noise-reduction"] == "completed" + assert statuses["P1-D2-host-sustained-load-controlled-automation"] == "completed" assert statuses["P1-E-log-controlled-writeback-executor"] == "completed" assert statuses["P1-F-log-controlled-writeback-consumer"] == "completed" assert statuses["P2-A-ui-ux-productization"] == "completed" assert statuses["P2-B-multi-product-expansion"] == "completed" assert {item["status"] for item in progress["source_family_items"]} == {"completed"} - assert progress["rollups"]["completed_count"] == 23 + assert progress["rollups"]["completed_count"] == 24 assert progress["rollups"]["pending_count"] == 0 diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 9b7a09dc..5034d073 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,21 @@ +## 2026-07-01 — 08:37 Host sustained-load AI controlled automation + +**照主線修正的問題**: +- 新增 `scripts/ops/host-sustained-load-controller.py`,把 `HostLoadAverageSustainedHigh` 從「SSH 看 top / 人工判斷」改成可機器讀取的 AI controlled packet:orphan browser / smoke load、合法 Gitea Actions / BuildKit saturation、memory / swap pressure、unknown sustained load 四類分流。 +- 新增 `scripts/ops/host-sustained-load-evidence.py`,unknown sustained load 不再回到 raw SSH top,而是產生脫敏 process-family / container evidence 給 source-specific PlayBook、KM、RAG 與後續 controller decision 使用;不輸出 raw command line、workspace path、URL 或 secret value。 +- `host-runaway-process-remediation.py` 的 apply gate 從 owner / maintenance-window 必填改成 controlled apply receipt 必填:`--controlled-apply-id`、`--evidence-ref`、`--post-apply-verifier`、`--confirm-apply`;owner / maintenance-window 只保留為可選 evidence。若目標 process group 已消失,回報 `already_exited` / `missing_process_group_count`,不再 traceback。 +- `ops/monitoring/alerts.yml` 與 `ops/monitoring/alerts-unified.yml` 將 sustained load 的 `auto_repair_action` 指向 controller,runbook 改為 AI controlled packet / dry-run / controlled SIGTERM / verifier;清掉 orphan browser 與 remediation authorization 告警中的人工批准語意。 +- `/api/v1/agents/agent-autonomous-runtime-control` readback 新增 `host_sustained_load_automation` 與 work item `P1-D2-host-sustained-load-controlled-automation`,6 個 required assets 全部 ready,讓告警降噪後的 CPU sustained-load automation 進入 work_item_progress / live counters,而不是只留在 Prometheus 文字。 + +**驗證**: +- `PYTHONDONTWRITEBYTECODE=1 python3.11 -m py_compile scripts/ops/host-sustained-load-controller.py scripts/ops/host-sustained-load-evidence.py scripts/ops/host-runaway-process-remediation.py apps/api/src/services/ai_agent_autonomous_runtime_control.py` 通過。 +- `PYTHONDONTWRITEBYTECODE=1 python3.11 -m pytest scripts/ops/tests/test_host_runaway_process_exporter.py -q --tb=short -p no:cacheprovider` 通過(16 passed)。 +- `DATABASE_URL=sqlite:///test.db PYTHONDONTWRITEBYTECODE=1 python3.11 -m pytest apps/api/tests/test_ai_agent_autonomous_runtime_control.py -q --tb=short -p no:cacheprovider` 通過(10 passed)。 +- `git diff --check` 通過。 +- 110 live readback:從 `http://192.168.0.110:9100/metrics` 只篩 `awoooi_host_*` 指標到 `/tmp/awoooi-host110-load.prom`;controller 回 `classification=blocked_unknown_sustained_load_requires_source_specific_playbook`、`load5_per_core=1.56`、`monitor_up=1`、active CI `0`、orphan rule `null`,並給出 `host-sustained-load-evidence.py --json` 只讀脫敏證據指令;本次未執行 host write / signal / restart。 + +**邊界**:本段只做 source / test / Prometheus rule / API readback 實作;未對 live host 送 SIGTERM;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未重啟主機,未 restart Docker / Nginx / K3s / DB / firewall,未恢復 legacy / generic runner label。 + ## 2026-07-01 — 08:19 188 non-110 CD lane controlled apply / 110 Harbor blocker readback **照主線修正的問題**: diff --git a/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md b/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md index d6ec71b6..b362665f 100644 --- a/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md +++ b/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md @@ -17,7 +17,7 @@ read-only exporter -> Prometheus alert -> AI triage packet -> KM / PlayBook evid | 類型 | 判定 | 處理 | |------|------|------| -| orphan browser smoke | headless Chrome / Chromium / Playwright process group 存活過久、PPID=1 或 group leader 消失、CPU 合計過高 | 走 dry-run 修復包;人工批准後可送 `SIGTERM` | +| orphan browser smoke | headless Chrome / Chromium / Playwright process group 存活過久、PPID=1 或 group leader 消失、CPU 合計過高 | 走 dry-run 修復包;controlled apply receipt + evidence + verifier 成立後可送 `SIGTERM` | | 合法 CI load | Gitea Actions task container 正在跑,沒有 orphan browser 指標 | 觀察 queue / timeout;不要誤殺 | | Docker / Sentry / Harbor 事故 | container restart、port down、journal error、cold-start gate blocked | 走各服務自己的 SOP,不使用本 PlayBook 殺 process | | swap 已滿但未 thrash | swap ratio 高但 `vmstat` / load 分類未顯示即時 thrash | 不手動清 swap;先降高 CPU 來源 | @@ -50,7 +50,7 @@ read-only exporter -> Prometheus alert -> AI triage packet -> KM / PlayBook evid | Alert | 條件 | 行動 | |-------|------|------| -| `HostOrphanBrowserSmokeHighCpu` | orphan browser group `> 0` 且 CPU `>= 100%` 持續 10 分鐘 | 產生 dry-run 修復包,確認 owner / 維護窗口 / evidence | +| `HostOrphanBrowserSmokeHighCpu` | orphan browser group `> 0` 且 CPU `>= 100%` 持續 10 分鐘 | 產生 dry-run 修復包,補 controlled apply id / evidence / post verifier | | `HostCiRunnerLoadSaturation` | load5/core `> 1.0` 且 active Gitea Actions `> 0` | 標為短期 CI 負載,檢查 runner queue,不直接 kill | | `HostRunawayProcessMonitorMissing` / `Stale` | exporter 缺失或超過 10 分鐘未更新 | 修 exporter / cron / textfile collector | | `HostRunawayProcessRemediationUnexpectedlyAuthorized` | `remediation_authorized > 0` | 立即回滾;禁止把監控器改成執行器 | @@ -59,11 +59,11 @@ Telegram / AI event packet contract: | Alert / input | Telegram lane | 必須顯示 | |---------------|---------------|----------| -| `HostOrphanBrowserSmokeHighCpu` | `orphan_browser_smoke_runaway_process` | alertname、host、rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫 | +| `HostOrphanBrowserSmokeHighCpu` | `orphan_browser_smoke_runaway_process` | alertname、host、rule、runaway dry-run、controlled apply id、evidence ref、post verifier、KM / PlayBook / Verifier 回寫 | | `HostCiRunnerLoadSaturation` | `ci_runner_load_saturation` | Gitea Actions run、runner queue、load/core、swap trend、capacity / queue 判定、不做 process remediation | -| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure`、`runner_prisma_generate_resource_pressure` 或 `host_resource_pressure_triage` | sanitized top process evidence,不顯示 raw workspace path、hosted toolcache path、`node_modules` path、外部 URL、JSON payload 或完整 process dump | +| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure`、`runner_prisma_generate_resource_pressure` 或 `host_resource_pressure_triage` | `host-sustained-load-evidence.py` 產生 sanitized top process / container evidence,不顯示 raw workspace path、hosted toolcache path、`node_modules` path、外部 URL、JSON payload 或完整 process dump | -所有 Telegram 卡片都必須保留 `runtime_write_gate=0`,並不得把 alert/card 轉成直接 kill / restart / reload 指令。 +所有 Telegram 卡片都必須明確顯示 `runtime_write_gate=controlled/0`、`controlled_apply_allowed`、post verifier 與 forbidden actions,並不得把 alert/card 轉成直接 kill / restart / reload 指令。 Host / runner raw dump 進入 Telegram 前必須先被 `TelegramGateway` 壓成 `P1/P2/P3 主機資源壓力` 卡片。第一屏只允許顯示 CPU、load、root process count、AI lane、candidate gate、Top evidence 與禁止事項;完整命令列、套件 JSON、外部檢查 endpoint、內部 workspace path 與 raw `ps aux` 必須留在內部 evidence / timeline,不得外送。 @@ -89,24 +89,36 @@ dry-run 必須檢查: 3. `oldest_age_seconds` 超過 PlayBook 門檻。 4. `active Gitea Actions` 與候選 process group 不是同一個仍在跑的合法 job。 5. 不是 Docker daemon、Sentry、Harbor、PostgreSQL、ClickHouse、K3s 或 backup 服務本體。 -6. 已有 owner / 維護窗口 / evidence ref。 +6. 已有 controlled apply id、evidence ref、post verifier;owner / 維護窗口只作額外 evidence,不作 low-blast orphan cleanup 的預設阻擋。 -如果只看到 `HostCiRunnerLoadSaturation`,且 orphan group count 為 `0`,預設判定是「合法 CI 短期負載」,不得自動修復。 +如果只看到 `HostCiRunnerLoadSaturation`,且 orphan group count 為 `0`,預設判定是「合法 CI 短期負載」,不得自動殺 process;只能走 runner queue verifier、stale-run drain/cancel packet 與 host pressure fail-closed。 + +如果只看到 `HostLoadAverageSustainedHigh`,且 orphan / active CI / swap 都無明確命中,AI 必須先跑只讀脫敏 evidence collector: + +```bash +python3 scripts/ops/host-sustained-load-evidence.py \ + --host 110 \ + --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom \ + --docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom \ + --json +``` + +collector 只輸出 process family、container CPU 與 PlayBook recommendation,不輸出 raw command line、workspace path、URL、JSON payload 或 secret。 --- ## 4. Gated Remediation -真正送 `SIGTERM` 時必須帶齊三個 gate: +真正送 `SIGTERM` 時必須帶齊 controlled apply gate: ```bash python3 scripts/ops/host-runaway-process-remediation.py \ --apply \ --confirm-apply \ --rule stockplatform_headless_smoke \ - --owner-approval-id OWNER-APPROVAL-REDACTED \ - --maintenance-window-id MW-REDACTED \ + --controlled-apply-id CAP-REDACTED \ --evidence-ref INC-REDACTED \ + --post-apply-verifier "scripts/ops/host-sustained-load-controller.py --host 110 --json" \ --wait-seconds 5 ``` @@ -138,10 +150,10 @@ active Gitea Actions 若仍存在,告警降級為 CI load,而非 orphan smok | 資產 | 必填欄位 | |------|----------| | Incident evidence | alert name、host、rule、pgid count、cpu percent、oldest age、active CI count、swap ratio | -| PlayBook run | dry-run payload、owner approval id、maintenance window id、evidence ref、actual signal summary | +| PlayBook run | dry-run payload、controlled apply id、optional owner / maintenance evidence、evidence ref、post verifier、actual signal summary | | KM entry | 根因分類、誤判防護、修復結果、recurrence guard | | Verifier | post-check 指標、load trend、orphan group count、runner queue state | -| Work item | 如果缺 owner / evidence / maintenance window,建立補件項,不假性拉高 runtime gate | +| Work item | 如果缺 controlled apply id / evidence ref / post verifier,建立補件項;owner / maintenance 只作 optional evidence,不假性拉高 runtime gate | 產品上的結論必須分開呈現: @@ -150,7 +162,7 @@ monitoring_ready=true alert_ready=true playbook_ready=true km_writeback_required=true -runtime_remediation_authorized=false unless gated apply is executed +runtime_write_gate=controlled for allowlisted orphan browser cleanup; 0/false is evidence only unless critical break-glass applies ``` --- diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 9f09da12..55edfcbc 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -101,8 +101,8 @@ groups: annotations: summary: "主機 {{ $labels.host }} load5/core 長時間過高" description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。" - auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'" - runbook: "先判斷高 load 來源:ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter;只允許 read-only 診斷,自動修復需走服務專屬 playbook。" + auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'scripts/ops/host-sustained-load-controller.py --host {{ $labels.host }} --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --json'" + runbook: "交給 host-sustained-load-controller 產生 AI controlled packet:orphan browser 走 host-runaway-process-remediation.py dry-run → controlled SIGTERM → verifier;合法 CI/BuildKit 走 runner pressure fail-closed 與 drain/cancel packet;unknown 先跑 host-sustained-load-evidence.py 只讀脫敏證據再選服務專屬 PlayBook;swap 走服務專屬記憶體 PlayBook。禁止直接 docker/systemd/nginx/firewall/reboot。" - alert: HostOutOfMemory expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 @@ -285,7 +285,7 @@ groups: annotations: summary: "110 orphan browser smoke process group CPU 過高" description: "偵測到 {{ $labels.rule }} orphan process group,CPU 合計 >= 100% 持續 10 分鐘。這通常是跨專案 headless Chrome / Playwright smoke 遺留,不是 Docker/Sentry/Harbor 事故。" - runbook: "先執行 `scripts/ops/host-runaway-process-remediation.py --rule {{ $labels.rule }}` 產生 dry-run;確認 active Gitea Actions、owner、維護窗口與 evidence ref 後才可用 --apply --confirm-apply 送 SIGTERM。禁止預設 SIGKILL、Docker restart、systemctl restart 或 firewall 變更。" + runbook: "先執行 `scripts/ops/host-sustained-load-controller.py --host 110 --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --json` 產生 AI controlled packet;orphan browser 只允許 `host-runaway-process-remediation.py --rule {{ $labels.rule }}` dry-run 後,帶 controlled-apply-id、evidence-ref、post-apply-verifier 與 --confirm-apply 送 SIGTERM。禁止預設 SIGKILL、Docker restart、systemctl restart 或 firewall 變更。" - alert: HostRunawayProcessRemediationUnexpectedlyAuthorized expr: awoooi_host_runaway_process_remediation_authorized{host="110"} > 0 @@ -302,7 +302,7 @@ groups: annotations: summary: "110 runaway process monitor exposed runtime remediation authorization" description: "host-runaway-process exporter 應永遠保持 read-only;若 remediation_authorized > 0,代表有人把監控器改成執行器或把 runtime gate 誤接上。" - runbook: "立即回滾 exporter,檢查 Git diff、cron、Ansible role 與 /home/wooo/scripts/host-runaway-process-exporter.py。實際修復只能由 gated remediation helper 在人工批准後執行。" + runbook: "立即回滾 exporter,檢查 Git diff、cron、Ansible role 與 /home/wooo/scripts/host-runaway-process-exporter.py。實際修復只能由 AI controlled packet 呼叫 gated remediation helper;監控 exporter 不得持有 runtime apply 權限。" - alert: HostCiRunnerLoadSaturation expr: | @@ -324,7 +324,7 @@ groups: annotations: summary: "110 high load is currently explained by active Gitea Actions" description: "load5/core > 1.0 且存在 Gitea Actions task container;若 orphan browser 指標為 0,先視為短期 CI build/test 負載,不要誤判成 Docker/Sentry/Harbor 事故。" - runbook: "檢查 Gitea runs、runner queue 與 `docker ps --filter name=GITEA-ACTIONS-TASK-`; 僅在 job 卡死、超過 workflow timeout 或 owner 取消後才走 runner drain / cleanup PlayBook。" + runbook: "執行 `scripts/ops/host-sustained-load-controller.py --host 110 --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --json`。若分類為 controlled_ci_runner_saturation_guarded,保持 runner pressure fail-closed;只有 stale workflow timeout / queue verifier 指向同一 run 時才產生 drain/cancel controlled packet,不做 process kill。" # ========================================================================= # K8s 叢集告警 (kubernetes_alerts) diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index 34303250..991522d3 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -69,8 +69,8 @@ groups: annotations: summary: "主機 {{ $labels.host }} load5/core 長時間過高" description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。" - auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'echo \"=== LOAD ===\"; uptime; echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -25; echo \"=== DOCKER STATS ===\"; docker stats --no-stream | head -40'" - runbook: "先判斷高 load 來源:ClickHouse merge / Kafka backlog / Chrome scraper / Ollama runner / exporter;只允許 read-only 診斷,自動修復需走服務專屬 playbook。" + auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'scripts/ops/host-sustained-load-controller.py --host {{ $labels.host }} --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --json'" + runbook: "交給 host-sustained-load-controller 產生 AI controlled packet:orphan browser 走 host-runaway-process-remediation.py dry-run → controlled SIGTERM → verifier;合法 CI/BuildKit 走 runner pressure fail-closed 與 drain/cancel packet;unknown 先跑 host-sustained-load-evidence.py 只讀脫敏證據再選服務專屬 PlayBook;swap 走服務專屬記憶體 PlayBook。禁止直接 docker/systemd/nginx/firewall/reboot。" - alert: HostOutOfMemory expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 diff --git a/scripts/ops/host-runaway-process-remediation.py b/scripts/ops/host-runaway-process-remediation.py index 928306a7..d929445c 100755 --- a/scripts/ops/host-runaway-process-remediation.py +++ b/scripts/ops/host-runaway-process-remediation.py @@ -2,9 +2,11 @@ """ Gated remediation helper for AWOOOI host runaway process groups. -Default mode is dry-run. Applying SIGTERM requires explicit owner approval, -maintenance window, evidence reference, and --confirm-apply. This script is a -PlayBook primitive, not a background auto-kill daemon. +Default mode is dry-run. Applying SIGTERM requires an explicit controlled apply +receipt, evidence reference, post-apply verifier, and --confirm-apply. Owner and +maintenance-window identifiers are accepted as optional evidence, but they are +not the default gate for allowlisted low-blast-radius orphan browser cleanup. +This script is a PlayBook primitive, not a background auto-kill daemon. """ from __future__ import annotations @@ -18,6 +20,7 @@ import sys import time from pathlib import Path from types import ModuleType +from typing import Any EXPORTER_PATH = Path(__file__).with_name("host-runaway-process-exporter.py") @@ -42,9 +45,11 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--min-cpu-percent", type=float, default=50) parser.add_argument("--apply", action="store_true", help="Send SIGTERM to matching process groups.") parser.add_argument("--confirm-apply", action="store_true", help="Required together with --apply.") + parser.add_argument("--controlled-apply-id", default="") parser.add_argument("--owner-approval-id", default="") parser.add_argument("--maintenance-window-id", default="") parser.add_argument("--evidence-ref", default="") + parser.add_argument("--post-apply-verifier", default="") parser.add_argument("--wait-seconds", type=int, default=0, help="Optional wait after SIGTERM before re-reading ps.") return parser.parse_args() @@ -57,17 +62,17 @@ def validate_apply_args(args: argparse.Namespace) -> None: missing.append("--confirm-apply") if not args.rule: missing.append("--rule") - if not args.owner_approval_id: - missing.append("--owner-approval-id") - if not args.maintenance_window_id: - missing.append("--maintenance-window-id") + if not args.controlled_apply_id: + missing.append("--controlled-apply-id") if not args.evidence_ref: missing.append("--evidence-ref") + if not args.post_apply_verifier: + missing.append("--post-apply-verifier") if missing: raise SystemExit( "Refusing apply; missing required gates: " + ", ".join(missing) - + ". Use dry-run output for the PlayBook packet first." + + ". Use dry-run output for the controlled PlayBook packet first." ) @@ -114,12 +119,31 @@ def main() -> None: ) signaled: list[int] = [] + missing_process_groups: list[int] = [] + signal_errors: list[dict[str, Any]] = [] if args.apply: for candidate in candidates: if candidate["blocked_reason"]: continue - os.killpg(int(candidate["pgid"]), signal.SIGTERM) - signaled.append(int(candidate["pgid"])) + pgid = int(candidate["pgid"]) + try: + os.killpg(pgid, signal.SIGTERM) + except ProcessLookupError: + candidate["action"] = "already_exited" + candidate["blocked_reason"] = "process_group_missing_at_apply" + missing_process_groups.append(pgid) + continue + except PermissionError as exc: + candidate["action"] = "signal_failed" + candidate["blocked_reason"] = "permission_denied" + signal_errors.append( + { + "pgid": pgid, + "error": exc.__class__.__name__, + } + ) + continue + signaled.append(pgid) remaining_after_wait = None if args.apply and args.wait_seconds > 0: @@ -139,14 +163,20 @@ def main() -> None: "host": args.host, "mode": "apply_sigterm" if args.apply else "dry_run", "runtime_gate": 1 if args.apply else 0, + "controlled_apply_id": args.controlled_apply_id if args.apply else None, "owner_approval_id": args.owner_approval_id if args.apply else None, "maintenance_window_id": args.maintenance_window_id if args.apply else None, "evidence_ref": args.evidence_ref if args.apply else None, + "post_apply_verifier": args.post_apply_verifier if args.apply else None, "min_age_seconds": args.min_age_seconds, "min_cpu_percent": args.min_cpu_percent, "candidate_count": len(candidates), "signaled_process_group_count": len(signaled), "signaled_process_groups": signaled, + "missing_process_group_count": len(missing_process_groups), + "missing_process_groups": missing_process_groups, + "signal_error_count": len(signal_errors), + "signal_errors": signal_errors, "remaining_after_wait": remaining_after_wait, "candidates": candidates, "forbidden_without_gates": [ @@ -159,6 +189,8 @@ def main() -> None: ], } print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True)) + if signal_errors: + raise SystemExit(75) if __name__ == "__main__": diff --git a/scripts/ops/host-sustained-load-controller.py b/scripts/ops/host-sustained-load-controller.py new file mode 100755 index 00000000..af6a1568 --- /dev/null +++ b/scripts/ops/host-sustained-load-controller.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 +"""Classify sustained host load and emit a controlled automation packet. + +The controller is intentionally read-only by default. It turns +HostLoadAverageSustainedHigh from a generic "SSH and look around" alert into a +deterministic AI Agent control packet: + +* orphan browser/smoke load -> gated SIGTERM helper dry-run, then controlled + apply with evidence and post-apply verifier +* active Gitea Actions/BuildKit load -> runner pressure stays fail-closed; + drain/cancel decisions must use runner/CD verifiers, not process kills +* unknown or critical pressure -> source-specific playbook or break-glass + +It never reads secrets, raw runner registrations, sessions, or environment +files, and it never mutates host state. +""" + +from __future__ import annotations + +import argparse +import json +import re +from pathlib import Path +from typing import Any + + +DEFAULT_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom") +SCHEMA_VERSION = "host_sustained_load_controlled_automation_v1" +LABEL_RE = re.compile(r"(?P[A-Za-z_][A-Za-z0-9_]*)=\"(?P(?:[^\"\\\\]|\\\\.)*)\"") +METRIC_RE = re.compile( + r"^(?P[A-Za-z_:][A-Za-z0-9_:]*)(?:\{(?P[^}]*)\})?\s+" + r"(?P[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)$" +) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Build a controlled AI Agent packet for sustained host load." + ) + parser.add_argument("--host", default="110") + parser.add_argument("--metrics-file", type=Path, default=DEFAULT_METRICS_FILE) + parser.add_argument("--load5-per-core-threshold", type=float, default=1.5) + parser.add_argument("--ci-stale-age-seconds", type=int, default=1800) + parser.add_argument("--json", action="store_true", help="Print JSON only.") + return parser.parse_args() + + +def _unescape_label(value: str) -> str: + return value.replace(r"\"", '"').replace(r"\\", "\\").replace(r"\n", "\n") + + +def parse_prometheus_text(text: str) -> list[dict[str, Any]]: + samples: list[dict[str, Any]] = [] + for raw_line in text.splitlines(): + line = raw_line.strip() + if not line or line.startswith("#"): + continue + match = METRIC_RE.match(line) + if not match: + continue + labels = { + item.group("key"): _unescape_label(item.group("value")) + for item in LABEL_RE.finditer(match.group("labels") or "") + } + samples.append( + { + "name": match.group("name"), + "labels": labels, + "value": float(match.group("value")), + } + ) + return samples + + +def _sample_value( + samples: list[dict[str, Any]], + name: str, + *, + host: str, + labels: dict[str, str] | None = None, + default: float = 0.0, +) -> float: + expected = {"host": host, **(labels or {})} + for sample in samples: + if sample["name"] != name: + continue + sample_labels = sample["labels"] + if all(sample_labels.get(key) == value for key, value in expected.items()): + return float(sample["value"]) + return default + + +def _rule_values(samples: list[dict[str, Any]], name: str, *, host: str) -> list[dict[str, Any]]: + values = [] + for sample in samples: + if sample["name"] != name: + continue + labels = sample["labels"] + if labels.get("host") != host: + continue + rule = labels.get("rule") + if not rule: + continue + values.append({"rule": rule, "value": float(sample["value"])}) + return values + + +def _top_orphan_rule(samples: list[dict[str, Any]], *, host: str) -> dict[str, Any] | None: + counts = _rule_values( + samples, + "awoooi_host_runaway_browser_orphan_group_count", + host=host, + ) + cpu_by_rule = { + item["rule"]: item["value"] + for item in _rule_values( + samples, + "awoooi_host_runaway_browser_orphan_cpu_percent", + host=host, + ) + } + candidates = [ + { + "rule": item["rule"], + "group_count": int(item["value"]), + "cpu_percent": round(cpu_by_rule.get(item["rule"], 0.0), 3), + } + for item in counts + if item["value"] > 0 + ] + if not candidates: + return None + return sorted(candidates, key=lambda item: (-item["cpu_percent"], item["rule"]))[0] + + +def build_packet( + *, + host: str, + samples: list[dict[str, Any]], + load5_per_core_threshold: float, + ci_stale_age_seconds: int, +) -> dict[str, Any]: + monitor_up = int( + _sample_value( + samples, + "awoooi_host_runaway_process_monitor_up", + host=host, + labels={"mode": "read_only"}, + default=0, + ) + ) + load5_per_core = _sample_value(samples, "awoooi_host_load5_per_core", host=host) + swap_used_ratio = _sample_value(samples, "awoooi_host_swap_used_ratio", host=host) + remediation_authorized = int( + _sample_value( + samples, + "awoooi_host_runaway_process_remediation_authorized", + host=host, + ) + ) + active_ci_containers = int( + _sample_value( + samples, + "awoooi_host_gitea_actions_active_container_count", + host=host, + default=0, + ) + ) + active_ci_groups = int( + _sample_value( + samples, + "awoooi_host_gitea_actions_active_process_group_count", + host=host, + default=0, + ) + ) + active_ci_cpu = _sample_value( + samples, + "awoooi_host_gitea_actions_active_process_cpu_percent", + host=host, + ) + active_ci_oldest_age = int( + _sample_value( + samples, + "awoooi_host_gitea_actions_active_process_oldest_age_seconds", + host=host, + ) + ) + top_orphan = _top_orphan_rule(samples, host=host) + + classification = "observing_load_within_threshold" + severity = "info" + controlled_apply_allowed = False + next_action = "keep_read_only_monitoring" + dry_run_command = "" + controlled_apply_command = "" + verifier_command = ( + "scripts/ops/host-sustained-load-controller.py " + f"--host {host} --metrics-file {DEFAULT_METRICS_FILE}" + ) + + if monitor_up != 1: + classification = "blocked_monitor_unavailable" + severity = "warning" + next_action = "restore_host_runaway_process_exporter_textfile_before_apply" + elif remediation_authorized > 0: + classification = "blocked_monitor_authority_violation" + severity = "critical" + next_action = "rollback_monitor_to_read_only_exporter" + elif load5_per_core > load5_per_core_threshold and top_orphan: + classification = "controlled_orphan_browser_remediation_ready" + severity = "critical" + controlled_apply_allowed = True + rule = top_orphan["rule"] + dry_run_command = f"scripts/ops/host-runaway-process-remediation.py --rule {rule}" + controlled_apply_command = ( + "scripts/ops/host-runaway-process-remediation.py " + f"--rule {rule} --apply --confirm-apply " + "--controlled-apply-id ${CONTROLLED_APPLY_ID} " + "--evidence-ref ${EVIDENCE_REF} " + "--post-apply-verifier " + "'scripts/ops/host-sustained-load-controller.py --host " + f"{host} --metrics-file {DEFAULT_METRICS_FILE}' " + "--wait-seconds 10" + ) + next_action = "run_orphan_browser_remediation_dry_run_then_controlled_sigterm" + elif ( + load5_per_core > load5_per_core_threshold + and (active_ci_containers > 0 or active_ci_groups > 0) + ): + classification = "controlled_ci_runner_saturation_guarded" + severity = "critical" if active_ci_oldest_age >= ci_stale_age_seconds else "warning" + controlled_apply_allowed = active_ci_oldest_age >= ci_stale_age_seconds + dry_run_command = ( + "ops/runner/read-public-gitea-actions-queue.py --json " + "&& ops/runner/check-awoooi-non110-runner-readiness.sh" + ) + controlled_apply_command = ( + "keep_110_runner_pressure_gate_fail_closed; " + "only cancel/drain stale Gitea Actions through runner verifier packet" + ) + next_action = ( + "prepare_runner_drain_or_cancel_packet_without_process_kill" + if controlled_apply_allowed + else "keep_pressure_gate_fail_closed_until_ci_load_clears" + ) + elif load5_per_core > load5_per_core_threshold and swap_used_ratio >= 0.85: + classification = "blocked_memory_or_swap_pressure_requires_service_playbook" + severity = "critical" + next_action = "route_to_service_specific_memory_pressure_playbook" + elif load5_per_core > load5_per_core_threshold: + classification = "blocked_unknown_sustained_load_requires_source_specific_playbook" + severity = "critical" + dry_run_command = ( + "scripts/ops/host-sustained-load-evidence.py " + f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} " + "--docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom " + "--json" + ) + next_action = "collect_sanitized_top_process_and_container_stats_then_select_playbook" + + return { + "schema_version": SCHEMA_VERSION, + "host": host, + "mode": "read_only_control_packet", + "classification": classification, + "severity": severity, + "controlled_apply_allowed": controlled_apply_allowed, + "next_action": next_action, + "readback": { + "monitor_up": monitor_up, + "load5_per_core": round(load5_per_core, 6), + "load5_per_core_threshold": load5_per_core_threshold, + "swap_used_ratio": round(swap_used_ratio, 6), + "remediation_authorized": remediation_authorized, + "active_ci_container_count": active_ci_containers, + "active_ci_process_group_count": active_ci_groups, + "active_ci_process_cpu_percent": round(active_ci_cpu, 3), + "active_ci_oldest_age_seconds": active_ci_oldest_age, + "top_orphan_rule": top_orphan, + }, + "commands": { + "dry_run": dry_run_command, + "controlled_apply": controlled_apply_command, + "post_apply_verifier": verifier_command, + "rollback": "send SIGTERM only; no persistent host mutation. Re-run workload if needed.", + }, + "operation_boundaries": { + "secret_value_read": False, + "raw_session_read": False, + "raw_runner_registration_read": False, + "host_write_performed": False, + "process_signal_performed": False, + "docker_restart_allowed": False, + "systemd_restart_allowed": False, + "firewall_change_allowed": False, + "critical_break_glass_required": True, + }, + "forbidden_actions": [ + "SIGKILL", + "docker_restart", + "systemctl_restart", + "nginx_reload", + "firewall_change", + "kubectl_action", + "secret_read", + "legacy_or_generic_runner_restore", + ], + } + + +def main() -> int: + args = parse_args() + try: + text = args.metrics_file.read_text(encoding="utf-8") + samples = parse_prometheus_text(text) + except FileNotFoundError: + samples = [] + packet = build_packet( + host=args.host, + samples=samples, + load5_per_core_threshold=args.load5_per_core_threshold, + ci_stale_age_seconds=args.ci_stale_age_seconds, + ) + if args.json: + print(json.dumps(packet, ensure_ascii=False, indent=2, sort_keys=True)) + else: + print(f"status={packet['classification']}") + print(f"controlled_apply_allowed={str(packet['controlled_apply_allowed']).lower()}") + print(f"next_action={packet['next_action']}") + if packet["commands"]["dry_run"]: + print(f"dry_run_command={packet['commands']['dry_run']}") + if packet["commands"]["controlled_apply"]: + print(f"controlled_apply_command={packet['commands']['controlled_apply']}") + print(f"post_apply_verifier={packet['commands']['post_apply_verifier']}") + return 0 if not packet["classification"].startswith("blocked_") else 75 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/ops/host-sustained-load-evidence.py b/scripts/ops/host-sustained-load-evidence.py new file mode 100755 index 00000000..0cbf71c6 --- /dev/null +++ b/scripts/ops/host-sustained-load-evidence.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +"""Build sanitized evidence for unknown sustained host load. + +This collector is read-only. It intentionally emits process families and +container names instead of raw command lines so CPU-pressure alerts can proceed +to a source-specific PlayBook without leaking workspace paths, URLs, JSON +payloads, or secrets. +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +from pathlib import Path +from typing import Any + + +DEFAULT_HOST_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom") +DEFAULT_DOCKER_STATS_FILE = Path("/home/wooo/node_exporter_textfiles/docker_stats.prom") +SCHEMA_VERSION = "host_sustained_load_sanitized_evidence_v1" +LABEL_RE = re.compile(r"(?P[A-Za-z_][A-Za-z0-9_]*)=\"(?P(?:[^\"\\\\]|\\\\.)*)\"") +METRIC_RE = re.compile( + r"^(?P[A-Za-z_:][A-Za-z0-9_:]*)(?:\{(?P[^}]*)\})?\s+" + r"(?P[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)$" +) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Collect sanitized sustained-load evidence.") + parser.add_argument("--host", default=os.environ.get("AIOPS_HOST_LABEL", "110")) + parser.add_argument("--metrics-file", type=Path, default=DEFAULT_HOST_METRICS_FILE) + parser.add_argument("--docker-stats-file", type=Path, default=DEFAULT_DOCKER_STATS_FILE) + parser.add_argument("--ps-file", type=Path) + parser.add_argument("--top-n", type=int, default=8) + parser.add_argument("--json", action="store_true") + return parser.parse_args() + + +def _unescape_label(value: str) -> str: + return value.replace(r"\"", '"').replace(r"\\", "\\").replace(r"\n", "\n") + + +def parse_prometheus_text(text: str) -> list[dict[str, Any]]: + samples: list[dict[str, Any]] = [] + for raw_line in text.splitlines(): + line = raw_line.strip() + if not line or line.startswith("#"): + continue + match = METRIC_RE.match(line) + if not match: + continue + labels = { + item.group("key"): _unescape_label(item.group("value")) + for item in LABEL_RE.finditer(match.group("labels") or "") + } + samples.append( + { + "name": match.group("name"), + "labels": labels, + "value": float(match.group("value")), + } + ) + return samples + + +def read_text(path: Path | None) -> str: + if path is None: + return "" + try: + return path.read_text(encoding="utf-8") + except FileNotFoundError: + return "" + + +def collect_ps_text(ps_file: Path | None) -> str: + if ps_file is not None: + return read_text(ps_file) + result = subprocess.run( + ["ps", "-eo", "pid=,ppid=,pgid=,etimes=,pcpu=,pmem=,comm=,args="], + check=True, + capture_output=True, + text=True, + timeout=10, + ) + return result.stdout + + +def parse_ps_text(text: str) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for raw_line in text.splitlines(): + line = raw_line.strip() + if not line: + continue + parts = line.split(None, 7) + if len(parts) < 7: + continue + pid, ppid, pgid, etimes, pcpu, pmem, comm = parts[:7] + args = parts[7] if len(parts) > 7 else comm + try: + rows.append( + { + "pid": int(pid), + "ppid": int(ppid), + "pgid": int(pgid), + "etimes": int(float(etimes)), + "cpu_percent": float(pcpu), + "mem_percent": float(pmem), + "comm": Path(comm).name[:48], + "family": classify_process_family(comm, args), + } + ) + except ValueError: + continue + return rows + + +def classify_process_family(comm: str, args: str) -> str: + text = f"{comm} {args}".lower() + if "act_runner" in text or "gitea-actions-task" in text or "/.cache/act/" in text: + return "gitea_actions_runner" + if "docker build" in text or "buildx" in text or "buildkit" in text: + return "docker_build" + if "next build" in text or "turbo build" in text or "pnpm" in text and " build" in text: + return "web_build" + if "chrome" in text or "chromium" in text or "playwright" in text: + return "headless_browser" + if "gitea" in text: + return "gitea_service" + if "postgres" in text or "postmaster" in text: + return "postgres" + if "clickhouse" in text: + return "clickhouse" + if "kafka" in text: + return "kafka" + if "sentry" in text: + return "sentry" + if "systemctl" in text or "systemd" in text or "dbus" in text: + return "systemd_control_plane" + if "sshd" in text: + return "ssh_control_plane" + if "python" in text: + return "python_job" + if "node" in text: + return "node_service" + return "unknown" + + +def summarize_processes(rows: list[dict[str, Any]], *, top_n: int) -> dict[str, Any]: + top_rows = sorted(rows, key=lambda item: (-item["cpu_percent"], item["comm"], item["pid"]))[:top_n] + families: dict[str, dict[str, Any]] = {} + for row in rows: + family = row["family"] + current = families.setdefault( + family, + { + "family": family, + "process_count": 0, + "cpu_percent": 0.0, + "max_age_seconds": 0, + "sample_comm": "", + }, + ) + current["process_count"] += 1 + current["cpu_percent"] += row["cpu_percent"] + current["max_age_seconds"] = max(current["max_age_seconds"], row["etimes"]) + if not current["sample_comm"] or row["cpu_percent"] > current.get("_sample_cpu", -1): + current["sample_comm"] = row["comm"] + current["_sample_cpu"] = row["cpu_percent"] + + family_rows = [] + for item in families.values(): + item.pop("_sample_cpu", None) + item["cpu_percent"] = round(float(item["cpu_percent"]), 3) + family_rows.append(item) + + return { + "top_processes": [ + { + "pid": row["pid"], + "ppid": row["ppid"], + "pgid": row["pgid"], + "cpu_percent": round(row["cpu_percent"], 3), + "mem_percent": round(row["mem_percent"], 3), + "age_seconds": row["etimes"], + "comm": row["comm"], + "family": row["family"], + } + for row in top_rows + ], + "families": sorted(family_rows, key=lambda item: (-item["cpu_percent"], item["family"]))[:top_n], + } + + +def top_docker_containers(samples: list[dict[str, Any]], *, host: str, top_n: int) -> list[dict[str, Any]]: + rows = [] + for sample in samples: + if sample["name"] != "docker_container_cpu_cores": + continue + labels = sample["labels"] + if labels.get("host", host) != host: + continue + rows.append( + { + "container_name": labels.get("container_name") or labels.get("name") or "unknown", + "cpu_cores": round(float(sample["value"]), 6), + } + ) + return sorted(rows, key=lambda item: (-item["cpu_cores"], item["container_name"]))[:top_n] + + +def recommend_playbook(process_families: list[dict[str, Any]], containers: list[dict[str, Any]]) -> str: + top_container = containers[0] if containers else {} + top_container_name = str(top_container.get("container_name") or "").lower() + top_container_cpu = float(top_container.get("cpu_cores") or 0.0) + top_family = process_families[0] if process_families else {} + family = str(top_family.get("family") or "") + + if "gitea" in top_container_name and top_container_cpu >= 2.0: + return "gitea_queue_or_hook_backlog_playbook" + if "postgres" in top_container_name or "postgres" in family: + return "postgres_hot_query_or_backup_export_playbook" + if family in {"docker_build", "web_build", "gitea_actions_runner"}: + return "build_or_runner_pressure_playbook" + if family in {"systemd_control_plane", "ssh_control_plane"}: + return "control_plane_saturation_playbook" + if family == "headless_browser": + return "orphan_browser_classification_refresh_playbook" + return "source_specific_playbook_required" + + +def build_payload(args: argparse.Namespace) -> dict[str, Any]: + host_samples = parse_prometheus_text(read_text(args.metrics_file)) + docker_samples = parse_prometheus_text(read_text(args.docker_stats_file)) + process_summary = summarize_processes(parse_ps_text(collect_ps_text(args.ps_file)), top_n=args.top_n) + containers = top_docker_containers(docker_samples, host=args.host, top_n=args.top_n) + recommendation = recommend_playbook(process_summary["families"], containers) + + return { + "schema_version": SCHEMA_VERSION, + "host": args.host, + "mode": "read_only_sanitized_evidence", + "recommendation": recommendation, + "controlled_apply_allowed": False, + "next_action": "select_or_generate_source_specific_playbook_then_run_check_mode", + "readback": { + "host_metric_sample_count": len(host_samples), + "docker_metric_sample_count": len(docker_samples), + "top_container_count": len(containers), + "top_process_family_count": len(process_summary["families"]), + }, + "top_containers": containers, + "top_process_families": process_summary["families"], + "top_processes_sanitized": process_summary["top_processes"], + "redaction": { + "raw_command_lines_emitted": False, + "workspace_paths_emitted": False, + "urls_emitted": False, + "secret_values_read": False, + }, + "operation_boundaries": { + "host_write_performed": False, + "process_signal_performed": False, + "docker_restart_performed": False, + "systemd_restart_performed": False, + "raw_session_read": False, + "raw_runner_registration_read": False, + }, + } + + +def main() -> int: + args = parse_args() + payload = build_payload(args) + if args.json: + print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True)) + else: + print(f"recommendation={payload['recommendation']}") + print(f"controlled_apply_allowed={str(payload['controlled_apply_allowed']).lower()}") + print(f"next_action={payload['next_action']}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/ops/tests/test_host_runaway_process_exporter.py b/scripts/ops/tests/test_host_runaway_process_exporter.py index 0e81f1ba..a97da0ea 100644 --- a/scripts/ops/tests/test_host_runaway_process_exporter.py +++ b/scripts/ops/tests/test_host_runaway_process_exporter.py @@ -1,6 +1,7 @@ from __future__ import annotations import importlib.util +import json import subprocess import sys from pathlib import Path @@ -9,6 +10,8 @@ from pathlib import Path SCRIPT_ROOT = Path(__file__).resolve().parents[1] EXPORTER_PATH = SCRIPT_ROOT / "host-runaway-process-exporter.py" REMEDIATION_PATH = SCRIPT_ROOT / "host-runaway-process-remediation.py" +CONTROLLER_PATH = SCRIPT_ROOT / "host-sustained-load-controller.py" +EVIDENCE_PATH = SCRIPT_ROOT / "host-sustained-load-evidence.py" def load_exporter(): @@ -167,7 +170,7 @@ def test_ignores_the_host_pressure_gate_process_group() -> None: def test_remediation_defaults_to_dry_run(tmp_path: Path) -> None: ps_file = tmp_path / "ps.txt" ps_file.write_text( - "100 1 100 100 7200 65.0 S chrome /opt/chrome/chrome --headless --user-data-dir=/tmp/stockplatform-review-bulk-ux-aa\n", + "999999 1 999999 999999 7200 65.0 S chrome /opt/chrome/chrome --headless --user-data-dir=/tmp/stockplatform-review-bulk-ux-aa\n", encoding="utf-8", ) @@ -193,7 +196,7 @@ def test_remediation_defaults_to_dry_run(tmp_path: Path) -> None: def test_remediation_refuses_apply_without_gates(tmp_path: Path) -> None: ps_file = tmp_path / "ps.txt" ps_file.write_text( - "100 1 100 100 7200 65.0 S chrome /opt/chrome/chrome --headless --user-data-dir=/tmp/stockplatform-review-bulk-ux-aa\n", + "999999 1 999999 999999 7200 65.0 S chrome /opt/chrome/chrome --headless --user-data-dir=/tmp/stockplatform-review-bulk-ux-aa\n", encoding="utf-8", ) @@ -213,3 +216,329 @@ def test_remediation_refuses_apply_without_gates(tmp_path: Path) -> None: assert result.returncode != 0 assert "Refusing apply" in result.stderr + assert "--controlled-apply-id" in result.stderr + assert "--confirm-apply" in result.stderr + assert "--post-apply-verifier" in result.stderr + + +def test_remediation_accepts_controlled_apply_gate_without_owner_gate(tmp_path: Path) -> None: + ps_file = tmp_path / "ps.txt" + ps_file.write_text( + "100 1 1 100 7200 65.0 S chrome /opt/chrome/chrome --headless --user-data-dir=/tmp/stockplatform-review-bulk-ux-aa\n", + encoding="utf-8", + ) + + result = subprocess.run( + [ + sys.executable, + str(REMEDIATION_PATH), + "--ps-file", + str(ps_file), + "--apply", + "--confirm-apply", + "--rule", + "stockplatform_headless_smoke", + "--controlled-apply-id", + "CAP-20260701-HOSTLOAD", + "--evidence-ref", + "HostLoadAverageSustainedHigh:110", + "--post-apply-verifier", + "scripts/ops/host-sustained-load-controller.py --host 110 --json", + ], + check=True, + capture_output=True, + text=True, + ) + + assert '"mode": "apply_sigterm"' in result.stdout + assert '"runtime_gate": 1' in result.stdout + assert '"controlled_apply_id": "CAP-20260701-HOSTLOAD"' in result.stdout + assert '"owner_approval_id": ""' in result.stdout + assert '"blocked_reason": "unsafe_pgid"' in result.stdout + assert '"missing_process_group_count": 0' in result.stdout + assert '"signal_error_count": 0' in result.stdout + assert '"signaled_process_group_count": 0' in result.stdout + + +def test_sustained_load_controller_routes_orphan_browser_to_controlled_remediation(tmp_path: Path) -> None: + metrics_file = tmp_path / "host.prom" + metrics_file.write_text( + "\n".join( + [ + 'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1', + 'awoooi_host_load5_per_core{host="110"} 2.2', + 'awoooi_host_swap_used_ratio{host="110"} 0.1', + 'awoooi_host_runaway_process_remediation_authorized{host="110"} 0', + 'awoooi_host_gitea_actions_active_container_count{host="110"} 0', + 'awoooi_host_gitea_actions_active_process_group_count{host="110"} 0', + 'awoooi_host_runaway_browser_orphan_group_count{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 1', + 'awoooi_host_runaway_browser_orphan_cpu_percent{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 155.5', + ] + ), + encoding="utf-8", + ) + + result = subprocess.run( + [ + sys.executable, + str(CONTROLLER_PATH), + "--host", + "110", + "--metrics-file", + str(metrics_file), + "--json", + ], + check=True, + capture_output=True, + text=True, + ) + + payload = json.loads(result.stdout) + assert payload["classification"] == "controlled_orphan_browser_remediation_ready" + assert payload["controlled_apply_allowed"] is True + assert "host-runaway-process-remediation.py --rule stockplatform_headless_smoke" in payload["commands"]["dry_run"] + assert "--controlled-apply-id" in payload["commands"]["controlled_apply"] + assert payload["operation_boundaries"]["process_signal_performed"] is False + + +def test_sustained_load_controller_keeps_ci_saturation_on_runner_path(tmp_path: Path) -> None: + metrics_file = tmp_path / "host.prom" + metrics_file.write_text( + "\n".join( + [ + 'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1', + 'awoooi_host_load5_per_core{host="110"} 2.0', + 'awoooi_host_swap_used_ratio{host="110"} 0.1', + 'awoooi_host_runaway_process_remediation_authorized{host="110"} 0', + 'awoooi_host_gitea_actions_active_container_count{host="110"} 2', + 'awoooi_host_gitea_actions_active_process_group_count{host="110"} 1', + 'awoooi_host_gitea_actions_active_process_cpu_percent{host="110"} 180.0', + 'awoooi_host_gitea_actions_active_process_oldest_age_seconds{host="110"} 1900', + ] + ), + encoding="utf-8", + ) + + result = subprocess.run( + [ + sys.executable, + str(CONTROLLER_PATH), + "--host", + "110", + "--metrics-file", + str(metrics_file), + "--json", + ], + check=True, + capture_output=True, + text=True, + ) + + payload = json.loads(result.stdout) + assert payload["classification"] == "controlled_ci_runner_saturation_guarded" + assert payload["controlled_apply_allowed"] is True + assert "fail_closed" in payload["commands"]["controlled_apply"] + assert "process_kill" not in payload["commands"]["controlled_apply"] + + +def test_sustained_load_controller_blocks_monitor_authority_violation(tmp_path: Path) -> None: + metrics_file = tmp_path / "host.prom" + metrics_file.write_text( + "\n".join( + [ + 'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1', + 'awoooi_host_load5_per_core{host="110"} 2.0', + 'awoooi_host_runaway_process_remediation_authorized{host="110"} 1', + ] + ), + encoding="utf-8", + ) + + result = subprocess.run( + [ + sys.executable, + str(CONTROLLER_PATH), + "--host", + "110", + "--metrics-file", + str(metrics_file), + "--json", + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 75 + payload = json.loads(result.stdout) + assert payload["classification"] == "blocked_monitor_authority_violation" + assert payload["controlled_apply_allowed"] is False + + +def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp_path: Path) -> None: + metrics_file = tmp_path / "host.prom" + metrics_file.write_text( + "\n".join( + [ + 'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1', + 'awoooi_host_load5_per_core{host="110"} 2.0', + 'awoooi_host_swap_used_ratio{host="110"} 0.1', + 'awoooi_host_runaway_process_remediation_authorized{host="110"} 0', + 'awoooi_host_gitea_actions_active_container_count{host="110"} 0', + 'awoooi_host_gitea_actions_active_process_group_count{host="110"} 0', + 'awoooi_host_runaway_browser_orphan_group_count{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 0', + ] + ), + encoding="utf-8", + ) + + result = subprocess.run( + [ + sys.executable, + str(CONTROLLER_PATH), + "--host", + "110", + "--metrics-file", + str(metrics_file), + "--json", + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 75 + payload = json.loads(result.stdout) + assert payload["classification"] == "blocked_unknown_sustained_load_requires_source_specific_playbook" + assert payload["controlled_apply_allowed"] is False + assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"] + assert payload["operation_boundaries"]["process_signal_performed"] is False + + +def test_sustained_load_evidence_emits_sanitized_gitea_recommendation(tmp_path: Path) -> None: + ps_file = tmp_path / "ps.txt" + ps_file.write_text( + "\n".join( + [ + "100 1 100 7200 280.0 1.0 gitea /usr/local/bin/gitea web --config /home/wooo/gitea/app.ini", + "200 1 200 180 15.0 0.5 systemd systemctl show gitea-act-runner-host.service", + ] + ), + encoding="utf-8", + ) + docker_file = tmp_path / "docker.prom" + docker_file.write_text( + 'docker_container_cpu_cores{host="110",container_name="gitea"} 3.4\n', + encoding="utf-8", + ) + + result = subprocess.run( + [ + sys.executable, + str(EVIDENCE_PATH), + "--host", + "110", + "--ps-file", + str(ps_file), + "--docker-stats-file", + str(docker_file), + "--json", + ], + check=True, + capture_output=True, + text=True, + ) + + payload = json.loads(result.stdout) + assert payload["schema_version"] == "host_sustained_load_sanitized_evidence_v1" + assert payload["recommendation"] == "gitea_queue_or_hook_backlog_playbook" + assert payload["redaction"]["raw_command_lines_emitted"] is False + assert payload["operation_boundaries"]["host_write_performed"] is False + assert "/home/wooo" not in result.stdout + + +def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp_path: Path) -> None: + metrics_file = tmp_path / "host.prom" + metrics_file.write_text( + "\n".join( + [ + 'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1', + 'awoooi_host_load5_per_core{host="110"} 2.4', + 'awoooi_host_swap_used_ratio{host="110"} 0.1', + 'awoooi_host_runaway_process_remediation_authorized{host="110"} 0', + ] + ), + encoding="utf-8", + ) + + result = subprocess.run( + [ + sys.executable, + str(CONTROLLER_PATH), + "--host", + "110", + "--metrics-file", + str(metrics_file), + "--json", + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 75 + payload = json.loads(result.stdout) + assert ( + payload["classification"] + == "blocked_unknown_sustained_load_requires_source_specific_playbook" + ) + assert payload["controlled_apply_allowed"] is False + assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"] + assert payload["operation_boundaries"]["host_write_performed"] is False + + +def test_sustained_load_evidence_sanitizes_process_details(tmp_path: Path) -> None: + ps_file = tmp_path / "ps.txt" + ps_file.write_text( + "\n".join( + [ + "101 1 101 7200 65.0 2.5 chrome /opt/chrome/chrome --headless --user-data-dir=/tmp/stockplatform-review-bulk-ux-aa --url=https://example.invalid/token", + "102 1 102 3600 20.0 1.0 node node /srv/private/app/server.js --api-key=SECRET", + ] + ), + encoding="utf-8", + ) + docker_stats_file = tmp_path / "docker.prom" + docker_stats_file.write_text( + 'docker_container_cpu_cores{host="110",container_name="gitea"} 3.2\n', + encoding="utf-8", + ) + + result = subprocess.run( + [ + sys.executable, + str(EVIDENCE_PATH), + "--host", + "110", + "--ps-file", + str(ps_file), + "--docker-stats-file", + str(docker_stats_file), + "--json", + ], + check=True, + capture_output=True, + text=True, + ) + + payload = json.loads(result.stdout) + assert payload["schema_version"] == "host_sustained_load_sanitized_evidence_v1" + assert payload["recommendation"] == "gitea_queue_or_hook_backlog_playbook" + assert payload["redaction"]["raw_command_lines_emitted"] is False + assert payload["redaction"]["workspace_paths_emitted"] is False + assert payload["redaction"]["urls_emitted"] is False + assert payload["operation_boundaries"]["host_write_performed"] is False + assert "https://example.invalid/token" not in result.stdout + assert "/tmp/stockplatform-review-bulk-ux-aa" not in result.stdout + assert "SECRET" not in result.stdout + assert {item["family"] for item in payload["top_process_families"]} >= { + "headless_browser", + "node_service", + }