From f358a0f6c3e614e407dedb6eee89bf10b2bc8173 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 18 Jun 2026 14:39:31 +0800 Subject: [PATCH] fix(api): route runaway host alerts to ai event packets --- apps/api/src/services/telegram_gateway.py | 120 +++++++++++++++--- .../tests/test_telegram_message_templates.py | 43 +++++++ docs/LOGBOOK.md | 13 ++ .../HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md | 10 ++ ...-04-15-MASTER-ai-autonomous-flywheel-v2.md | 13 ++ ...oot-cold-start-backup-recovery-workplan.md | 12 +- 6 files changed, 188 insertions(+), 23 deletions(-) diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 390d2e19..00d3065c 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -97,6 +97,9 @@ _HOST_RESOURCE_ALERT_HEADER_RE = re.compile( _HOST_RESOURCE_TARGET_RE = re.compile( r"\b(?:WARN|CRIT|INFO)\s+(?P[A-Za-z0-9_.-]+)\b" ) +_HOST_RESOURCE_ALERTNAME_RE = re.compile(r"\balertname\s*=\s*\"?(?P[A-Za-z0-9_.:-]+)\"?") +_HOST_RESOURCE_HOST_LABEL_RE = re.compile(r"\bhost\s*=\s*\"?(?P[A-Za-z0-9_.:-]+)\"?") +_HOST_RESOURCE_RULE_LABEL_RE = re.compile(r"\brule\s*=\s*\"?(?P[A-Za-z0-9_.:-]+)\"?") _HOST_PROCESS_LINE_RE = re.compile( r"^\s*(?P\S+)\s+" r"(?P\d+)\s+" @@ -112,6 +115,10 @@ def _is_host_resource_alert_text(text: str) -> bool: "CPU 警告" in text or "容器內 root Node.js 進程" in text or ("ps aux" in text and ("next build" in text or "npm run build" in text)) + or "HostOrphanBrowserSmokeHighCpu" in text + or "HostCiRunnerLoadSaturation" in text + or "awoooi_host_runaway_browser_orphan" in text + or "awoooi_host_gitea_actions_active_container_count" in text ) @@ -165,10 +172,15 @@ def _parse_host_process_lines(text: str) -> list[dict[str, str | float]]: def _host_resource_alert_impact( + text: str, cpu_text: str, load_text: str, processes: list[dict[str, str | float]], ) -> str: + if "HostOrphanBrowserSmokeHighCpu" in text or "awoooi_host_runaway_browser_orphan" in text: + return "orphan Chrome / Playwright smoke 疑似吃滿 CPU;先驗 pgid、age、cmdline 與 active CI 分流" + if "HostCiRunnerLoadSaturation" in text or "awoooi_host_gitea_actions_active_container_count" in text: + return "CI runner 正在造成主機負載;先確認 Actions run、queue、timeout 與服務 SLO" try: load = float(load_text) except (TypeError, ValueError): @@ -186,7 +198,20 @@ def _host_resource_alert_impact( return "資源升高但尚未確認根因;先聚合觀察並補足 owner 判讀" -def _host_resource_automation_lane(processes: list[dict[str, str | float]]) -> tuple[str, str]: +def _host_resource_automation_lane( + text: str, + processes: list[dict[str, str | float]], +) -> tuple[str, str]: + if "HostOrphanBrowserSmokeHighCpu" in text or "awoooi_host_runaway_browser_orphan" in text: + return ( + "orphan_browser_smoke_runaway_process", + "建立 runaway process triage packet;先跑 remediation dry-run,待 owner / window / evidence 後才可 SIGTERM", + ) + if "HostCiRunnerLoadSaturation" in text or "awoooi_host_gitea_actions_active_container_count" in text: + return ( + "ci_runner_load_saturation", + "建立 CI load evidence packet,彙整 Gitea Actions run、runner queue、load/core 與 swap;不 kill process", + ) commands = " ".join(str(item.get("command", "")).lower() for item in processes) if "build" in commands: return ( @@ -204,6 +229,69 @@ def _host_resource_automation_lane(processes: list[dict[str, str | float]]) -> t ) +def _host_resource_alert_label(name: str, text: str) -> str: + patterns = { + "alertname": _HOST_RESOURCE_ALERTNAME_RE, + "host": _HOST_RESOURCE_HOST_LABEL_RE, + "rule": _HOST_RESOURCE_RULE_LABEL_RE, + } + match = patterns[name].search(text) + return match.group(name) if match else "" + + +def _host_resource_alert_evidence_lines( + text: str, + processes: list[dict[str, str | float]], +) -> list[str]: + if processes: + lines: list[str] = [] + for item in processes[:3]: + process_cpu = f"{float(item['cpu']):g}%" + lines.append( + "├ " + f"PID {html.escape(str(item['pid']))} " + f"CPU {html.escape(process_cpu)}:" + f"{html.escape(str(item['command']))}" + ) + lines[-1] = "└" + lines[-1][1:] + return lines + + alertname = _host_resource_alert_label("alertname", text) + rule = _host_resource_alert_label("rule", text) + if alertname: + lines = [ + f"├ Alert:{html.escape(alertname)}", + "├ Metric:awoooi_host_runaway_process_*", + ] + if rule: + lines.append(f"└ Rule:{html.escape(rule)}") + else: + lines[-1] = "└" + lines[-1][1:] + return lines + + return ["└ 尚未收到可解析的 top process,請補只讀 evidence。"] + + +def _host_resource_recommendation_lines(text: str) -> list[str]: + if "HostOrphanBrowserSmokeHighCpu" in text or "awoooi_host_runaway_browser_orphan" in text: + return [ + "├ 先讀 Prometheus orphan group / CPU / age / cmdline 指標與 textfile timestamp", + "├ 執行 `host-runaway-process-remediation.py` dry-run 產生候選,不直接 apply", + "└ 若 owner approval、maintenance window、evidence ref 齊全,才可 gated SIGTERM 並回寫 KM / PlayBook / Verifier", + ] + if "HostCiRunnerLoadSaturation" in text or "awoooi_host_gitea_actions_active_container_count" in text: + return [ + "├ 確認 Gitea Actions run、runner queue、build timeout、load/core 與 swap trend", + "├ 若是合法 CI,標記為 capacity / queue 事件,不做 process remediation", + "└ 若 CI 卡死,產出 owner packet 與 runner cleanup dry-run,再進維護窗口", + ] + return [ + "├ 確認是否為 CI/CD / Actions / runner 正常建置窗口", + "├ 若持續超過門檻,先查 runner queue、build job、容器資源限制與服務 SLO", + "└ 同一 host/service 5 分鐘聚合一次,避免洗版", + ] + + def format_host_resource_alert_card(text: str) -> str: """把 host CPU/load raw dump 轉成值班者可讀的 Telegram HTML 卡。""" if not _is_host_resource_alert_text(text): @@ -214,29 +302,21 @@ def format_host_resource_alert_card(text: str) -> str: target = ( header.group("target") if header - else (target_match.group("target") if target_match else "unknown-host") + else ( + target_match.group("target") + if target_match + else (_host_resource_alert_label("host", text) or "unknown-host") + ) ) cpu = header.group("cpu") if header else "-" load = header.group("load") if header else "-" processes = _parse_host_process_lines(text) - impact = _host_resource_alert_impact(cpu, load, processes) - automation_lane, automation_next_step = _host_resource_automation_lane(processes) + impact = _host_resource_alert_impact(text, cpu, load, processes) + automation_lane, automation_next_step = _host_resource_automation_lane(text, processes) load_bar = _resource_load_bar(load) severity = "🔴" if load != "-" and load_bar.count("■") >= 7 else "⚠️" - - evidence_lines: list[str] = [] - for item in processes[:3]: - process_cpu = f"{float(item['cpu']):g}%" - evidence_lines.append( - "├ " - f"PID {html.escape(str(item['pid']))} " - f"CPU {html.escape(process_cpu)}:" - f"{html.escape(str(item['command']))}" - ) - if evidence_lines: - evidence_lines[-1] = "└" + evidence_lines[-1][1:] - else: - evidence_lines.append("└ 尚未收到可解析的 top process,請補只讀 evidence。") + evidence_lines = _host_resource_alert_evidence_lines(text, processes) + recommendation_lines = _host_resource_recommendation_lines(text) return "\n".join( [ @@ -257,9 +337,7 @@ def format_host_resource_alert_card(text: str) -> str: *evidence_lines, "", "建議下一步", - "├ 確認是否為 CI/CD / Actions / runner 正常建置窗口", - "├ 若持續超過門檻,先查 runner queue、build job、容器資源限制與服務 SLO", - "└ 同一 host/service 5 分鐘聚合一次,避免洗版", + *recommendation_lines, "", "禁止事項", "└ 不 kill process、不 restart Docker / Gitea、不 reload Nginx、不改 firewall;除非已有維護窗口與 owner 批准。", diff --git a/apps/api/tests/test_telegram_message_templates.py b/apps/api/tests/test_telegram_message_templates.py index ba4a46be..2db5b500 100644 --- a/apps/api/tests/test_telegram_message_templates.py +++ b/apps/api/tests/test_telegram_message_templates.py @@ -91,6 +91,49 @@ root 364 181 0.7 3491396 494608 ? Rl 05:56 0:18 /opt/hostedto assert "processChild.js" not in result +def test_orphan_browser_alert_becomes_runaway_process_event_packet() -> None: + """HostOrphanBrowserSmokeHighCpu 必須變成 runaway process 專屬事件包。""" + raw_alert = ( + 'alertname="HostOrphanBrowserSmokeHighCpu" host="110" ' + 'rule="stockplatform_headless_smoke" ' + "description=\"orphan Chrome smoke group detected\"" + ) + + result = format_host_resource_alert_card(raw_alert) + + assert "主機資源告警|110" in result + assert "ai_automation_alert_card_v1" in result + assert "orphan_browser_smoke_runaway_process" in result + assert "HostOrphanBrowserSmokeHighCpu" in result + assert "stockplatform_headless_smoke" in result + assert "host-runaway-process-remediation.py" in result + assert "dry-run" in result + assert "gated SIGTERM" in result + assert "KM / PlayBook / Verifier" in result + assert "runtime_write_gate=0" in result + assert "不 kill process" in result + assert "Docker" in result + + +def test_ci_runner_load_alert_becomes_capacity_event_packet() -> None: + """HostCiRunnerLoadSaturation 不可被誤導成可 kill 的 runaway process。""" + raw_alert = ( + 'alertname="HostCiRunnerLoadSaturation" host="110" ' + "awoooi_host_gitea_actions_active_container_count 2" + ) + + result = format_host_resource_alert_card(raw_alert) + + assert "主機資源告警|110" in result + assert "ci_runner_load_saturation" in result + assert "CI load evidence packet" in result + assert "Gitea Actions run" in result + assert "合法 CI" in result + assert "不做 process remediation" in result + assert "runtime_write_gate=0" in result + assert "不 kill process" in result + + @pytest.mark.asyncio async def test_send_alert_notification_normalizes_host_resource_raw_dump(monkeypatch) -> None: """send_alert_notification 是最後出口,必須自動套用 AI 自動化事件包。""" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 63c78c03..5e1246b2 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -59,6 +59,19 @@ - Runtime auto-remediation:仍 `0%`,這是安全設計;若未來要由 AI 進入修復,必須先產生 triage packet、dry-run evidence、owner approval、maintenance window、evidence ref、post-check 與 KM 回寫,不得由 exporter 自行 kill。 - 目前 110 高 CPU 判讀:orphan headless browser 已歸零;剩餘負載應歸因於 active CI 或其他一般 workload,不能再被誤判為前一輪 stockPlatform orphan Chrome 事故。 +### 2026-06-18 14:38 台北|Host runaway alert -> AI event packet 補強 + +**修補**:`TelegramGateway.send_alert_notification()` 的最後出口已能把新 Prometheus alert text 轉成專屬 AI automation card,而不是只靠泛用 CPU raw dump parser。`HostOrphanBrowserSmokeHighCpu` 會進 `orphan_browser_smoke_runaway_process` lane,顯示 alertname / host / rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫;`HostCiRunnerLoadSaturation` 會進 `ci_runner_load_saturation` lane,要求彙整 Gitea Actions run、runner queue、load/core 與 swap trend,並明確標示合法 CI 不做 process remediation。 + +**驗證**: +- `DATABASE_URL=postgresql+asyncpg://ci:ci@localhost/ci PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/python3.11 -m pytest apps/api/tests/test_telegram_message_templates.py -q -p no:cacheprovider`:`59 passed`。 +- `PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/python3.11 -m py_compile apps/api/src/services/telegram_gateway.py`:通過。 + +**完成度同步**: +- Host runaway alert -> AI event packet:`0% -> 100%`。 +- Monitoring / alert / PlayBook / Telegram event packet / live scrape:`100%`。 +- Runtime remediation / Telegram 實發 / Bot API call / host write:仍 `0 / false`;本段未發 Telegram、未讀 secret、未 kill process、未重啟服務、未改 firewall/K8s。 + ## 2026-06-18|P2-406B Receipt Readback Owner Review 本地完成 **背景**:P2-004 已把依賴 / 供應鏈漂移收斂成只讀監控讀回;統帥要求每次推進都不能忘記目標與方向,因此本段把日報 / 週報 / 月報、Telegram receipt owner review、P2-004 drift monitor 與 P2-403J 報表真相串成同一個 owner review surface,讓治理頁可以直接看到 AI Agent 分工、互審與仍被關閉的 runtime 邊界。 diff --git a/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md b/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md index d46be540..937b5fbc 100644 --- a/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md +++ b/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md @@ -55,6 +55,16 @@ read-only exporter -> Prometheus alert -> AI triage packet -> KM / PlayBook evid | `HostRunawayProcessMonitorMissing` / `Stale` | exporter 缺失或超過 10 分鐘未更新 | 修 exporter / cron / textfile collector | | `HostRunawayProcessRemediationUnexpectedlyAuthorized` | `remediation_authorized > 0` | 立即回滾;禁止把監控器改成執行器 | +Telegram / AI event packet contract: + +| Alert / input | Telegram lane | 必須顯示 | +|---------------|---------------|----------| +| `HostOrphanBrowserSmokeHighCpu` | `orphan_browser_smoke_runaway_process` | alertname、host、rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫 | +| `HostCiRunnerLoadSaturation` | `ci_runner_load_saturation` | Gitea Actions run、runner queue、load/core、swap trend、capacity / queue 判定、不做 process remediation | +| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure` 或 `host_resource_pressure_triage` | sanitized top process evidence,不顯示 raw workspace path 或完整 process dump | + +所有 Telegram 卡片都必須保留 `runtime_write_gate=0`,並不得把 alert/card 轉成直接 kill / restart / reload 指令。 + --- ## 3. AI Triager 必做判讀 diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index 0f15290e..61d47eea 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -5088,3 +5088,16 @@ Trigger commit `f5cd37b7` 與 deploy marker `0ba92357` 已把 governance UI 的 - 新增 pytest,鎖住 orphan 分類、Linux / BSD `ps` 解析、合法 / 年輕 process 忽略、CI/swap 指標、dry-run 與 apply gate 拒絕行為;readiness audit 以 pyenv Python 重跑後 `BLOCKED=0`。 **裁決:** 這是 host CPU runaway 的 observe -> classify -> alert -> PlayBook -> KM contract -> gated remediation 閉環,不是 runtime 自動 kill 授權。AI 可以自動診斷、告警、產生 dry-run 修復包與 KM/PlayBook 回寫要求;真正 process termination 仍需 owner approval、maintenance window、evidence ref 與 post-check。Docker restart、systemd restart、Nginx reload、firewall change、secret read、host write 與 production write 仍全部禁止。 + +### 2026-06-18 14:38 (台北) — §8 / Host CPU AIOps — Host runaway alert 轉 AI event packet + +**觸發**:前段已把 110 runaway process 監控、告警、PlayBook 與 live scrape 補齊,但 alert 進 Telegram 最後出口時仍需確認 `HostOrphanBrowserSmokeHighCpu` 與 `HostCiRunnerLoadSaturation` 不會被壓成泛用 CPU 文本。 + +**已推進:** +- `TelegramGateway.send_alert_notification()` 仍是最後出口;`format_host_resource_alert_card()` 現在可解析 `alertname`、`host`、`rule` label。 +- `HostOrphanBrowserSmokeHighCpu` 會轉成 `orphan_browser_smoke_runaway_process` lane,顯示 runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫。 +- `HostCiRunnerLoadSaturation` 會轉成 `ci_runner_load_saturation` lane,要求彙整 Gitea Actions run、runner queue、load/core 與 swap,並標示合法 CI 不做 process remediation。 +- `docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md` 已補 Telegram / AI event packet contract。 +- 精準測試 `apps/api/tests/test_telegram_message_templates.py` 已新增兩條 regression,`59 passed`;`telegram_gateway.py` py_compile 通過。 + +**裁決:** 這是 alert -> AI event packet 的只讀與訊息模板閉環,不是 Telegram 實發、Bot API call、Gateway queue write、host write 或 process kill 授權。所有卡片仍固定 `runtime_write_gate=0`,真正修復仍必須走 dry-run、owner approval、maintenance window、evidence ref、post-check 與 KM / PlayBook 回寫。 diff --git a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md index bb65172d..fa7b6e74 100644 --- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md +++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md @@ -15,7 +15,7 @@ | P0 host / K3s recovery | DONE | 100% | 120 booted after console fsck at `2026-06-12 15:13`; latest 2026-06-14 18:15 readback shows 120 is reachable, K3s is active, `mon` and `mon1` are both `Ready control-plane`, and cold-start P0/P1 checks are green. | | P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 92% | 2026-06-15 03:11 `backup-status` shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `escrow_missing=5`, last aggregate `2026-06-15 02:40:13`. Offsite / escrow report shows `SCRIPT_MISSING_COUNT=0`, `OFFSITE_CONFIGURED=1`, `RCLONE_CONFIGURED=1`, `ESCROW_MISSING_COUNT=5`. Owner request package is ready; actual marker write remains blocked on real non-secret evidence IDs. | | P2 service / data truth | VERIFIED_FULL_STACK_GREEN_FOR_SERVICE | 100% | 2026-06-18 13:43 cold-start verifies public route/TLS, API/Web route, momo health and current-month parity `10936|10936|2026-06-01|2026-06-17|2026-06-01|2026-06-17`, backup exporters, schedules, K3s node readiness, VIP, and 110 / 188 runtime health. K8s active failed Job count is `0`, bad pods are `0`, and cold-start returns `PASS=84 WARN=0 BLOCKED=0`. | -| P3 docs / automation contracts | DONE_WITH_RUNAWAY_PROCESS_AIOPS_LIVE_SCRAPED | 100% | Workplan, SOP v1.26, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, and 2026-06-18 live readback are updated. 14:31-14:32 Prometheus scrape confirms 110 `monitor_up=1`, orphan browser group count `0`, active CI containers `2`, load5/core around `0.79-0.81`, swap ratio around `1.0`, `remediation_authorized=0`, and missing/orphan alerts not firing. Repo-side readiness audit also checks runaway process exporter / remediation helper / alert group; live cold-start remains `PASS=84 WARN=0 BLOCKED=0` from the latest service readiness readback. | +| P3 docs / automation contracts | DONE_WITH_RUNAWAY_PROCESS_AIOPS_LIVE_SCRAPED | 100% | Workplan, SOP v1.26, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, and 2026-06-18 live readback are updated. 14:31-14:32 Prometheus scrape confirms 110 `monitor_up=1`, orphan browser group count `0`, active CI containers `2`, load5/core around `0.79-0.81`, swap ratio around `1.0`, `remediation_authorized=0`, and missing/orphan alerts not firing. Repo-side readiness audit also checks runaway process exporter / remediation helper / alert group; live cold-start remains `PASS=84 WARN=0 BLOCKED=0` from the latest service readiness readback. | Full cold-start service readiness may be declared green for the latest verified evidence set. As of 2026-06-18 13:43, services are green with `WARN=0` and `BLOCKED=0`; the retained stale `km-vectorize` failed Job remains historical evidence only. Do not declare DR scorecard complete while credential escrow evidence remains blocked. @@ -214,7 +214,7 @@ Do not run `truncate`, whole DB restore, force-push, DROP, or online root filesy ## 9. Progress Updates ```text -2026-06-18 15:10 Asia/Taipei +2026-06-18 14:20 Asia/Taipei Phase: P3 AI Ops runaway process automation Before: 110 CPU 滿載只能靠人工 `ps/top` 判斷;泛用 `HostHighCpuLoad` 無法分辨跨專案 orphan Chrome smoke 與合法 Gitea Actions CI load。 After: 新增 read-only `host-runaway-process-exporter.py`、gated `host-runaway-process-remediation.py`、Prometheus `host_runaway_process_alerts`、Ansible textfile exporter source-of-truth、SOP v1.26 與 `HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md`。Exporter 暴露 orphan browser、active CI、load/core、swap ratio 與 `remediation_authorized=0`;修復器預設 dry-run,`SIGTERM` 必須帶 owner approval、maintenance window、evidence ref。 @@ -228,6 +228,14 @@ Blocked: No for live observability; yes for runtime remediation by design until Next: Keep cron scrape under normal monitoring; if orphan count becomes >0, create AI triage packet and remediation dry-run before any gated `SIGTERM`. Completion: monitoring / alert / PlayBook / KM contract 100%; runtime auto-remediation remains gated at 0 until a real owner-approved apply is executed. +2026-06-18 14:38 Asia/Taipei +Phase: P3 AI Ops alert-to-event packet +Before: 泛用 CPU raw dump 可被轉成 AI automation card,但 `HostOrphanBrowserSmokeHighCpu` / `HostCiRunnerLoadSaturation` alert text 尚未有專屬 lane。 +After: Telegram 最後出口可將 `HostOrphanBrowserSmokeHighCpu` 轉成 `orphan_browser_smoke_runaway_process`,將 `HostCiRunnerLoadSaturation` 轉成 `ci_runner_load_saturation`;兩者都保留 `runtime_write_gate=0`,並要求 dry-run / owner / maintenance / evidence / KM / PlayBook / Verifier。 +Evidence: `apps/api/src/services/telegram_gateway.py`、`apps/api/tests/test_telegram_message_templates.py`,精準 pytest `59 passed`。 +Blocked: No for alert-to-event packet; yes for Telegram live send / runtime remediation by design. +Next: 等 code-review / CD 後做 production readback;若未來 alert 實際 firing,確認 Telegram card 與 AwoooP Run truth-chain 都能呈現同一 lane。 + 2026-06-18 13:43 Asia/Taipei Phase: P1/P2/P3 live readback Before: live cold-start was `PASS=83 WARN=1 BLOCKED=0`, result `DEGRADED`, because retained stale `km-vectorize-29689620` failed Job evidence was still counted as a service warning.