fix(api): route runaway host alerts to ai event packets

2026-06-18 14:39:31 +08:00
parent e025cda641
commit f358a0f6c3
6 changed files with 188 additions and 23 deletions
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -97,6 +97,9 @@ _HOST_RESOURCE_ALERT_HEADER_RE = re.compile(
 _HOST_RESOURCE_TARGET_RE = re.compile(
    r"\b(?:WARN|CRIT|INFO)\s+(?P<target>[A-Za-z0-9_.-]+)\b"
 )
+_HOST_RESOURCE_ALERTNAME_RE = re.compile(r"\balertname\s*=\s*\"?(?P<alertname>[A-Za-z0-9_.:-]+)\"?")
+_HOST_RESOURCE_HOST_LABEL_RE = re.compile(r"\bhost\s*=\s*\"?(?P<host>[A-Za-z0-9_.:-]+)\"?")
+_HOST_RESOURCE_RULE_LABEL_RE = re.compile(r"\brule\s*=\s*\"?(?P<rule>[A-Za-z0-9_.:-]+)\"?")
 _HOST_PROCESS_LINE_RE = re.compile(
    r"^\s*(?P<user>\S+)\s+"
    r"(?P<pid>\d+)\s+"
@@ -112,6 +115,10 @@ def _is_host_resource_alert_text(text: str) -> bool:
        "CPU 警告" in text
        or "容器內 root Node.js 進程" in text
        or ("ps aux" in text and ("next build" in text or "npm run build" in text))
+        or "HostOrphanBrowserSmokeHighCpu" in text
+        or "HostCiRunnerLoadSaturation" in text
+        or "awoooi_host_runaway_browser_orphan" in text
+        or "awoooi_host_gitea_actions_active_container_count" in text
    )


@@ -165,10 +172,15 @@ def _parse_host_process_lines(text: str) -> list[dict[str, str | float]]:


 def _host_resource_alert_impact(
+    text: str,
    cpu_text: str,
    load_text: str,
    processes: list[dict[str, str | float]],
 ) -> str:
+    if "HostOrphanBrowserSmokeHighCpu" in text or "awoooi_host_runaway_browser_orphan" in text:
+        return "orphan Chrome / Playwright smoke 疑似吃滿 CPU；先驗 pgid、age、cmdline 與 active CI 分流"
+    if "HostCiRunnerLoadSaturation" in text or "awoooi_host_gitea_actions_active_container_count" in text:
+        return "CI runner 正在造成主機負載；先確認 Actions run、queue、timeout 與服務 SLO"
    try:
        load = float(load_text)
    except (TypeError, ValueError):
@@ -186,7 +198,20 @@ def _host_resource_alert_impact(
    return "資源升高但尚未確認根因；先聚合觀察並補足 owner 判讀"


-def _host_resource_automation_lane(processes: list[dict[str, str | float]]) -> tuple[str, str]:
+def _host_resource_automation_lane(
+    text: str,
+    processes: list[dict[str, str | float]],
+) -> tuple[str, str]:
+    if "HostOrphanBrowserSmokeHighCpu" in text or "awoooi_host_runaway_browser_orphan" in text:
+        return (
+            "orphan_browser_smoke_runaway_process",
+            "建立 runaway process triage packet；先跑 remediation dry-run，待 owner / window / evidence 後才可 SIGTERM",
+        )
+    if "HostCiRunnerLoadSaturation" in text or "awoooi_host_gitea_actions_active_container_count" in text:
+        return (
+            "ci_runner_load_saturation",
+            "建立 CI load evidence packet，彙整 Gitea Actions run、runner queue、load/core 與 swap；不 kill process",
+        )
    commands = " ".join(str(item.get("command", "")).lower() for item in processes)
    if "build" in commands:
        return (
@@ -204,6 +229,69 @@ def _host_resource_automation_lane(processes: list[dict[str, str | float]]) -> t
    )


+def _host_resource_alert_label(name: str, text: str) -> str:
+    patterns = {
+        "alertname": _HOST_RESOURCE_ALERTNAME_RE,
+        "host": _HOST_RESOURCE_HOST_LABEL_RE,
+        "rule": _HOST_RESOURCE_RULE_LABEL_RE,
+    }
+    match = patterns[name].search(text)
+    return match.group(name) if match else ""
+
+
+def _host_resource_alert_evidence_lines(
+    text: str,
+    processes: list[dict[str, str | float]],
+) -> list[str]:
+    if processes:
+        lines: list[str] = []
+        for item in processes[:3]:
+            process_cpu = f"{float(item['cpu']):g}%"
+            lines.append(
+                "├ "
+                f"<code>PID {html.escape(str(item['pid']))}</code> "
+                f"CPU <code>{html.escape(process_cpu)}</code>："
+                f"<code>{html.escape(str(item['command']))}</code>"
+            )
+        lines[-1] = "└" + lines[-1][1:]
+        return lines
+
+    alertname = _host_resource_alert_label("alertname", text)
+    rule = _host_resource_alert_label("rule", text)
+    if alertname:
+        lines = [
+            f"├ Alert：<code>{html.escape(alertname)}</code>",
+            "├ Metric：<code>awoooi_host_runaway_process_*</code>",
+        ]
+        if rule:
+            lines.append(f"└ Rule：<code>{html.escape(rule)}</code>")
+        else:
+            lines[-1] = "└" + lines[-1][1:]
+        return lines
+
+    return ["└ 尚未收到可解析的 top process，請補只讀 evidence。"]
+
+
+def _host_resource_recommendation_lines(text: str) -> list[str]:
+    if "HostOrphanBrowserSmokeHighCpu" in text or "awoooi_host_runaway_browser_orphan" in text:
+        return [
+            "├ 先讀 Prometheus orphan group / CPU / age / cmdline 指標與 textfile timestamp",
+            "├ 執行 `host-runaway-process-remediation.py` dry-run 產生候選，不直接 apply",
+            "└ 若 owner approval、maintenance window、evidence ref 齊全，才可 gated SIGTERM 並回寫 KM / PlayBook / Verifier",
+        ]
+    if "HostCiRunnerLoadSaturation" in text or "awoooi_host_gitea_actions_active_container_count" in text:
+        return [
+            "├ 確認 Gitea Actions run、runner queue、build timeout、load/core 與 swap trend",
+            "├ 若是合法 CI，標記為 capacity / queue 事件，不做 process remediation",
+            "└ 若 CI 卡死，產出 owner packet 與 runner cleanup dry-run，再進維護窗口",
+        ]
+    return [
+        "├ 確認是否為 CI/CD / Actions / runner 正常建置窗口",
+        "├ 若持續超過門檻，先查 runner queue、build job、容器資源限制與服務 SLO",
+        "└ 同一 host/service 5 分鐘聚合一次，避免洗版",
+    ]
+
+
 def format_host_resource_alert_card(text: str) -> str:
    """把 host CPU/load raw dump 轉成值班者可讀的 Telegram HTML 卡。"""
    if not _is_host_resource_alert_text(text):
@@ -214,29 +302,21 @@ def format_host_resource_alert_card(text: str) -> str:
    target = (
        header.group("target")
        if header
-        else (target_match.group("target") if target_match else "unknown-host")
+        else (
+            target_match.group("target")
+            if target_match
+            else (_host_resource_alert_label("host", text) or "unknown-host")
+        )
    )
    cpu = header.group("cpu") if header else "-"
    load = header.group("load") if header else "-"
    processes = _parse_host_process_lines(text)
-    impact = _host_resource_alert_impact(cpu, load, processes)
-    automation_lane, automation_next_step = _host_resource_automation_lane(processes)
+    impact = _host_resource_alert_impact(text, cpu, load, processes)
+    automation_lane, automation_next_step = _host_resource_automation_lane(text, processes)
    load_bar = _resource_load_bar(load)
    severity = "🔴" if load != "-" and load_bar.count("■") >= 7 else "⚠️"
-
-    evidence_lines: list[str] = []
-    for item in processes[:3]:
-        process_cpu = f"{float(item['cpu']):g}%"
-        evidence_lines.append(
-            "├ "
-            f"<code>PID {html.escape(str(item['pid']))}</code> "
-            f"CPU <code>{html.escape(process_cpu)}</code>："
-            f"<code>{html.escape(str(item['command']))}</code>"
-        )
-    if evidence_lines:
-        evidence_lines[-1] = "└" + evidence_lines[-1][1:]
-    else:
-        evidence_lines.append("└ 尚未收到可解析的 top process，請補只讀 evidence。")
+    evidence_lines = _host_resource_alert_evidence_lines(text, processes)
+    recommendation_lines = _host_resource_recommendation_lines(text)

    return "\n".join(
        [
@@ -257,9 +337,7 @@ def format_host_resource_alert_card(text: str) -> str:
            *evidence_lines,
            "",
            "<b>建議下一步</b>",
-            "├ 確認是否為 CI/CD / Actions / runner 正常建置窗口",
-            "├ 若持續超過門檻，先查 runner queue、build job、容器資源限制與服務 SLO",
-            "└ 同一 host/service 5 分鐘聚合一次，避免洗版",
+            *recommendation_lines,
            "",
            "<b>禁止事項</b>",
            "└ 不 kill process、不 restart Docker / Gitea、不 reload Nginx、不改 firewall；除非已有維護窗口與 owner 批准。",
--- a/apps/api/tests/test_telegram_message_templates.py
+++ b/apps/api/tests/test_telegram_message_templates.py
@@ -91,6 +91,49 @@ root         364  181  0.7 3491396 494608 ?      Rl   05:56   0:18 /opt/hostedto
    assert "processChild.js" not in result


+def test_orphan_browser_alert_becomes_runaway_process_event_packet() -> None:
+    """HostOrphanBrowserSmokeHighCpu 必須變成 runaway process 專屬事件包。"""
+    raw_alert = (
+        'alertname="HostOrphanBrowserSmokeHighCpu" host="110" '
+        'rule="stockplatform_headless_smoke" '
+        "description=\"orphan Chrome smoke group detected\""
+    )
+
+    result = format_host_resource_alert_card(raw_alert)
+
+    assert "主機資源告警｜110" in result
+    assert "ai_automation_alert_card_v1" in result
+    assert "orphan_browser_smoke_runaway_process" in result
+    assert "HostOrphanBrowserSmokeHighCpu" in result
+    assert "stockplatform_headless_smoke" in result
+    assert "host-runaway-process-remediation.py" in result
+    assert "dry-run" in result
+    assert "gated SIGTERM" in result
+    assert "KM / PlayBook / Verifier" in result
+    assert "runtime_write_gate=0" in result
+    assert "不 kill process" in result
+    assert "Docker" in result
+
+
+def test_ci_runner_load_alert_becomes_capacity_event_packet() -> None:
+    """HostCiRunnerLoadSaturation 不可被誤導成可 kill 的 runaway process。"""
+    raw_alert = (
+        'alertname="HostCiRunnerLoadSaturation" host="110" '
+        "awoooi_host_gitea_actions_active_container_count 2"
+    )
+
+    result = format_host_resource_alert_card(raw_alert)
+
+    assert "主機資源告警｜110" in result
+    assert "ci_runner_load_saturation" in result
+    assert "CI load evidence packet" in result
+    assert "Gitea Actions run" in result
+    assert "合法 CI" in result
+    assert "不做 process remediation" in result
+    assert "runtime_write_gate=0" in result
+    assert "不 kill process" in result
+
+
@pytest.mark.asyncio
 async def test_send_alert_notification_normalizes_host_resource_raw_dump(monkeypatch) -> None:
    """send_alert_notification 是最後出口，必須自動套用 AI 自動化事件包。"""
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -59,6 +59,19 @@
 - Runtime auto-remediation：仍 `0%`，這是安全設計；若未來要由 AI 進入修復，必須先產生 triage packet、dry-run evidence、owner approval、maintenance window、evidence ref、post-check 與 KM 回寫，不得由 exporter 自行 kill。
 - 目前 110 高 CPU 判讀：orphan headless browser 已歸零；剩餘負載應歸因於 active CI 或其他一般 workload，不能再被誤判為前一輪 stockPlatform orphan Chrome 事故。

+### 2026-06-18 14:38 台北｜Host runaway alert -> AI event packet 補強
+
+**修補**：`TelegramGateway.send_alert_notification()` 的最後出口已能把新 Prometheus alert text 轉成專屬 AI automation card，而不是只靠泛用 CPU raw dump parser。`HostOrphanBrowserSmokeHighCpu` 會進 `orphan_browser_smoke_runaway_process` lane，顯示 alertname / host / rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫；`HostCiRunnerLoadSaturation` 會進 `ci_runner_load_saturation` lane，要求彙整 Gitea Actions run、runner queue、load/core 與 swap trend，並明確標示合法 CI 不做 process remediation。
+
+**驗證**：
+- `DATABASE_URL=postgresql+asyncpg://ci:ci@localhost/ci PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/python3.11 -m pytest apps/api/tests/test_telegram_message_templates.py -q -p no:cacheprovider`：`59 passed`。
+- `PYTHONPATH=apps/api /Users/ogt/.pyenv/shims/python3.11 -m py_compile apps/api/src/services/telegram_gateway.py`：通過。
+
+**完成度同步**：
+- Host runaway alert -> AI event packet：`0% -> 100%`。
+- Monitoring / alert / PlayBook / Telegram event packet / live scrape：`100%`。
+- Runtime remediation / Telegram 實發 / Bot API call / host write：仍 `0 / false`；本段未發 Telegram、未讀 secret、未 kill process、未重啟服務、未改 firewall/K8s。
+
 ## 2026-06-18｜P2-406B Receipt Readback Owner Review 本地完成

 **背景**：P2-004 已把依賴 / 供應鏈漂移收斂成只讀監控讀回；統帥要求每次推進都不能忘記目標與方向，因此本段把日報 / 週報 / 月報、Telegram receipt owner review、P2-004 drift monitor 與 P2-403J 報表真相串成同一個 owner review surface，讓治理頁可以直接看到 AI Agent 分工、互審與仍被關閉的 runtime 邊界。
--- a/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md
+++ b/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md
@@ -55,6 +55,16 @@ read-only exporter -> Prometheus alert -> AI triage packet -> KM / PlayBook evid
 | `HostRunawayProcessMonitorMissing` / `Stale` | exporter 缺失或超過 10 分鐘未更新 | 修 exporter / cron / textfile collector |
 | `HostRunawayProcessRemediationUnexpectedlyAuthorized` | `remediation_authorized > 0` | 立即回滾；禁止把監控器改成執行器 |

+Telegram / AI event packet contract:
+
+| Alert / input | Telegram lane | 必須顯示 |
+|---------------|---------------|----------|
+| `HostOrphanBrowserSmokeHighCpu` | `orphan_browser_smoke_runaway_process` | alertname、host、rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫 |
+| `HostCiRunnerLoadSaturation` | `ci_runner_load_saturation` | Gitea Actions run、runner queue、load/core、swap trend、capacity / queue 判定、不做 process remediation |
+| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure` 或 `host_resource_pressure_triage` | sanitized top process evidence，不顯示 raw workspace path 或完整 process dump |
+
+所有 Telegram 卡片都必須保留 `runtime_write_gate=0`，並不得把 alert/card 轉成直接 kill / restart / reload 指令。
+
 ---

 ## 3. AI Triager 必做判讀
--- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md
+++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md
@@ -5088,3 +5088,16 @@ Trigger commit `f5cd37b7` 與 deploy marker `0ba92357` 已把 governance UI 的
 - 新增 pytest，鎖住 orphan 分類、Linux / BSD `ps` 解析、合法 / 年輕 process 忽略、CI/swap 指標、dry-run 與 apply gate 拒絕行為；readiness audit 以 pyenv Python 重跑後 `BLOCKED=0`。

 **裁決：** 這是 host CPU runaway 的 observe -> classify -> alert -> PlayBook -> KM contract -> gated remediation 閉環，不是 runtime 自動 kill 授權。AI 可以自動診斷、告警、產生 dry-run 修復包與 KM/PlayBook 回寫要求；真正 process termination 仍需 owner approval、maintenance window、evidence ref 與 post-check。Docker restart、systemd restart、Nginx reload、firewall change、secret read、host write 與 production write 仍全部禁止。
+
+### 2026-06-18 14:38 (台北) — §8 / Host CPU AIOps — Host runaway alert 轉 AI event packet
+
+**觸發**：前段已把 110 runaway process 監控、告警、PlayBook 與 live scrape 補齊，但 alert 進 Telegram 最後出口時仍需確認 `HostOrphanBrowserSmokeHighCpu` 與 `HostCiRunnerLoadSaturation` 不會被壓成泛用 CPU 文本。
+
+**已推進：**
+- `TelegramGateway.send_alert_notification()` 仍是最後出口；`format_host_resource_alert_card()` 現在可解析 `alertname`、`host`、`rule` label。
+- `HostOrphanBrowserSmokeHighCpu` 會轉成 `orphan_browser_smoke_runaway_process` lane，顯示 runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫。
+- `HostCiRunnerLoadSaturation` 會轉成 `ci_runner_load_saturation` lane，要求彙整 Gitea Actions run、runner queue、load/core 與 swap，並標示合法 CI 不做 process remediation。
+- `docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md` 已補 Telegram / AI event packet contract。
+- 精準測試 `apps/api/tests/test_telegram_message_templates.py` 已新增兩條 regression，`59 passed`；`telegram_gateway.py` py_compile 通過。
+
+**裁決：** 這是 alert -> AI event packet 的只讀與訊息模板閉環，不是 Telegram 實發、Bot API call、Gateway queue write、host write 或 process kill 授權。所有卡片仍固定 `runtime_write_gate=0`，真正修復仍必須走 dry-run、owner approval、maintenance window、evidence ref、post-check 與 KM / PlayBook 回寫。
--- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md
+++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md
@@ -15,7 +15,7 @@
 | P0 host / K3s recovery | DONE | 100% | 120 booted after console fsck at `2026-06-12 15:13`; latest 2026-06-14 18:15 readback shows 120 is reachable, K3s is active, `mon` and `mon1` are both `Ready control-plane`, and cold-start P0/P1 checks are green. |
 | P1 backup / alert / escrow | BLOCKED_DR_ESCROW | 92% | 2026-06-15 03:11 `backup-status` shows 110 `13/13 fresh failed=0`, 188 `2/2 fresh failed=0`, `core_blockers=0`, `escrow_missing=5`, last aggregate `2026-06-15 02:40:13`. Offsite / escrow report shows `SCRIPT_MISSING_COUNT=0`, `OFFSITE_CONFIGURED=1`, `RCLONE_CONFIGURED=1`, `ESCROW_MISSING_COUNT=5`. Owner request package is ready; actual marker write remains blocked on real non-secret evidence IDs. |
 | P2 service / data truth | VERIFIED_FULL_STACK_GREEN_FOR_SERVICE | 100% | 2026-06-18 13:43 cold-start verifies public route/TLS, API/Web route, momo health and current-month parity `10936|10936|2026-06-01|2026-06-17|2026-06-01|2026-06-17`, backup exporters, schedules, K3s node readiness, VIP, and 110 / 188 runtime health. K8s active failed Job count is `0`, bad pods are `0`, and cold-start returns `PASS=84 WARN=0 BLOCKED=0`. |
-| P3 docs / automation contracts | DONE_WITH_RUNAWAY_PROCESS_AIOPS_LIVE_SCRAPED | 100% | Workplan, SOP v1.26, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, and 2026-06-18 live readback are updated. 14:31-14:32 Prometheus scrape confirms 110 `monitor_up=1`, orphan browser group count `0`, active CI containers `2`, load5/core around `0.79-0.81`, swap ratio around `1.0`, `remediation_authorized=0`, and missing/orphan alerts not firing. Repo-side readiness audit also checks runaway process exporter / remediation helper / alert group; live cold-start remains `PASS=84 WARN=0 BLOCKED=0` from the latest service readiness readback. |
+| P3 docs / automation contracts | DONE_WITH_RUNAWAY_PROCESS_AIOPS_LIVE_SCRAPED | 100% | Workplan, SOP v1.26, BACKUP-STATUS, LOGBOOK, 120 console/fsck recovery, Gitea backup stale-dump hardening, reboot ledger/version-comparison SOP, escrow evidence audit, 188 nginx Ansible baseline, 110 cold-start detector script, startup judgment layers, GO/NO-GO tree, host recovery cards, explicit Plan B degraded-operation path, machine-readable `plan_b` baseline, readiness-audit Plan B guard, B0-B5 service levels, T+0/T+120 fallback timeline checks, host role / load-balancing assessment, CD `known_hosts` guardrail, `fwupd-refresh.timer` rollback note, K3s filesystem event blocker, AWOOOI backup no-direct-offsite-sync contract, 110/188 Ansible source-of-truth, Gitea self-hosted readiness validation workflow, post-CD no-regression readbacks, stale-vs-active K8s failed Job classification, 110 runaway browser / CI load AIOps exporter + alert + gated remediation PlayBook, Telegram / AI event packet mapping, and 2026-06-18 live readback are updated. 14:31-14:32 Prometheus scrape confirms 110 `monitor_up=1`, orphan browser group count `0`, active CI containers `2`, load5/core around `0.79-0.81`, swap ratio around `1.0`, `remediation_authorized=0`, and missing/orphan alerts not firing. Repo-side readiness audit also checks runaway process exporter / remediation helper / alert group; live cold-start remains `PASS=84 WARN=0 BLOCKED=0` from the latest service readiness readback. |

 Full cold-start service readiness may be declared green for the latest verified evidence set. As of 2026-06-18 13:43, services are green with `WARN=0` and `BLOCKED=0`; the retained stale `km-vectorize` failed Job remains historical evidence only. Do not declare DR scorecard complete while credential escrow evidence remains blocked.

@@ -214,7 +214,7 @@ Do not run `truncate`, whole DB restore, force-push, DROP, or online root filesy
 ## 9. Progress Updates

 ```text
-2026-06-18 15:10 Asia/Taipei
+2026-06-18 14:20 Asia/Taipei
 Phase: P3 AI Ops runaway process automation
 Before: 110 CPU 滿載只能靠人工 `ps/top` 判斷；泛用 `HostHighCpuLoad` 無法分辨跨專案 orphan Chrome smoke 與合法 Gitea Actions CI load。
 After: 新增 read-only `host-runaway-process-exporter.py`、gated `host-runaway-process-remediation.py`、Prometheus `host_runaway_process_alerts`、Ansible textfile exporter source-of-truth、SOP v1.26 與 `HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md`。Exporter 暴露 orphan browser、active CI、load/core、swap ratio 與 `remediation_authorized=0`；修復器預設 dry-run，`SIGTERM` 必須帶 owner approval、maintenance window、evidence ref。
@@ -228,6 +228,14 @@ Blocked: No for live observability; yes for runtime remediation by design until
 Next: Keep cron scrape under normal monitoring; if orphan count becomes >0, create AI triage packet and remediation dry-run before any gated `SIGTERM`.
 Completion: monitoring / alert / PlayBook / KM contract 100%; runtime auto-remediation remains gated at 0 until a real owner-approved apply is executed.

+2026-06-18 14:38 Asia/Taipei
+Phase: P3 AI Ops alert-to-event packet
+Before: 泛用 CPU raw dump 可被轉成 AI automation card，但 `HostOrphanBrowserSmokeHighCpu` / `HostCiRunnerLoadSaturation` alert text 尚未有專屬 lane。
+After: Telegram 最後出口可將 `HostOrphanBrowserSmokeHighCpu` 轉成 `orphan_browser_smoke_runaway_process`，將 `HostCiRunnerLoadSaturation` 轉成 `ci_runner_load_saturation`；兩者都保留 `runtime_write_gate=0`，並要求 dry-run / owner / maintenance / evidence / KM / PlayBook / Verifier。
+Evidence: `apps/api/src/services/telegram_gateway.py`、`apps/api/tests/test_telegram_message_templates.py`，精準 pytest `59 passed`。
+Blocked: No for alert-to-event packet; yes for Telegram live send / runtime remediation by design.
+Next: 等 code-review / CD 後做 production readback；若未來 alert 實際 firing，確認 Telegram card 與 AwoooP Run truth-chain 都能呈現同一 lane。
+
 2026-06-18 13:43 Asia/Taipei
 Phase: P1/P2/P3 live readback
 Before: live cold-start was `PASS=83 WARN=1 BLOCKED=0`, result `DEGRADED`, because retained stale `km-vectorize-29689620` failed Job evidence was still counted as a service warning.