fix(api): 將主機資源告警收斂成脫敏事件卡

2026-06-18 15:22:11 +08:00
parent dafe534259
commit 5d76ac1145
5 changed files with 216 additions and 29 deletions
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -133,6 +133,10 @@ def _resource_load_bar(load_text: str) -> str:

 def _compact_host_process_command(command: str) -> str:
    lowered = command.lower()
+    if "prisma" in lowered and "generate" in lowered:
+        return "Prisma generate"
+    if "prisma" in lowered and ("build/index.js" in lowered or "prisma/build" in lowered):
+        return "Prisma CLI child"
    if "jest-worker" in lowered or "processchild.js" in lowered:
        return "Next.js build worker"
    if "next build" in lowered:
@@ -145,7 +149,11 @@ def _compact_host_process_command(command: str) -> str:
        return "npm run build"
    if "node " in lowered:
        return "Node.js process"
-    return command.rsplit("/", 1)[-1][:80]
+    summary = re.sub(r"https?://\S+", "[redacted-url]", command)
+    summary = re.sub(r"\{.*", "", summary)
+    summary = re.sub(r"/(?:[\w@.+-]+/)+", "", summary)
+    summary = re.sub(r"\s+", " ", summary).strip()
+    return (summary.rsplit("/", 1)[-1] or "process")[:80]


 def _parse_host_process_lines(text: str) -> list[dict[str, str | float]]:
@@ -164,6 +172,7 @@ def _parse_host_process_lines(text: str) -> list[dict[str, str | float]]:
        processes.append(
            {
                "pid": match.group("pid"),
+                "user": match.group("user"),
                "cpu": cpu,
                "command": _compact_host_process_command(command),
            }
@@ -191,6 +200,8 @@ def _host_resource_alert_impact(
        cpu = 0.0

    commands = " ".join(str(item.get("command", "")).lower() for item in processes)
+    if "prisma" in commands:
+        return "Prisma generate / package build 程序正在吃 CPU；先確認 CI/CD、runner job 與供應鏈來源，不可直接 kill"
    if "build" in commands:
        return "建置程序正在吃 CPU；先確認 CI/CD 或 runner 排程，不可直接重啟服務"
    if load >= 8 or cpu >= 90:
@@ -213,6 +224,11 @@ def _host_resource_automation_lane(
            "建立 CI load evidence packet，彙整 Gitea Actions run、runner queue、load/core 與 swap；不 kill process",
        )
    commands = " ".join(str(item.get("command", "")).lower() for item in processes)
+    if "prisma" in commands:
+        return (
+            "runner_prisma_generate_resource_pressure",
+            "建立 Prisma / package install 資源壓力候選，交由 AI 彙整 CI/CD run、套件來源、runner queue 與供應鏈 evidence",
+        )
    if "build" in commands:
        return (
            "runner_build_resource_pressure",
@@ -247,11 +263,16 @@ def _host_resource_alert_evidence_lines(
        lines: list[str] = []
        for item in processes[:3]:
            process_cpu = f"{float(item['cpu']):g}%"
+            user_suffix = (
+                " root"
+                if str(item.get("user", "")).lower() == "root"
+                else ""
+            )
            lines.append(
                "├ "
                f"<code>PID {html.escape(str(item['pid']))}</code> "
                f"CPU <code>{html.escape(process_cpu)}</code>："
-                f"<code>{html.escape(str(item['command']))}</code>"
+                f"<code>{html.escape(str(item['command']))}{html.escape(user_suffix)}</code>"
            )
        lines[-1] = "└" + lines[-1][1:]
        return lines
@@ -286,12 +307,28 @@ def _host_resource_recommendation_lines(text: str) -> list[str]:
            "└ 若 CI 卡死，產出 owner packet 與 runner cleanup dry-run，再進維護窗口",
        ]
    return [
-        "├ 確認是否為 CI/CD / Actions / runner 正常建置窗口",
-        "├ 若持續超過門檻，先查 runner queue、build job、容器資源限制與服務 SLO",
+        "├ 確認是否為 CI/CD / Actions / runner 正常建置或 package install 窗口",
+        "├ 若持續超過門檻，先查 runner queue、build job、套件來源、容器資源限制與服務 SLO",
        "└ 同一 host/service 5 分鐘聚合一次，避免洗版",
    ]


+def _host_resource_alert_severity(load_text: str, cpu_text: str) -> str:
+    try:
+        load = float(load_text)
+    except (TypeError, ValueError):
+        load = 0.0
+    try:
+        cpu = float(cpu_text)
+    except (TypeError, ValueError):
+        cpu = 0.0
+    if load >= 8 or cpu >= 90:
+        return "P1"
+    if load >= 4 or cpu >= 70:
+        return "P2"
+    return "P3"
+
+
 def format_host_resource_alert_card(text: str) -> str:
    """把 host CPU/load raw dump 轉成值班者可讀的 Telegram HTML 卡。"""
    if not _is_host_resource_alert_text(text):
@@ -314,19 +351,23 @@ def format_host_resource_alert_card(text: str) -> str:
    impact = _host_resource_alert_impact(text, cpu, load, processes)
    automation_lane, automation_next_step = _host_resource_automation_lane(text, processes)
    load_bar = _resource_load_bar(load)
-    severity = "🔴" if load != "-" and load_bar.count("■") >= 7 else "⚠️"
+    severity = _host_resource_alert_severity(load, cpu)
+    root_process_count = sum(
+        1 for item in processes if str(item.get("user", "")).lower() == "root"
+    )
    evidence_lines = _host_resource_alert_evidence_lines(text, processes)
    recommendation_lines = _host_resource_recommendation_lines(text)

    return "\n".join(
        [
-            f"{severity} <b>主機資源告警｜{html.escape(target)}</b>",
+            f"<b>{severity} 主機資源壓力｜{html.escape(target)}</b>",
            "<code>ai_automation_alert_card_v1</code>",
            "",
-            "<b>影響判讀</b>",
+            "<b>一眼摘要</b>",
            f"├ CPU 使用率：<code>{html.escape(cpu)}%</code>",
            f"├ Load：<code>{html.escape(load)}</code> <code>{load_bar}</code>",
-            f"└ 狀態：<b>{html.escape(impact)}</b>",
+            f"├ 容器 root 進程：<code>{root_process_count}</code>",
+            f"└ 影響：<b>{html.escape(impact)}</b>",
            "",
            "<b>AI 自動化判讀</b>",
            f"├ Lane：<code>{html.escape(automation_lane)}</code>",
@@ -345,6 +386,13 @@ def format_host_resource_alert_card(text: str) -> str:
    )


+def normalize_alert_notification_payload(text: str, parse_mode: str) -> tuple[str, str]:
+    """最後出口統一脫敏監控告警；host 類 raw dump 一律改成 HTML 事件卡。"""
+    if _is_host_resource_alert_text(text):
+        return format_host_resource_alert_card(text), "HTML"
+    return text, parse_mode
+
+
 def _top_gateway_bucket(
    buckets: list[dict[str, object]],
    field: str,
@@ -8222,15 +8270,14 @@ class TelegramGateway:
        reply_markup: dict | None = None,
    ) -> dict:
        """發送告警型純文字通知到 SRE 戰情室群組。"""
-        safe_text = (
-            format_host_resource_alert_card(text)
-            if parse_mode.upper() == "HTML"
-            else text
+        safe_text, effective_parse_mode = normalize_alert_notification_payload(
+            text,
+            parse_mode,
        )
        payload: dict = {
            "chat_id": self.alert_chat_id,
            "text": safe_text[:4096],
-            "parse_mode": parse_mode,
+            "parse_mode": effective_parse_mode,
        }
        if reply_markup:
            payload["reply_markup"] = reply_markup
@@ -8263,10 +8310,14 @@ class TelegramGateway:
        Returns:
            dict: Telegram API 回應
        """
+        safe_text, effective_parse_mode = normalize_alert_notification_payload(
+            text,
+            parse_mode,
+        )
        payload: dict = {
            "chat_id": chat_id or self.alert_chat_id,
-            "text": text[:4096],
-            "parse_mode": parse_mode,
+            "text": safe_text[:4096],
+            "parse_mode": effective_parse_mode,
            "disable_web_page_preview": disable_web_page_preview,
        }
        return await self._send_request("sendMessage", payload)
--- a/apps/api/tests/test_telegram_message_templates.py
+++ b/apps/api/tests/test_telegram_message_templates.py
@@ -69,11 +69,12 @@ root         364  181  0.7 3491396 494608 ?      Rl   05:56   0:18 /opt/hostedto

    result = format_host_resource_alert_card(raw_alert)

-    assert "主機資源告警｜h110-gitea" in result
+    assert "P1 主機資源壓力｜h110-gitea" in result
    assert "ai_automation_alert_card_v1" in result
-    assert "影響判讀" in result
+    assert "一眼摘要" in result
    assert "CPU 使用率" in result
    assert "Load" in result
+    assert "容器 root 進程" in result
    assert "AI 自動化判讀" in result
    assert "runner_build_resource_pressure" in result
    assert "candidate_only" in result
@@ -101,7 +102,7 @@ def test_orphan_browser_alert_becomes_runaway_process_event_packet() -> None:

    result = format_host_resource_alert_card(raw_alert)

-    assert "主機資源告警｜110" in result
+    assert "P3 主機資源壓力｜110" in result
    assert "ai_automation_alert_card_v1" in result
    assert "orphan_browser_smoke_runaway_process" in result
    assert "HostOrphanBrowserSmokeHighCpu" in result
@@ -124,7 +125,7 @@ def test_ci_runner_load_alert_becomes_capacity_event_packet() -> None:

    result = format_host_resource_alert_card(raw_alert)

-    assert "主機資源告警｜110" in result
+    assert "P3 主機資源壓力｜110" in result
    assert "ci_runner_load_saturation" in result
    assert "CI load evidence packet" in result
    assert "Gitea Actions run" in result
@@ -166,6 +167,89 @@ async def test_send_alert_notification_normalizes_host_resource_raw_dump(monkeyp
    assert "/workspace/wooo/" not in payload["text"]


+def test_prisma_generate_alert_redacts_raw_process_json_and_urls() -> None:
+    """Prisma generate 類 root Node.js 告警不得把路徑、URL 或 JSON 直接送出。"""
+    raw_alert = """WARN h110-gitea 🔴 CPU 警告: used=29.8% load=8.62
+
+WARN h110-gitea ⚠️  容器內 root Node.js 進程:
+root         365 27.5  0.1 1283324 108564 ?      Sl   06:27   0:00 node /opt/hostedtoolcache/node/20.20.2/x64/bin/pnpm prisma generate
+root         376 15.5  0.3 11756860 217220 ?     Rl   06:27   0:03 node ./node_modules/.bin/../prisma/build/index.js generate
+root         392  0.0  0.0 1096836 53400 ?       Ssl  06:27   0:00 /opt/hostedtoolcache/node/20.20.2/x64/bin/node /workspace/wooo/vibework/node_modules/.pnpm/prisma@7.8.0_types+react@18.3.30_react@18.3.30/node_modules/prisma/build/index.js {"product":"prisma","version":"7.8.0","endpoint":"https://checkpoint.prisma.io","command":"generate"}
+"""
+
+    result = format_host_resource_alert_card(raw_alert)
+
+    assert "P1 主機資源壓力｜h110-gitea" in result
+    assert "runner_prisma_generate_resource_pressure" in result
+    assert "Prisma generate" in result
+    assert "容器 root 進程：<code>3</code>" in result
+    assert "套件來源" in result
+    assert "runtime_write_gate=0" in result
+    assert "root         365" not in result
+    assert "checkpoint.prisma.io" not in result
+    assert "node_modules" not in result
+    assert "/opt/hostedtoolcache" not in result
+    assert "/workspace/wooo" not in result
+    assert '"product":"prisma"' not in result
+
+
+@pytest.mark.asyncio
+async def test_send_alert_notification_forces_html_card_for_markdown_host_alert(monkeypatch) -> None:
+    """即使呼叫端用 Markdown，host raw dump 仍必須被最後出口改成 HTML 卡。"""
+    sent_requests = []
+    gateway = TelegramGateway()
+
+    async def fake_send_request(method, payload):
+        sent_requests.append((method, payload))
+        return {"ok": True}
+
+    monkeypatch.setattr(TelegramGateway, "alert_chat_id", property(lambda _self: "chat"))
+    monkeypatch.setattr(gateway, "_send_request", fake_send_request)
+
+    await gateway.send_alert_notification(
+        text=(
+            "WARN h110-gitea 🔴 CPU 警告: used=29.8% load=8.62\n"
+            "root         365 27.5  0.1 1283324 108564 ?      Sl   06:27   0:00 "
+            "node /opt/hostedtoolcache/node/20.20.2/x64/bin/pnpm prisma generate"
+        ),
+        parse_mode="MarkdownV2",
+    )
+
+    payload = sent_requests[0][1]
+    assert payload["parse_mode"] == "HTML"
+    assert "ai_automation_alert_card_v1" in payload["text"]
+    assert "runner_prisma_generate_resource_pressure" in payload["text"]
+    assert "/opt/hostedtoolcache" not in payload["text"]
+
+
+@pytest.mark.asyncio
+async def test_send_text_normalizes_host_resource_alert(monkeypatch) -> None:
+    """send_text 旁路也不能把 host resource raw dump 直接送出。"""
+    sent_requests = []
+    gateway = TelegramGateway()
+
+    async def fake_send_request(method, payload):
+        sent_requests.append((method, payload))
+        return {"ok": True}
+
+    monkeypatch.setattr(TelegramGateway, "alert_chat_id", property(lambda _self: "chat"))
+    monkeypatch.setattr(gateway, "_send_request", fake_send_request)
+
+    await gateway.send_text(
+        text=(
+            "WARN h110-gitea 🔴 CPU 警告: used=29.8% load=8.62\n"
+            "root         365 27.5  0.1 1283324 108564 ?      Sl   06:27   0:00 "
+            "node /workspace/wooo/vibework/node_modules/.bin/prisma generate"
+        ),
+    )
+
+    payload = sent_requests[0][1]
+    assert payload["parse_mode"] == "HTML"
+    assert "P1 主機資源壓力" in payload["text"]
+    assert "node_modules" not in payload["text"]
+    assert "/workspace/wooo" not in payload["text"]
+
+
 def test_weekly_report_marks_all_zero_as_low_trust_anomaly() -> None:
    report = WeeklyReportMessage(
        week_range="2026-W24",
--- a/docs/ARCHITECTURE_MEMORY.md
+++ b/docs/ARCHITECTURE_MEMORY.md
@@ -2,8 +2,27 @@

 > AI 模組地圖索引 - 每次新增積木後必須登記

-**最後更新**: 2026-03-23 (Phase 9 Agent Teams)
-**維護者**: Claude Code + C-Suite
+**最後更新**: 2026-06-18 (IwoooS AI 自動化產品契約)
+**維護者**: AWOOOI 工程團隊
+
+---
+
+## 產品定位記憶 (2026-06-18)
+
+AWOOOI / AwoooP / IwoooS 是 AI 自動化產品，不是單純的監控頁、告警轉發器、資安清冊或文件集合。任何告警、資安事件、主機訊號、CI/CD 訊號、Wazuh / Kali / SOC 證據、Nginx / gateway / runtime config drift、code review 候選與人工批准，都必須回到可驗證的 AI 自動化閉環。
+
+合格閉環必須能回答：
+
+1. Sensor / Evidence：訊號來源與只讀證據在哪裡。
+2. Normalizer：原始訊號如何被轉成可判讀事件包。
+3. AI 分流：由哪個 AI lane / 規則 / agent 判讀。
+4. 候選：產生哪些修復、隔離、回復、文件或人工送審候選。
+5. 閘門：需要哪些 owner / reviewer / maintenance window / rollback / secret boundary。
+6. 執行邊界：哪些行為仍是 `0 / false`，不得被 UI 可見或 CD 成功誤判成授權。
+7. 驗證器：完成後用什麼 readback、smoke、metric、receipt 或 timeline 驗證。
+8. 學習回寫：如何回寫 KM、PlayBook、LOGBOOK、候選規則與前台狀態。
+
+若一個工作項目無法回答 AI 判讀到哪、候選是什麼、閘門是否滿足、驗證器是否完成、學習是否回寫，就不得宣稱 AI 自動化完成，也不得上修 IwoooS headline 完成度。前台只能呈現脫敏後的產品資訊與證據摘要，不得顯示工作視窗對話、內部協作內容、個人 namespace、內網位址、secret 片段或 raw process dump。

 ---

--- a/docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md
+++ b/docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md
@@ -62,12 +62,43 @@ IwoooS / AwoooP 是 AI 自動化產品，Telegram 告警不是終點，而是自

 Host / runner 資源告警的第一版落地：

- `TelegramGateway.send_alert_notification()` 會在 HTML 告警出口自動套用 host resource formatter。
- `CPU 警告`、`容器內 root Node.js 進程`、含 `ps aux` 且指向 build 程序的文字，會被轉成 `ai_automation_alert_card_v1`。
- raw process dump 會被壓成 `PID + CPU + 精簡命令`，不保留 `/workspace/...`、完整 node module 路徑或整段 `ps aux`。
- build 壓力會分流到 `runner_build_resource_pressure`，預設 `candidate_only / runtime_write_gate=0`。
+- `TelegramGateway.send_alert_notification()` 與 `send_text()` 會在最後出口自動套用 host resource formatter；即使呼叫端傳入 Markdown，也必須強制改成脫敏 HTML 卡片。
+- `CPU 警告`、`容器內 root Node.js 進程`、含 `ps aux` 且指向 build / package install / Prisma generate 程序的文字，會被轉成 `ai_automation_alert_card_v1`。
+- raw process dump 會被壓成 `PID + CPU + 精簡命令`，不保留 `/workspace/...`、`/opt/hostedtoolcache/...`、完整 `node_modules` 路徑、外部檢查 URL、JSON payload 或整段 `ps aux`。
+- build 壓力會分流到 `runner_build_resource_pressure`；Prisma / package install 壓力會分流到 `runner_prisma_generate_resource_pressure`；預設都是 `candidate_only / runtime_write_gate=0`。
 - 這只建立 AI 候選與判讀入口，不代表允許 kill process、restart 服務、改 Nginx、改 firewall 或執行 Kali active scan。

+Host / runner 告警卡片必須採用下列第一屏版型：
+
+```text
+P1 主機資源壓力｜h110-gitea
+ai_automation_alert_card_v1
+
+一眼摘要
+├ CPU 使用率：29.8%
+├ Load：8.62 ■■■■■■■■
+├ 容器 root 進程：3
+└ 影響：Prisma generate / package build 程序正在吃 CPU；先確認 CI/CD、runner job 與供應鏈來源，不可直接 kill
+
+AI 自動化判讀
+├ Lane：runner_prisma_generate_resource_pressure
+├ Gate：candidate_only / runtime_write_gate=0
+└ 下一步：建立 Prisma / package install 資源壓力候選，交由 AI 彙整 CI/CD run、套件來源、runner queue 與供應鏈 evidence
+
+Top evidence
+├ PID 365 CPU 27.5%：Prisma generate root
+├ PID 376 CPU 15.5%：Prisma generate root
+└ PID 392 CPU 0%：Prisma generate root
+
+建議下一步
+├ 確認是否為 CI/CD / Actions / runner 正常建置或 package install 窗口
+├ 若持續超過門檻，先查 runner queue、build job、套件來源、容器資源限制與服務 SLO
+└ 同一 host/service 5 分鐘聚合一次，避免洗版
+
+禁止事項
+└ 不 kill process、不 restart Docker / Gitea、不 reload Nginx、不改 firewall；除非已有維護窗口與 owner 批准。
+```
+
 ## 與 AwoooP 的分工

 | 介面 | 承載內容 |
--- a/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md
+++ b/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md
@@ -1,7 +1,7 @@
-# Host Runaway Process AIOps PlayBook
+# 主機異常行程 AIOps PlayBook

-> Last updated: 2026-06-18 Asia/Taipei
-> Scope: 110 host CPU 滿載、orphan Chrome / Playwright smoke、Gitea Actions CI load 分流。
+> 最後更新：2026-06-18 Asia/Taipei
+> 範圍：110 host CPU 滿載、orphan Chrome / Playwright smoke、Gitea Actions CI load、Prisma / package install 資源壓力分流。

 ---

@@ -61,10 +61,12 @@ Telegram / AI event packet contract:
 |---------------|---------------|----------|
 | `HostOrphanBrowserSmokeHighCpu` | `orphan_browser_smoke_runaway_process` | alertname、host、rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫 |
 | `HostCiRunnerLoadSaturation` | `ci_runner_load_saturation` | Gitea Actions run、runner queue、load/core、swap trend、capacity / queue 判定、不做 process remediation |
-| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure` 或 `host_resource_pressure_triage` | sanitized top process evidence，不顯示 raw workspace path 或完整 process dump |
+| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure`、`runner_prisma_generate_resource_pressure` 或 `host_resource_pressure_triage` | sanitized top process evidence，不顯示 raw workspace path、hosted toolcache path、`node_modules` path、外部 URL、JSON payload 或完整 process dump |

 所有 Telegram 卡片都必須保留 `runtime_write_gate=0`，並不得把 alert/card 轉成直接 kill / restart / reload 指令。

+Host / runner raw dump 進入 Telegram 前必須先被 `TelegramGateway` 壓成 `P1/P2/P3 主機資源壓力` 卡片。第一屏只允許顯示 CPU、load、root process count、AI lane、candidate gate、Top evidence 與禁止事項；完整命令列、套件 JSON、外部檢查 endpoint、內部 workspace path 與 raw `ps aux` 必須留在內部 evidence / timeline，不得外送。
+
 ---

 ## 3. AI Triager 必做判讀