From 5d76ac114584229555359b09a26e4f13b440f6df Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 18 Jun 2026 15:22:11 +0800 Subject: [PATCH] =?UTF-8?q?fix(api):=20=E5=B0=87=E4=B8=BB=E6=A9=9F?= =?UTF-8?q?=E8=B3=87=E6=BA=90=E5=91=8A=E8=AD=A6=E6=94=B6=E6=96=82=E6=88=90?= =?UTF-8?q?=E8=84=AB=E6=95=8F=E4=BA=8B=E4=BB=B6=E5=8D=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/api/src/services/telegram_gateway.py | 81 +++++++++++++--- .../tests/test_telegram_message_templates.py | 92 ++++++++++++++++++- docs/ARCHITECTURE_MEMORY.md | 23 ++++- .../TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md | 39 +++++++- .../HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md | 10 +- 5 files changed, 216 insertions(+), 29 deletions(-) diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 00d3065c..077fd187 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -133,6 +133,10 @@ def _resource_load_bar(load_text: str) -> str: def _compact_host_process_command(command: str) -> str: lowered = command.lower() + if "prisma" in lowered and "generate" in lowered: + return "Prisma generate" + if "prisma" in lowered and ("build/index.js" in lowered or "prisma/build" in lowered): + return "Prisma CLI child" if "jest-worker" in lowered or "processchild.js" in lowered: return "Next.js build worker" if "next build" in lowered: @@ -145,7 +149,11 @@ def _compact_host_process_command(command: str) -> str: return "npm run build" if "node " in lowered: return "Node.js process" - return command.rsplit("/", 1)[-1][:80] + summary = re.sub(r"https?://\S+", "[redacted-url]", command) + summary = re.sub(r"\{.*", "", summary) + summary = re.sub(r"/(?:[\w@.+-]+/)+", "", summary) + summary = re.sub(r"\s+", " ", summary).strip() + return (summary.rsplit("/", 1)[-1] or "process")[:80] def _parse_host_process_lines(text: str) -> list[dict[str, str | float]]: @@ -164,6 +172,7 @@ def _parse_host_process_lines(text: str) -> list[dict[str, str | float]]: processes.append( { "pid": match.group("pid"), + "user": match.group("user"), "cpu": cpu, "command": _compact_host_process_command(command), } @@ -191,6 +200,8 @@ def _host_resource_alert_impact( cpu = 0.0 commands = " ".join(str(item.get("command", "")).lower() for item in processes) + if "prisma" in commands: + return "Prisma generate / package build 程序正在吃 CPU;先確認 CI/CD、runner job 與供應鏈來源,不可直接 kill" if "build" in commands: return "建置程序正在吃 CPU;先確認 CI/CD 或 runner 排程,不可直接重啟服務" if load >= 8 or cpu >= 90: @@ -213,6 +224,11 @@ def _host_resource_automation_lane( "建立 CI load evidence packet,彙整 Gitea Actions run、runner queue、load/core 與 swap;不 kill process", ) commands = " ".join(str(item.get("command", "")).lower() for item in processes) + if "prisma" in commands: + return ( + "runner_prisma_generate_resource_pressure", + "建立 Prisma / package install 資源壓力候選,交由 AI 彙整 CI/CD run、套件來源、runner queue 與供應鏈 evidence", + ) if "build" in commands: return ( "runner_build_resource_pressure", @@ -247,11 +263,16 @@ def _host_resource_alert_evidence_lines( lines: list[str] = [] for item in processes[:3]: process_cpu = f"{float(item['cpu']):g}%" + user_suffix = ( + " root" + if str(item.get("user", "")).lower() == "root" + else "" + ) lines.append( "├ " f"PID {html.escape(str(item['pid']))} " f"CPU {html.escape(process_cpu)}:" - f"{html.escape(str(item['command']))}" + f"{html.escape(str(item['command']))}{html.escape(user_suffix)}" ) lines[-1] = "└" + lines[-1][1:] return lines @@ -286,12 +307,28 @@ def _host_resource_recommendation_lines(text: str) -> list[str]: "└ 若 CI 卡死,產出 owner packet 與 runner cleanup dry-run,再進維護窗口", ] return [ - "├ 確認是否為 CI/CD / Actions / runner 正常建置窗口", - "├ 若持續超過門檻,先查 runner queue、build job、容器資源限制與服務 SLO", + "├ 確認是否為 CI/CD / Actions / runner 正常建置或 package install 窗口", + "├ 若持續超過門檻,先查 runner queue、build job、套件來源、容器資源限制與服務 SLO", "└ 同一 host/service 5 分鐘聚合一次,避免洗版", ] +def _host_resource_alert_severity(load_text: str, cpu_text: str) -> str: + try: + load = float(load_text) + except (TypeError, ValueError): + load = 0.0 + try: + cpu = float(cpu_text) + except (TypeError, ValueError): + cpu = 0.0 + if load >= 8 or cpu >= 90: + return "P1" + if load >= 4 or cpu >= 70: + return "P2" + return "P3" + + def format_host_resource_alert_card(text: str) -> str: """把 host CPU/load raw dump 轉成值班者可讀的 Telegram HTML 卡。""" if not _is_host_resource_alert_text(text): @@ -314,19 +351,23 @@ def format_host_resource_alert_card(text: str) -> str: impact = _host_resource_alert_impact(text, cpu, load, processes) automation_lane, automation_next_step = _host_resource_automation_lane(text, processes) load_bar = _resource_load_bar(load) - severity = "🔴" if load != "-" and load_bar.count("■") >= 7 else "⚠️" + severity = _host_resource_alert_severity(load, cpu) + root_process_count = sum( + 1 for item in processes if str(item.get("user", "")).lower() == "root" + ) evidence_lines = _host_resource_alert_evidence_lines(text, processes) recommendation_lines = _host_resource_recommendation_lines(text) return "\n".join( [ - f"{severity} 主機資源告警|{html.escape(target)}", + f"{severity} 主機資源壓力|{html.escape(target)}", "ai_automation_alert_card_v1", "", - "影響判讀", + "一眼摘要", f"├ CPU 使用率:{html.escape(cpu)}%", f"├ Load:{html.escape(load)} {load_bar}", - f"└ 狀態:{html.escape(impact)}", + f"├ 容器 root 進程:{root_process_count}", + f"└ 影響:{html.escape(impact)}", "", "AI 自動化判讀", f"├ Lane:{html.escape(automation_lane)}", @@ -345,6 +386,13 @@ def format_host_resource_alert_card(text: str) -> str: ) +def normalize_alert_notification_payload(text: str, parse_mode: str) -> tuple[str, str]: + """最後出口統一脫敏監控告警;host 類 raw dump 一律改成 HTML 事件卡。""" + if _is_host_resource_alert_text(text): + return format_host_resource_alert_card(text), "HTML" + return text, parse_mode + + def _top_gateway_bucket( buckets: list[dict[str, object]], field: str, @@ -8222,15 +8270,14 @@ class TelegramGateway: reply_markup: dict | None = None, ) -> dict: """發送告警型純文字通知到 SRE 戰情室群組。""" - safe_text = ( - format_host_resource_alert_card(text) - if parse_mode.upper() == "HTML" - else text + safe_text, effective_parse_mode = normalize_alert_notification_payload( + text, + parse_mode, ) payload: dict = { "chat_id": self.alert_chat_id, "text": safe_text[:4096], - "parse_mode": parse_mode, + "parse_mode": effective_parse_mode, } if reply_markup: payload["reply_markup"] = reply_markup @@ -8263,10 +8310,14 @@ class TelegramGateway: Returns: dict: Telegram API 回應 """ + safe_text, effective_parse_mode = normalize_alert_notification_payload( + text, + parse_mode, + ) payload: dict = { "chat_id": chat_id or self.alert_chat_id, - "text": text[:4096], - "parse_mode": parse_mode, + "text": safe_text[:4096], + "parse_mode": effective_parse_mode, "disable_web_page_preview": disable_web_page_preview, } return await self._send_request("sendMessage", payload) diff --git a/apps/api/tests/test_telegram_message_templates.py b/apps/api/tests/test_telegram_message_templates.py index 2db5b500..0ece7672 100644 --- a/apps/api/tests/test_telegram_message_templates.py +++ b/apps/api/tests/test_telegram_message_templates.py @@ -69,11 +69,12 @@ root 364 181 0.7 3491396 494608 ? Rl 05:56 0:18 /opt/hostedto result = format_host_resource_alert_card(raw_alert) - assert "主機資源告警|h110-gitea" in result + assert "P1 主機資源壓力|h110-gitea" in result assert "ai_automation_alert_card_v1" in result - assert "影響判讀" in result + assert "一眼摘要" in result assert "CPU 使用率" in result assert "Load" in result + assert "容器 root 進程" in result assert "AI 自動化判讀" in result assert "runner_build_resource_pressure" in result assert "candidate_only" in result @@ -101,7 +102,7 @@ def test_orphan_browser_alert_becomes_runaway_process_event_packet() -> None: result = format_host_resource_alert_card(raw_alert) - assert "主機資源告警|110" in result + assert "P3 主機資源壓力|110" in result assert "ai_automation_alert_card_v1" in result assert "orphan_browser_smoke_runaway_process" in result assert "HostOrphanBrowserSmokeHighCpu" in result @@ -124,7 +125,7 @@ def test_ci_runner_load_alert_becomes_capacity_event_packet() -> None: result = format_host_resource_alert_card(raw_alert) - assert "主機資源告警|110" in result + assert "P3 主機資源壓力|110" in result assert "ci_runner_load_saturation" in result assert "CI load evidence packet" in result assert "Gitea Actions run" in result @@ -166,6 +167,89 @@ async def test_send_alert_notification_normalizes_host_resource_raw_dump(monkeyp assert "/workspace/wooo/" not in payload["text"] +def test_prisma_generate_alert_redacts_raw_process_json_and_urls() -> None: + """Prisma generate 類 root Node.js 告警不得把路徑、URL 或 JSON 直接送出。""" + raw_alert = """WARN h110-gitea 🔴 CPU 警告: used=29.8% load=8.62 + +WARN h110-gitea ⚠️ 容器內 root Node.js 進程: +root 365 27.5 0.1 1283324 108564 ? Sl 06:27 0:00 node /opt/hostedtoolcache/node/20.20.2/x64/bin/pnpm prisma generate +root 376 15.5 0.3 11756860 217220 ? Rl 06:27 0:03 node ./node_modules/.bin/../prisma/build/index.js generate +root 392 0.0 0.0 1096836 53400 ? Ssl 06:27 0:00 /opt/hostedtoolcache/node/20.20.2/x64/bin/node /workspace/wooo/vibework/node_modules/.pnpm/prisma@7.8.0_types+react@18.3.30_react@18.3.30/node_modules/prisma/build/index.js {"product":"prisma","version":"7.8.0","endpoint":"https://checkpoint.prisma.io","command":"generate"} +""" + + result = format_host_resource_alert_card(raw_alert) + + assert "P1 主機資源壓力|h110-gitea" in result + assert "runner_prisma_generate_resource_pressure" in result + assert "Prisma generate" in result + assert "容器 root 進程:3" in result + assert "套件來源" in result + assert "runtime_write_gate=0" in result + assert "root 365" not in result + assert "checkpoint.prisma.io" not in result + assert "node_modules" not in result + assert "/opt/hostedtoolcache" not in result + assert "/workspace/wooo" not in result + assert '"product":"prisma"' not in result + + +@pytest.mark.asyncio +async def test_send_alert_notification_forces_html_card_for_markdown_host_alert(monkeypatch) -> None: + """即使呼叫端用 Markdown,host raw dump 仍必須被最後出口改成 HTML 卡。""" + sent_requests = [] + gateway = TelegramGateway() + + async def fake_send_request(method, payload): + sent_requests.append((method, payload)) + return {"ok": True} + + monkeypatch.setattr(TelegramGateway, "alert_chat_id", property(lambda _self: "chat")) + monkeypatch.setattr(gateway, "_send_request", fake_send_request) + + await gateway.send_alert_notification( + text=( + "WARN h110-gitea 🔴 CPU 警告: used=29.8% load=8.62\n" + "root 365 27.5 0.1 1283324 108564 ? Sl 06:27 0:00 " + "node /opt/hostedtoolcache/node/20.20.2/x64/bin/pnpm prisma generate" + ), + parse_mode="MarkdownV2", + ) + + payload = sent_requests[0][1] + assert payload["parse_mode"] == "HTML" + assert "ai_automation_alert_card_v1" in payload["text"] + assert "runner_prisma_generate_resource_pressure" in payload["text"] + assert "/opt/hostedtoolcache" not in payload["text"] + + +@pytest.mark.asyncio +async def test_send_text_normalizes_host_resource_alert(monkeypatch) -> None: + """send_text 旁路也不能把 host resource raw dump 直接送出。""" + sent_requests = [] + gateway = TelegramGateway() + + async def fake_send_request(method, payload): + sent_requests.append((method, payload)) + return {"ok": True} + + monkeypatch.setattr(TelegramGateway, "alert_chat_id", property(lambda _self: "chat")) + monkeypatch.setattr(gateway, "_send_request", fake_send_request) + + await gateway.send_text( + text=( + "WARN h110-gitea 🔴 CPU 警告: used=29.8% load=8.62\n" + "root 365 27.5 0.1 1283324 108564 ? Sl 06:27 0:00 " + "node /workspace/wooo/vibework/node_modules/.bin/prisma generate" + ), + ) + + payload = sent_requests[0][1] + assert payload["parse_mode"] == "HTML" + assert "P1 主機資源壓力" in payload["text"] + assert "node_modules" not in payload["text"] + assert "/workspace/wooo" not in payload["text"] + + def test_weekly_report_marks_all_zero_as_low_trust_anomaly() -> None: report = WeeklyReportMessage( week_range="2026-W24", diff --git a/docs/ARCHITECTURE_MEMORY.md b/docs/ARCHITECTURE_MEMORY.md index ec65f8a8..164d8acc 100644 --- a/docs/ARCHITECTURE_MEMORY.md +++ b/docs/ARCHITECTURE_MEMORY.md @@ -2,8 +2,27 @@ > AI 模組地圖索引 - 每次新增積木後必須登記 -**最後更新**: 2026-03-23 (Phase 9 Agent Teams) -**維護者**: Claude Code + C-Suite +**最後更新**: 2026-06-18 (IwoooS AI 自動化產品契約) +**維護者**: AWOOOI 工程團隊 + +--- + +## 產品定位記憶 (2026-06-18) + +AWOOOI / AwoooP / IwoooS 是 AI 自動化產品,不是單純的監控頁、告警轉發器、資安清冊或文件集合。任何告警、資安事件、主機訊號、CI/CD 訊號、Wazuh / Kali / SOC 證據、Nginx / gateway / runtime config drift、code review 候選與人工批准,都必須回到可驗證的 AI 自動化閉環。 + +合格閉環必須能回答: + +1. Sensor / Evidence:訊號來源與只讀證據在哪裡。 +2. Normalizer:原始訊號如何被轉成可判讀事件包。 +3. AI 分流:由哪個 AI lane / 規則 / agent 判讀。 +4. 候選:產生哪些修復、隔離、回復、文件或人工送審候選。 +5. 閘門:需要哪些 owner / reviewer / maintenance window / rollback / secret boundary。 +6. 執行邊界:哪些行為仍是 `0 / false`,不得被 UI 可見或 CD 成功誤判成授權。 +7. 驗證器:完成後用什麼 readback、smoke、metric、receipt 或 timeline 驗證。 +8. 學習回寫:如何回寫 KM、PlayBook、LOGBOOK、候選規則與前台狀態。 + +若一個工作項目無法回答 AI 判讀到哪、候選是什麼、閘門是否滿足、驗證器是否完成、學習是否回寫,就不得宣稱 AI 自動化完成,也不得上修 IwoooS headline 完成度。前台只能呈現脫敏後的產品資訊與證據摘要,不得顯示工作視窗對話、內部協作內容、個人 namespace、內網位址、secret 片段或 raw process dump。 --- diff --git a/docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md b/docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md index eb3ab205..72115ee9 100644 --- a/docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md +++ b/docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md @@ -62,12 +62,43 @@ IwoooS / AwoooP 是 AI 自動化產品,Telegram 告警不是終點,而是自 Host / runner 資源告警的第一版落地: -- `TelegramGateway.send_alert_notification()` 會在 HTML 告警出口自動套用 host resource formatter。 -- `CPU 警告`、`容器內 root Node.js 進程`、含 `ps aux` 且指向 build 程序的文字,會被轉成 `ai_automation_alert_card_v1`。 -- raw process dump 會被壓成 `PID + CPU + 精簡命令`,不保留 `/workspace/...`、完整 node module 路徑或整段 `ps aux`。 -- build 壓力會分流到 `runner_build_resource_pressure`,預設 `candidate_only / runtime_write_gate=0`。 +- `TelegramGateway.send_alert_notification()` 與 `send_text()` 會在最後出口自動套用 host resource formatter;即使呼叫端傳入 Markdown,也必須強制改成脫敏 HTML 卡片。 +- `CPU 警告`、`容器內 root Node.js 進程`、含 `ps aux` 且指向 build / package install / Prisma generate 程序的文字,會被轉成 `ai_automation_alert_card_v1`。 +- raw process dump 會被壓成 `PID + CPU + 精簡命令`,不保留 `/workspace/...`、`/opt/hostedtoolcache/...`、完整 `node_modules` 路徑、外部檢查 URL、JSON payload 或整段 `ps aux`。 +- build 壓力會分流到 `runner_build_resource_pressure`;Prisma / package install 壓力會分流到 `runner_prisma_generate_resource_pressure`;預設都是 `candidate_only / runtime_write_gate=0`。 - 這只建立 AI 候選與判讀入口,不代表允許 kill process、restart 服務、改 Nginx、改 firewall 或執行 Kali active scan。 +Host / runner 告警卡片必須採用下列第一屏版型: + +```text +P1 主機資源壓力|h110-gitea +ai_automation_alert_card_v1 + +一眼摘要 +├ CPU 使用率:29.8% +├ Load:8.62 ■■■■■■■■ +├ 容器 root 進程:3 +└ 影響:Prisma generate / package build 程序正在吃 CPU;先確認 CI/CD、runner job 與供應鏈來源,不可直接 kill + +AI 自動化判讀 +├ Lane:runner_prisma_generate_resource_pressure +├ Gate:candidate_only / runtime_write_gate=0 +└ 下一步:建立 Prisma / package install 資源壓力候選,交由 AI 彙整 CI/CD run、套件來源、runner queue 與供應鏈 evidence + +Top evidence +├ PID 365 CPU 27.5%:Prisma generate root +├ PID 376 CPU 15.5%:Prisma generate root +└ PID 392 CPU 0%:Prisma generate root + +建議下一步 +├ 確認是否為 CI/CD / Actions / runner 正常建置或 package install 窗口 +├ 若持續超過門檻,先查 runner queue、build job、套件來源、容器資源限制與服務 SLO +└ 同一 host/service 5 分鐘聚合一次,避免洗版 + +禁止事項 +└ 不 kill process、不 restart Docker / Gitea、不 reload Nginx、不改 firewall;除非已有維護窗口與 owner 批准。 +``` + ## 與 AwoooP 的分工 | 介面 | 承載內容 | diff --git a/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md b/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md index 937b5fbc..c606ca5a 100644 --- a/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md +++ b/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md @@ -1,7 +1,7 @@ -# Host Runaway Process AIOps PlayBook +# 主機異常行程 AIOps PlayBook -> Last updated: 2026-06-18 Asia/Taipei -> Scope: 110 host CPU 滿載、orphan Chrome / Playwright smoke、Gitea Actions CI load 分流。 +> 最後更新:2026-06-18 Asia/Taipei +> 範圍:110 host CPU 滿載、orphan Chrome / Playwright smoke、Gitea Actions CI load、Prisma / package install 資源壓力分流。 --- @@ -61,10 +61,12 @@ Telegram / AI event packet contract: |---------------|---------------|----------| | `HostOrphanBrowserSmokeHighCpu` | `orphan_browser_smoke_runaway_process` | alertname、host、rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫 | | `HostCiRunnerLoadSaturation` | `ci_runner_load_saturation` | Gitea Actions run、runner queue、load/core、swap trend、capacity / queue 判定、不做 process remediation | -| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure` 或 `host_resource_pressure_triage` | sanitized top process evidence,不顯示 raw workspace path 或完整 process dump | +| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure`、`runner_prisma_generate_resource_pressure` 或 `host_resource_pressure_triage` | sanitized top process evidence,不顯示 raw workspace path、hosted toolcache path、`node_modules` path、外部 URL、JSON payload 或完整 process dump | 所有 Telegram 卡片都必須保留 `runtime_write_gate=0`,並不得把 alert/card 轉成直接 kill / restart / reload 指令。 +Host / runner raw dump 進入 Telegram 前必須先被 `TelegramGateway` 壓成 `P1/P2/P3 主機資源壓力` 卡片。第一屏只允許顯示 CPU、load、root process count、AI lane、candidate gate、Top evidence 與禁止事項;完整命令列、套件 JSON、外部檢查 endpoint、內部 workspace path 與 raw `ps aux` 必須留在內部 evidence / timeline,不得外送。 + --- ## 3. AI Triager 必做判讀