fix(api): 將主機資源告警收斂成脫敏事件卡
Some checks failed
Code Review / ai-code-review (push) Successful in 13s
CD Pipeline / tests (push) Successful in 1m48s
Ansible / Reboot Recovery Contract / validate (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-06-18 15:22:11 +08:00
parent dafe534259
commit 5d76ac1145
5 changed files with 216 additions and 29 deletions

View File

@@ -133,6 +133,10 @@ def _resource_load_bar(load_text: str) -> str:
def _compact_host_process_command(command: str) -> str:
lowered = command.lower()
if "prisma" in lowered and "generate" in lowered:
return "Prisma generate"
if "prisma" in lowered and ("build/index.js" in lowered or "prisma/build" in lowered):
return "Prisma CLI child"
if "jest-worker" in lowered or "processchild.js" in lowered:
return "Next.js build worker"
if "next build" in lowered:
@@ -145,7 +149,11 @@ def _compact_host_process_command(command: str) -> str:
return "npm run build"
if "node " in lowered:
return "Node.js process"
return command.rsplit("/", 1)[-1][:80]
summary = re.sub(r"https?://\S+", "[redacted-url]", command)
summary = re.sub(r"\{.*", "", summary)
summary = re.sub(r"/(?:[\w@.+-]+/)+", "", summary)
summary = re.sub(r"\s+", " ", summary).strip()
return (summary.rsplit("/", 1)[-1] or "process")[:80]
def _parse_host_process_lines(text: str) -> list[dict[str, str | float]]:
@@ -164,6 +172,7 @@ def _parse_host_process_lines(text: str) -> list[dict[str, str | float]]:
processes.append(
{
"pid": match.group("pid"),
"user": match.group("user"),
"cpu": cpu,
"command": _compact_host_process_command(command),
}
@@ -191,6 +200,8 @@ def _host_resource_alert_impact(
cpu = 0.0
commands = " ".join(str(item.get("command", "")).lower() for item in processes)
if "prisma" in commands:
return "Prisma generate / package build 程序正在吃 CPU先確認 CI/CD、runner job 與供應鏈來源,不可直接 kill"
if "build" in commands:
return "建置程序正在吃 CPU先確認 CI/CD 或 runner 排程,不可直接重啟服務"
if load >= 8 or cpu >= 90:
@@ -213,6 +224,11 @@ def _host_resource_automation_lane(
"建立 CI load evidence packet彙整 Gitea Actions run、runner queue、load/core 與 swap不 kill process",
)
commands = " ".join(str(item.get("command", "")).lower() for item in processes)
if "prisma" in commands:
return (
"runner_prisma_generate_resource_pressure",
"建立 Prisma / package install 資源壓力候選,交由 AI 彙整 CI/CD run、套件來源、runner queue 與供應鏈 evidence",
)
if "build" in commands:
return (
"runner_build_resource_pressure",
@@ -247,11 +263,16 @@ def _host_resource_alert_evidence_lines(
lines: list[str] = []
for item in processes[:3]:
process_cpu = f"{float(item['cpu']):g}%"
user_suffix = (
" root"
if str(item.get("user", "")).lower() == "root"
else ""
)
lines.append(
""
f"<code>PID {html.escape(str(item['pid']))}</code> "
f"CPU <code>{html.escape(process_cpu)}</code>"
f"<code>{html.escape(str(item['command']))}</code>"
f"<code>{html.escape(str(item['command']))}{html.escape(user_suffix)}</code>"
)
lines[-1] = "" + lines[-1][1:]
return lines
@@ -286,12 +307,28 @@ def _host_resource_recommendation_lines(text: str) -> list[str]:
"└ 若 CI 卡死,產出 owner packet 與 runner cleanup dry-run再進維護窗口",
]
return [
"├ 確認是否為 CI/CD / Actions / runner 正常建置窗口",
"├ 若持續超過門檻,先查 runner queue、build job、容器資源限制與服務 SLO",
"├ 確認是否為 CI/CD / Actions / runner 正常建置或 package install 窗口",
"├ 若持續超過門檻,先查 runner queue、build job、套件來源、容器資源限制與服務 SLO",
"└ 同一 host/service 5 分鐘聚合一次,避免洗版",
]
def _host_resource_alert_severity(load_text: str, cpu_text: str) -> str:
try:
load = float(load_text)
except (TypeError, ValueError):
load = 0.0
try:
cpu = float(cpu_text)
except (TypeError, ValueError):
cpu = 0.0
if load >= 8 or cpu >= 90:
return "P1"
if load >= 4 or cpu >= 70:
return "P2"
return "P3"
def format_host_resource_alert_card(text: str) -> str:
"""把 host CPU/load raw dump 轉成值班者可讀的 Telegram HTML 卡。"""
if not _is_host_resource_alert_text(text):
@@ -314,19 +351,23 @@ def format_host_resource_alert_card(text: str) -> str:
impact = _host_resource_alert_impact(text, cpu, load, processes)
automation_lane, automation_next_step = _host_resource_automation_lane(text, processes)
load_bar = _resource_load_bar(load)
severity = "🔴" if load != "-" and load_bar.count("") >= 7 else "⚠️"
severity = _host_resource_alert_severity(load, cpu)
root_process_count = sum(
1 for item in processes if str(item.get("user", "")).lower() == "root"
)
evidence_lines = _host_resource_alert_evidence_lines(text, processes)
recommendation_lines = _host_resource_recommendation_lines(text)
return "\n".join(
[
f"{severity} <b>主機資源告警{html.escape(target)}</b>",
f"<b>{severity} 主機資源壓力{html.escape(target)}</b>",
"<code>ai_automation_alert_card_v1</code>",
"",
"<b>影響判讀</b>",
"<b>一眼摘要</b>",
f"├ CPU 使用率:<code>{html.escape(cpu)}%</code>",
f"├ Load<code>{html.escape(load)}</code> <code>{load_bar}</code>",
f"└ 狀態:<b>{html.escape(impact)}</b>",
f"├ 容器 root 進程:<code>{root_process_count}</code>",
f"└ 影響:<b>{html.escape(impact)}</b>",
"",
"<b>AI 自動化判讀</b>",
f"├ Lane<code>{html.escape(automation_lane)}</code>",
@@ -345,6 +386,13 @@ def format_host_resource_alert_card(text: str) -> str:
)
def normalize_alert_notification_payload(text: str, parse_mode: str) -> tuple[str, str]:
"""最後出口統一脫敏監控告警host 類 raw dump 一律改成 HTML 事件卡。"""
if _is_host_resource_alert_text(text):
return format_host_resource_alert_card(text), "HTML"
return text, parse_mode
def _top_gateway_bucket(
buckets: list[dict[str, object]],
field: str,
@@ -8222,15 +8270,14 @@ class TelegramGateway:
reply_markup: dict | None = None,
) -> dict:
"""發送告警型純文字通知到 SRE 戰情室群組。"""
safe_text = (
format_host_resource_alert_card(text)
if parse_mode.upper() == "HTML"
else text
safe_text, effective_parse_mode = normalize_alert_notification_payload(
text,
parse_mode,
)
payload: dict = {
"chat_id": self.alert_chat_id,
"text": safe_text[:4096],
"parse_mode": parse_mode,
"parse_mode": effective_parse_mode,
}
if reply_markup:
payload["reply_markup"] = reply_markup
@@ -8263,10 +8310,14 @@ class TelegramGateway:
Returns:
dict: Telegram API 回應
"""
safe_text, effective_parse_mode = normalize_alert_notification_payload(
text,
parse_mode,
)
payload: dict = {
"chat_id": chat_id or self.alert_chat_id,
"text": text[:4096],
"parse_mode": parse_mode,
"text": safe_text[:4096],
"parse_mode": effective_parse_mode,
"disable_web_page_preview": disable_web_page_preview,
}
return await self._send_request("sendMessage", payload)

View File

@@ -69,11 +69,12 @@ root 364 181 0.7 3491396 494608 ? Rl 05:56 0:18 /opt/hostedto
result = format_host_resource_alert_card(raw_alert)
assert "主機資源告警h110-gitea" in result
assert "P1 主機資源壓力h110-gitea" in result
assert "ai_automation_alert_card_v1" in result
assert "影響判讀" in result
assert "一眼摘要" in result
assert "CPU 使用率" in result
assert "Load" in result
assert "容器 root 進程" in result
assert "AI 自動化判讀" in result
assert "runner_build_resource_pressure" in result
assert "candidate_only" in result
@@ -101,7 +102,7 @@ def test_orphan_browser_alert_becomes_runaway_process_event_packet() -> None:
result = format_host_resource_alert_card(raw_alert)
assert "主機資源告警110" in result
assert "P3 主機資源壓力110" in result
assert "ai_automation_alert_card_v1" in result
assert "orphan_browser_smoke_runaway_process" in result
assert "HostOrphanBrowserSmokeHighCpu" in result
@@ -124,7 +125,7 @@ def test_ci_runner_load_alert_becomes_capacity_event_packet() -> None:
result = format_host_resource_alert_card(raw_alert)
assert "主機資源告警110" in result
assert "P3 主機資源壓力110" in result
assert "ci_runner_load_saturation" in result
assert "CI load evidence packet" in result
assert "Gitea Actions run" in result
@@ -166,6 +167,89 @@ async def test_send_alert_notification_normalizes_host_resource_raw_dump(monkeyp
assert "/workspace/wooo/" not in payload["text"]
def test_prisma_generate_alert_redacts_raw_process_json_and_urls() -> None:
"""Prisma generate 類 root Node.js 告警不得把路徑、URL 或 JSON 直接送出。"""
raw_alert = """WARN h110-gitea 🔴 CPU 警告: used=29.8% load=8.62
WARN h110-gitea ⚠️ 容器內 root Node.js 進程:
root 365 27.5 0.1 1283324 108564 ? Sl 06:27 0:00 node /opt/hostedtoolcache/node/20.20.2/x64/bin/pnpm prisma generate
root 376 15.5 0.3 11756860 217220 ? Rl 06:27 0:03 node ./node_modules/.bin/../prisma/build/index.js generate
root 392 0.0 0.0 1096836 53400 ? Ssl 06:27 0:00 /opt/hostedtoolcache/node/20.20.2/x64/bin/node /workspace/wooo/vibework/node_modules/.pnpm/prisma@7.8.0_types+react@18.3.30_react@18.3.30/node_modules/prisma/build/index.js {"product":"prisma","version":"7.8.0","endpoint":"https://checkpoint.prisma.io","command":"generate"}
"""
result = format_host_resource_alert_card(raw_alert)
assert "P1 主機資源壓力h110-gitea" in result
assert "runner_prisma_generate_resource_pressure" in result
assert "Prisma generate" in result
assert "容器 root 進程:<code>3</code>" in result
assert "套件來源" in result
assert "runtime_write_gate=0" in result
assert "root 365" not in result
assert "checkpoint.prisma.io" not in result
assert "node_modules" not in result
assert "/opt/hostedtoolcache" not in result
assert "/workspace/wooo" not in result
assert '"product":"prisma"' not in result
@pytest.mark.asyncio
async def test_send_alert_notification_forces_html_card_for_markdown_host_alert(monkeypatch) -> None:
"""即使呼叫端用 Markdownhost raw dump 仍必須被最後出口改成 HTML 卡。"""
sent_requests = []
gateway = TelegramGateway()
async def fake_send_request(method, payload):
sent_requests.append((method, payload))
return {"ok": True}
monkeypatch.setattr(TelegramGateway, "alert_chat_id", property(lambda _self: "chat"))
monkeypatch.setattr(gateway, "_send_request", fake_send_request)
await gateway.send_alert_notification(
text=(
"WARN h110-gitea 🔴 CPU 警告: used=29.8% load=8.62\n"
"root 365 27.5 0.1 1283324 108564 ? Sl 06:27 0:00 "
"node /opt/hostedtoolcache/node/20.20.2/x64/bin/pnpm prisma generate"
),
parse_mode="MarkdownV2",
)
payload = sent_requests[0][1]
assert payload["parse_mode"] == "HTML"
assert "ai_automation_alert_card_v1" in payload["text"]
assert "runner_prisma_generate_resource_pressure" in payload["text"]
assert "/opt/hostedtoolcache" not in payload["text"]
@pytest.mark.asyncio
async def test_send_text_normalizes_host_resource_alert(monkeypatch) -> None:
"""send_text 旁路也不能把 host resource raw dump 直接送出。"""
sent_requests = []
gateway = TelegramGateway()
async def fake_send_request(method, payload):
sent_requests.append((method, payload))
return {"ok": True}
monkeypatch.setattr(TelegramGateway, "alert_chat_id", property(lambda _self: "chat"))
monkeypatch.setattr(gateway, "_send_request", fake_send_request)
await gateway.send_text(
text=(
"WARN h110-gitea 🔴 CPU 警告: used=29.8% load=8.62\n"
"root 365 27.5 0.1 1283324 108564 ? Sl 06:27 0:00 "
"node /workspace/wooo/vibework/node_modules/.bin/prisma generate"
),
)
payload = sent_requests[0][1]
assert payload["parse_mode"] == "HTML"
assert "P1 主機資源壓力" in payload["text"]
assert "node_modules" not in payload["text"]
assert "/workspace/wooo" not in payload["text"]
def test_weekly_report_marks_all_zero_as_low_trust_anomaly() -> None:
report = WeeklyReportMessage(
week_range="2026-W24",

View File

@@ -2,8 +2,27 @@
> AI 模組地圖索引 - 每次新增積木後必須登記
**最後更新**: 2026-03-23 (Phase 9 Agent Teams)
**維護者**: Claude Code + C-Suite
**最後更新**: 2026-06-18 (IwoooS AI 自動化產品契約)
**維護者**: AWOOOI 工程團隊
---
## 產品定位記憶 (2026-06-18)
AWOOOI / AwoooP / IwoooS 是 AI 自動化產品不是單純的監控頁、告警轉發器、資安清冊或文件集合。任何告警、資安事件、主機訊號、CI/CD 訊號、Wazuh / Kali / SOC 證據、Nginx / gateway / runtime config drift、code review 候選與人工批准,都必須回到可驗證的 AI 自動化閉環。
合格閉環必須能回答:
1. Sensor / Evidence訊號來源與只讀證據在哪裡。
2. Normalizer原始訊號如何被轉成可判讀事件包。
3. AI 分流:由哪個 AI lane / 規則 / agent 判讀。
4. 候選:產生哪些修復、隔離、回復、文件或人工送審候選。
5. 閘門:需要哪些 owner / reviewer / maintenance window / rollback / secret boundary。
6. 執行邊界:哪些行為仍是 `0 / false`,不得被 UI 可見或 CD 成功誤判成授權。
7. 驗證器:完成後用什麼 readback、smoke、metric、receipt 或 timeline 驗證。
8. 學習回寫:如何回寫 KM、PlayBook、LOGBOOK、候選規則與前台狀態。
若一個工作項目無法回答 AI 判讀到哪、候選是什麼、閘門是否滿足、驗證器是否完成、學習是否回寫,就不得宣稱 AI 自動化完成,也不得上修 IwoooS headline 完成度。前台只能呈現脫敏後的產品資訊與證據摘要,不得顯示工作視窗對話、內部協作內容、個人 namespace、內網位址、secret 片段或 raw process dump。
---

View File

@@ -62,12 +62,43 @@ IwoooS / AwoooP 是 AI 自動化產品Telegram 告警不是終點,而是自
Host / runner 資源告警的第一版落地:
- `TelegramGateway.send_alert_notification()` 會在 HTML 告警出口自動套用 host resource formatter。
- `CPU 警告``容器內 root Node.js 進程`、含 `ps aux` 且指向 build 程序的文字,會被轉成 `ai_automation_alert_card_v1`
- raw process dump 會被壓成 `PID + CPU + 精簡命令`,不保留 `/workspace/...`、完整 node module 路徑或整段 `ps aux`
- build 壓力會分流到 `runner_build_resource_pressure`,預設 `candidate_only / runtime_write_gate=0`
- `TelegramGateway.send_alert_notification()` `send_text()` 會在最後出口自動套用 host resource formatter;即使呼叫端傳入 Markdown也必須強制改成脫敏 HTML 卡片
- `CPU 警告``容器內 root Node.js 進程`、含 `ps aux` 且指向 build / package install / Prisma generate 程序的文字,會被轉成 `ai_automation_alert_card_v1`
- raw process dump 會被壓成 `PID + CPU + 精簡命令`,不保留 `/workspace/...``/opt/hostedtoolcache/...`完整 `node_modules` 路徑、外部檢查 URL、JSON payload 或整段 `ps aux`
- build 壓力會分流到 `runner_build_resource_pressure`Prisma / package install 壓力會分流到 `runner_prisma_generate_resource_pressure`;預設都是 `candidate_only / runtime_write_gate=0`
- 這只建立 AI 候選與判讀入口,不代表允許 kill process、restart 服務、改 Nginx、改 firewall 或執行 Kali active scan。
Host / runner 告警卡片必須採用下列第一屏版型:
```text
P1 主機資源壓力h110-gitea
ai_automation_alert_card_v1
一眼摘要
├ CPU 使用率29.8%
├ Load8.62 ■■■■■■■■
├ 容器 root 進程3
└ 影響Prisma generate / package build 程序正在吃 CPU先確認 CI/CD、runner job 與供應鏈來源,不可直接 kill
AI 自動化判讀
├ Lanerunner_prisma_generate_resource_pressure
├ Gatecandidate_only / runtime_write_gate=0
└ 下一步:建立 Prisma / package install 資源壓力候選,交由 AI 彙整 CI/CD run、套件來源、runner queue 與供應鏈 evidence
Top evidence
├ PID 365 CPU 27.5%Prisma generate root
├ PID 376 CPU 15.5%Prisma generate root
└ PID 392 CPU 0%Prisma generate root
建議下一步
├ 確認是否為 CI/CD / Actions / runner 正常建置或 package install 窗口
├ 若持續超過門檻,先查 runner queue、build job、套件來源、容器資源限制與服務 SLO
└ 同一 host/service 5 分鐘聚合一次,避免洗版
禁止事項
└ 不 kill process、不 restart Docker / Gitea、不 reload Nginx、不改 firewall除非已有維護窗口與 owner 批准。
```
## 與 AwoooP 的分工
| 介面 | 承載內容 |

View File

@@ -1,7 +1,7 @@
# Host Runaway Process AIOps PlayBook
# 主機異常行程 AIOps PlayBook
> Last updated: 2026-06-18 Asia/Taipei
> Scope: 110 host CPU 滿載、orphan Chrome / Playwright smoke、Gitea Actions CI load 分流。
> 最後更新:2026-06-18 Asia/Taipei
> 範圍:110 host CPU 滿載、orphan Chrome / Playwright smoke、Gitea Actions CI load、Prisma / package install 資源壓力分流。
---
@@ -61,10 +61,12 @@ Telegram / AI event packet contract:
|---------------|---------------|----------|
| `HostOrphanBrowserSmokeHighCpu` | `orphan_browser_smoke_runaway_process` | alertname、host、rule、runaway dry-run、owner / maintenance / evidence gate、KM / PlayBook / Verifier 回寫 |
| `HostCiRunnerLoadSaturation` | `ci_runner_load_saturation` | Gitea Actions run、runner queue、load/core、swap trend、capacity / queue 判定、不做 process remediation |
| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure``host_resource_pressure_triage` | sanitized top process evidence不顯示 raw workspace path 或完整 process dump |
| raw `CPU 警告` / `ps aux` dump | `runner_build_resource_pressure``runner_prisma_generate_resource_pressure``host_resource_pressure_triage` | sanitized top process evidence不顯示 raw workspace path、hosted toolcache path、`node_modules` path、外部 URL、JSON payload 或完整 process dump |
所有 Telegram 卡片都必須保留 `runtime_write_gate=0`,並不得把 alert/card 轉成直接 kill / restart / reload 指令。
Host / runner raw dump 進入 Telegram 前必須先被 `TelegramGateway` 壓成 `P1/P2/P3 主機資源壓力` 卡片。第一屏只允許顯示 CPU、load、root process count、AI lane、candidate gate、Top evidence 與禁止事項;完整命令列、套件 JSON、外部檢查 endpoint、內部 workspace path 與 raw `ps aux` 必須留在內部 evidence / timeline不得外送。
---
## 3. AI Triager 必做判讀