fix(telegram): format governance and runbook alerts

2026-05-07 00:58:20 +08:00
parent f046742a4f
commit 341c3b6523
4 changed files with 356 additions and 50 deletions
--- a/apps/api/src/services/failover_alerter.py
+++ b/apps/api/src/services/failover_alerter.py
@@ -134,46 +134,7 @@ class FailoverAlerter:
            logger.debug("governance_alert_dedup_skipped", event_type=event_type)
            return

-        status = _escape_md(str(payload.get("status", "warning")))
-        impact = _as_dict(payload.get("impact"))
-        remediation = _as_dict(payload.get("remediation"))
-        actionable = _as_dict(payload.get("actionable"))
-
-        impact_lines = _lines_from_dict(impact, max_items=12, compact=True)
-        remediation_lines = _lines_from_list(remediation.get("items"))
-        remediation_next_action = remediation.get("next_action")
-        remediation_hint = remediation.get("hint")
-        actionable_lines = _lines_from_list(actionable.get("items"))
-
-        next_action_line = ""
-        if remediation_next_action:
-            next_action_line = f"\n  下一步：{_escape_md(str(remediation_next_action))}"
-        if remediation_hint:
-            next_action_line += f"\n  提示：{_escape_md(str(remediation_hint))}"
-
-        sections: list[str] = [
-            "⚠️ *AI 治理警報*",
-            f"\n類型：{_escape_md(event_type)}",
-            f"狀態：{status}",
-        ]
-        if impact_lines:
-            sections.append(f"\n*影響*\n{impact_lines}")
-        if remediation_lines or next_action_line:
-            sections.append("\n*修復方向*")
-            if remediation_lines:
-                sections.append(remediation_lines)
-            if next_action_line:
-                sections.append(next_action_line)
-        if actionable_lines:
-            sections.append(f"\n*可直接自動化*\n{actionable_lines}")
-
-        fallback_items = _fallback_pairs(payload, keep={"status", "impact", "remediation", "actionable"})
-        if fallback_items:
-            sections.append(
-                "\n*欄位快覽（備援）*\n" + "\n".join(fallback_items)
-            )
-
-        msg = "\n".join(sections)
+        msg = format_governance_alert_card(event_type, payload)
        await self._send(msg)
        logger.info("governance_alert_sent", event_type=event_type)

@@ -336,6 +297,180 @@ def _as_dict(value: Any) -> dict[str, Any]:
    return value if isinstance(value, dict) else {}


+_EVENT_DISPLAY_NAMES = {
+    "trust_drift": "信任漂移",
+    "knowledge_degradation": "知識庫劣化",
+    "governance_slo_data_gap": "SLO 資料缺口",
+    "governance_self_failure": "治理自檢失敗",
+    "llm_hallucination": "LLM 驗證失敗",
+    "execution_blast_radius": "執行風險擴大",
+}
+
+_STATUS_BADGES = {
+    "critical": "🔴 critical",
+    "error": "🔴 error",
+    "violation": "🔴 violation",
+    "warning": "🟡 warning",
+    "degraded": "🟠 degraded",
+    "ok": "🟢 ok",
+}
+
+_IMPACT_PROFILES: dict[str, list[tuple[str, str]]] = {
+    "trust_drift": [
+        ("drifted_count", "漂移 Playbook"),
+        ("total_playbooks", "總 Playbook"),
+        ("drift_ratio", "漂移比例"),
+        ("threshold", "警戒門檻"),
+        ("auto_deprecated_count", "自動停用"),
+    ],
+    "knowledge_degradation": [
+        ("stale_count", "陳舊 KM"),
+        ("total_count", "總 KM"),
+        ("stale_ratio", "陳舊比例"),
+        ("threshold", "警戒門檻"),
+        ("stale_days", "陳舊天數"),
+    ],
+    "governance_slo_data_gap": [
+        ("reason", "缺口原因"),
+        ("skipped_count", "略過指標"),
+        ("all_slo_metrics_not_emitted", "SLO 指標缺失"),
+    ],
+    "governance_self_failure": [
+        ("failed_checks", "失敗檢查"),
+        ("total_checks", "總檢查"),
+        ("failure_rate", "失敗比例"),
+    ],
+    "execution_blast_radius": [
+        ("affected_services", "受影響服務"),
+        ("blast_radius", "爆炸半徑"),
+        ("threshold", "警戒門檻"),
+    ],
+    "llm_hallucination": [
+        ("failed", "驗證失敗"),
+        ("rate", "失敗比例"),
+        ("threshold", "警戒門檻"),
+    ],
+}
+
+
+def _event_display_name(event_type: str) -> str:
+    if event_type in _EVENT_DISPLAY_NAMES:
+        return _EVENT_DISPLAY_NAMES[event_type]
+    if event_type.startswith("slo_"):
+        return "SLO 違反"
+    return event_type.replace("_", " ").strip().title()
+
+
+def _status_badge(status: Any) -> str:
+    status_text = str(status or "warning")
+    return _STATUS_BADGES.get(status_text.lower(), status_text)
+
+
+def _format_metric_value(key: str, value: Any) -> str:
+    if isinstance(value, bool):
+        return "是" if value else "否"
+    if isinstance(value, (float, int)) and (
+        key.endswith("_ratio") or key in {"threshold", "rate", "failure_rate"}
+    ):
+        return f"{float(value) * 100:.1f}%"
+    if isinstance(value, list):
+        if not value:
+            return "0"
+        shown = ", ".join(str(item) for item in value[:3])
+        if len(value) > 3:
+            shown += f"…（共 {len(value)}）"
+        return shown
+    return str(value)
+
+
+def _profiled_rows(event_type: str, data: dict[str, Any], *, max_rows: int = 8) -> list[str]:
+    if not data:
+        return []
+
+    used: set[str] = set()
+    rows: list[str] = []
+    for key, label in _IMPACT_PROFILES.get(event_type, []):
+        if key in data:
+            rows.append(f"{label}：{_format_metric_value(key, data[key])}")
+            used.add(key)
+
+    for key in sorted(data.keys()):
+        if len(rows) >= max_rows:
+            break
+        if key in used:
+            continue
+        rows.append(f"{key}：{_format_metric_value(key, data[key])}")
+
+    if len(data) > len(used) + max(0, max_rows - len(rows)):
+        rows.append("更多欄位已收斂至 AwoooP 稽核資料")
+    return rows[:max_rows]
+
+
+def _tree_lines(rows: list[str]) -> str:
+    if not rows:
+        return ""
+    rendered: list[str] = []
+    for idx, row in enumerate(rows):
+        branch = "└" if idx == len(rows) - 1 else "├"
+        rendered.append(f"{branch} {_escape_md(str(row))}")
+    return "\n".join(rendered)
+
+
+def _governance_summary_lines(event_type: str, impact: dict[str, Any]) -> str:
+    rows = _profiled_rows(event_type, impact)
+    return _tree_lines(rows)
+
+
+def format_governance_alert_card(event_type: str, payload: dict[str, Any]) -> str:
+    """格式化 AI 治理 Telegram 卡片。
+
+    2026-05-07 Codex — 保留治理 payload，僅在 Telegram 邊界層把 raw key/value
+    轉成可掃描卡片，避免大量純文字欄位洗版。
+    """
+    payload = payload if isinstance(payload, dict) else {}
+    impact = _as_dict(payload.get("impact"))
+    remediation = _as_dict(payload.get("remediation"))
+    actionable = _as_dict(payload.get("actionable"))
+    status = payload.get("status", "warning")
+
+    sections: list[str] = [
+        f"⚠️ *AI 治理警報｜{_escape_md(_event_display_name(event_type))}*",
+        "──────────────────────",
+        f"類型：{_escape_md(event_type)}",
+        f"狀態：{_escape_md(_status_badge(status))}",
+    ]
+
+    impact_lines = _governance_summary_lines(event_type, impact)
+    if impact_lines:
+        sections.extend(["", "🧭 *影響摘要*", impact_lines])
+
+    remediation_lines = _lines_from_list(remediation.get("items"))
+    remediation_next_action = remediation.get("next_action")
+    remediation_hint = remediation.get("hint")
+    if remediation_lines or remediation_next_action or remediation_hint:
+        sections.extend(["", "🛠️ *修復方向*"])
+        if remediation_lines:
+            sections.append(remediation_lines)
+        if remediation_next_action:
+            sections.append(f"▶️ 下一步：{_escape_md(str(remediation_next_action))}")
+        if remediation_hint:
+            sections.append(f"💡 提示：{_escape_md(str(remediation_hint))}")
+
+    actionable_lines = _lines_from_list(actionable.get("items"))
+    if actionable_lines:
+        sections.extend(["", "🤖 *可自動化工作*", actionable_lines])
+
+    fallback_items = _fallback_pairs(
+        payload,
+        keep={"status", "impact", "remediation", "actionable"},
+        max_items=4,
+    )
+    if fallback_items:
+        sections.extend(["", "📎 *補充欄位*", "\n".join(fallback_items)])
+
+    return "\n".join(sections)
+
+
 def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str:
    if not data:
        return ""
@@ -360,7 +495,12 @@ def _lines_from_list(value: Any) -> str:
    )


-def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> list[str]:
+def _fallback_pairs(
+    payload: dict[str, Any],
+    keep: set[str] | None = None,
+    *,
+    max_items: int | None = None,
+) -> list[str]:
    if not isinstance(payload, dict):
        return []
    keep = set(keep or set())
@@ -368,6 +508,9 @@ def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> li
    for key in sorted(payload.keys()):
        if key in keep:
            continue
+        if max_items is not None and len(rows) >= max_items:
+            rows.append(_escape_md("更多欄位已收斂至 AwoooP 稽核資料"))
+            break
        rows.append(f"{_escape_md(str(key))}：{_escape_md(str(payload.get(key)))}")
    return rows

--- a/apps/api/src/services/runbook_generator.py
+++ b/apps/api/src/services/runbook_generator.py
@@ -24,12 +24,19 @@ Runbook Generator - Phase 25 P1 Knowledge Auto-Harvesting
 from __future__ import annotations

 import asyncio
+import html
+import re
 import time
 from typing import TYPE_CHECKING

 import structlog

-from src.models.knowledge import EntrySource, EntryStatus, EntryType, KnowledgeEntryCreate
+from src.models.knowledge import (
+    EntrySource,
+    EntryStatus,
+    EntryType,
+    KnowledgeEntryCreate,
+)

 if TYPE_CHECKING:
    from src.models.incident import Incident
@@ -38,6 +45,88 @@ if TYPE_CHECKING:

 logger = structlog.get_logger(__name__)

+_CARD_MAX_LEN = 3600
+_SECTION_RE = re.compile(r"^#{1,6}\s+(?P<title>.+?)\s*$")
+_BULLET_RE = re.compile(r"^\s*(?:[-*]|\d+[.)])\s*")
+
+
+def _html(text: object) -> str:
+    return html.escape(str(text), quote=False)
+
+
+def _shorten(text: object, limit: int = 120) -> str:
+    compact = " ".join(str(text or "").split())
+    if len(compact) <= limit:
+        return compact
+    return compact[: max(0, limit - 1)].rstrip() + "…"
+
+
+def _clean_preview_line(line: str) -> str:
+    line = _SECTION_RE.sub("", line.strip())
+    line = _BULLET_RE.sub("", line).strip()
+    line = line.replace("`", "")
+    return " ".join(line.split())
+
+
+def _section_preview(content: str, title_keyword: str, *, fallback: str) -> str:
+    """從 Markdown 內容抽一行可讀摘要，避免把整段 Runbook 原文丟進 Telegram。"""
+    lines = str(content or "").splitlines()
+    in_section = False
+    for raw_line in lines:
+        line = raw_line.strip()
+        if not line:
+            continue
+        heading = _SECTION_RE.match(line)
+        if heading:
+            in_section = title_keyword in heading.group("title")
+            continue
+        if not in_section:
+            continue
+        preview = _clean_preview_line(line)
+        if preview:
+            return _shorten(preview, 120)
+    return fallback
+
+
+def _step_preview(content: str) -> str:
+    preview = _section_preview(content, "執行", fallback="待審核 Runbook 執行步驟")
+    if any(token in preview for token in ("{host}", "{target}", "Unsupported scheme", "Invalid component name")):
+        return "含 placeholder 或不支援的執行步驟，需人工修正後才能發布"
+    return _shorten(preview, 120)
+
+
+def format_runbook_review_card(
+    incident: object,
+    entry_id: str,
+    content: str,
+) -> str:
+    """格式化 Telegram Runbook 審核卡片。
+
+    2026-05-07 Codex — 將純文字 Markdown preview 改成治理卡片，讓 SRE
+    能快速判斷知識狀態、受影響服務與審核重點。
+    """
+    incident_id = getattr(incident, "incident_id", "unknown")
+    services = ", ".join(getattr(incident, "affected_services", None) or []) or "unknown"
+    symptom = _section_preview(content, "症狀", fallback=f"Incident {incident_id} 的修復知識待審核")
+    step = _step_preview(content)
+
+    message = (
+        "📄 <b>RUNBOOK REVIEW｜待審核</b>\n"
+        "──────────────────────\n"
+        f"📋 Incident：<code>{_html(incident_id)}</code>\n"
+        f"🧩 受影響服務：<code>{_html(services)}</code>\n"
+        "🧠 知識狀態：<b>DRAFT｜需人工審核</b>\n"
+        f"🗂️ Entry ID：<code>{_html(entry_id)}</code>\n\n"
+        "🧾 <b>內容摘要</b>\n"
+        f"├ 症狀：{_html(symptom)}\n"
+        f"└ 執行：{_html(step)}\n\n"
+        "✅ <b>審核重點</b>\n"
+        "1. 確認步驟可重跑，且不含 placeholder / 不支援 scheme\n"
+        "2. 補齊適用條件、rollback 與驗證方式\n\n"
+        "🔎 AwoooP：知識庫 / Runbook Review"
+    )
+    return message[:_CARD_MAX_LEN]
+

 class NemotronRunbookGenerator:
    """
@@ -109,7 +198,7 @@ class NemotronRunbookGenerator:
                playbook_id=playbook.playbook_id,
            )

-            await self._push_runbook_review_card(incident, entry.id, content[:200])
+            await self._push_runbook_review_card(incident, entry.id, content)

        except Exception as e:
            logger.error(
@@ -300,13 +389,7 @@ class NemotronRunbookGenerator:
        try:
            from src.services.telegram_gateway import get_telegram_gateway
            tg = get_telegram_gateway()
-            await tg.send_text(
-                f"📄 <b>Auto Runbook 待審核</b>\n"
-                f"Incident: <code>{incident.incident_id}</code>\n"
-                f"Entry ID: <code>{entry_id}</code>\n\n"
-                f"<i>{content_preview}...</i>\n\n"
-                f"請至知識庫審核並發布。"
-            )
+            await tg.send_text(format_runbook_review_card(incident, entry_id, content_preview))
        except Exception as e:
            logger.warning("runbook_review_card_failed", error=str(e))

--- a/apps/api/tests/test_failover_alerter.py
+++ b/apps/api/tests/test_failover_alerter.py
@@ -22,6 +22,7 @@ from src.services.failover_alerter import (
    _lines_from_list,
    _sanitize_telegram_error,
    configure_alerter,
+    format_governance_alert_card,
    get_failover_alerter,
    reset_failover_alerter,
 )
@@ -249,3 +250,57 @@ def test_sanitize_telegram_error_redacts_bot_token_url() -> None:

    assert "SECRET" not in sanitized
    assert "bot<redacted>" in sanitized
+
+
+def test_governance_alert_card_formats_knowledge_degradation() -> None:
+    card = format_governance_alert_card(
+        "knowledge_degradation",
+        {
+            "status": "warning",
+            "impact": {
+                "stale_count": 948,
+                "stale_days": 7,
+                "stale_ratio": 0.521,
+                "threshold": 0.2,
+                "total_count": 1819,
+            },
+            "remediation": {
+                "items": [
+                    "啟動 KM 反查與自動補齊流程",
+                    "關鍵服務告警自動同步到 KM 任務",
+                ],
+                "next_action": "run_kb_growth_healthcheck",
+            },
+            "actionable": {
+                "items": [
+                    "每日檢查 ANTI_PATTERN 更新結果",
+                    "安排 owner 對 stale 條目做快速人工審核",
+                ]
+            },
+        },
+    )
+
+    assert "*AI 治理警報｜知識庫劣化*" in card
+    assert "🧭 *影響摘要*" in card
+    assert "陳舊 KM：948" in card
+    assert "陳舊比例：52\\.1%" in card
+    assert "▶️ 下一步：run\\_kb\\_growth\\_healthcheck" in card
+    assert "欄位快覽" not in card
+
+
+def test_governance_alert_card_limits_fallback_fields() -> None:
+    card = format_governance_alert_card(
+        "custom_signal",
+        {
+            "status": "warning",
+            "field_a": "a",
+            "field_b": "b",
+            "field_c": "c",
+            "field_d": "d",
+            "field_e": "e",
+        },
+    )
+
+    assert "📎 *補充欄位*" in card
+    assert "更多欄位已收斂至 AwoooP 稽核資料" in card
+    assert "field\\_e" not in card
--- a/apps/api/tests/test_phase25_auto_harvesting.py
+++ b/apps/api/tests/test_phase25_auto_harvesting.py
@@ -14,6 +14,7 @@ ADR-052 / Phase 25 P1: NemotronRunbookGenerator + AntiPattern Gate
 """

 from pathlib import Path
+from types import SimpleNamespace

 # Source file paths
 _BASE = Path(__file__).parent.parent / "src"
@@ -80,6 +81,30 @@ class TestRunbookGeneratorModule:
        source = _RUNBOOK_GEN.read_text()
        assert "fallback" in source

+    def test_runbook_review_card_is_structured_html(self):
+        """Telegram Runbook 審核訊息必須是可掃描治理卡片，不直接傾倒 Markdown 原文"""
+        from src.services.runbook_generator import format_runbook_review_card
+
+        incident = SimpleNamespace(
+            incident_id="INC-20260506-E54736",
+            affected_services=["node-exporter-110"],
+        )
+        content = (
+            "## 症狀描述\n"
+            "Incident INC-20260506-E54736，受影響服務：node-exporter-110\n\n"
+            "## 執行步驟\n"
+            "- Step 1: ssh{host} echo '=== LOAD ===' -> FAILED: Unsupported scheme\n"
+        )
+
+        card = format_runbook_review_card(incident, "ff5eff01-7243-44bf", content)
+
+        assert "<b>RUNBOOK REVIEW｜待審核</b>" in card
+        assert "<code>INC-20260506-E54736</code>" in card
+        assert "🧾 <b>內容摘要</b>" in card
+        assert "placeholder 或不支援的執行步驟" in card
+        assert "## 症狀描述" not in card
+        assert "ssh{host}" not in card
+

 # =============================================================================
 # TestAutoRepairService — fire-and-forget 與 GC 防洩漏