From 341c3b6523a9ba6f9dfbdd5321de5d1a5a353743 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 7 May 2026 00:58:20 +0800 Subject: [PATCH] fix(telegram): format governance and runbook alerts --- apps/api/src/services/failover_alerter.py | 225 ++++++++++++++---- apps/api/src/services/runbook_generator.py | 101 +++++++- apps/api/tests/test_failover_alerter.py | 55 +++++ .../api/tests/test_phase25_auto_harvesting.py | 25 ++ 4 files changed, 356 insertions(+), 50 deletions(-) diff --git a/apps/api/src/services/failover_alerter.py b/apps/api/src/services/failover_alerter.py index 8e033de6..3885a2a5 100644 --- a/apps/api/src/services/failover_alerter.py +++ b/apps/api/src/services/failover_alerter.py @@ -134,46 +134,7 @@ class FailoverAlerter: logger.debug("governance_alert_dedup_skipped", event_type=event_type) return - status = _escape_md(str(payload.get("status", "warning"))) - impact = _as_dict(payload.get("impact")) - remediation = _as_dict(payload.get("remediation")) - actionable = _as_dict(payload.get("actionable")) - - impact_lines = _lines_from_dict(impact, max_items=12, compact=True) - remediation_lines = _lines_from_list(remediation.get("items")) - remediation_next_action = remediation.get("next_action") - remediation_hint = remediation.get("hint") - actionable_lines = _lines_from_list(actionable.get("items")) - - next_action_line = "" - if remediation_next_action: - next_action_line = f"\n 下一步:{_escape_md(str(remediation_next_action))}" - if remediation_hint: - next_action_line += f"\n 提示:{_escape_md(str(remediation_hint))}" - - sections: list[str] = [ - "⚠️ *AI 治理警報*", - f"\n類型:{_escape_md(event_type)}", - f"狀態:{status}", - ] - if impact_lines: - sections.append(f"\n*影響*\n{impact_lines}") - if remediation_lines or next_action_line: - sections.append("\n*修復方向*") - if remediation_lines: - sections.append(remediation_lines) - if next_action_line: - sections.append(next_action_line) - if actionable_lines: - sections.append(f"\n*可直接自動化*\n{actionable_lines}") - - fallback_items = _fallback_pairs(payload, keep={"status", "impact", "remediation", "actionable"}) - if fallback_items: - sections.append( - "\n*欄位快覽(備援)*\n" + "\n".join(fallback_items) - ) - - msg = "\n".join(sections) + msg = format_governance_alert_card(event_type, payload) await self._send(msg) logger.info("governance_alert_sent", event_type=event_type) @@ -336,6 +297,180 @@ def _as_dict(value: Any) -> dict[str, Any]: return value if isinstance(value, dict) else {} +_EVENT_DISPLAY_NAMES = { + "trust_drift": "信任漂移", + "knowledge_degradation": "知識庫劣化", + "governance_slo_data_gap": "SLO 資料缺口", + "governance_self_failure": "治理自檢失敗", + "llm_hallucination": "LLM 驗證失敗", + "execution_blast_radius": "執行風險擴大", +} + +_STATUS_BADGES = { + "critical": "🔴 critical", + "error": "🔴 error", + "violation": "🔴 violation", + "warning": "🟡 warning", + "degraded": "🟠 degraded", + "ok": "🟢 ok", +} + +_IMPACT_PROFILES: dict[str, list[tuple[str, str]]] = { + "trust_drift": [ + ("drifted_count", "漂移 Playbook"), + ("total_playbooks", "總 Playbook"), + ("drift_ratio", "漂移比例"), + ("threshold", "警戒門檻"), + ("auto_deprecated_count", "自動停用"), + ], + "knowledge_degradation": [ + ("stale_count", "陳舊 KM"), + ("total_count", "總 KM"), + ("stale_ratio", "陳舊比例"), + ("threshold", "警戒門檻"), + ("stale_days", "陳舊天數"), + ], + "governance_slo_data_gap": [ + ("reason", "缺口原因"), + ("skipped_count", "略過指標"), + ("all_slo_metrics_not_emitted", "SLO 指標缺失"), + ], + "governance_self_failure": [ + ("failed_checks", "失敗檢查"), + ("total_checks", "總檢查"), + ("failure_rate", "失敗比例"), + ], + "execution_blast_radius": [ + ("affected_services", "受影響服務"), + ("blast_radius", "爆炸半徑"), + ("threshold", "警戒門檻"), + ], + "llm_hallucination": [ + ("failed", "驗證失敗"), + ("rate", "失敗比例"), + ("threshold", "警戒門檻"), + ], +} + + +def _event_display_name(event_type: str) -> str: + if event_type in _EVENT_DISPLAY_NAMES: + return _EVENT_DISPLAY_NAMES[event_type] + if event_type.startswith("slo_"): + return "SLO 違反" + return event_type.replace("_", " ").strip().title() + + +def _status_badge(status: Any) -> str: + status_text = str(status or "warning") + return _STATUS_BADGES.get(status_text.lower(), status_text) + + +def _format_metric_value(key: str, value: Any) -> str: + if isinstance(value, bool): + return "是" if value else "否" + if isinstance(value, (float, int)) and ( + key.endswith("_ratio") or key in {"threshold", "rate", "failure_rate"} + ): + return f"{float(value) * 100:.1f}%" + if isinstance(value, list): + if not value: + return "0" + shown = ", ".join(str(item) for item in value[:3]) + if len(value) > 3: + shown += f"…(共 {len(value)})" + return shown + return str(value) + + +def _profiled_rows(event_type: str, data: dict[str, Any], *, max_rows: int = 8) -> list[str]: + if not data: + return [] + + used: set[str] = set() + rows: list[str] = [] + for key, label in _IMPACT_PROFILES.get(event_type, []): + if key in data: + rows.append(f"{label}:{_format_metric_value(key, data[key])}") + used.add(key) + + for key in sorted(data.keys()): + if len(rows) >= max_rows: + break + if key in used: + continue + rows.append(f"{key}:{_format_metric_value(key, data[key])}") + + if len(data) > len(used) + max(0, max_rows - len(rows)): + rows.append("更多欄位已收斂至 AwoooP 稽核資料") + return rows[:max_rows] + + +def _tree_lines(rows: list[str]) -> str: + if not rows: + return "" + rendered: list[str] = [] + for idx, row in enumerate(rows): + branch = "└" if idx == len(rows) - 1 else "├" + rendered.append(f"{branch} {_escape_md(str(row))}") + return "\n".join(rendered) + + +def _governance_summary_lines(event_type: str, impact: dict[str, Any]) -> str: + rows = _profiled_rows(event_type, impact) + return _tree_lines(rows) + + +def format_governance_alert_card(event_type: str, payload: dict[str, Any]) -> str: + """格式化 AI 治理 Telegram 卡片。 + + 2026-05-07 Codex — 保留治理 payload,僅在 Telegram 邊界層把 raw key/value + 轉成可掃描卡片,避免大量純文字欄位洗版。 + """ + payload = payload if isinstance(payload, dict) else {} + impact = _as_dict(payload.get("impact")) + remediation = _as_dict(payload.get("remediation")) + actionable = _as_dict(payload.get("actionable")) + status = payload.get("status", "warning") + + sections: list[str] = [ + f"⚠️ *AI 治理警報|{_escape_md(_event_display_name(event_type))}*", + "──────────────────────", + f"類型:{_escape_md(event_type)}", + f"狀態:{_escape_md(_status_badge(status))}", + ] + + impact_lines = _governance_summary_lines(event_type, impact) + if impact_lines: + sections.extend(["", "🧭 *影響摘要*", impact_lines]) + + remediation_lines = _lines_from_list(remediation.get("items")) + remediation_next_action = remediation.get("next_action") + remediation_hint = remediation.get("hint") + if remediation_lines or remediation_next_action or remediation_hint: + sections.extend(["", "🛠️ *修復方向*"]) + if remediation_lines: + sections.append(remediation_lines) + if remediation_next_action: + sections.append(f"▶️ 下一步:{_escape_md(str(remediation_next_action))}") + if remediation_hint: + sections.append(f"💡 提示:{_escape_md(str(remediation_hint))}") + + actionable_lines = _lines_from_list(actionable.get("items")) + if actionable_lines: + sections.extend(["", "🤖 *可自動化工作*", actionable_lines]) + + fallback_items = _fallback_pairs( + payload, + keep={"status", "impact", "remediation", "actionable"}, + max_items=4, + ) + if fallback_items: + sections.extend(["", "📎 *補充欄位*", "\n".join(fallback_items)]) + + return "\n".join(sections) + + def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str: if not data: return "" @@ -360,7 +495,12 @@ def _lines_from_list(value: Any) -> str: ) -def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> list[str]: +def _fallback_pairs( + payload: dict[str, Any], + keep: set[str] | None = None, + *, + max_items: int | None = None, +) -> list[str]: if not isinstance(payload, dict): return [] keep = set(keep or set()) @@ -368,6 +508,9 @@ def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> li for key in sorted(payload.keys()): if key in keep: continue + if max_items is not None and len(rows) >= max_items: + rows.append(_escape_md("更多欄位已收斂至 AwoooP 稽核資料")) + break rows.append(f"{_escape_md(str(key))}:{_escape_md(str(payload.get(key)))}") return rows diff --git a/apps/api/src/services/runbook_generator.py b/apps/api/src/services/runbook_generator.py index 6b06ecd7..5bca4f0e 100644 --- a/apps/api/src/services/runbook_generator.py +++ b/apps/api/src/services/runbook_generator.py @@ -24,12 +24,19 @@ Runbook Generator - Phase 25 P1 Knowledge Auto-Harvesting from __future__ import annotations import asyncio +import html +import re import time from typing import TYPE_CHECKING import structlog -from src.models.knowledge import EntrySource, EntryStatus, EntryType, KnowledgeEntryCreate +from src.models.knowledge import ( + EntrySource, + EntryStatus, + EntryType, + KnowledgeEntryCreate, +) if TYPE_CHECKING: from src.models.incident import Incident @@ -38,6 +45,88 @@ if TYPE_CHECKING: logger = structlog.get_logger(__name__) +_CARD_MAX_LEN = 3600 +_SECTION_RE = re.compile(r"^#{1,6}\s+(?P.+?)\s*$") +_BULLET_RE = re.compile(r"^\s*(?:[-*]|\d+[.)])\s*") + + +def _html(text: object) -> str: + return html.escape(str(text), quote=False) + + +def _shorten(text: object, limit: int = 120) -> str: + compact = " ".join(str(text or "").split()) + if len(compact) <= limit: + return compact + return compact[: max(0, limit - 1)].rstrip() + "…" + + +def _clean_preview_line(line: str) -> str: + line = _SECTION_RE.sub("", line.strip()) + line = _BULLET_RE.sub("", line).strip() + line = line.replace("`", "") + return " ".join(line.split()) + + +def _section_preview(content: str, title_keyword: str, *, fallback: str) -> str: + """從 Markdown 內容抽一行可讀摘要,避免把整段 Runbook 原文丟進 Telegram。""" + lines = str(content or "").splitlines() + in_section = False + for raw_line in lines: + line = raw_line.strip() + if not line: + continue + heading = _SECTION_RE.match(line) + if heading: + in_section = title_keyword in heading.group("title") + continue + if not in_section: + continue + preview = _clean_preview_line(line) + if preview: + return _shorten(preview, 120) + return fallback + + +def _step_preview(content: str) -> str: + preview = _section_preview(content, "執行", fallback="待審核 Runbook 執行步驟") + if any(token in preview for token in ("{host}", "{target}", "Unsupported scheme", "Invalid component name")): + return "含 placeholder 或不支援的執行步驟,需人工修正後才能發布" + return _shorten(preview, 120) + + +def format_runbook_review_card( + incident: object, + entry_id: str, + content: str, +) -> str: + """格式化 Telegram Runbook 審核卡片。 + + 2026-05-07 Codex — 將純文字 Markdown preview 改成治理卡片,讓 SRE + 能快速判斷知識狀態、受影響服務與審核重點。 + """ + incident_id = getattr(incident, "incident_id", "unknown") + services = ", ".join(getattr(incident, "affected_services", None) or []) or "unknown" + symptom = _section_preview(content, "症狀", fallback=f"Incident {incident_id} 的修復知識待審核") + step = _step_preview(content) + + message = ( + "📄 <b>RUNBOOK REVIEW|待審核</b>\n" + "──────────────────────\n" + f"📋 Incident:<code>{_html(incident_id)}</code>\n" + f"🧩 受影響服務:<code>{_html(services)}</code>\n" + "🧠 知識狀態:<b>DRAFT|需人工審核</b>\n" + f"🗂️ Entry ID:<code>{_html(entry_id)}</code>\n\n" + "🧾 <b>內容摘要</b>\n" + f"├ 症狀:{_html(symptom)}\n" + f"└ 執行:{_html(step)}\n\n" + "✅ <b>審核重點</b>\n" + "1. 確認步驟可重跑,且不含 placeholder / 不支援 scheme\n" + "2. 補齊適用條件、rollback 與驗證方式\n\n" + "🔎 AwoooP:知識庫 / Runbook Review" + ) + return message[:_CARD_MAX_LEN] + class NemotronRunbookGenerator: """ @@ -109,7 +198,7 @@ class NemotronRunbookGenerator: playbook_id=playbook.playbook_id, ) - await self._push_runbook_review_card(incident, entry.id, content[:200]) + await self._push_runbook_review_card(incident, entry.id, content) except Exception as e: logger.error( @@ -300,13 +389,7 @@ class NemotronRunbookGenerator: try: from src.services.telegram_gateway import get_telegram_gateway tg = get_telegram_gateway() - await tg.send_text( - f"📄 <b>Auto Runbook 待審核</b>\n" - f"Incident: <code>{incident.incident_id}</code>\n" - f"Entry ID: <code>{entry_id}</code>\n\n" - f"<i>{content_preview}...</i>\n\n" - f"請至知識庫審核並發布。" - ) + await tg.send_text(format_runbook_review_card(incident, entry_id, content_preview)) except Exception as e: logger.warning("runbook_review_card_failed", error=str(e)) diff --git a/apps/api/tests/test_failover_alerter.py b/apps/api/tests/test_failover_alerter.py index 51bbe407..bcfed64e 100644 --- a/apps/api/tests/test_failover_alerter.py +++ b/apps/api/tests/test_failover_alerter.py @@ -22,6 +22,7 @@ from src.services.failover_alerter import ( _lines_from_list, _sanitize_telegram_error, configure_alerter, + format_governance_alert_card, get_failover_alerter, reset_failover_alerter, ) @@ -249,3 +250,57 @@ def test_sanitize_telegram_error_redacts_bot_token_url() -> None: assert "SECRET" not in sanitized assert "bot<redacted>" in sanitized + + +def test_governance_alert_card_formats_knowledge_degradation() -> None: + card = format_governance_alert_card( + "knowledge_degradation", + { + "status": "warning", + "impact": { + "stale_count": 948, + "stale_days": 7, + "stale_ratio": 0.521, + "threshold": 0.2, + "total_count": 1819, + }, + "remediation": { + "items": [ + "啟動 KM 反查與自動補齊流程", + "關鍵服務告警自動同步到 KM 任務", + ], + "next_action": "run_kb_growth_healthcheck", + }, + "actionable": { + "items": [ + "每日檢查 ANTI_PATTERN 更新結果", + "安排 owner 對 stale 條目做快速人工審核", + ] + }, + }, + ) + + assert "*AI 治理警報|知識庫劣化*" in card + assert "🧭 *影響摘要*" in card + assert "陳舊 KM:948" in card + assert "陳舊比例:52\\.1%" in card + assert "▶️ 下一步:run\\_kb\\_growth\\_healthcheck" in card + assert "欄位快覽" not in card + + +def test_governance_alert_card_limits_fallback_fields() -> None: + card = format_governance_alert_card( + "custom_signal", + { + "status": "warning", + "field_a": "a", + "field_b": "b", + "field_c": "c", + "field_d": "d", + "field_e": "e", + }, + ) + + assert "📎 *補充欄位*" in card + assert "更多欄位已收斂至 AwoooP 稽核資料" in card + assert "field\\_e" not in card diff --git a/apps/api/tests/test_phase25_auto_harvesting.py b/apps/api/tests/test_phase25_auto_harvesting.py index e4c381e8..45b80973 100644 --- a/apps/api/tests/test_phase25_auto_harvesting.py +++ b/apps/api/tests/test_phase25_auto_harvesting.py @@ -14,6 +14,7 @@ ADR-052 / Phase 25 P1: NemotronRunbookGenerator + AntiPattern Gate """ from pathlib import Path +from types import SimpleNamespace # Source file paths _BASE = Path(__file__).parent.parent / "src" @@ -80,6 +81,30 @@ class TestRunbookGeneratorModule: source = _RUNBOOK_GEN.read_text() assert "fallback" in source + def test_runbook_review_card_is_structured_html(self): + """Telegram Runbook 審核訊息必須是可掃描治理卡片,不直接傾倒 Markdown 原文""" + from src.services.runbook_generator import format_runbook_review_card + + incident = SimpleNamespace( + incident_id="INC-20260506-E54736", + affected_services=["node-exporter-110"], + ) + content = ( + "## 症狀描述\n" + "Incident INC-20260506-E54736,受影響服務:node-exporter-110\n\n" + "## 執行步驟\n" + "- Step 1: ssh{host} echo '=== LOAD ===' -> FAILED: Unsupported scheme\n" + ) + + card = format_runbook_review_card(incident, "ff5eff01-7243-44bf", content) + + assert "<b>RUNBOOK REVIEW|待審核</b>" in card + assert "<code>INC-20260506-E54736</code>" in card + assert "🧾 <b>內容摘要</b>" in card + assert "placeholder 或不支援的執行步驟" in card + assert "## 症狀描述" not in card + assert "ssh{host}" not in card + # ============================================================================= # TestAutoRepairService — fire-and-forget 與 GC 防洩漏