fix(telegram): format governance and runbook alerts
This commit is contained in:
@@ -134,46 +134,7 @@ class FailoverAlerter:
|
||||
logger.debug("governance_alert_dedup_skipped", event_type=event_type)
|
||||
return
|
||||
|
||||
status = _escape_md(str(payload.get("status", "warning")))
|
||||
impact = _as_dict(payload.get("impact"))
|
||||
remediation = _as_dict(payload.get("remediation"))
|
||||
actionable = _as_dict(payload.get("actionable"))
|
||||
|
||||
impact_lines = _lines_from_dict(impact, max_items=12, compact=True)
|
||||
remediation_lines = _lines_from_list(remediation.get("items"))
|
||||
remediation_next_action = remediation.get("next_action")
|
||||
remediation_hint = remediation.get("hint")
|
||||
actionable_lines = _lines_from_list(actionable.get("items"))
|
||||
|
||||
next_action_line = ""
|
||||
if remediation_next_action:
|
||||
next_action_line = f"\n 下一步:{_escape_md(str(remediation_next_action))}"
|
||||
if remediation_hint:
|
||||
next_action_line += f"\n 提示:{_escape_md(str(remediation_hint))}"
|
||||
|
||||
sections: list[str] = [
|
||||
"⚠️ *AI 治理警報*",
|
||||
f"\n類型:{_escape_md(event_type)}",
|
||||
f"狀態:{status}",
|
||||
]
|
||||
if impact_lines:
|
||||
sections.append(f"\n*影響*\n{impact_lines}")
|
||||
if remediation_lines or next_action_line:
|
||||
sections.append("\n*修復方向*")
|
||||
if remediation_lines:
|
||||
sections.append(remediation_lines)
|
||||
if next_action_line:
|
||||
sections.append(next_action_line)
|
||||
if actionable_lines:
|
||||
sections.append(f"\n*可直接自動化*\n{actionable_lines}")
|
||||
|
||||
fallback_items = _fallback_pairs(payload, keep={"status", "impact", "remediation", "actionable"})
|
||||
if fallback_items:
|
||||
sections.append(
|
||||
"\n*欄位快覽(備援)*\n" + "\n".join(fallback_items)
|
||||
)
|
||||
|
||||
msg = "\n".join(sections)
|
||||
msg = format_governance_alert_card(event_type, payload)
|
||||
await self._send(msg)
|
||||
logger.info("governance_alert_sent", event_type=event_type)
|
||||
|
||||
@@ -336,6 +297,180 @@ def _as_dict(value: Any) -> dict[str, Any]:
|
||||
return value if isinstance(value, dict) else {}
|
||||
|
||||
|
||||
_EVENT_DISPLAY_NAMES = {
|
||||
"trust_drift": "信任漂移",
|
||||
"knowledge_degradation": "知識庫劣化",
|
||||
"governance_slo_data_gap": "SLO 資料缺口",
|
||||
"governance_self_failure": "治理自檢失敗",
|
||||
"llm_hallucination": "LLM 驗證失敗",
|
||||
"execution_blast_radius": "執行風險擴大",
|
||||
}
|
||||
|
||||
_STATUS_BADGES = {
|
||||
"critical": "🔴 critical",
|
||||
"error": "🔴 error",
|
||||
"violation": "🔴 violation",
|
||||
"warning": "🟡 warning",
|
||||
"degraded": "🟠 degraded",
|
||||
"ok": "🟢 ok",
|
||||
}
|
||||
|
||||
_IMPACT_PROFILES: dict[str, list[tuple[str, str]]] = {
|
||||
"trust_drift": [
|
||||
("drifted_count", "漂移 Playbook"),
|
||||
("total_playbooks", "總 Playbook"),
|
||||
("drift_ratio", "漂移比例"),
|
||||
("threshold", "警戒門檻"),
|
||||
("auto_deprecated_count", "自動停用"),
|
||||
],
|
||||
"knowledge_degradation": [
|
||||
("stale_count", "陳舊 KM"),
|
||||
("total_count", "總 KM"),
|
||||
("stale_ratio", "陳舊比例"),
|
||||
("threshold", "警戒門檻"),
|
||||
("stale_days", "陳舊天數"),
|
||||
],
|
||||
"governance_slo_data_gap": [
|
||||
("reason", "缺口原因"),
|
||||
("skipped_count", "略過指標"),
|
||||
("all_slo_metrics_not_emitted", "SLO 指標缺失"),
|
||||
],
|
||||
"governance_self_failure": [
|
||||
("failed_checks", "失敗檢查"),
|
||||
("total_checks", "總檢查"),
|
||||
("failure_rate", "失敗比例"),
|
||||
],
|
||||
"execution_blast_radius": [
|
||||
("affected_services", "受影響服務"),
|
||||
("blast_radius", "爆炸半徑"),
|
||||
("threshold", "警戒門檻"),
|
||||
],
|
||||
"llm_hallucination": [
|
||||
("failed", "驗證失敗"),
|
||||
("rate", "失敗比例"),
|
||||
("threshold", "警戒門檻"),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _event_display_name(event_type: str) -> str:
|
||||
if event_type in _EVENT_DISPLAY_NAMES:
|
||||
return _EVENT_DISPLAY_NAMES[event_type]
|
||||
if event_type.startswith("slo_"):
|
||||
return "SLO 違反"
|
||||
return event_type.replace("_", " ").strip().title()
|
||||
|
||||
|
||||
def _status_badge(status: Any) -> str:
|
||||
status_text = str(status or "warning")
|
||||
return _STATUS_BADGES.get(status_text.lower(), status_text)
|
||||
|
||||
|
||||
def _format_metric_value(key: str, value: Any) -> str:
|
||||
if isinstance(value, bool):
|
||||
return "是" if value else "否"
|
||||
if isinstance(value, (float, int)) and (
|
||||
key.endswith("_ratio") or key in {"threshold", "rate", "failure_rate"}
|
||||
):
|
||||
return f"{float(value) * 100:.1f}%"
|
||||
if isinstance(value, list):
|
||||
if not value:
|
||||
return "0"
|
||||
shown = ", ".join(str(item) for item in value[:3])
|
||||
if len(value) > 3:
|
||||
shown += f"…(共 {len(value)})"
|
||||
return shown
|
||||
return str(value)
|
||||
|
||||
|
||||
def _profiled_rows(event_type: str, data: dict[str, Any], *, max_rows: int = 8) -> list[str]:
|
||||
if not data:
|
||||
return []
|
||||
|
||||
used: set[str] = set()
|
||||
rows: list[str] = []
|
||||
for key, label in _IMPACT_PROFILES.get(event_type, []):
|
||||
if key in data:
|
||||
rows.append(f"{label}:{_format_metric_value(key, data[key])}")
|
||||
used.add(key)
|
||||
|
||||
for key in sorted(data.keys()):
|
||||
if len(rows) >= max_rows:
|
||||
break
|
||||
if key in used:
|
||||
continue
|
||||
rows.append(f"{key}:{_format_metric_value(key, data[key])}")
|
||||
|
||||
if len(data) > len(used) + max(0, max_rows - len(rows)):
|
||||
rows.append("更多欄位已收斂至 AwoooP 稽核資料")
|
||||
return rows[:max_rows]
|
||||
|
||||
|
||||
def _tree_lines(rows: list[str]) -> str:
|
||||
if not rows:
|
||||
return ""
|
||||
rendered: list[str] = []
|
||||
for idx, row in enumerate(rows):
|
||||
branch = "└" if idx == len(rows) - 1 else "├"
|
||||
rendered.append(f"{branch} {_escape_md(str(row))}")
|
||||
return "\n".join(rendered)
|
||||
|
||||
|
||||
def _governance_summary_lines(event_type: str, impact: dict[str, Any]) -> str:
|
||||
rows = _profiled_rows(event_type, impact)
|
||||
return _tree_lines(rows)
|
||||
|
||||
|
||||
def format_governance_alert_card(event_type: str, payload: dict[str, Any]) -> str:
|
||||
"""格式化 AI 治理 Telegram 卡片。
|
||||
|
||||
2026-05-07 Codex — 保留治理 payload,僅在 Telegram 邊界層把 raw key/value
|
||||
轉成可掃描卡片,避免大量純文字欄位洗版。
|
||||
"""
|
||||
payload = payload if isinstance(payload, dict) else {}
|
||||
impact = _as_dict(payload.get("impact"))
|
||||
remediation = _as_dict(payload.get("remediation"))
|
||||
actionable = _as_dict(payload.get("actionable"))
|
||||
status = payload.get("status", "warning")
|
||||
|
||||
sections: list[str] = [
|
||||
f"⚠️ *AI 治理警報|{_escape_md(_event_display_name(event_type))}*",
|
||||
"──────────────────────",
|
||||
f"類型:{_escape_md(event_type)}",
|
||||
f"狀態:{_escape_md(_status_badge(status))}",
|
||||
]
|
||||
|
||||
impact_lines = _governance_summary_lines(event_type, impact)
|
||||
if impact_lines:
|
||||
sections.extend(["", "🧭 *影響摘要*", impact_lines])
|
||||
|
||||
remediation_lines = _lines_from_list(remediation.get("items"))
|
||||
remediation_next_action = remediation.get("next_action")
|
||||
remediation_hint = remediation.get("hint")
|
||||
if remediation_lines or remediation_next_action or remediation_hint:
|
||||
sections.extend(["", "🛠️ *修復方向*"])
|
||||
if remediation_lines:
|
||||
sections.append(remediation_lines)
|
||||
if remediation_next_action:
|
||||
sections.append(f"▶️ 下一步:{_escape_md(str(remediation_next_action))}")
|
||||
if remediation_hint:
|
||||
sections.append(f"💡 提示:{_escape_md(str(remediation_hint))}")
|
||||
|
||||
actionable_lines = _lines_from_list(actionable.get("items"))
|
||||
if actionable_lines:
|
||||
sections.extend(["", "🤖 *可自動化工作*", actionable_lines])
|
||||
|
||||
fallback_items = _fallback_pairs(
|
||||
payload,
|
||||
keep={"status", "impact", "remediation", "actionable"},
|
||||
max_items=4,
|
||||
)
|
||||
if fallback_items:
|
||||
sections.extend(["", "📎 *補充欄位*", "\n".join(fallback_items)])
|
||||
|
||||
return "\n".join(sections)
|
||||
|
||||
|
||||
def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str:
|
||||
if not data:
|
||||
return ""
|
||||
@@ -360,7 +495,12 @@ def _lines_from_list(value: Any) -> str:
|
||||
)
|
||||
|
||||
|
||||
def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> list[str]:
|
||||
def _fallback_pairs(
|
||||
payload: dict[str, Any],
|
||||
keep: set[str] | None = None,
|
||||
*,
|
||||
max_items: int | None = None,
|
||||
) -> list[str]:
|
||||
if not isinstance(payload, dict):
|
||||
return []
|
||||
keep = set(keep or set())
|
||||
@@ -368,6 +508,9 @@ def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> li
|
||||
for key in sorted(payload.keys()):
|
||||
if key in keep:
|
||||
continue
|
||||
if max_items is not None and len(rows) >= max_items:
|
||||
rows.append(_escape_md("更多欄位已收斂至 AwoooP 稽核資料"))
|
||||
break
|
||||
rows.append(f"{_escape_md(str(key))}:{_escape_md(str(payload.get(key)))}")
|
||||
return rows
|
||||
|
||||
|
||||
@@ -24,12 +24,19 @@ Runbook Generator - Phase 25 P1 Knowledge Auto-Harvesting
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import re
|
||||
import time
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import structlog
|
||||
|
||||
from src.models.knowledge import EntrySource, EntryStatus, EntryType, KnowledgeEntryCreate
|
||||
from src.models.knowledge import (
|
||||
EntrySource,
|
||||
EntryStatus,
|
||||
EntryType,
|
||||
KnowledgeEntryCreate,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from src.models.incident import Incident
|
||||
@@ -38,6 +45,88 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
_CARD_MAX_LEN = 3600
|
||||
_SECTION_RE = re.compile(r"^#{1,6}\s+(?P<title>.+?)\s*$")
|
||||
_BULLET_RE = re.compile(r"^\s*(?:[-*]|\d+[.)])\s*")
|
||||
|
||||
|
||||
def _html(text: object) -> str:
|
||||
return html.escape(str(text), quote=False)
|
||||
|
||||
|
||||
def _shorten(text: object, limit: int = 120) -> str:
|
||||
compact = " ".join(str(text or "").split())
|
||||
if len(compact) <= limit:
|
||||
return compact
|
||||
return compact[: max(0, limit - 1)].rstrip() + "…"
|
||||
|
||||
|
||||
def _clean_preview_line(line: str) -> str:
|
||||
line = _SECTION_RE.sub("", line.strip())
|
||||
line = _BULLET_RE.sub("", line).strip()
|
||||
line = line.replace("`", "")
|
||||
return " ".join(line.split())
|
||||
|
||||
|
||||
def _section_preview(content: str, title_keyword: str, *, fallback: str) -> str:
|
||||
"""從 Markdown 內容抽一行可讀摘要,避免把整段 Runbook 原文丟進 Telegram。"""
|
||||
lines = str(content or "").splitlines()
|
||||
in_section = False
|
||||
for raw_line in lines:
|
||||
line = raw_line.strip()
|
||||
if not line:
|
||||
continue
|
||||
heading = _SECTION_RE.match(line)
|
||||
if heading:
|
||||
in_section = title_keyword in heading.group("title")
|
||||
continue
|
||||
if not in_section:
|
||||
continue
|
||||
preview = _clean_preview_line(line)
|
||||
if preview:
|
||||
return _shorten(preview, 120)
|
||||
return fallback
|
||||
|
||||
|
||||
def _step_preview(content: str) -> str:
|
||||
preview = _section_preview(content, "執行", fallback="待審核 Runbook 執行步驟")
|
||||
if any(token in preview for token in ("{host}", "{target}", "Unsupported scheme", "Invalid component name")):
|
||||
return "含 placeholder 或不支援的執行步驟,需人工修正後才能發布"
|
||||
return _shorten(preview, 120)
|
||||
|
||||
|
||||
def format_runbook_review_card(
|
||||
incident: object,
|
||||
entry_id: str,
|
||||
content: str,
|
||||
) -> str:
|
||||
"""格式化 Telegram Runbook 審核卡片。
|
||||
|
||||
2026-05-07 Codex — 將純文字 Markdown preview 改成治理卡片,讓 SRE
|
||||
能快速判斷知識狀態、受影響服務與審核重點。
|
||||
"""
|
||||
incident_id = getattr(incident, "incident_id", "unknown")
|
||||
services = ", ".join(getattr(incident, "affected_services", None) or []) or "unknown"
|
||||
symptom = _section_preview(content, "症狀", fallback=f"Incident {incident_id} 的修復知識待審核")
|
||||
step = _step_preview(content)
|
||||
|
||||
message = (
|
||||
"📄 <b>RUNBOOK REVIEW|待審核</b>\n"
|
||||
"──────────────────────\n"
|
||||
f"📋 Incident:<code>{_html(incident_id)}</code>\n"
|
||||
f"🧩 受影響服務:<code>{_html(services)}</code>\n"
|
||||
"🧠 知識狀態:<b>DRAFT|需人工審核</b>\n"
|
||||
f"🗂️ Entry ID:<code>{_html(entry_id)}</code>\n\n"
|
||||
"🧾 <b>內容摘要</b>\n"
|
||||
f"├ 症狀:{_html(symptom)}\n"
|
||||
f"└ 執行:{_html(step)}\n\n"
|
||||
"✅ <b>審核重點</b>\n"
|
||||
"1. 確認步驟可重跑,且不含 placeholder / 不支援 scheme\n"
|
||||
"2. 補齊適用條件、rollback 與驗證方式\n\n"
|
||||
"🔎 AwoooP:知識庫 / Runbook Review"
|
||||
)
|
||||
return message[:_CARD_MAX_LEN]
|
||||
|
||||
|
||||
class NemotronRunbookGenerator:
|
||||
"""
|
||||
@@ -109,7 +198,7 @@ class NemotronRunbookGenerator:
|
||||
playbook_id=playbook.playbook_id,
|
||||
)
|
||||
|
||||
await self._push_runbook_review_card(incident, entry.id, content[:200])
|
||||
await self._push_runbook_review_card(incident, entry.id, content)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
@@ -300,13 +389,7 @@ class NemotronRunbookGenerator:
|
||||
try:
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
tg = get_telegram_gateway()
|
||||
await tg.send_text(
|
||||
f"📄 <b>Auto Runbook 待審核</b>\n"
|
||||
f"Incident: <code>{incident.incident_id}</code>\n"
|
||||
f"Entry ID: <code>{entry_id}</code>\n\n"
|
||||
f"<i>{content_preview}...</i>\n\n"
|
||||
f"請至知識庫審核並發布。"
|
||||
)
|
||||
await tg.send_text(format_runbook_review_card(incident, entry_id, content_preview))
|
||||
except Exception as e:
|
||||
logger.warning("runbook_review_card_failed", error=str(e))
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ from src.services.failover_alerter import (
|
||||
_lines_from_list,
|
||||
_sanitize_telegram_error,
|
||||
configure_alerter,
|
||||
format_governance_alert_card,
|
||||
get_failover_alerter,
|
||||
reset_failover_alerter,
|
||||
)
|
||||
@@ -249,3 +250,57 @@ def test_sanitize_telegram_error_redacts_bot_token_url() -> None:
|
||||
|
||||
assert "SECRET" not in sanitized
|
||||
assert "bot<redacted>" in sanitized
|
||||
|
||||
|
||||
def test_governance_alert_card_formats_knowledge_degradation() -> None:
|
||||
card = format_governance_alert_card(
|
||||
"knowledge_degradation",
|
||||
{
|
||||
"status": "warning",
|
||||
"impact": {
|
||||
"stale_count": 948,
|
||||
"stale_days": 7,
|
||||
"stale_ratio": 0.521,
|
||||
"threshold": 0.2,
|
||||
"total_count": 1819,
|
||||
},
|
||||
"remediation": {
|
||||
"items": [
|
||||
"啟動 KM 反查與自動補齊流程",
|
||||
"關鍵服務告警自動同步到 KM 任務",
|
||||
],
|
||||
"next_action": "run_kb_growth_healthcheck",
|
||||
},
|
||||
"actionable": {
|
||||
"items": [
|
||||
"每日檢查 ANTI_PATTERN 更新結果",
|
||||
"安排 owner 對 stale 條目做快速人工審核",
|
||||
]
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
assert "*AI 治理警報|知識庫劣化*" in card
|
||||
assert "🧭 *影響摘要*" in card
|
||||
assert "陳舊 KM:948" in card
|
||||
assert "陳舊比例:52\\.1%" in card
|
||||
assert "▶️ 下一步:run\\_kb\\_growth\\_healthcheck" in card
|
||||
assert "欄位快覽" not in card
|
||||
|
||||
|
||||
def test_governance_alert_card_limits_fallback_fields() -> None:
|
||||
card = format_governance_alert_card(
|
||||
"custom_signal",
|
||||
{
|
||||
"status": "warning",
|
||||
"field_a": "a",
|
||||
"field_b": "b",
|
||||
"field_c": "c",
|
||||
"field_d": "d",
|
||||
"field_e": "e",
|
||||
},
|
||||
)
|
||||
|
||||
assert "📎 *補充欄位*" in card
|
||||
assert "更多欄位已收斂至 AwoooP 稽核資料" in card
|
||||
assert "field\\_e" not in card
|
||||
|
||||
@@ -14,6 +14,7 @@ ADR-052 / Phase 25 P1: NemotronRunbookGenerator + AntiPattern Gate
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
# Source file paths
|
||||
_BASE = Path(__file__).parent.parent / "src"
|
||||
@@ -80,6 +81,30 @@ class TestRunbookGeneratorModule:
|
||||
source = _RUNBOOK_GEN.read_text()
|
||||
assert "fallback" in source
|
||||
|
||||
def test_runbook_review_card_is_structured_html(self):
|
||||
"""Telegram Runbook 審核訊息必須是可掃描治理卡片,不直接傾倒 Markdown 原文"""
|
||||
from src.services.runbook_generator import format_runbook_review_card
|
||||
|
||||
incident = SimpleNamespace(
|
||||
incident_id="INC-20260506-E54736",
|
||||
affected_services=["node-exporter-110"],
|
||||
)
|
||||
content = (
|
||||
"## 症狀描述\n"
|
||||
"Incident INC-20260506-E54736,受影響服務:node-exporter-110\n\n"
|
||||
"## 執行步驟\n"
|
||||
"- Step 1: ssh{host} echo '=== LOAD ===' -> FAILED: Unsupported scheme\n"
|
||||
)
|
||||
|
||||
card = format_runbook_review_card(incident, "ff5eff01-7243-44bf", content)
|
||||
|
||||
assert "<b>RUNBOOK REVIEW|待審核</b>" in card
|
||||
assert "<code>INC-20260506-E54736</code>" in card
|
||||
assert "🧾 <b>內容摘要</b>" in card
|
||||
assert "placeholder 或不支援的執行步驟" in card
|
||||
assert "## 症狀描述" not in card
|
||||
assert "ssh{host}" not in card
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TestAutoRepairService — fire-and-forget 與 GC 防洩漏
|
||||
|
||||
Reference in New Issue
Block a user