fix(telegram): format governance and runbook alerts
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m13s
CD Pipeline / build-and-deploy (push) Successful in 3m22s
CD Pipeline / post-deploy-checks (push) Successful in 1m28s

This commit is contained in:
Your Name
2026-05-07 00:58:20 +08:00
parent f046742a4f
commit 341c3b6523
4 changed files with 356 additions and 50 deletions

View File

@@ -134,46 +134,7 @@ class FailoverAlerter:
logger.debug("governance_alert_dedup_skipped", event_type=event_type)
return
status = _escape_md(str(payload.get("status", "warning")))
impact = _as_dict(payload.get("impact"))
remediation = _as_dict(payload.get("remediation"))
actionable = _as_dict(payload.get("actionable"))
impact_lines = _lines_from_dict(impact, max_items=12, compact=True)
remediation_lines = _lines_from_list(remediation.get("items"))
remediation_next_action = remediation.get("next_action")
remediation_hint = remediation.get("hint")
actionable_lines = _lines_from_list(actionable.get("items"))
next_action_line = ""
if remediation_next_action:
next_action_line = f"\n 下一步:{_escape_md(str(remediation_next_action))}"
if remediation_hint:
next_action_line += f"\n 提示:{_escape_md(str(remediation_hint))}"
sections: list[str] = [
"⚠️ *AI 治理警報*",
f"\n類型:{_escape_md(event_type)}",
f"狀態:{status}",
]
if impact_lines:
sections.append(f"\n*影響*\n{impact_lines}")
if remediation_lines or next_action_line:
sections.append("\n*修復方向*")
if remediation_lines:
sections.append(remediation_lines)
if next_action_line:
sections.append(next_action_line)
if actionable_lines:
sections.append(f"\n*可直接自動化*\n{actionable_lines}")
fallback_items = _fallback_pairs(payload, keep={"status", "impact", "remediation", "actionable"})
if fallback_items:
sections.append(
"\n*欄位快覽(備援)*\n" + "\n".join(fallback_items)
)
msg = "\n".join(sections)
msg = format_governance_alert_card(event_type, payload)
await self._send(msg)
logger.info("governance_alert_sent", event_type=event_type)
@@ -336,6 +297,180 @@ def _as_dict(value: Any) -> dict[str, Any]:
return value if isinstance(value, dict) else {}
_EVENT_DISPLAY_NAMES = {
"trust_drift": "信任漂移",
"knowledge_degradation": "知識庫劣化",
"governance_slo_data_gap": "SLO 資料缺口",
"governance_self_failure": "治理自檢失敗",
"llm_hallucination": "LLM 驗證失敗",
"execution_blast_radius": "執行風險擴大",
}
_STATUS_BADGES = {
"critical": "🔴 critical",
"error": "🔴 error",
"violation": "🔴 violation",
"warning": "🟡 warning",
"degraded": "🟠 degraded",
"ok": "🟢 ok",
}
_IMPACT_PROFILES: dict[str, list[tuple[str, str]]] = {
"trust_drift": [
("drifted_count", "漂移 Playbook"),
("total_playbooks", "總 Playbook"),
("drift_ratio", "漂移比例"),
("threshold", "警戒門檻"),
("auto_deprecated_count", "自動停用"),
],
"knowledge_degradation": [
("stale_count", "陳舊 KM"),
("total_count", "總 KM"),
("stale_ratio", "陳舊比例"),
("threshold", "警戒門檻"),
("stale_days", "陳舊天數"),
],
"governance_slo_data_gap": [
("reason", "缺口原因"),
("skipped_count", "略過指標"),
("all_slo_metrics_not_emitted", "SLO 指標缺失"),
],
"governance_self_failure": [
("failed_checks", "失敗檢查"),
("total_checks", "總檢查"),
("failure_rate", "失敗比例"),
],
"execution_blast_radius": [
("affected_services", "受影響服務"),
("blast_radius", "爆炸半徑"),
("threshold", "警戒門檻"),
],
"llm_hallucination": [
("failed", "驗證失敗"),
("rate", "失敗比例"),
("threshold", "警戒門檻"),
],
}
def _event_display_name(event_type: str) -> str:
if event_type in _EVENT_DISPLAY_NAMES:
return _EVENT_DISPLAY_NAMES[event_type]
if event_type.startswith("slo_"):
return "SLO 違反"
return event_type.replace("_", " ").strip().title()
def _status_badge(status: Any) -> str:
status_text = str(status or "warning")
return _STATUS_BADGES.get(status_text.lower(), status_text)
def _format_metric_value(key: str, value: Any) -> str:
if isinstance(value, bool):
return "" if value else ""
if isinstance(value, (float, int)) and (
key.endswith("_ratio") or key in {"threshold", "rate", "failure_rate"}
):
return f"{float(value) * 100:.1f}%"
if isinstance(value, list):
if not value:
return "0"
shown = ", ".join(str(item) for item in value[:3])
if len(value) > 3:
shown += f"…(共 {len(value)}"
return shown
return str(value)
def _profiled_rows(event_type: str, data: dict[str, Any], *, max_rows: int = 8) -> list[str]:
if not data:
return []
used: set[str] = set()
rows: list[str] = []
for key, label in _IMPACT_PROFILES.get(event_type, []):
if key in data:
rows.append(f"{label}{_format_metric_value(key, data[key])}")
used.add(key)
for key in sorted(data.keys()):
if len(rows) >= max_rows:
break
if key in used:
continue
rows.append(f"{key}{_format_metric_value(key, data[key])}")
if len(data) > len(used) + max(0, max_rows - len(rows)):
rows.append("更多欄位已收斂至 AwoooP 稽核資料")
return rows[:max_rows]
def _tree_lines(rows: list[str]) -> str:
if not rows:
return ""
rendered: list[str] = []
for idx, row in enumerate(rows):
branch = "" if idx == len(rows) - 1 else ""
rendered.append(f"{branch} {_escape_md(str(row))}")
return "\n".join(rendered)
def _governance_summary_lines(event_type: str, impact: dict[str, Any]) -> str:
rows = _profiled_rows(event_type, impact)
return _tree_lines(rows)
def format_governance_alert_card(event_type: str, payload: dict[str, Any]) -> str:
"""格式化 AI 治理 Telegram 卡片。
2026-05-07 Codex — 保留治理 payload僅在 Telegram 邊界層把 raw key/value
轉成可掃描卡片,避免大量純文字欄位洗版。
"""
payload = payload if isinstance(payload, dict) else {}
impact = _as_dict(payload.get("impact"))
remediation = _as_dict(payload.get("remediation"))
actionable = _as_dict(payload.get("actionable"))
status = payload.get("status", "warning")
sections: list[str] = [
f"⚠️ *AI 治理警報|{_escape_md(_event_display_name(event_type))}*",
"──────────────────────",
f"類型:{_escape_md(event_type)}",
f"狀態:{_escape_md(_status_badge(status))}",
]
impact_lines = _governance_summary_lines(event_type, impact)
if impact_lines:
sections.extend(["", "🧭 *影響摘要*", impact_lines])
remediation_lines = _lines_from_list(remediation.get("items"))
remediation_next_action = remediation.get("next_action")
remediation_hint = remediation.get("hint")
if remediation_lines or remediation_next_action or remediation_hint:
sections.extend(["", "🛠️ *修復方向*"])
if remediation_lines:
sections.append(remediation_lines)
if remediation_next_action:
sections.append(f"▶️ 下一步:{_escape_md(str(remediation_next_action))}")
if remediation_hint:
sections.append(f"💡 提示:{_escape_md(str(remediation_hint))}")
actionable_lines = _lines_from_list(actionable.get("items"))
if actionable_lines:
sections.extend(["", "🤖 *可自動化工作*", actionable_lines])
fallback_items = _fallback_pairs(
payload,
keep={"status", "impact", "remediation", "actionable"},
max_items=4,
)
if fallback_items:
sections.extend(["", "📎 *補充欄位*", "\n".join(fallback_items)])
return "\n".join(sections)
def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str:
if not data:
return ""
@@ -360,7 +495,12 @@ def _lines_from_list(value: Any) -> str:
)
def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> list[str]:
def _fallback_pairs(
payload: dict[str, Any],
keep: set[str] | None = None,
*,
max_items: int | None = None,
) -> list[str]:
if not isinstance(payload, dict):
return []
keep = set(keep or set())
@@ -368,6 +508,9 @@ def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> li
for key in sorted(payload.keys()):
if key in keep:
continue
if max_items is not None and len(rows) >= max_items:
rows.append(_escape_md("更多欄位已收斂至 AwoooP 稽核資料"))
break
rows.append(f"{_escape_md(str(key))}{_escape_md(str(payload.get(key)))}")
return rows

View File

@@ -24,12 +24,19 @@ Runbook Generator - Phase 25 P1 Knowledge Auto-Harvesting
from __future__ import annotations
import asyncio
import html
import re
import time
from typing import TYPE_CHECKING
import structlog
from src.models.knowledge import EntrySource, EntryStatus, EntryType, KnowledgeEntryCreate
from src.models.knowledge import (
EntrySource,
EntryStatus,
EntryType,
KnowledgeEntryCreate,
)
if TYPE_CHECKING:
from src.models.incident import Incident
@@ -38,6 +45,88 @@ if TYPE_CHECKING:
logger = structlog.get_logger(__name__)
_CARD_MAX_LEN = 3600
_SECTION_RE = re.compile(r"^#{1,6}\s+(?P<title>.+?)\s*$")
_BULLET_RE = re.compile(r"^\s*(?:[-*]|\d+[.)])\s*")
def _html(text: object) -> str:
return html.escape(str(text), quote=False)
def _shorten(text: object, limit: int = 120) -> str:
compact = " ".join(str(text or "").split())
if len(compact) <= limit:
return compact
return compact[: max(0, limit - 1)].rstrip() + ""
def _clean_preview_line(line: str) -> str:
line = _SECTION_RE.sub("", line.strip())
line = _BULLET_RE.sub("", line).strip()
line = line.replace("`", "")
return " ".join(line.split())
def _section_preview(content: str, title_keyword: str, *, fallback: str) -> str:
"""從 Markdown 內容抽一行可讀摘要,避免把整段 Runbook 原文丟進 Telegram。"""
lines = str(content or "").splitlines()
in_section = False
for raw_line in lines:
line = raw_line.strip()
if not line:
continue
heading = _SECTION_RE.match(line)
if heading:
in_section = title_keyword in heading.group("title")
continue
if not in_section:
continue
preview = _clean_preview_line(line)
if preview:
return _shorten(preview, 120)
return fallback
def _step_preview(content: str) -> str:
preview = _section_preview(content, "執行", fallback="待審核 Runbook 執行步驟")
if any(token in preview for token in ("{host}", "{target}", "Unsupported scheme", "Invalid component name")):
return "含 placeholder 或不支援的執行步驟,需人工修正後才能發布"
return _shorten(preview, 120)
def format_runbook_review_card(
incident: object,
entry_id: str,
content: str,
) -> str:
"""格式化 Telegram Runbook 審核卡片。
2026-05-07 Codex — 將純文字 Markdown preview 改成治理卡片,讓 SRE
能快速判斷知識狀態、受影響服務與審核重點。
"""
incident_id = getattr(incident, "incident_id", "unknown")
services = ", ".join(getattr(incident, "affected_services", None) or []) or "unknown"
symptom = _section_preview(content, "症狀", fallback=f"Incident {incident_id} 的修復知識待審核")
step = _step_preview(content)
message = (
"📄 <b>RUNBOOK REVIEW待審核</b>\n"
"──────────────────────\n"
f"📋 Incident<code>{_html(incident_id)}</code>\n"
f"🧩 受影響服務:<code>{_html(services)}</code>\n"
"🧠 知識狀態:<b>DRAFT需人工審核</b>\n"
f"🗂️ Entry ID<code>{_html(entry_id)}</code>\n\n"
"🧾 <b>內容摘要</b>\n"
f"├ 症狀:{_html(symptom)}\n"
f"└ 執行:{_html(step)}\n\n"
"✅ <b>審核重點</b>\n"
"1. 確認步驟可重跑,且不含 placeholder / 不支援 scheme\n"
"2. 補齊適用條件、rollback 與驗證方式\n\n"
"🔎 AwoooP知識庫 / Runbook Review"
)
return message[:_CARD_MAX_LEN]
class NemotronRunbookGenerator:
"""
@@ -109,7 +198,7 @@ class NemotronRunbookGenerator:
playbook_id=playbook.playbook_id,
)
await self._push_runbook_review_card(incident, entry.id, content[:200])
await self._push_runbook_review_card(incident, entry.id, content)
except Exception as e:
logger.error(
@@ -300,13 +389,7 @@ class NemotronRunbookGenerator:
try:
from src.services.telegram_gateway import get_telegram_gateway
tg = get_telegram_gateway()
await tg.send_text(
f"📄 <b>Auto Runbook 待審核</b>\n"
f"Incident: <code>{incident.incident_id}</code>\n"
f"Entry ID: <code>{entry_id}</code>\n\n"
f"<i>{content_preview}...</i>\n\n"
f"請至知識庫審核並發布。"
)
await tg.send_text(format_runbook_review_card(incident, entry_id, content_preview))
except Exception as e:
logger.warning("runbook_review_card_failed", error=str(e))

View File

@@ -22,6 +22,7 @@ from src.services.failover_alerter import (
_lines_from_list,
_sanitize_telegram_error,
configure_alerter,
format_governance_alert_card,
get_failover_alerter,
reset_failover_alerter,
)
@@ -249,3 +250,57 @@ def test_sanitize_telegram_error_redacts_bot_token_url() -> None:
assert "SECRET" not in sanitized
assert "bot<redacted>" in sanitized
def test_governance_alert_card_formats_knowledge_degradation() -> None:
card = format_governance_alert_card(
"knowledge_degradation",
{
"status": "warning",
"impact": {
"stale_count": 948,
"stale_days": 7,
"stale_ratio": 0.521,
"threshold": 0.2,
"total_count": 1819,
},
"remediation": {
"items": [
"啟動 KM 反查與自動補齊流程",
"關鍵服務告警自動同步到 KM 任務",
],
"next_action": "run_kb_growth_healthcheck",
},
"actionable": {
"items": [
"每日檢查 ANTI_PATTERN 更新結果",
"安排 owner 對 stale 條目做快速人工審核",
]
},
},
)
assert "*AI 治理警報|知識庫劣化*" in card
assert "🧭 *影響摘要*" in card
assert "陳舊 KM948" in card
assert "陳舊比例52\\.1%" in card
assert "▶️ 下一步run\\_kb\\_growth\\_healthcheck" in card
assert "欄位快覽" not in card
def test_governance_alert_card_limits_fallback_fields() -> None:
card = format_governance_alert_card(
"custom_signal",
{
"status": "warning",
"field_a": "a",
"field_b": "b",
"field_c": "c",
"field_d": "d",
"field_e": "e",
},
)
assert "📎 *補充欄位*" in card
assert "更多欄位已收斂至 AwoooP 稽核資料" in card
assert "field\\_e" not in card

View File

@@ -14,6 +14,7 @@ ADR-052 / Phase 25 P1: NemotronRunbookGenerator + AntiPattern Gate
"""
from pathlib import Path
from types import SimpleNamespace
# Source file paths
_BASE = Path(__file__).parent.parent / "src"
@@ -80,6 +81,30 @@ class TestRunbookGeneratorModule:
source = _RUNBOOK_GEN.read_text()
assert "fallback" in source
def test_runbook_review_card_is_structured_html(self):
"""Telegram Runbook 審核訊息必須是可掃描治理卡片,不直接傾倒 Markdown 原文"""
from src.services.runbook_generator import format_runbook_review_card
incident = SimpleNamespace(
incident_id="INC-20260506-E54736",
affected_services=["node-exporter-110"],
)
content = (
"## 症狀描述\n"
"Incident INC-20260506-E54736受影響服務node-exporter-110\n\n"
"## 執行步驟\n"
"- Step 1: ssh{host} echo '=== LOAD ===' -> FAILED: Unsupported scheme\n"
)
card = format_runbook_review_card(incident, "ff5eff01-7243-44bf", content)
assert "<b>RUNBOOK REVIEW待審核</b>" in card
assert "<code>INC-20260506-E54736</code>" in card
assert "🧾 <b>內容摘要</b>" in card
assert "placeholder 或不支援的執行步驟" in card
assert "## 症狀描述" not in card
assert "ssh{host}" not in card
# =============================================================================
# TestAutoRepairService — fire-and-forget 與 GC 防洩漏