fix(governance): clarify knowledge degradation alerts

2026-05-19 15:14:40 +08:00
parent 038f1a0d6d
commit 795c9a4e93
3 changed files with 66 additions and 4 deletions
--- a/apps/api/src/services/failover_alerter.py
+++ b/apps/api/src/services/failover_alerter.py
@@ -299,7 +299,7 @@ def _as_dict(value: Any) -> dict[str, Any]:

 _EVENT_DISPLAY_NAMES = {
    "trust_drift": "信任漂移",
-    "knowledge_degradation": "知識庫劣化",
+    "knowledge_degradation": "KM 需要更新（影響 AI 判斷）",
    "governance_slo_data_gap": "SLO 資料缺口",
    "governance_self_failure": "治理自檢失敗",
    "llm_hallucination": "LLM 驗證失敗",
@@ -421,6 +421,59 @@ def _governance_summary_lines(event_type: str, impact: dict[str, Any]) -> str:
    return _tree_lines(rows)


+def _governance_operator_context(event_type: str, impact: dict[str, Any]) -> list[str]:
+    """Return operator-facing guidance for governance alerts.
+
+    The governance loop stores machine-readable details in AwoooP. Telegram
+    needs a shorter "what this means / what to do now" layer so operators do
+    not have to infer the process stage from raw metric names.
+    """
+    if event_type != "knowledge_degradation":
+        return []
+
+    stale_count = impact.get("stale_count", "?")
+    total_count = impact.get("total_count", "?")
+    stale_days = impact.get("stale_days", "?")
+    threshold = _format_metric_value("threshold", impact.get("threshold", 0.2))
+    stale_ratio = _format_metric_value("stale_ratio", impact.get("stale_ratio", 0))
+
+    plain_summary = (
+        f"{stale_count} / {total_count} 筆 KM 超過 {stale_days} 天未更新，"
+        "AI 做告警分類、規則匹配、PlayBook 推薦時可能引用舊資訊。"
+    )
+    policy_summary = (
+        f"這是治理品質警報，不是服務故障；目標是把 stale ratio "
+        f"{stale_ratio} 降到門檻 {threshold} 以下。"
+    )
+
+    return [
+        "",
+        "💬 *白話說明*",
+        _escape_md(plain_summary),
+        _escape_md(policy_summary),
+        "",
+        "🧩 *AI 流程狀態*",
+        _tree_lines(
+            [
+                "階段：detected → queued_kb_healthcheck → waiting_owner_review",
+                "AI 已做：統計 stale KM，產生補齊與審核方向",
+                "AI 可做：反查 Incident / Sentry / SigNoz / PlayBook，產生 KM 更新草稿與任務",
+                "需要人工：owner 審核高影響 KM 內容，避免 AI 自動寫入錯誤知識",
+            ]
+        ),
+        "",
+        "✅ *現在要做*",
+        _lines_from_list(
+            [
+                "確認 run_kb_growth_healthcheck 是否已排程或已執行",
+                "到 AwoooP Work Items / AI 治理篩選 knowledge_degradation",
+                "優先審核最近被告警、Sentry、SigNoz、PlayBook 引用的 KM",
+                "不用重啟服務；等 stale_ratio 降到 20% 以下再關閉治理警報",
+            ]
+        ),
+    ]
+
+
 def format_governance_alert_card(event_type: str, payload: dict[str, Any]) -> str:
    """格式化 AI 治理 Telegram 卡片。

@@ -440,6 +493,8 @@ def format_governance_alert_card(event_type: str, payload: dict[str, Any]) -> st
        f"狀態：{_escape_md(_status_badge(status))}",
    ]

+    sections.extend(_governance_operator_context(event_type, impact))
+
    impact_lines = _governance_summary_lines(event_type, impact)
    if impact_lines:
        sections.extend(["", "🧭 *影響摘要*", impact_lines])
@@ -460,9 +515,10 @@ def format_governance_alert_card(event_type: str, payload: dict[str, Any]) -> st
    if actionable_lines:
        sections.extend(["", "🤖 *可自動化工作*", actionable_lines])

+    profiled_keys = {key for key, _label in _IMPACT_PROFILES.get(event_type, [])}
    fallback_items = _fallback_pairs(
        payload,
-        keep={"status", "impact", "remediation", "actionable"},
+        keep={"status", "impact", "remediation", "actionable", *profiled_keys},
        max_items=4,
    )
    if fallback_items:
--- a/apps/api/src/services/governance_agent.py
+++ b/apps/api/src/services/governance_agent.py
@@ -227,7 +227,7 @@ class GovernanceAgent:
                    "actionable": {
                        "items": [
                            "每日檢查 ANTI_PATTERN 更新結果",
-                            "安排至少 2 位 owner 對 stale條目做快速人工審核",
+                            "安排至少 2 位 owner 對 stale 條目做快速人工審核",
                        ],
                    },
                    "stale_count": stale,
--- a/apps/api/tests/test_failover_alerter.py
+++ b/apps/api/tests/test_failover_alerter.py
@@ -280,11 +280,17 @@ def test_governance_alert_card_formats_knowledge_degradation() -> None:
        },
    )

-    assert "*AI 治理警報｜知識庫劣化*" in card
+    assert "*AI 治理警報｜KM 需要更新" in card
+    assert "💬 *白話說明*" in card
+    assert "🧩 *AI 流程狀態*" in card
+    assert "✅ *現在要做*" in card
+    assert "queued\\_kb\\_healthcheck" in card
+    assert "AwoooP Work Items" in card
    assert "🧭 *影響摘要*" in card
    assert "陳舊 KM：948" in card
    assert "陳舊比例：52\\.1%" in card
    assert "▶️ 下一步：run\\_kb\\_growth\\_healthcheck" in card
+    assert "📎 *補充欄位*" not in card
    assert "欄位快覽" not in card