feat(governance): normalize AI治理告警輸出與元告警解析度

2026-05-02 23:49:59 +08:00
parent a38d911213
commit b710f3f38f
7 changed files with 325 additions and 31 deletions
--- a/apps/api/src/jobs/ai_slo_watchdog_job.py
+++ b/apps/api/src/jobs/ai_slo_watchdog_job.py
@@ -143,20 +143,36 @@ async def _check_once() -> None:
        return
    await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1")

+    violation_lines = [
+        f"{idx + 1}. {item}" for idx, item in enumerate(violations)
+    ]
+    diagnosis = "AI 自健診異常"
+    system_impact = "\n".join(
+        [
+            f"檢出 {len(violations)} 項 KPI 異常（W-1~W-6）",
+            f"關鍵影響：飛輪自動化能力可能降級",
+            *violation_lines,
+        ]
+    )
+    probable_cause = "治理異常與執行資料同時異常，建議先核對 AI SLO 指標與最近自修復任務執行紀錄"
+
    # 發送 TYPE-8M Meta-System 告警
-        diagnosis = " | ".join(violations)
-        incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
-        try:
-            from src.services.telegram_gateway import get_telegram_gateway
-            await get_telegram_gateway().send_meta_alert(
-                incident_id=incident_id,
+    # 重大異常：超過 2 項即升為 critical，便於前線分流；1-2 項走 warning
+    severity = "critical" if len(violations) >= 2 else "warning"
+    incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
+    try:
+        from src.services.telegram_gateway import get_telegram_gateway
+
+        await get_telegram_gateway().send_meta_alert(
+            incident_id=incident_id,
            approval_id=str(uuid.uuid4()),
            alertname="AI 自健診異常",
-                alert_category="flywheel_health",
-                diagnosis=diagnosis,
-                severity_level="critical",
-                system_impact=f"{len(violations)} 項 KPI 異常（W-1~W-6），飛輪自動化能力可能降級",
-            )
+            alert_category="flywheel_health",
+            diagnosis=diagnosis,
+            severity_level=severity,
+            system_impact=system_impact,
+            probable_cause=probable_cause,
+        )
        logger.warning(
            "ai_slo_watchdog_alert_sent",
            incident_id=incident_id,
--- a/apps/api/src/services/failover_alerter.py
+++ b/apps/api/src/services/failover_alerter.py
@@ -102,16 +102,46 @@ class FailoverAlerter:
            logger.debug("governance_alert_dedup_skipped", event_type=event_type)
            return

-        # 格式化 payload 為可讀字串（key=value，換行分隔）
-        detail_lines = "\n".join(
-            f"{_escape_md(str(k))}：{_escape_md(str(v))}"
-            for k, v in payload.items()
-        )
-        msg = (
-            f"*AI 治理警報*\n\n"
-            f"類型：{_escape_md(event_type)}\n\n"
-            f"{detail_lines}"
-        )
+        status = _escape_md(str(payload.get("status", "warning")))
+        impact = _as_dict(payload.get("impact"))
+        remediation = _as_dict(payload.get("remediation"))
+        actionable = _as_dict(payload.get("actionable"))
+
+        impact_lines = _lines_from_dict(impact, max_items=12, compact=True)
+        remediation_lines = _lines_from_list(remediation.get("items"))
+        remediation_next_action = remediation.get("next_action")
+        remediation_hint = remediation.get("hint")
+        actionable_lines = _lines_from_list(actionable.get("items"))
+
+        next_action_line = ""
+        if remediation_next_action:
+            next_action_line = f"\n  下一步：{_escape_md(str(remediation_next_action))}"
+        if remediation_hint:
+            next_action_line += f"\n  提示：{_escape_md(str(remediation_hint))}"
+
+        sections: list[str] = [
+            "⚠️ *AI 治理警報*",
+            f"\n類型：{_escape_md(event_type)}",
+            f"狀態：{status}",
+        ]
+        if impact_lines:
+            sections.append(f"\n*影響*\n{impact_lines}")
+        if remediation_lines or next_action_line:
+            sections.append(f"\n*修復方向*")
+            if remediation_lines:
+                sections.append(remediation_lines)
+            if next_action_line:
+                sections.append(next_action_line)
+        if actionable_lines:
+            sections.append(f"\n*可直接自動化*\n{actionable_lines}")
+
+        fallback_items = _fallback_pairs(payload, keep={"status", "impact", "remediation", "actionable"})
+        if fallback_items:
+            sections.append(
+                "\n*欄位快覽（備援）*\n" + "\n".join(fallback_items)
+            )
+
+        msg = "\n".join(sections)
        await self._send(msg)
        logger.info("governance_alert_sent", event_type=event_type)

@@ -259,6 +289,46 @@ def _escape_md(text: str) -> str:
    return text


+def _as_dict(value: Any) -> dict[str, Any]:
+    return value if isinstance(value, dict) else {}
+
+
+def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str:
+    if not data:
+        return ""
+    rows = []
+    idx = 0
+    for k in sorted(data.keys()) if isinstance(data, dict) else []:
+        if idx >= max_items:
+            break
+        rows.append(f"{_escape_md(str(k))}：{_escape_md(str(data.get(k)))}")
+        idx += 1
+    if compact and len(rows) >= max_items:
+        rows.append("...（更多欄位略）")
+    return "\n".join(f"  {line}" for line in rows)
+
+
+def _lines_from_list(value: Any) -> str:
+    if not isinstance(value, list):
+        return ""
+    return "\n".join(
+        f"  {idx + 1}. {_escape_md(str(item))}"
+        for idx, item in enumerate(value)
+    )
+
+
+def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> list[str]:
+    if not isinstance(payload, dict):
+        return []
+    keep = set(keep or set())
+    rows = []
+    for key in sorted(payload.keys()):
+        if key in keep:
+            continue
+        rows.append(f"{_escape_md(str(key))}：{_escape_md(str(payload.get(key)))}")
+    return rows
+
+
 # =============================================================================
 # Singleton
 # =============================================================================
--- a/apps/api/src/services/governance_agent.py
+++ b/apps/api/src/services/governance_agent.py
@@ -142,6 +142,13 @@ class GovernanceAgent:
                        ],
                        "sample_playbook_ids": kept_ids[:10],
                    },
+                    "drifted_count": len(drifted),
+                    "auto_deprecated_count": len(auto_deprecated_ids),
+                    "auto_deprecated_ids": auto_deprecated_ids[:10],
+                    "playbook_ids": kept_ids[:10],
+                    "total_playbooks": total,
+                    "threshold": TRUST_DRIFT_THRESHOLD,
+                    "auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
                },
            )

@@ -215,6 +222,11 @@ class GovernanceAgent:
                            "安排至少 2 位 owner 對 stale條目做快速人工審核",
                        ],
                    },
+                    "stale_count": stale,
+                    "total_count": total,
+                    "stale_ratio": round(ratio, 3),
+                    "threshold": KM_STALE_RATIO,
+                    "stale_days": KM_STALE_DAYS,
                },
            )

@@ -260,6 +272,27 @@ class GovernanceAgent:
            await self._alert(
                "llm_hallucination",
                {
+                    "status": "warning",
+                    "impact": {
+                        "failed_count": failed,
+                        "total_checked": total,
+                        "hallucination_rate": round(rate, 3),
+                        "threshold": HALLUCINATION_RATE_THRESHOLD,
+                    },
+                    "remediation": {
+                        "items": [
+                            "檢核 AI 建議來源與 evidence snapshot 一致性",
+                            "檢視最近 incident 的 verifier 輸入欄位是否缺失關鍵上下文",
+                        ],
+                        "next_action": "run_knowledge_gap_audit",
+                        "hint": "高失敗率通常表示 evidence 收斂流程退化或資料欄位解讀偏差",
+                    },
+                    "actionable": {
+                        "items": [
+                            "啟動 `playbook_evidence` 對齊補償流程",
+                            "調整 verify timeout 與降級策略，避免過度信任低品質證據",
+                        ],
+                    },
                    "failed_count": failed,
                    "total_checked": total,
                    "hallucination_rate": round(rate, 3),
@@ -304,6 +337,27 @@ class GovernanceAgent:
            await self._alert(
                "execution_blast_radius",
                {
+                    "status": "warning",
+                    "impact": {
+                        "failed_count": failed,
+                        "total_executions": total,
+                        "failure_rate": round(rate, 3),
+                        "threshold": EXECUTION_FAIL_RATE_THRESHOLD,
+                    },
+                    "remediation": {
+                        "items": [
+                            "鎖定失敗 playbook 清單，關閉高風險自動執行",
+                            "比對 incident evidence 與 post_execution_verification 失敗原因",
+                        ],
+                        "next_action": "pause_auto_repair_for_top_failing_playbooks",
+                        "hint": "可能是 auto_repair_playbook 與 runtime 版本/環境脫節",
+                    },
+                    "actionable": {
+                        "items": [
+                            "跑 `run_self_check` 快照與失敗 playbook 熱點報表",
+                            "必要時啟用 emergency fallback 路由進人工審核",
+                        ],
+                    },
                    "failed_count": failed,
                    "total_executions": total,
                    "failure_rate": round(rate, 3),
@@ -548,9 +602,25 @@ class GovernanceAgent:
                await self._alert(
                    "governance_self_failure",
                    {
-                        "failed_checks": failed_checks,
-                        "total_checks": 5,  # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
-                        "errors": {k: results[k].get("error") for k in failed_checks},
+                        "status": "critical",
+                        "impact": {
+                            "failed_checks": failed_checks,
+                            "total_checks": 5,  # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
+                            "errors": {k: results[k].get("error") for k in failed_checks},
+                        },
+                        "remediation": {
+                            "items": [
+                                "暫停非關鍵治理自動化接收鏈路",
+                                "聚焦治理執行路徑錯誤並補齊 fallback",
+                            ],
+                            "next_action": "investigate_governance_pipeline_health",
+                        },
+                        "actionable": {
+                            "items": [
+                                "檢查 GovernanceAgent run loop 是否完整執行 5 個項目",
+                                "確認 DB 寫入與 Prometheus fetch 未被上游干擾",
+                            ],
+                        },
                    },
                )
            except Exception:
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -2603,12 +2603,12 @@ class TelegramGateway:
            f"━━━━━━━━━━━━━━━━━━━\n"
            f"📋 <code>{html.escape(incident_id)}</code>\n"
            f"🚨 異常元件：<b>{html.escape(alertname)}</b>\n"
-            f"🎯 診斷結果：{html.escape(diagnosis[:100])}\n"
+            f"🎯 診斷結果：{html.escape(_smart_truncate(diagnosis, 320))}\n"
        )
        if system_impact:
-            text += f"\n🧠 <b>系統影響</b>\n{html.escape(system_impact[:150])}\n"
+            text += f"\n🧠 <b>系統影響</b>\n{html.escape(_smart_truncate(system_impact, 320))}\n"
        if probable_cause:
-            text += f"└─ 可能根因：{html.escape(probable_cause[:100])}\n"
+            text += f"└─ 可能根因：{html.escape(_smart_truncate(probable_cause, 320))}\n"

        # 2026-04-16 ogt: 移除 flywheel_diag / flywheel_dashboard (3-part ghost button，無 handler)
        # 鐵律: 寧可沒按鈕，不可有死按鈕 (feedback_no_ghost_buttons.md)
--- a/docs/12-agent-game-rules.md
+++ b/docs/12-agent-game-rules.md
@@ -137,11 +137,12 @@ last_modified_by: Codex

 ## AI 治理告警事件規範（本輪新增）

- 目標：把治理告警輸出固定為「可解析」「可追蹤」「可行動」三層結構，支援 telegram + PG + AI 決策。
+- 目標：把治理告警輸出固定為「可解析」「可追蹤」「可行動」三層結構，並讓 Telegram 與 AI Agent 能直接接力執行。
 - 版本：`governance_event_v1`，適用模組：
  - `governance_agent.py`（`_alert()`）
  - `failover_alerter.py`（告警推送）
  - `ai_slo_watchdog_job.py`（META 告警）
+- JSON Schema：[/Users/ogt/awoooi/docs/schemas/governance_event_v1.schema.json](docs/schemas/governance_event_v1.schema.json)

 ### 1) 通用 Schema

@@ -177,8 +178,8 @@ last_modified_by: Codex
 | `trust_drift` | `governance_agent.check_trust_drift` | 風險警示時 `warning`；未超標可不推送 | `auto_deprecated_count/ids`, `playbook_ids` |
 | `knowledge_degradation` | `governance_agent.check_knowledge_degradation` | 過比例時 `warning` | `next_action=run_kb_growth_healthcheck` |
 | `governance_slo_data_gap` | `governance_agent.run_self_check` | 所有 SLO metric 無 emit 時 `warning` | `next_action=run_adr100_slo_emit_playbook` |
-| `llm_hallucination` | `governance_agent.check_llm_hallucination` | 舊有 payload（待重構） |
-| `execution_blast_radius` | `governance_agent.check_execution_blast_radius` | 舊有 payload（待重構） |
+| `llm_hallucination` | `governance_agent.check_llm_hallucination` | 過比例時 `warning` | `next_action=run_knowledge_gap_audit` |
+| `execution_blast_radius` | `governance_agent.check_execution_blast_radius` | 過比例時 `warning` | `next_action=pause_auto_repair_for_top_failing_playbooks` |
 | `governance_slo_<metric>_violation` | `governance_agent.check_slo_compliance` | `status=violation` | `next_action=trigger_flywheel_safeguard` |
 | `slo_*`/`governance_*` | 其他治理事件 | 按事件需求保留最小欄位但建議同 schema |

@@ -190,4 +191,56 @@ last_modified_by: Codex
  - `impact`
  - `remediation`
  - `actionable`
- W-1~W-6 自健診（`ai_slo_watchdog_job.py`）以 `system_impact` 明確標示異常 KPI 數量與檢查區間，避免 `W-6` 漏報文案誤解。
+
+- W-1~W-6 自健診（`ai_slo_watchdog_job.py`）以 `system_impact` 明確列出異常 KPI 與序號清單，避免 `W-6` 漏報文案誤解。
+
+### 4) 快速 Sample（供 AGENT/Parser 套件直接接力）
+
+```json
+{
+  "event_type": "trust_drift",
+  "status": "warning",
+  "impact": {
+    "drifted_count": 4,
+    "total_playbooks": 26,
+    "drift_ratio": 0.153,
+    "auto_deprecated_count": 0,
+    "auto_deprecated_ids": [],
+    "playbook_ids": [
+      "PB-20260501-27910D",
+      "PB-COLD-745C00B9",
+      "PB-20260405-1CF853",
+      "PB-20260409-B66B1A"
+    ]
+  },
+  "remediation": {
+    "next_action": "review_trust_drift_candidates",
+    "items": [
+      "確認各 playbook 最近 14 天執行結果是否含高失敗/高重試",
+      "必要時啟用 trial auto-deprecate"
+    ]
+  },
+  "actionable": {
+    "items": [
+      "可自動註記可降級清單",
+      "可自動生成 approval-free dry-run 回放報告"
+    ]
+  }
+}
+```
+
+### 5) 事件處理路徑（非人肉清單）
+
+- `trust_drift`：
+  - 立即：保留低信任但新近使用 Playbook 清單，輸出 `playbook_ids`
+  - 自動：`AUTO_DEPRECATED` 當日 30 天內未更新的 Playbook 自動降級
+  - 人工：人工覆核 playbook 風險，決定是否 rollback
+- `knowledge_degradation`：
+  - 自動：觸發 `run_kb_growth_healthcheck`
+  - 續接：`playbook_evidence` / `kb_rot_cleaner` 補齊缺口
+- `governance_slo_data_gap`：
+  - 自動：`run_adr100_slo_emit_playbook`
+  - 檢查：所有 API Pod 是否已掛載 `PROMETHEUS_MULTIPROC_DIR`，Prometheus rules 已載入
+- `governance_slo_*_violation`：
+  - 自動：暫停高風險 auto-repair 路徑（`flywheel safeguard`）
+  - 人工：review 最近 1 小時 self-check 失敗樣本
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -22,6 +22,19 @@
 ### 驗證
 - 代碼改動已在上一輪 commit 寫入（含 `governance_agent.py`、`ai_slo_watchdog_job.py`、`webhooks.py`）並推送到 `gitea main`。

+## 2026-05-02 | AI治理報告可讀性與自動化收斂完成（本輪）
+
+### 完成
+- 將 `governance_agent.py` 告警 payload 升級為**雙軌輸出**：
+  - 保留現有扁平欄位（便於既有告警消費者）；
+  - 同步補齊 `status / impact / remediation / actionable` 結構。
+- 讓 `FailoverAlerter.alert_governance` 直接輸出「影響 / 修復 / 可自動化」三區塊，去掉雜亂 Key=Value 備援列，提升 Telegram 一眼可讀性。
+- `ai_slo_watchdog_job.py` 重組 `W-1~W-6` 異常文案，加入 `system_impact` 明細與嚴重度自動分流（warning/critical）。
+- 新增機讀 schema：`docs/schemas/governance_event_v1.schema.json`，並在 `docs/12-agent-game-rules.md` 補齊告警範例與事件對應自動化路徑。
+
+### 影響
+- `trust_drift` / `knowledge_degradation` / `knowledge_slo_data_gap` 的告警不再只像「字串摘要」，可直接交給 Agent 判斷下一步行動。
+
 ## 2026-05-02 | trust_drift 飛輪自治：低信任未使用 playbook 自動 deprecate

 承接統帥對 governance 類告警的全面授權。trust_drift 過去只發 Telegram 告警，4 個低信任 playbook 一直在告警表內噴噪音。
--- a/docs/schemas/governance_event_v1.schema.json
+++ b/docs/schemas/governance_event_v1.schema.json
@@ -0,0 +1,72 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "urn:awoooi:governance-event-v1",
+  "title": "AWOOOI Governance Event (v1)",
+  "type": "object",
+  "required": [
+    "status",
+    "impact",
+    "remediation",
+    "actionable"
+  ],
+  "properties": {
+    "status": {
+      "type": "string",
+      "enum": ["info", "warning", "critical", "violation"]
+    },
+    "impact": {
+      "type": "object",
+      "additionalProperties": true,
+      "minProperties": 1
+    },
+    "remediation": {
+      "type": "object",
+      "properties": {
+        "items": {
+          "type": "array",
+          "items": { "type": "string" }
+        },
+        "next_action": {
+          "type": "string",
+          "minLength": 1
+        },
+        "hint": {
+          "type": "string"
+        }
+      },
+      "additionalProperties": true
+    },
+    "actionable": {
+      "type": "object",
+      "properties": {
+        "items": {
+          "type": "array",
+          "items": { "type": "string" }
+        }
+      },
+      "additionalProperties": true
+    },
+    "drifted_count": { "type": "integer", "minimum": 0 },
+    "total_playbooks": { "type": "integer", "minimum": 0 },
+    "auto_deprecated_count": { "type": "integer", "minimum": 0 },
+    "auto_deprecated_ids": {
+      "type": "array",
+      "items": { "type": "string" }
+    },
+    "playbook_ids": {
+      "type": "array",
+      "items": { "type": "string" }
+    },
+    "stale_count": { "type": "integer", "minimum": 0 },
+    "total_count": { "type": "integer", "minimum": 0 },
+    "stale_ratio": { "type": "number", "minimum": 0 },
+    "stale_days": { "type": "integer", "minimum": 1 },
+    "threshold": { "type": "number", "minimum": 0 },
+    "failed_count": { "type": "integer", "minimum": 0 },
+    "total_checked": { "type": "integer", "minimum": 0 },
+    "total_executions": { "type": "integer", "minimum": 0 },
+    "failure_rate": { "type": "number", "minimum": 0, "maximum": 1 },
+    "hallucination_rate": { "type": "number", "minimum": 0, "maximum": 1 }
+  },
+  "additionalProperties": true
+}