diff --git a/apps/api/src/jobs/ai_slo_watchdog_job.py b/apps/api/src/jobs/ai_slo_watchdog_job.py
index 93dcdbd0..0461dca1 100644
--- a/apps/api/src/jobs/ai_slo_watchdog_job.py
+++ b/apps/api/src/jobs/ai_slo_watchdog_job.py
@@ -143,20 +143,36 @@ async def _check_once() -> None:
return
await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1")
+ violation_lines = [
+ f"{idx + 1}. {item}" for idx, item in enumerate(violations)
+ ]
+ diagnosis = "AI 自健診異常"
+ system_impact = "\n".join(
+ [
+ f"檢出 {len(violations)} 項 KPI 異常(W-1~W-6)",
+ f"關鍵影響:飛輪自動化能力可能降級",
+ *violation_lines,
+ ]
+ )
+ probable_cause = "治理異常與執行資料同時異常,建議先核對 AI SLO 指標與最近自修復任務執行紀錄"
+
# 發送 TYPE-8M Meta-System 告警
- diagnosis = " | ".join(violations)
- incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
- try:
- from src.services.telegram_gateway import get_telegram_gateway
- await get_telegram_gateway().send_meta_alert(
- incident_id=incident_id,
+ # 重大異常:超過 2 項即升為 critical,便於前線分流;1-2 項走 warning
+ severity = "critical" if len(violations) >= 2 else "warning"
+ incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
+ try:
+ from src.services.telegram_gateway import get_telegram_gateway
+
+ await get_telegram_gateway().send_meta_alert(
+ incident_id=incident_id,
approval_id=str(uuid.uuid4()),
alertname="AI 自健診異常",
- alert_category="flywheel_health",
- diagnosis=diagnosis,
- severity_level="critical",
- system_impact=f"{len(violations)} 項 KPI 異常(W-1~W-6),飛輪自動化能力可能降級",
- )
+ alert_category="flywheel_health",
+ diagnosis=diagnosis,
+ severity_level=severity,
+ system_impact=system_impact,
+ probable_cause=probable_cause,
+ )
logger.warning(
"ai_slo_watchdog_alert_sent",
incident_id=incident_id,
diff --git a/apps/api/src/services/failover_alerter.py b/apps/api/src/services/failover_alerter.py
index bac8e4d2..d552eb12 100644
--- a/apps/api/src/services/failover_alerter.py
+++ b/apps/api/src/services/failover_alerter.py
@@ -102,16 +102,46 @@ class FailoverAlerter:
logger.debug("governance_alert_dedup_skipped", event_type=event_type)
return
- # 格式化 payload 為可讀字串(key=value,換行分隔)
- detail_lines = "\n".join(
- f"{_escape_md(str(k))}:{_escape_md(str(v))}"
- for k, v in payload.items()
- )
- msg = (
- f"*AI 治理警報*\n\n"
- f"類型:{_escape_md(event_type)}\n\n"
- f"{detail_lines}"
- )
+ status = _escape_md(str(payload.get("status", "warning")))
+ impact = _as_dict(payload.get("impact"))
+ remediation = _as_dict(payload.get("remediation"))
+ actionable = _as_dict(payload.get("actionable"))
+
+ impact_lines = _lines_from_dict(impact, max_items=12, compact=True)
+ remediation_lines = _lines_from_list(remediation.get("items"))
+ remediation_next_action = remediation.get("next_action")
+ remediation_hint = remediation.get("hint")
+ actionable_lines = _lines_from_list(actionable.get("items"))
+
+ next_action_line = ""
+ if remediation_next_action:
+ next_action_line = f"\n 下一步:{_escape_md(str(remediation_next_action))}"
+ if remediation_hint:
+ next_action_line += f"\n 提示:{_escape_md(str(remediation_hint))}"
+
+ sections: list[str] = [
+ "⚠️ *AI 治理警報*",
+ f"\n類型:{_escape_md(event_type)}",
+ f"狀態:{status}",
+ ]
+ if impact_lines:
+ sections.append(f"\n*影響*\n{impact_lines}")
+ if remediation_lines or next_action_line:
+ sections.append(f"\n*修復方向*")
+ if remediation_lines:
+ sections.append(remediation_lines)
+ if next_action_line:
+ sections.append(next_action_line)
+ if actionable_lines:
+ sections.append(f"\n*可直接自動化*\n{actionable_lines}")
+
+ fallback_items = _fallback_pairs(payload, keep={"status", "impact", "remediation", "actionable"})
+ if fallback_items:
+ sections.append(
+ "\n*欄位快覽(備援)*\n" + "\n".join(fallback_items)
+ )
+
+ msg = "\n".join(sections)
await self._send(msg)
logger.info("governance_alert_sent", event_type=event_type)
@@ -259,6 +289,46 @@ def _escape_md(text: str) -> str:
return text
+def _as_dict(value: Any) -> dict[str, Any]:
+ return value if isinstance(value, dict) else {}
+
+
+def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str:
+ if not data:
+ return ""
+ rows = []
+ idx = 0
+ for k in sorted(data.keys()) if isinstance(data, dict) else []:
+ if idx >= max_items:
+ break
+ rows.append(f"{_escape_md(str(k))}:{_escape_md(str(data.get(k)))}")
+ idx += 1
+ if compact and len(rows) >= max_items:
+ rows.append("...(更多欄位略)")
+ return "\n".join(f" {line}" for line in rows)
+
+
+def _lines_from_list(value: Any) -> str:
+ if not isinstance(value, list):
+ return ""
+ return "\n".join(
+ f" {idx + 1}. {_escape_md(str(item))}"
+ for idx, item in enumerate(value)
+ )
+
+
+def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> list[str]:
+ if not isinstance(payload, dict):
+ return []
+ keep = set(keep or set())
+ rows = []
+ for key in sorted(payload.keys()):
+ if key in keep:
+ continue
+ rows.append(f"{_escape_md(str(key))}:{_escape_md(str(payload.get(key)))}")
+ return rows
+
+
# =============================================================================
# Singleton
# =============================================================================
diff --git a/apps/api/src/services/governance_agent.py b/apps/api/src/services/governance_agent.py
index 267391f0..fa91e97e 100644
--- a/apps/api/src/services/governance_agent.py
+++ b/apps/api/src/services/governance_agent.py
@@ -142,6 +142,13 @@ class GovernanceAgent:
],
"sample_playbook_ids": kept_ids[:10],
},
+ "drifted_count": len(drifted),
+ "auto_deprecated_count": len(auto_deprecated_ids),
+ "auto_deprecated_ids": auto_deprecated_ids[:10],
+ "playbook_ids": kept_ids[:10],
+ "total_playbooks": total,
+ "threshold": TRUST_DRIFT_THRESHOLD,
+ "auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
},
)
@@ -215,6 +222,11 @@ class GovernanceAgent:
"安排至少 2 位 owner 對 stale條目做快速人工審核",
],
},
+ "stale_count": stale,
+ "total_count": total,
+ "stale_ratio": round(ratio, 3),
+ "threshold": KM_STALE_RATIO,
+ "stale_days": KM_STALE_DAYS,
},
)
@@ -260,6 +272,27 @@ class GovernanceAgent:
await self._alert(
"llm_hallucination",
{
+ "status": "warning",
+ "impact": {
+ "failed_count": failed,
+ "total_checked": total,
+ "hallucination_rate": round(rate, 3),
+ "threshold": HALLUCINATION_RATE_THRESHOLD,
+ },
+ "remediation": {
+ "items": [
+ "檢核 AI 建議來源與 evidence snapshot 一致性",
+ "檢視最近 incident 的 verifier 輸入欄位是否缺失關鍵上下文",
+ ],
+ "next_action": "run_knowledge_gap_audit",
+ "hint": "高失敗率通常表示 evidence 收斂流程退化或資料欄位解讀偏差",
+ },
+ "actionable": {
+ "items": [
+ "啟動 `playbook_evidence` 對齊補償流程",
+ "調整 verify timeout 與降級策略,避免過度信任低品質證據",
+ ],
+ },
"failed_count": failed,
"total_checked": total,
"hallucination_rate": round(rate, 3),
@@ -304,6 +337,27 @@ class GovernanceAgent:
await self._alert(
"execution_blast_radius",
{
+ "status": "warning",
+ "impact": {
+ "failed_count": failed,
+ "total_executions": total,
+ "failure_rate": round(rate, 3),
+ "threshold": EXECUTION_FAIL_RATE_THRESHOLD,
+ },
+ "remediation": {
+ "items": [
+ "鎖定失敗 playbook 清單,關閉高風險自動執行",
+ "比對 incident evidence 與 post_execution_verification 失敗原因",
+ ],
+ "next_action": "pause_auto_repair_for_top_failing_playbooks",
+ "hint": "可能是 auto_repair_playbook 與 runtime 版本/環境脫節",
+ },
+ "actionable": {
+ "items": [
+ "跑 `run_self_check` 快照與失敗 playbook 熱點報表",
+ "必要時啟用 emergency fallback 路由進人工審核",
+ ],
+ },
"failed_count": failed,
"total_executions": total,
"failure_rate": round(rate, 3),
@@ -548,9 +602,25 @@ class GovernanceAgent:
await self._alert(
"governance_self_failure",
{
- "failed_checks": failed_checks,
- "total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
- "errors": {k: results[k].get("error") for k in failed_checks},
+ "status": "critical",
+ "impact": {
+ "failed_checks": failed_checks,
+ "total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
+ "errors": {k: results[k].get("error") for k in failed_checks},
+ },
+ "remediation": {
+ "items": [
+ "暫停非關鍵治理自動化接收鏈路",
+ "聚焦治理執行路徑錯誤並補齊 fallback",
+ ],
+ "next_action": "investigate_governance_pipeline_health",
+ },
+ "actionable": {
+ "items": [
+ "檢查 GovernanceAgent run loop 是否完整執行 5 個項目",
+ "確認 DB 寫入與 Prometheus fetch 未被上游干擾",
+ ],
+ },
},
)
except Exception:
diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py
index 2d9c7cdd..6bb2ad8b 100644
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -2603,12 +2603,12 @@ class TelegramGateway:
f"━━━━━━━━━━━━━━━━━━━\n"
f"📋 {html.escape(incident_id)}\n"
f"🚨 異常元件:{html.escape(alertname)}\n"
- f"🎯 診斷結果:{html.escape(diagnosis[:100])}\n"
+ f"🎯 診斷結果:{html.escape(_smart_truncate(diagnosis, 320))}\n"
)
if system_impact:
- text += f"\n🧠 系統影響\n{html.escape(system_impact[:150])}\n"
+ text += f"\n🧠 系統影響\n{html.escape(_smart_truncate(system_impact, 320))}\n"
if probable_cause:
- text += f"└─ 可能根因:{html.escape(probable_cause[:100])}\n"
+ text += f"└─ 可能根因:{html.escape(_smart_truncate(probable_cause, 320))}\n"
# 2026-04-16 ogt: 移除 flywheel_diag / flywheel_dashboard (3-part ghost button,無 handler)
# 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md)
diff --git a/docs/12-agent-game-rules.md b/docs/12-agent-game-rules.md
index 69dfacbc..4e0b51c3 100644
--- a/docs/12-agent-game-rules.md
+++ b/docs/12-agent-game-rules.md
@@ -137,11 +137,12 @@ last_modified_by: Codex
## AI 治理告警事件規範(本輪新增)
-- 目標:把治理告警輸出固定為「可解析」「可追蹤」「可行動」三層結構,支援 telegram + PG + AI 決策。
+- 目標:把治理告警輸出固定為「可解析」「可追蹤」「可行動」三層結構,並讓 Telegram 與 AI Agent 能直接接力執行。
- 版本:`governance_event_v1`,適用模組:
- `governance_agent.py`(`_alert()`)
- `failover_alerter.py`(告警推送)
- `ai_slo_watchdog_job.py`(META 告警)
+- JSON Schema:[/Users/ogt/awoooi/docs/schemas/governance_event_v1.schema.json](docs/schemas/governance_event_v1.schema.json)
### 1) 通用 Schema
@@ -177,8 +178,8 @@ last_modified_by: Codex
| `trust_drift` | `governance_agent.check_trust_drift` | 風險警示時 `warning`;未超標可不推送 | `auto_deprecated_count/ids`, `playbook_ids` |
| `knowledge_degradation` | `governance_agent.check_knowledge_degradation` | 過比例時 `warning` | `next_action=run_kb_growth_healthcheck` |
| `governance_slo_data_gap` | `governance_agent.run_self_check` | 所有 SLO metric 無 emit 時 `warning` | `next_action=run_adr100_slo_emit_playbook` |
-| `llm_hallucination` | `governance_agent.check_llm_hallucination` | 舊有 payload(待重構) |
-| `execution_blast_radius` | `governance_agent.check_execution_blast_radius` | 舊有 payload(待重構) |
+| `llm_hallucination` | `governance_agent.check_llm_hallucination` | 過比例時 `warning` | `next_action=run_knowledge_gap_audit` |
+| `execution_blast_radius` | `governance_agent.check_execution_blast_radius` | 過比例時 `warning` | `next_action=pause_auto_repair_for_top_failing_playbooks` |
| `governance_slo__violation` | `governance_agent.check_slo_compliance` | `status=violation` | `next_action=trigger_flywheel_safeguard` |
| `slo_*`/`governance_*` | 其他治理事件 | 按事件需求保留最小欄位但建議同 schema |
@@ -190,4 +191,56 @@ last_modified_by: Codex
- `impact`
- `remediation`
- `actionable`
-- W-1~W-6 自健診(`ai_slo_watchdog_job.py`)以 `system_impact` 明確標示異常 KPI 數量與檢查區間,避免 `W-6` 漏報文案誤解。
+
+- W-1~W-6 自健診(`ai_slo_watchdog_job.py`)以 `system_impact` 明確列出異常 KPI 與序號清單,避免 `W-6` 漏報文案誤解。
+
+### 4) 快速 Sample(供 AGENT/Parser 套件直接接力)
+
+```json
+{
+ "event_type": "trust_drift",
+ "status": "warning",
+ "impact": {
+ "drifted_count": 4,
+ "total_playbooks": 26,
+ "drift_ratio": 0.153,
+ "auto_deprecated_count": 0,
+ "auto_deprecated_ids": [],
+ "playbook_ids": [
+ "PB-20260501-27910D",
+ "PB-COLD-745C00B9",
+ "PB-20260405-1CF853",
+ "PB-20260409-B66B1A"
+ ]
+ },
+ "remediation": {
+ "next_action": "review_trust_drift_candidates",
+ "items": [
+ "確認各 playbook 最近 14 天執行結果是否含高失敗/高重試",
+ "必要時啟用 trial auto-deprecate"
+ ]
+ },
+ "actionable": {
+ "items": [
+ "可自動註記可降級清單",
+ "可自動生成 approval-free dry-run 回放報告"
+ ]
+ }
+}
+```
+
+### 5) 事件處理路徑(非人肉清單)
+
+- `trust_drift`:
+ - 立即:保留低信任但新近使用 Playbook 清單,輸出 `playbook_ids`
+ - 自動:`AUTO_DEPRECATED` 當日 30 天內未更新的 Playbook 自動降級
+ - 人工:人工覆核 playbook 風險,決定是否 rollback
+- `knowledge_degradation`:
+ - 自動:觸發 `run_kb_growth_healthcheck`
+ - 續接:`playbook_evidence` / `kb_rot_cleaner` 補齊缺口
+- `governance_slo_data_gap`:
+ - 自動:`run_adr100_slo_emit_playbook`
+ - 檢查:所有 API Pod 是否已掛載 `PROMETHEUS_MULTIPROC_DIR`,Prometheus rules 已載入
+- `governance_slo_*_violation`:
+ - 自動:暫停高風險 auto-repair 路徑(`flywheel safeguard`)
+ - 人工:review 最近 1 小時 self-check 失敗樣本
diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md
index 97ff538d..5d89df84 100644
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -22,6 +22,19 @@
### 驗證
- 代碼改動已在上一輪 commit 寫入(含 `governance_agent.py`、`ai_slo_watchdog_job.py`、`webhooks.py`)並推送到 `gitea main`。
+## 2026-05-02 | AI治理報告可讀性與自動化收斂完成(本輪)
+
+### 完成
+- 將 `governance_agent.py` 告警 payload 升級為**雙軌輸出**:
+ - 保留現有扁平欄位(便於既有告警消費者);
+ - 同步補齊 `status / impact / remediation / actionable` 結構。
+- 讓 `FailoverAlerter.alert_governance` 直接輸出「影響 / 修復 / 可自動化」三區塊,去掉雜亂 Key=Value 備援列,提升 Telegram 一眼可讀性。
+- `ai_slo_watchdog_job.py` 重組 `W-1~W-6` 異常文案,加入 `system_impact` 明細與嚴重度自動分流(warning/critical)。
+- 新增機讀 schema:`docs/schemas/governance_event_v1.schema.json`,並在 `docs/12-agent-game-rules.md` 補齊告警範例與事件對應自動化路徑。
+
+### 影響
+- `trust_drift` / `knowledge_degradation` / `knowledge_slo_data_gap` 的告警不再只像「字串摘要」,可直接交給 Agent 判斷下一步行動。
+
## 2026-05-02 | trust_drift 飛輪自治:低信任未使用 playbook 自動 deprecate
承接統帥對 governance 類告警的全面授權。trust_drift 過去只發 Telegram 告警,4 個低信任 playbook 一直在告警表內噴噪音。
diff --git a/docs/schemas/governance_event_v1.schema.json b/docs/schemas/governance_event_v1.schema.json
new file mode 100644
index 00000000..15ad1c74
--- /dev/null
+++ b/docs/schemas/governance_event_v1.schema.json
@@ -0,0 +1,72 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "urn:awoooi:governance-event-v1",
+ "title": "AWOOOI Governance Event (v1)",
+ "type": "object",
+ "required": [
+ "status",
+ "impact",
+ "remediation",
+ "actionable"
+ ],
+ "properties": {
+ "status": {
+ "type": "string",
+ "enum": ["info", "warning", "critical", "violation"]
+ },
+ "impact": {
+ "type": "object",
+ "additionalProperties": true,
+ "minProperties": 1
+ },
+ "remediation": {
+ "type": "object",
+ "properties": {
+ "items": {
+ "type": "array",
+ "items": { "type": "string" }
+ },
+ "next_action": {
+ "type": "string",
+ "minLength": 1
+ },
+ "hint": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": true
+ },
+ "actionable": {
+ "type": "object",
+ "properties": {
+ "items": {
+ "type": "array",
+ "items": { "type": "string" }
+ }
+ },
+ "additionalProperties": true
+ },
+ "drifted_count": { "type": "integer", "minimum": 0 },
+ "total_playbooks": { "type": "integer", "minimum": 0 },
+ "auto_deprecated_count": { "type": "integer", "minimum": 0 },
+ "auto_deprecated_ids": {
+ "type": "array",
+ "items": { "type": "string" }
+ },
+ "playbook_ids": {
+ "type": "array",
+ "items": { "type": "string" }
+ },
+ "stale_count": { "type": "integer", "minimum": 0 },
+ "total_count": { "type": "integer", "minimum": 0 },
+ "stale_ratio": { "type": "number", "minimum": 0 },
+ "stale_days": { "type": "integer", "minimum": 1 },
+ "threshold": { "type": "number", "minimum": 0 },
+ "failed_count": { "type": "integer", "minimum": 0 },
+ "total_checked": { "type": "integer", "minimum": 0 },
+ "total_executions": { "type": "integer", "minimum": 0 },
+ "failure_rate": { "type": "number", "minimum": 0, "maximum": 1 },
+ "hallucination_rate": { "type": "number", "minimum": 0, "maximum": 1 }
+ },
+ "additionalProperties": true
+}