diff --git a/apps/api/src/jobs/ai_slo_watchdog_job.py b/apps/api/src/jobs/ai_slo_watchdog_job.py index 93dcdbd0..0461dca1 100644 --- a/apps/api/src/jobs/ai_slo_watchdog_job.py +++ b/apps/api/src/jobs/ai_slo_watchdog_job.py @@ -143,20 +143,36 @@ async def _check_once() -> None: return await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1") + violation_lines = [ + f"{idx + 1}. {item}" for idx, item in enumerate(violations) + ] + diagnosis = "AI 自健診異常" + system_impact = "\n".join( + [ + f"檢出 {len(violations)} 項 KPI 異常(W-1~W-6)", + f"關鍵影響:飛輪自動化能力可能降級", + *violation_lines, + ] + ) + probable_cause = "治理異常與執行資料同時異常,建議先核對 AI SLO 指標與最近自修復任務執行紀錄" + # 發送 TYPE-8M Meta-System 告警 - diagnosis = " | ".join(violations) - incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}" - try: - from src.services.telegram_gateway import get_telegram_gateway - await get_telegram_gateway().send_meta_alert( - incident_id=incident_id, + # 重大異常:超過 2 項即升為 critical,便於前線分流;1-2 項走 warning + severity = "critical" if len(violations) >= 2 else "warning" + incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}" + try: + from src.services.telegram_gateway import get_telegram_gateway + + await get_telegram_gateway().send_meta_alert( + incident_id=incident_id, approval_id=str(uuid.uuid4()), alertname="AI 自健診異常", - alert_category="flywheel_health", - diagnosis=diagnosis, - severity_level="critical", - system_impact=f"{len(violations)} 項 KPI 異常(W-1~W-6),飛輪自動化能力可能降級", - ) + alert_category="flywheel_health", + diagnosis=diagnosis, + severity_level=severity, + system_impact=system_impact, + probable_cause=probable_cause, + ) logger.warning( "ai_slo_watchdog_alert_sent", incident_id=incident_id, diff --git a/apps/api/src/services/failover_alerter.py b/apps/api/src/services/failover_alerter.py index bac8e4d2..d552eb12 100644 --- a/apps/api/src/services/failover_alerter.py +++ b/apps/api/src/services/failover_alerter.py @@ -102,16 +102,46 @@ class FailoverAlerter: logger.debug("governance_alert_dedup_skipped", event_type=event_type) return - # 格式化 payload 為可讀字串(key=value,換行分隔) - detail_lines = "\n".join( - f"{_escape_md(str(k))}:{_escape_md(str(v))}" - for k, v in payload.items() - ) - msg = ( - f"*AI 治理警報*\n\n" - f"類型:{_escape_md(event_type)}\n\n" - f"{detail_lines}" - ) + status = _escape_md(str(payload.get("status", "warning"))) + impact = _as_dict(payload.get("impact")) + remediation = _as_dict(payload.get("remediation")) + actionable = _as_dict(payload.get("actionable")) + + impact_lines = _lines_from_dict(impact, max_items=12, compact=True) + remediation_lines = _lines_from_list(remediation.get("items")) + remediation_next_action = remediation.get("next_action") + remediation_hint = remediation.get("hint") + actionable_lines = _lines_from_list(actionable.get("items")) + + next_action_line = "" + if remediation_next_action: + next_action_line = f"\n 下一步:{_escape_md(str(remediation_next_action))}" + if remediation_hint: + next_action_line += f"\n 提示:{_escape_md(str(remediation_hint))}" + + sections: list[str] = [ + "⚠️ *AI 治理警報*", + f"\n類型:{_escape_md(event_type)}", + f"狀態:{status}", + ] + if impact_lines: + sections.append(f"\n*影響*\n{impact_lines}") + if remediation_lines or next_action_line: + sections.append(f"\n*修復方向*") + if remediation_lines: + sections.append(remediation_lines) + if next_action_line: + sections.append(next_action_line) + if actionable_lines: + sections.append(f"\n*可直接自動化*\n{actionable_lines}") + + fallback_items = _fallback_pairs(payload, keep={"status", "impact", "remediation", "actionable"}) + if fallback_items: + sections.append( + "\n*欄位快覽(備援)*\n" + "\n".join(fallback_items) + ) + + msg = "\n".join(sections) await self._send(msg) logger.info("governance_alert_sent", event_type=event_type) @@ -259,6 +289,46 @@ def _escape_md(text: str) -> str: return text +def _as_dict(value: Any) -> dict[str, Any]: + return value if isinstance(value, dict) else {} + + +def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str: + if not data: + return "" + rows = [] + idx = 0 + for k in sorted(data.keys()) if isinstance(data, dict) else []: + if idx >= max_items: + break + rows.append(f"{_escape_md(str(k))}:{_escape_md(str(data.get(k)))}") + idx += 1 + if compact and len(rows) >= max_items: + rows.append("...(更多欄位略)") + return "\n".join(f" {line}" for line in rows) + + +def _lines_from_list(value: Any) -> str: + if not isinstance(value, list): + return "" + return "\n".join( + f" {idx + 1}. {_escape_md(str(item))}" + for idx, item in enumerate(value) + ) + + +def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> list[str]: + if not isinstance(payload, dict): + return [] + keep = set(keep or set()) + rows = [] + for key in sorted(payload.keys()): + if key in keep: + continue + rows.append(f"{_escape_md(str(key))}:{_escape_md(str(payload.get(key)))}") + return rows + + # ============================================================================= # Singleton # ============================================================================= diff --git a/apps/api/src/services/governance_agent.py b/apps/api/src/services/governance_agent.py index 267391f0..fa91e97e 100644 --- a/apps/api/src/services/governance_agent.py +++ b/apps/api/src/services/governance_agent.py @@ -142,6 +142,13 @@ class GovernanceAgent: ], "sample_playbook_ids": kept_ids[:10], }, + "drifted_count": len(drifted), + "auto_deprecated_count": len(auto_deprecated_ids), + "auto_deprecated_ids": auto_deprecated_ids[:10], + "playbook_ids": kept_ids[:10], + "total_playbooks": total, + "threshold": TRUST_DRIFT_THRESHOLD, + "auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS, }, ) @@ -215,6 +222,11 @@ class GovernanceAgent: "安排至少 2 位 owner 對 stale條目做快速人工審核", ], }, + "stale_count": stale, + "total_count": total, + "stale_ratio": round(ratio, 3), + "threshold": KM_STALE_RATIO, + "stale_days": KM_STALE_DAYS, }, ) @@ -260,6 +272,27 @@ class GovernanceAgent: await self._alert( "llm_hallucination", { + "status": "warning", + "impact": { + "failed_count": failed, + "total_checked": total, + "hallucination_rate": round(rate, 3), + "threshold": HALLUCINATION_RATE_THRESHOLD, + }, + "remediation": { + "items": [ + "檢核 AI 建議來源與 evidence snapshot 一致性", + "檢視最近 incident 的 verifier 輸入欄位是否缺失關鍵上下文", + ], + "next_action": "run_knowledge_gap_audit", + "hint": "高失敗率通常表示 evidence 收斂流程退化或資料欄位解讀偏差", + }, + "actionable": { + "items": [ + "啟動 `playbook_evidence` 對齊補償流程", + "調整 verify timeout 與降級策略,避免過度信任低品質證據", + ], + }, "failed_count": failed, "total_checked": total, "hallucination_rate": round(rate, 3), @@ -304,6 +337,27 @@ class GovernanceAgent: await self._alert( "execution_blast_radius", { + "status": "warning", + "impact": { + "failed_count": failed, + "total_executions": total, + "failure_rate": round(rate, 3), + "threshold": EXECUTION_FAIL_RATE_THRESHOLD, + }, + "remediation": { + "items": [ + "鎖定失敗 playbook 清單,關閉高風險自動執行", + "比對 incident evidence 與 post_execution_verification 失敗原因", + ], + "next_action": "pause_auto_repair_for_top_failing_playbooks", + "hint": "可能是 auto_repair_playbook 與 runtime 版本/環境脫節", + }, + "actionable": { + "items": [ + "跑 `run_self_check` 快照與失敗 playbook 熱點報表", + "必要時啟用 emergency fallback 路由進人工審核", + ], + }, "failed_count": failed, "total_executions": total, "failure_rate": round(rate, 3), @@ -548,9 +602,25 @@ class GovernanceAgent: await self._alert( "governance_self_failure", { - "failed_checks": failed_checks, - "total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項 - "errors": {k: results[k].get("error") for k in failed_checks}, + "status": "critical", + "impact": { + "failed_checks": failed_checks, + "total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項 + "errors": {k: results[k].get("error") for k in failed_checks}, + }, + "remediation": { + "items": [ + "暫停非關鍵治理自動化接收鏈路", + "聚焦治理執行路徑錯誤並補齊 fallback", + ], + "next_action": "investigate_governance_pipeline_health", + }, + "actionable": { + "items": [ + "檢查 GovernanceAgent run loop 是否完整執行 5 個項目", + "確認 DB 寫入與 Prometheus fetch 未被上游干擾", + ], + }, }, ) except Exception: diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 2d9c7cdd..6bb2ad8b 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -2603,12 +2603,12 @@ class TelegramGateway: f"━━━━━━━━━━━━━━━━━━━\n" f"📋 {html.escape(incident_id)}\n" f"🚨 異常元件:{html.escape(alertname)}\n" - f"🎯 診斷結果:{html.escape(diagnosis[:100])}\n" + f"🎯 診斷結果:{html.escape(_smart_truncate(diagnosis, 320))}\n" ) if system_impact: - text += f"\n🧠 系統影響\n{html.escape(system_impact[:150])}\n" + text += f"\n🧠 系統影響\n{html.escape(_smart_truncate(system_impact, 320))}\n" if probable_cause: - text += f"└─ 可能根因:{html.escape(probable_cause[:100])}\n" + text += f"└─ 可能根因:{html.escape(_smart_truncate(probable_cause, 320))}\n" # 2026-04-16 ogt: 移除 flywheel_diag / flywheel_dashboard (3-part ghost button,無 handler) # 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md) diff --git a/docs/12-agent-game-rules.md b/docs/12-agent-game-rules.md index 69dfacbc..4e0b51c3 100644 --- a/docs/12-agent-game-rules.md +++ b/docs/12-agent-game-rules.md @@ -137,11 +137,12 @@ last_modified_by: Codex ## AI 治理告警事件規範(本輪新增) -- 目標:把治理告警輸出固定為「可解析」「可追蹤」「可行動」三層結構,支援 telegram + PG + AI 決策。 +- 目標:把治理告警輸出固定為「可解析」「可追蹤」「可行動」三層結構,並讓 Telegram 與 AI Agent 能直接接力執行。 - 版本:`governance_event_v1`,適用模組: - `governance_agent.py`(`_alert()`) - `failover_alerter.py`(告警推送) - `ai_slo_watchdog_job.py`(META 告警) +- JSON Schema:[/Users/ogt/awoooi/docs/schemas/governance_event_v1.schema.json](docs/schemas/governance_event_v1.schema.json) ### 1) 通用 Schema @@ -177,8 +178,8 @@ last_modified_by: Codex | `trust_drift` | `governance_agent.check_trust_drift` | 風險警示時 `warning`;未超標可不推送 | `auto_deprecated_count/ids`, `playbook_ids` | | `knowledge_degradation` | `governance_agent.check_knowledge_degradation` | 過比例時 `warning` | `next_action=run_kb_growth_healthcheck` | | `governance_slo_data_gap` | `governance_agent.run_self_check` | 所有 SLO metric 無 emit 時 `warning` | `next_action=run_adr100_slo_emit_playbook` | -| `llm_hallucination` | `governance_agent.check_llm_hallucination` | 舊有 payload(待重構) | -| `execution_blast_radius` | `governance_agent.check_execution_blast_radius` | 舊有 payload(待重構) | +| `llm_hallucination` | `governance_agent.check_llm_hallucination` | 過比例時 `warning` | `next_action=run_knowledge_gap_audit` | +| `execution_blast_radius` | `governance_agent.check_execution_blast_radius` | 過比例時 `warning` | `next_action=pause_auto_repair_for_top_failing_playbooks` | | `governance_slo__violation` | `governance_agent.check_slo_compliance` | `status=violation` | `next_action=trigger_flywheel_safeguard` | | `slo_*`/`governance_*` | 其他治理事件 | 按事件需求保留最小欄位但建議同 schema | @@ -190,4 +191,56 @@ last_modified_by: Codex - `impact` - `remediation` - `actionable` -- W-1~W-6 自健診(`ai_slo_watchdog_job.py`)以 `system_impact` 明確標示異常 KPI 數量與檢查區間,避免 `W-6` 漏報文案誤解。 + +- W-1~W-6 自健診(`ai_slo_watchdog_job.py`)以 `system_impact` 明確列出異常 KPI 與序號清單,避免 `W-6` 漏報文案誤解。 + +### 4) 快速 Sample(供 AGENT/Parser 套件直接接力) + +```json +{ + "event_type": "trust_drift", + "status": "warning", + "impact": { + "drifted_count": 4, + "total_playbooks": 26, + "drift_ratio": 0.153, + "auto_deprecated_count": 0, + "auto_deprecated_ids": [], + "playbook_ids": [ + "PB-20260501-27910D", + "PB-COLD-745C00B9", + "PB-20260405-1CF853", + "PB-20260409-B66B1A" + ] + }, + "remediation": { + "next_action": "review_trust_drift_candidates", + "items": [ + "確認各 playbook 最近 14 天執行結果是否含高失敗/高重試", + "必要時啟用 trial auto-deprecate" + ] + }, + "actionable": { + "items": [ + "可自動註記可降級清單", + "可自動生成 approval-free dry-run 回放報告" + ] + } +} +``` + +### 5) 事件處理路徑(非人肉清單) + +- `trust_drift`: + - 立即:保留低信任但新近使用 Playbook 清單,輸出 `playbook_ids` + - 自動:`AUTO_DEPRECATED` 當日 30 天內未更新的 Playbook 自動降級 + - 人工:人工覆核 playbook 風險,決定是否 rollback +- `knowledge_degradation`: + - 自動:觸發 `run_kb_growth_healthcheck` + - 續接:`playbook_evidence` / `kb_rot_cleaner` 補齊缺口 +- `governance_slo_data_gap`: + - 自動:`run_adr100_slo_emit_playbook` + - 檢查:所有 API Pod 是否已掛載 `PROMETHEUS_MULTIPROC_DIR`,Prometheus rules 已載入 +- `governance_slo_*_violation`: + - 自動:暫停高風險 auto-repair 路徑(`flywheel safeguard`) + - 人工:review 最近 1 小時 self-check 失敗樣本 diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 97ff538d..5d89df84 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -22,6 +22,19 @@ ### 驗證 - 代碼改動已在上一輪 commit 寫入(含 `governance_agent.py`、`ai_slo_watchdog_job.py`、`webhooks.py`)並推送到 `gitea main`。 +## 2026-05-02 | AI治理報告可讀性與自動化收斂完成(本輪) + +### 完成 +- 將 `governance_agent.py` 告警 payload 升級為**雙軌輸出**: + - 保留現有扁平欄位(便於既有告警消費者); + - 同步補齊 `status / impact / remediation / actionable` 結構。 +- 讓 `FailoverAlerter.alert_governance` 直接輸出「影響 / 修復 / 可自動化」三區塊,去掉雜亂 Key=Value 備援列,提升 Telegram 一眼可讀性。 +- `ai_slo_watchdog_job.py` 重組 `W-1~W-6` 異常文案,加入 `system_impact` 明細與嚴重度自動分流(warning/critical)。 +- 新增機讀 schema:`docs/schemas/governance_event_v1.schema.json`,並在 `docs/12-agent-game-rules.md` 補齊告警範例與事件對應自動化路徑。 + +### 影響 +- `trust_drift` / `knowledge_degradation` / `knowledge_slo_data_gap` 的告警不再只像「字串摘要」,可直接交給 Agent 判斷下一步行動。 + ## 2026-05-02 | trust_drift 飛輪自治:低信任未使用 playbook 自動 deprecate 承接統帥對 governance 類告警的全面授權。trust_drift 過去只發 Telegram 告警,4 個低信任 playbook 一直在告警表內噴噪音。 diff --git a/docs/schemas/governance_event_v1.schema.json b/docs/schemas/governance_event_v1.schema.json new file mode 100644 index 00000000..15ad1c74 --- /dev/null +++ b/docs/schemas/governance_event_v1.schema.json @@ -0,0 +1,72 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "urn:awoooi:governance-event-v1", + "title": "AWOOOI Governance Event (v1)", + "type": "object", + "required": [ + "status", + "impact", + "remediation", + "actionable" + ], + "properties": { + "status": { + "type": "string", + "enum": ["info", "warning", "critical", "violation"] + }, + "impact": { + "type": "object", + "additionalProperties": true, + "minProperties": 1 + }, + "remediation": { + "type": "object", + "properties": { + "items": { + "type": "array", + "items": { "type": "string" } + }, + "next_action": { + "type": "string", + "minLength": 1 + }, + "hint": { + "type": "string" + } + }, + "additionalProperties": true + }, + "actionable": { + "type": "object", + "properties": { + "items": { + "type": "array", + "items": { "type": "string" } + } + }, + "additionalProperties": true + }, + "drifted_count": { "type": "integer", "minimum": 0 }, + "total_playbooks": { "type": "integer", "minimum": 0 }, + "auto_deprecated_count": { "type": "integer", "minimum": 0 }, + "auto_deprecated_ids": { + "type": "array", + "items": { "type": "string" } + }, + "playbook_ids": { + "type": "array", + "items": { "type": "string" } + }, + "stale_count": { "type": "integer", "minimum": 0 }, + "total_count": { "type": "integer", "minimum": 0 }, + "stale_ratio": { "type": "number", "minimum": 0 }, + "stale_days": { "type": "integer", "minimum": 1 }, + "threshold": { "type": "number", "minimum": 0 }, + "failed_count": { "type": "integer", "minimum": 0 }, + "total_checked": { "type": "integer", "minimum": 0 }, + "total_executions": { "type": "integer", "minimum": 0 }, + "failure_rate": { "type": "number", "minimum": 0, "maximum": 1 }, + "hallucination_rate": { "type": "number", "minimum": 0, "maximum": 1 } + }, + "additionalProperties": true +}