feat(governance): normalize AI治理告警輸出與元告警解析度
Some checks failed
CD Pipeline / tests (push) Failing after 25s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 46s

This commit is contained in:
Your Name
2026-05-02 23:49:59 +08:00
parent a38d911213
commit b710f3f38f
7 changed files with 325 additions and 31 deletions

View File

@@ -143,20 +143,36 @@ async def _check_once() -> None:
return
await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1")
violation_lines = [
f"{idx + 1}. {item}" for idx, item in enumerate(violations)
]
diagnosis = "AI 自健診異常"
system_impact = "\n".join(
[
f"檢出 {len(violations)} 項 KPI 異常W-1~W-6",
f"關鍵影響:飛輪自動化能力可能降級",
*violation_lines,
]
)
probable_cause = "治理異常與執行資料同時異常,建議先核對 AI SLO 指標與最近自修復任務執行紀錄"
# 發送 TYPE-8M Meta-System 告警
diagnosis = " | ".join(violations)
incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
try:
from src.services.telegram_gateway import get_telegram_gateway
await get_telegram_gateway().send_meta_alert(
incident_id=incident_id,
# 重大異常:超過 2 項即升為 critical便於前線分流1-2 項走 warning
severity = "critical" if len(violations) >= 2 else "warning"
incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
try:
from src.services.telegram_gateway import get_telegram_gateway
await get_telegram_gateway().send_meta_alert(
incident_id=incident_id,
approval_id=str(uuid.uuid4()),
alertname="AI 自健診異常",
alert_category="flywheel_health",
diagnosis=diagnosis,
severity_level="critical",
system_impact=f"{len(violations)} 項 KPI 異常W-1~W-6飛輪自動化能力可能降級",
)
alert_category="flywheel_health",
diagnosis=diagnosis,
severity_level=severity,
system_impact=system_impact,
probable_cause=probable_cause,
)
logger.warning(
"ai_slo_watchdog_alert_sent",
incident_id=incident_id,

View File

@@ -102,16 +102,46 @@ class FailoverAlerter:
logger.debug("governance_alert_dedup_skipped", event_type=event_type)
return
# 格式化 payload 為可讀字串key=value換行分隔
detail_lines = "\n".join(
f"{_escape_md(str(k))}{_escape_md(str(v))}"
for k, v in payload.items()
)
msg = (
f"*AI 治理警報*\n\n"
f"類型:{_escape_md(event_type)}\n\n"
f"{detail_lines}"
)
status = _escape_md(str(payload.get("status", "warning")))
impact = _as_dict(payload.get("impact"))
remediation = _as_dict(payload.get("remediation"))
actionable = _as_dict(payload.get("actionable"))
impact_lines = _lines_from_dict(impact, max_items=12, compact=True)
remediation_lines = _lines_from_list(remediation.get("items"))
remediation_next_action = remediation.get("next_action")
remediation_hint = remediation.get("hint")
actionable_lines = _lines_from_list(actionable.get("items"))
next_action_line = ""
if remediation_next_action:
next_action_line = f"\n 下一步:{_escape_md(str(remediation_next_action))}"
if remediation_hint:
next_action_line += f"\n 提示:{_escape_md(str(remediation_hint))}"
sections: list[str] = [
"⚠️ *AI 治理警報*",
f"\n類型:{_escape_md(event_type)}",
f"狀態:{status}",
]
if impact_lines:
sections.append(f"\n*影響*\n{impact_lines}")
if remediation_lines or next_action_line:
sections.append(f"\n*修復方向*")
if remediation_lines:
sections.append(remediation_lines)
if next_action_line:
sections.append(next_action_line)
if actionable_lines:
sections.append(f"\n*可直接自動化*\n{actionable_lines}")
fallback_items = _fallback_pairs(payload, keep={"status", "impact", "remediation", "actionable"})
if fallback_items:
sections.append(
"\n*欄位快覽(備援)*\n" + "\n".join(fallback_items)
)
msg = "\n".join(sections)
await self._send(msg)
logger.info("governance_alert_sent", event_type=event_type)
@@ -259,6 +289,46 @@ def _escape_md(text: str) -> str:
return text
def _as_dict(value: Any) -> dict[str, Any]:
return value if isinstance(value, dict) else {}
def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str:
if not data:
return ""
rows = []
idx = 0
for k in sorted(data.keys()) if isinstance(data, dict) else []:
if idx >= max_items:
break
rows.append(f"{_escape_md(str(k))}{_escape_md(str(data.get(k)))}")
idx += 1
if compact and len(rows) >= max_items:
rows.append("...(更多欄位略)")
return "\n".join(f" {line}" for line in rows)
def _lines_from_list(value: Any) -> str:
if not isinstance(value, list):
return ""
return "\n".join(
f" {idx + 1}. {_escape_md(str(item))}"
for idx, item in enumerate(value)
)
def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> list[str]:
if not isinstance(payload, dict):
return []
keep = set(keep or set())
rows = []
for key in sorted(payload.keys()):
if key in keep:
continue
rows.append(f"{_escape_md(str(key))}{_escape_md(str(payload.get(key)))}")
return rows
# =============================================================================
# Singleton
# =============================================================================

View File

@@ -142,6 +142,13 @@ class GovernanceAgent:
],
"sample_playbook_ids": kept_ids[:10],
},
"drifted_count": len(drifted),
"auto_deprecated_count": len(auto_deprecated_ids),
"auto_deprecated_ids": auto_deprecated_ids[:10],
"playbook_ids": kept_ids[:10],
"total_playbooks": total,
"threshold": TRUST_DRIFT_THRESHOLD,
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
},
)
@@ -215,6 +222,11 @@ class GovernanceAgent:
"安排至少 2 位 owner 對 stale條目做快速人工審核",
],
},
"stale_count": stale,
"total_count": total,
"stale_ratio": round(ratio, 3),
"threshold": KM_STALE_RATIO,
"stale_days": KM_STALE_DAYS,
},
)
@@ -260,6 +272,27 @@ class GovernanceAgent:
await self._alert(
"llm_hallucination",
{
"status": "warning",
"impact": {
"failed_count": failed,
"total_checked": total,
"hallucination_rate": round(rate, 3),
"threshold": HALLUCINATION_RATE_THRESHOLD,
},
"remediation": {
"items": [
"檢核 AI 建議來源與 evidence snapshot 一致性",
"檢視最近 incident 的 verifier 輸入欄位是否缺失關鍵上下文",
],
"next_action": "run_knowledge_gap_audit",
"hint": "高失敗率通常表示 evidence 收斂流程退化或資料欄位解讀偏差",
},
"actionable": {
"items": [
"啟動 `playbook_evidence` 對齊補償流程",
"調整 verify timeout 與降級策略,避免過度信任低品質證據",
],
},
"failed_count": failed,
"total_checked": total,
"hallucination_rate": round(rate, 3),
@@ -304,6 +337,27 @@ class GovernanceAgent:
await self._alert(
"execution_blast_radius",
{
"status": "warning",
"impact": {
"failed_count": failed,
"total_executions": total,
"failure_rate": round(rate, 3),
"threshold": EXECUTION_FAIL_RATE_THRESHOLD,
},
"remediation": {
"items": [
"鎖定失敗 playbook 清單,關閉高風險自動執行",
"比對 incident evidence 與 post_execution_verification 失敗原因",
],
"next_action": "pause_auto_repair_for_top_failing_playbooks",
"hint": "可能是 auto_repair_playbook 與 runtime 版本/環境脫節",
},
"actionable": {
"items": [
"跑 `run_self_check` 快照與失敗 playbook 熱點報表",
"必要時啟用 emergency fallback 路由進人工審核",
],
},
"failed_count": failed,
"total_executions": total,
"failure_rate": round(rate, 3),
@@ -548,9 +602,25 @@ class GovernanceAgent:
await self._alert(
"governance_self_failure",
{
"failed_checks": failed_checks,
"total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
"errors": {k: results[k].get("error") for k in failed_checks},
"status": "critical",
"impact": {
"failed_checks": failed_checks,
"total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
"errors": {k: results[k].get("error") for k in failed_checks},
},
"remediation": {
"items": [
"暫停非關鍵治理自動化接收鏈路",
"聚焦治理執行路徑錯誤並補齊 fallback",
],
"next_action": "investigate_governance_pipeline_health",
},
"actionable": {
"items": [
"檢查 GovernanceAgent run loop 是否完整執行 5 個項目",
"確認 DB 寫入與 Prometheus fetch 未被上游干擾",
],
},
},
)
except Exception:

View File

@@ -2603,12 +2603,12 @@ class TelegramGateway:
f"━━━━━━━━━━━━━━━━━━━\n"
f"📋 <code>{html.escape(incident_id)}</code>\n"
f"🚨 異常元件:<b>{html.escape(alertname)}</b>\n"
f"🎯 診斷結果:{html.escape(diagnosis[:100])}\n"
f"🎯 診斷結果:{html.escape(_smart_truncate(diagnosis, 320))}\n"
)
if system_impact:
text += f"\n🧠 <b>系統影響</b>\n{html.escape(system_impact[:150])}\n"
text += f"\n🧠 <b>系統影響</b>\n{html.escape(_smart_truncate(system_impact, 320))}\n"
if probable_cause:
text += f"└─ 可能根因:{html.escape(probable_cause[:100])}\n"
text += f"└─ 可能根因:{html.escape(_smart_truncate(probable_cause, 320))}\n"
# 2026-04-16 ogt: 移除 flywheel_diag / flywheel_dashboard (3-part ghost button無 handler)
# 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md)

View File

@@ -137,11 +137,12 @@ last_modified_by: Codex
## AI 治理告警事件規範(本輪新增)
- 目標:把治理告警輸出固定為「可解析」「可追蹤」「可行動」三層結構,支援 telegram + PG + AI 決策
- 目標:把治理告警輸出固定為「可解析」「可追蹤」「可行動」三層結構,並讓 Telegram 與 AI Agent 能直接接力執行
- 版本:`governance_event_v1`,適用模組:
- `governance_agent.py``_alert()`
- `failover_alerter.py`(告警推送)
- `ai_slo_watchdog_job.py`META 告警)
- JSON Schema[/Users/ogt/awoooi/docs/schemas/governance_event_v1.schema.json](docs/schemas/governance_event_v1.schema.json)
### 1) 通用 Schema
@@ -177,8 +178,8 @@ last_modified_by: Codex
| `trust_drift` | `governance_agent.check_trust_drift` | 風險警示時 `warning`;未超標可不推送 | `auto_deprecated_count/ids`, `playbook_ids` |
| `knowledge_degradation` | `governance_agent.check_knowledge_degradation` | 過比例時 `warning` | `next_action=run_kb_growth_healthcheck` |
| `governance_slo_data_gap` | `governance_agent.run_self_check` | 所有 SLO metric 無 emit 時 `warning` | `next_action=run_adr100_slo_emit_playbook` |
| `llm_hallucination` | `governance_agent.check_llm_hallucination` | 舊有 payload待重構 |
| `execution_blast_radius` | `governance_agent.check_execution_blast_radius` | 舊有 payload待重構 |
| `llm_hallucination` | `governance_agent.check_llm_hallucination` | 過比例時 `warning` | `next_action=run_knowledge_gap_audit` |
| `execution_blast_radius` | `governance_agent.check_execution_blast_radius` | 過比例時 `warning` | `next_action=pause_auto_repair_for_top_failing_playbooks` |
| `governance_slo_<metric>_violation` | `governance_agent.check_slo_compliance` | `status=violation` | `next_action=trigger_flywheel_safeguard` |
| `slo_*`/`governance_*` | 其他治理事件 | 按事件需求保留最小欄位但建議同 schema |
@@ -190,4 +191,56 @@ last_modified_by: Codex
- `impact`
- `remediation`
- `actionable`
- W-1~W-6 自健診(`ai_slo_watchdog_job.py`)以 `system_impact` 明確標示異常 KPI 數量與檢查區間,避免 `W-6` 漏報文案誤解。
- W-1~W-6 自健診(`ai_slo_watchdog_job.py`)以 `system_impact` 明確列出異常 KPI 與序號清單,避免 `W-6` 漏報文案誤解。
### 4) 快速 Sample供 AGENT/Parser 套件直接接力)
```json
{
"event_type": "trust_drift",
"status": "warning",
"impact": {
"drifted_count": 4,
"total_playbooks": 26,
"drift_ratio": 0.153,
"auto_deprecated_count": 0,
"auto_deprecated_ids": [],
"playbook_ids": [
"PB-20260501-27910D",
"PB-COLD-745C00B9",
"PB-20260405-1CF853",
"PB-20260409-B66B1A"
]
},
"remediation": {
"next_action": "review_trust_drift_candidates",
"items": [
"確認各 playbook 最近 14 天執行結果是否含高失敗/高重試",
"必要時啟用 trial auto-deprecate"
]
},
"actionable": {
"items": [
"可自動註記可降級清單",
"可自動生成 approval-free dry-run 回放報告"
]
}
}
```
### 5) 事件處理路徑(非人肉清單)
- `trust_drift`
- 立即:保留低信任但新近使用 Playbook 清單,輸出 `playbook_ids`
- 自動:`AUTO_DEPRECATED` 當日 30 天內未更新的 Playbook 自動降級
- 人工:人工覆核 playbook 風險,決定是否 rollback
- `knowledge_degradation`
- 自動:觸發 `run_kb_growth_healthcheck`
- 續接:`playbook_evidence` / `kb_rot_cleaner` 補齊缺口
- `governance_slo_data_gap`
- 自動:`run_adr100_slo_emit_playbook`
- 檢查:所有 API Pod 是否已掛載 `PROMETHEUS_MULTIPROC_DIR`Prometheus rules 已載入
- `governance_slo_*_violation`
- 自動:暫停高風險 auto-repair 路徑(`flywheel safeguard`
- 人工review 最近 1 小時 self-check 失敗樣本

View File

@@ -22,6 +22,19 @@
### 驗證
- 代碼改動已在上一輪 commit 寫入(含 `governance_agent.py``ai_slo_watchdog_job.py``webhooks.py`)並推送到 `gitea main`
## 2026-05-02 | AI治理報告可讀性與自動化收斂完成本輪
### 完成
-`governance_agent.py` 告警 payload 升級為**雙軌輸出**
- 保留現有扁平欄位(便於既有告警消費者);
- 同步補齊 `status / impact / remediation / actionable` 結構。
-`FailoverAlerter.alert_governance` 直接輸出「影響 / 修復 / 可自動化」三區塊,去掉雜亂 Key=Value 備援列,提升 Telegram 一眼可讀性。
- `ai_slo_watchdog_job.py` 重組 `W-1~W-6` 異常文案,加入 `system_impact` 明細與嚴重度自動分流warning/critical
- 新增機讀 schema`docs/schemas/governance_event_v1.schema.json`,並在 `docs/12-agent-game-rules.md` 補齊告警範例與事件對應自動化路徑。
### 影響
- `trust_drift` / `knowledge_degradation` / `knowledge_slo_data_gap` 的告警不再只像「字串摘要」,可直接交給 Agent 判斷下一步行動。
## 2026-05-02 | trust_drift 飛輪自治:低信任未使用 playbook 自動 deprecate
承接統帥對 governance 類告警的全面授權。trust_drift 過去只發 Telegram 告警4 個低信任 playbook 一直在告警表內噴噪音。

View File

@@ -0,0 +1,72 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "urn:awoooi:governance-event-v1",
"title": "AWOOOI Governance Event (v1)",
"type": "object",
"required": [
"status",
"impact",
"remediation",
"actionable"
],
"properties": {
"status": {
"type": "string",
"enum": ["info", "warning", "critical", "violation"]
},
"impact": {
"type": "object",
"additionalProperties": true,
"minProperties": 1
},
"remediation": {
"type": "object",
"properties": {
"items": {
"type": "array",
"items": { "type": "string" }
},
"next_action": {
"type": "string",
"minLength": 1
},
"hint": {
"type": "string"
}
},
"additionalProperties": true
},
"actionable": {
"type": "object",
"properties": {
"items": {
"type": "array",
"items": { "type": "string" }
}
},
"additionalProperties": true
},
"drifted_count": { "type": "integer", "minimum": 0 },
"total_playbooks": { "type": "integer", "minimum": 0 },
"auto_deprecated_count": { "type": "integer", "minimum": 0 },
"auto_deprecated_ids": {
"type": "array",
"items": { "type": "string" }
},
"playbook_ids": {
"type": "array",
"items": { "type": "string" }
},
"stale_count": { "type": "integer", "minimum": 0 },
"total_count": { "type": "integer", "minimum": 0 },
"stale_ratio": { "type": "number", "minimum": 0 },
"stale_days": { "type": "integer", "minimum": 1 },
"threshold": { "type": "number", "minimum": 0 },
"failed_count": { "type": "integer", "minimum": 0 },
"total_checked": { "type": "integer", "minimum": 0 },
"total_executions": { "type": "integer", "minimum": 0 },
"failure_rate": { "type": "number", "minimum": 0, "maximum": 1 },
"hallucination_rate": { "type": "number", "minimum": 0, "maximum": 1 }
},
"additionalProperties": true
}