feat(governance): normalize AI治理告警輸出與元告警解析度
This commit is contained in:
@@ -143,20 +143,36 @@ async def _check_once() -> None:
|
||||
return
|
||||
await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1")
|
||||
|
||||
violation_lines = [
|
||||
f"{idx + 1}. {item}" for idx, item in enumerate(violations)
|
||||
]
|
||||
diagnosis = "AI 自健診異常"
|
||||
system_impact = "\n".join(
|
||||
[
|
||||
f"檢出 {len(violations)} 項 KPI 異常(W-1~W-6)",
|
||||
f"關鍵影響:飛輪自動化能力可能降級",
|
||||
*violation_lines,
|
||||
]
|
||||
)
|
||||
probable_cause = "治理異常與執行資料同時異常,建議先核對 AI SLO 指標與最近自修復任務執行紀錄"
|
||||
|
||||
# 發送 TYPE-8M Meta-System 告警
|
||||
diagnosis = " | ".join(violations)
|
||||
incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
|
||||
try:
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
await get_telegram_gateway().send_meta_alert(
|
||||
incident_id=incident_id,
|
||||
# 重大異常:超過 2 項即升為 critical,便於前線分流;1-2 項走 warning
|
||||
severity = "critical" if len(violations) >= 2 else "warning"
|
||||
incident_id = f"META-{now_taipei().strftime('%Y%m%d%H%M%S')}"
|
||||
try:
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
|
||||
await get_telegram_gateway().send_meta_alert(
|
||||
incident_id=incident_id,
|
||||
approval_id=str(uuid.uuid4()),
|
||||
alertname="AI 自健診異常",
|
||||
alert_category="flywheel_health",
|
||||
diagnosis=diagnosis,
|
||||
severity_level="critical",
|
||||
system_impact=f"{len(violations)} 項 KPI 異常(W-1~W-6),飛輪自動化能力可能降級",
|
||||
)
|
||||
alert_category="flywheel_health",
|
||||
diagnosis=diagnosis,
|
||||
severity_level=severity,
|
||||
system_impact=system_impact,
|
||||
probable_cause=probable_cause,
|
||||
)
|
||||
logger.warning(
|
||||
"ai_slo_watchdog_alert_sent",
|
||||
incident_id=incident_id,
|
||||
|
||||
@@ -102,16 +102,46 @@ class FailoverAlerter:
|
||||
logger.debug("governance_alert_dedup_skipped", event_type=event_type)
|
||||
return
|
||||
|
||||
# 格式化 payload 為可讀字串(key=value,換行分隔)
|
||||
detail_lines = "\n".join(
|
||||
f"{_escape_md(str(k))}:{_escape_md(str(v))}"
|
||||
for k, v in payload.items()
|
||||
)
|
||||
msg = (
|
||||
f"*AI 治理警報*\n\n"
|
||||
f"類型:{_escape_md(event_type)}\n\n"
|
||||
f"{detail_lines}"
|
||||
)
|
||||
status = _escape_md(str(payload.get("status", "warning")))
|
||||
impact = _as_dict(payload.get("impact"))
|
||||
remediation = _as_dict(payload.get("remediation"))
|
||||
actionable = _as_dict(payload.get("actionable"))
|
||||
|
||||
impact_lines = _lines_from_dict(impact, max_items=12, compact=True)
|
||||
remediation_lines = _lines_from_list(remediation.get("items"))
|
||||
remediation_next_action = remediation.get("next_action")
|
||||
remediation_hint = remediation.get("hint")
|
||||
actionable_lines = _lines_from_list(actionable.get("items"))
|
||||
|
||||
next_action_line = ""
|
||||
if remediation_next_action:
|
||||
next_action_line = f"\n 下一步:{_escape_md(str(remediation_next_action))}"
|
||||
if remediation_hint:
|
||||
next_action_line += f"\n 提示:{_escape_md(str(remediation_hint))}"
|
||||
|
||||
sections: list[str] = [
|
||||
"⚠️ *AI 治理警報*",
|
||||
f"\n類型:{_escape_md(event_type)}",
|
||||
f"狀態:{status}",
|
||||
]
|
||||
if impact_lines:
|
||||
sections.append(f"\n*影響*\n{impact_lines}")
|
||||
if remediation_lines or next_action_line:
|
||||
sections.append(f"\n*修復方向*")
|
||||
if remediation_lines:
|
||||
sections.append(remediation_lines)
|
||||
if next_action_line:
|
||||
sections.append(next_action_line)
|
||||
if actionable_lines:
|
||||
sections.append(f"\n*可直接自動化*\n{actionable_lines}")
|
||||
|
||||
fallback_items = _fallback_pairs(payload, keep={"status", "impact", "remediation", "actionable"})
|
||||
if fallback_items:
|
||||
sections.append(
|
||||
"\n*欄位快覽(備援)*\n" + "\n".join(fallback_items)
|
||||
)
|
||||
|
||||
msg = "\n".join(sections)
|
||||
await self._send(msg)
|
||||
logger.info("governance_alert_sent", event_type=event_type)
|
||||
|
||||
@@ -259,6 +289,46 @@ def _escape_md(text: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
def _as_dict(value: Any) -> dict[str, Any]:
|
||||
return value if isinstance(value, dict) else {}
|
||||
|
||||
|
||||
def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = False) -> str:
|
||||
if not data:
|
||||
return ""
|
||||
rows = []
|
||||
idx = 0
|
||||
for k in sorted(data.keys()) if isinstance(data, dict) else []:
|
||||
if idx >= max_items:
|
||||
break
|
||||
rows.append(f"{_escape_md(str(k))}:{_escape_md(str(data.get(k)))}")
|
||||
idx += 1
|
||||
if compact and len(rows) >= max_items:
|
||||
rows.append("...(更多欄位略)")
|
||||
return "\n".join(f" {line}" for line in rows)
|
||||
|
||||
|
||||
def _lines_from_list(value: Any) -> str:
|
||||
if not isinstance(value, list):
|
||||
return ""
|
||||
return "\n".join(
|
||||
f" {idx + 1}. {_escape_md(str(item))}"
|
||||
for idx, item in enumerate(value)
|
||||
)
|
||||
|
||||
|
||||
def _fallback_pairs(payload: dict[str, Any], keep: set[str] | None = None) -> list[str]:
|
||||
if not isinstance(payload, dict):
|
||||
return []
|
||||
keep = set(keep or set())
|
||||
rows = []
|
||||
for key in sorted(payload.keys()):
|
||||
if key in keep:
|
||||
continue
|
||||
rows.append(f"{_escape_md(str(key))}:{_escape_md(str(payload.get(key)))}")
|
||||
return rows
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton
|
||||
# =============================================================================
|
||||
|
||||
@@ -142,6 +142,13 @@ class GovernanceAgent:
|
||||
],
|
||||
"sample_playbook_ids": kept_ids[:10],
|
||||
},
|
||||
"drifted_count": len(drifted),
|
||||
"auto_deprecated_count": len(auto_deprecated_ids),
|
||||
"auto_deprecated_ids": auto_deprecated_ids[:10],
|
||||
"playbook_ids": kept_ids[:10],
|
||||
"total_playbooks": total,
|
||||
"threshold": TRUST_DRIFT_THRESHOLD,
|
||||
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -215,6 +222,11 @@ class GovernanceAgent:
|
||||
"安排至少 2 位 owner 對 stale條目做快速人工審核",
|
||||
],
|
||||
},
|
||||
"stale_count": stale,
|
||||
"total_count": total,
|
||||
"stale_ratio": round(ratio, 3),
|
||||
"threshold": KM_STALE_RATIO,
|
||||
"stale_days": KM_STALE_DAYS,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -260,6 +272,27 @@ class GovernanceAgent:
|
||||
await self._alert(
|
||||
"llm_hallucination",
|
||||
{
|
||||
"status": "warning",
|
||||
"impact": {
|
||||
"failed_count": failed,
|
||||
"total_checked": total,
|
||||
"hallucination_rate": round(rate, 3),
|
||||
"threshold": HALLUCINATION_RATE_THRESHOLD,
|
||||
},
|
||||
"remediation": {
|
||||
"items": [
|
||||
"檢核 AI 建議來源與 evidence snapshot 一致性",
|
||||
"檢視最近 incident 的 verifier 輸入欄位是否缺失關鍵上下文",
|
||||
],
|
||||
"next_action": "run_knowledge_gap_audit",
|
||||
"hint": "高失敗率通常表示 evidence 收斂流程退化或資料欄位解讀偏差",
|
||||
},
|
||||
"actionable": {
|
||||
"items": [
|
||||
"啟動 `playbook_evidence` 對齊補償流程",
|
||||
"調整 verify timeout 與降級策略,避免過度信任低品質證據",
|
||||
],
|
||||
},
|
||||
"failed_count": failed,
|
||||
"total_checked": total,
|
||||
"hallucination_rate": round(rate, 3),
|
||||
@@ -304,6 +337,27 @@ class GovernanceAgent:
|
||||
await self._alert(
|
||||
"execution_blast_radius",
|
||||
{
|
||||
"status": "warning",
|
||||
"impact": {
|
||||
"failed_count": failed,
|
||||
"total_executions": total,
|
||||
"failure_rate": round(rate, 3),
|
||||
"threshold": EXECUTION_FAIL_RATE_THRESHOLD,
|
||||
},
|
||||
"remediation": {
|
||||
"items": [
|
||||
"鎖定失敗 playbook 清單,關閉高風險自動執行",
|
||||
"比對 incident evidence 與 post_execution_verification 失敗原因",
|
||||
],
|
||||
"next_action": "pause_auto_repair_for_top_failing_playbooks",
|
||||
"hint": "可能是 auto_repair_playbook 與 runtime 版本/環境脫節",
|
||||
},
|
||||
"actionable": {
|
||||
"items": [
|
||||
"跑 `run_self_check` 快照與失敗 playbook 熱點報表",
|
||||
"必要時啟用 emergency fallback 路由進人工審核",
|
||||
],
|
||||
},
|
||||
"failed_count": failed,
|
||||
"total_executions": total,
|
||||
"failure_rate": round(rate, 3),
|
||||
@@ -548,9 +602,25 @@ class GovernanceAgent:
|
||||
await self._alert(
|
||||
"governance_self_failure",
|
||||
{
|
||||
"failed_checks": failed_checks,
|
||||
"total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
|
||||
"errors": {k: results[k].get("error") for k in failed_checks},
|
||||
"status": "critical",
|
||||
"impact": {
|
||||
"failed_checks": failed_checks,
|
||||
"total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
|
||||
"errors": {k: results[k].get("error") for k in failed_checks},
|
||||
},
|
||||
"remediation": {
|
||||
"items": [
|
||||
"暫停非關鍵治理自動化接收鏈路",
|
||||
"聚焦治理執行路徑錯誤並補齊 fallback",
|
||||
],
|
||||
"next_action": "investigate_governance_pipeline_health",
|
||||
},
|
||||
"actionable": {
|
||||
"items": [
|
||||
"檢查 GovernanceAgent run loop 是否完整執行 5 個項目",
|
||||
"確認 DB 寫入與 Prometheus fetch 未被上游干擾",
|
||||
],
|
||||
},
|
||||
},
|
||||
)
|
||||
except Exception:
|
||||
|
||||
@@ -2603,12 +2603,12 @@ class TelegramGateway:
|
||||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||||
f"📋 <code>{html.escape(incident_id)}</code>\n"
|
||||
f"🚨 異常元件:<b>{html.escape(alertname)}</b>\n"
|
||||
f"🎯 診斷結果:{html.escape(diagnosis[:100])}\n"
|
||||
f"🎯 診斷結果:{html.escape(_smart_truncate(diagnosis, 320))}\n"
|
||||
)
|
||||
if system_impact:
|
||||
text += f"\n🧠 <b>系統影響</b>\n{html.escape(system_impact[:150])}\n"
|
||||
text += f"\n🧠 <b>系統影響</b>\n{html.escape(_smart_truncate(system_impact, 320))}\n"
|
||||
if probable_cause:
|
||||
text += f"└─ 可能根因:{html.escape(probable_cause[:100])}\n"
|
||||
text += f"└─ 可能根因:{html.escape(_smart_truncate(probable_cause, 320))}\n"
|
||||
|
||||
# 2026-04-16 ogt: 移除 flywheel_diag / flywheel_dashboard (3-part ghost button,無 handler)
|
||||
# 鐵律: 寧可沒按鈕,不可有死按鈕 (feedback_no_ghost_buttons.md)
|
||||
|
||||
@@ -137,11 +137,12 @@ last_modified_by: Codex
|
||||
|
||||
## AI 治理告警事件規範(本輪新增)
|
||||
|
||||
- 目標:把治理告警輸出固定為「可解析」「可追蹤」「可行動」三層結構,支援 telegram + PG + AI 決策。
|
||||
- 目標:把治理告警輸出固定為「可解析」「可追蹤」「可行動」三層結構,並讓 Telegram 與 AI Agent 能直接接力執行。
|
||||
- 版本:`governance_event_v1`,適用模組:
|
||||
- `governance_agent.py`(`_alert()`)
|
||||
- `failover_alerter.py`(告警推送)
|
||||
- `ai_slo_watchdog_job.py`(META 告警)
|
||||
- JSON Schema:[/Users/ogt/awoooi/docs/schemas/governance_event_v1.schema.json](docs/schemas/governance_event_v1.schema.json)
|
||||
|
||||
### 1) 通用 Schema
|
||||
|
||||
@@ -177,8 +178,8 @@ last_modified_by: Codex
|
||||
| `trust_drift` | `governance_agent.check_trust_drift` | 風險警示時 `warning`;未超標可不推送 | `auto_deprecated_count/ids`, `playbook_ids` |
|
||||
| `knowledge_degradation` | `governance_agent.check_knowledge_degradation` | 過比例時 `warning` | `next_action=run_kb_growth_healthcheck` |
|
||||
| `governance_slo_data_gap` | `governance_agent.run_self_check` | 所有 SLO metric 無 emit 時 `warning` | `next_action=run_adr100_slo_emit_playbook` |
|
||||
| `llm_hallucination` | `governance_agent.check_llm_hallucination` | 舊有 payload(待重構) |
|
||||
| `execution_blast_radius` | `governance_agent.check_execution_blast_radius` | 舊有 payload(待重構) |
|
||||
| `llm_hallucination` | `governance_agent.check_llm_hallucination` | 過比例時 `warning` | `next_action=run_knowledge_gap_audit` |
|
||||
| `execution_blast_radius` | `governance_agent.check_execution_blast_radius` | 過比例時 `warning` | `next_action=pause_auto_repair_for_top_failing_playbooks` |
|
||||
| `governance_slo_<metric>_violation` | `governance_agent.check_slo_compliance` | `status=violation` | `next_action=trigger_flywheel_safeguard` |
|
||||
| `slo_*`/`governance_*` | 其他治理事件 | 按事件需求保留最小欄位但建議同 schema |
|
||||
|
||||
@@ -190,4 +191,56 @@ last_modified_by: Codex
|
||||
- `impact`
|
||||
- `remediation`
|
||||
- `actionable`
|
||||
- W-1~W-6 自健診(`ai_slo_watchdog_job.py`)以 `system_impact` 明確標示異常 KPI 數量與檢查區間,避免 `W-6` 漏報文案誤解。
|
||||
|
||||
- W-1~W-6 自健診(`ai_slo_watchdog_job.py`)以 `system_impact` 明確列出異常 KPI 與序號清單,避免 `W-6` 漏報文案誤解。
|
||||
|
||||
### 4) 快速 Sample(供 AGENT/Parser 套件直接接力)
|
||||
|
||||
```json
|
||||
{
|
||||
"event_type": "trust_drift",
|
||||
"status": "warning",
|
||||
"impact": {
|
||||
"drifted_count": 4,
|
||||
"total_playbooks": 26,
|
||||
"drift_ratio": 0.153,
|
||||
"auto_deprecated_count": 0,
|
||||
"auto_deprecated_ids": [],
|
||||
"playbook_ids": [
|
||||
"PB-20260501-27910D",
|
||||
"PB-COLD-745C00B9",
|
||||
"PB-20260405-1CF853",
|
||||
"PB-20260409-B66B1A"
|
||||
]
|
||||
},
|
||||
"remediation": {
|
||||
"next_action": "review_trust_drift_candidates",
|
||||
"items": [
|
||||
"確認各 playbook 最近 14 天執行結果是否含高失敗/高重試",
|
||||
"必要時啟用 trial auto-deprecate"
|
||||
]
|
||||
},
|
||||
"actionable": {
|
||||
"items": [
|
||||
"可自動註記可降級清單",
|
||||
"可自動生成 approval-free dry-run 回放報告"
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 5) 事件處理路徑(非人肉清單)
|
||||
|
||||
- `trust_drift`:
|
||||
- 立即:保留低信任但新近使用 Playbook 清單,輸出 `playbook_ids`
|
||||
- 自動:`AUTO_DEPRECATED` 當日 30 天內未更新的 Playbook 自動降級
|
||||
- 人工:人工覆核 playbook 風險,決定是否 rollback
|
||||
- `knowledge_degradation`:
|
||||
- 自動:觸發 `run_kb_growth_healthcheck`
|
||||
- 續接:`playbook_evidence` / `kb_rot_cleaner` 補齊缺口
|
||||
- `governance_slo_data_gap`:
|
||||
- 自動:`run_adr100_slo_emit_playbook`
|
||||
- 檢查:所有 API Pod 是否已掛載 `PROMETHEUS_MULTIPROC_DIR`,Prometheus rules 已載入
|
||||
- `governance_slo_*_violation`:
|
||||
- 自動:暫停高風險 auto-repair 路徑(`flywheel safeguard`)
|
||||
- 人工:review 最近 1 小時 self-check 失敗樣本
|
||||
|
||||
@@ -22,6 +22,19 @@
|
||||
### 驗證
|
||||
- 代碼改動已在上一輪 commit 寫入(含 `governance_agent.py`、`ai_slo_watchdog_job.py`、`webhooks.py`)並推送到 `gitea main`。
|
||||
|
||||
## 2026-05-02 | AI治理報告可讀性與自動化收斂完成(本輪)
|
||||
|
||||
### 完成
|
||||
- 將 `governance_agent.py` 告警 payload 升級為**雙軌輸出**:
|
||||
- 保留現有扁平欄位(便於既有告警消費者);
|
||||
- 同步補齊 `status / impact / remediation / actionable` 結構。
|
||||
- 讓 `FailoverAlerter.alert_governance` 直接輸出「影響 / 修復 / 可自動化」三區塊,去掉雜亂 Key=Value 備援列,提升 Telegram 一眼可讀性。
|
||||
- `ai_slo_watchdog_job.py` 重組 `W-1~W-6` 異常文案,加入 `system_impact` 明細與嚴重度自動分流(warning/critical)。
|
||||
- 新增機讀 schema:`docs/schemas/governance_event_v1.schema.json`,並在 `docs/12-agent-game-rules.md` 補齊告警範例與事件對應自動化路徑。
|
||||
|
||||
### 影響
|
||||
- `trust_drift` / `knowledge_degradation` / `knowledge_slo_data_gap` 的告警不再只像「字串摘要」,可直接交給 Agent 判斷下一步行動。
|
||||
|
||||
## 2026-05-02 | trust_drift 飛輪自治:低信任未使用 playbook 自動 deprecate
|
||||
|
||||
承接統帥對 governance 類告警的全面授權。trust_drift 過去只發 Telegram 告警,4 個低信任 playbook 一直在告警表內噴噪音。
|
||||
|
||||
72
docs/schemas/governance_event_v1.schema.json
Normal file
72
docs/schemas/governance_event_v1.schema.json
Normal file
@@ -0,0 +1,72 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "urn:awoooi:governance-event-v1",
|
||||
"title": "AWOOOI Governance Event (v1)",
|
||||
"type": "object",
|
||||
"required": [
|
||||
"status",
|
||||
"impact",
|
||||
"remediation",
|
||||
"actionable"
|
||||
],
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"enum": ["info", "warning", "critical", "violation"]
|
||||
},
|
||||
"impact": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"minProperties": 1
|
||||
},
|
||||
"remediation": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": { "type": "string" }
|
||||
},
|
||||
"next_action": {
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"hint": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"additionalProperties": true
|
||||
},
|
||||
"actionable": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": { "type": "string" }
|
||||
}
|
||||
},
|
||||
"additionalProperties": true
|
||||
},
|
||||
"drifted_count": { "type": "integer", "minimum": 0 },
|
||||
"total_playbooks": { "type": "integer", "minimum": 0 },
|
||||
"auto_deprecated_count": { "type": "integer", "minimum": 0 },
|
||||
"auto_deprecated_ids": {
|
||||
"type": "array",
|
||||
"items": { "type": "string" }
|
||||
},
|
||||
"playbook_ids": {
|
||||
"type": "array",
|
||||
"items": { "type": "string" }
|
||||
},
|
||||
"stale_count": { "type": "integer", "minimum": 0 },
|
||||
"total_count": { "type": "integer", "minimum": 0 },
|
||||
"stale_ratio": { "type": "number", "minimum": 0 },
|
||||
"stale_days": { "type": "integer", "minimum": 1 },
|
||||
"threshold": { "type": "number", "minimum": 0 },
|
||||
"failed_count": { "type": "integer", "minimum": 0 },
|
||||
"total_checked": { "type": "integer", "minimum": 0 },
|
||||
"total_executions": { "type": "integer", "minimum": 0 },
|
||||
"failure_rate": { "type": "number", "minimum": 0, "maximum": 1 },
|
||||
"hallucination_rate": { "type": "number", "minimum": 0, "maximum": 1 }
|
||||
},
|
||||
"additionalProperties": true
|
||||
}
|
||||
Reference in New Issue
Block a user