All checks were successful
Code Review / ai-code-review (push) Successful in 11s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 22s
CD Pipeline / tests (push) Successful in 1m6s
CD Pipeline / build-and-deploy (push) Successful in 5m17s
CD Pipeline / post-deploy-checks (push) Successful in 1m38s
780 lines
34 KiB
Python
780 lines
34 KiB
Python
"""AI 自我治理 Agent
|
||
|
||
四項自檢,每 1 小時執行一次:
|
||
1. trust_drift — Playbook trust_score < 0.2 → 告警建議廢棄
|
||
2. knowledge_degradation — KM 7 天未更新 > 20% 總量 → 告警知識衰退
|
||
3. llm_hallucination — 近 100 筆 evidence verification_result=failed 比例 > 10%
|
||
4. execution_blast_radius — 近 100 筆 auto_repair_executions.success=False 比例 > 15%
|
||
5. slo_compliance — 4 個 SLO 合規性檢查(ADR-100),違反時降級飛輪行為
|
||
|
||
所有 check 互相隔離(try/except),任一失敗不阻斷其他項目。
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
2026-04-27 P3.4 by Claude — 新增 SLO 合規性自檢(ADR-100)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
from datetime import timedelta
|
||
from typing import Any
|
||
|
||
import structlog
|
||
from sqlalchemy import func, select
|
||
|
||
from src.db.base import get_db_context
|
||
from src.db.models import (
|
||
AiGovernanceEvent,
|
||
AutoRepairExecution,
|
||
IncidentEvidence,
|
||
KnowledgeEntryRecord,
|
||
PlaybookRecord,
|
||
)
|
||
from src.models.knowledge import EntryStatus
|
||
from src.utils.timezone import now_taipei
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# =============================================================================
|
||
# 閾值常數
|
||
# =============================================================================
|
||
TRUST_DRIFT_THRESHOLD = 0.2 # playbook trust_score 低於此值 → 告警
|
||
# 2026-05-02 ogt + Claude Sonnet 4.6: trust_drift auto-deprecate
|
||
# trust < 0.2 + (last_used > N 天前 OR 從沒用過 + 創建超過 N 天) → 自動 deprecate
|
||
# 這個 N 設 30 天,給 playbook 充足試用期,避免新提案被早期幾次失敗就廢棄
|
||
TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS = 30
|
||
KM_STALE_DAYS = 7 # 知識條目超過幾天未更新視為陳舊
|
||
KM_STALE_RATIO = 0.20 # 陳舊比例超過此值 → 告警
|
||
HALLUCINATION_RATE_THRESHOLD = 0.10 # LLM verification failed 比例超過此值 → 告警
|
||
EXECUTION_FAIL_RATE_THRESHOLD = 0.15 # 執行失敗比例超過此值 → 告警
|
||
RECENT_LIMIT = 100 # 最近幾筆做統計
|
||
|
||
|
||
# =============================================================================
|
||
# GovernanceAgent
|
||
# =============================================================================
|
||
|
||
class GovernanceAgent:
|
||
"""AI 自我治理 Agent — 5 項自檢 + 1h 排程
|
||
|
||
1-4: trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius
|
||
5: slo_compliance(ADR-100 SLO 合規性)
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
2026-04-27 P3.4 by Claude — 加入第 5 項 slo_compliance
|
||
"""
|
||
|
||
def __init__(self, alerter=None) -> None:
|
||
# alerter: FailoverAlerter instance(可注入,預設從 singleton 取得)
|
||
self._alerter = alerter
|
||
|
||
# =========================================================================
|
||
# 1. Playbook 信任度漂移
|
||
# =========================================================================
|
||
|
||
async def check_trust_drift(self, emit_alert: bool = True) -> dict[str, Any]:
|
||
"""Playbook trust_score < 0.2 → 告警建議廢棄;30 天沒用過的直接 auto-deprecate
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
2026-05-02 ogt + Claude Sonnet 4.6: 加 auto_deprecate_low_trust_unused 自治路徑
|
||
守衛條件:trust < 0.2 AND (last_used_at < 30 天前 OR 從未使用且創建超過 30 天)
|
||
→ status 改 'deprecated',alert 改報「N 個告警 + M 個 auto-deprecated」
|
||
2026-05-05 Codex: emit_alert=False 供 W-6 watchdog 查詢統計,維持
|
||
governance_agent 單一入口,但避免與 hourly self-check 發出雙重 Telegram。
|
||
"""
|
||
async with get_db_context() as db:
|
||
result = await db.execute(
|
||
select(PlaybookRecord).where(
|
||
PlaybookRecord.status.not_in(["deprecated", "archived"])
|
||
)
|
||
)
|
||
all_records = result.scalars().all()
|
||
|
||
total = len(all_records)
|
||
drifted = [r for r in all_records if float(r.trust_score) < TRUST_DRIFT_THRESHOLD]
|
||
|
||
# auto-deprecate eligibility
|
||
cutoff = now_taipei() - timedelta(days=TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS)
|
||
auto_deprecated_ids: list[str] = []
|
||
kept_ids: list[str] = []
|
||
for r in drifted:
|
||
last = r.last_used_at
|
||
created = r.created_at
|
||
# 沒用過 → 用 created_at 作為「進入系統時間」
|
||
ref_time = last if last is not None else created
|
||
if ref_time is not None and ref_time < cutoff:
|
||
r.status = "deprecated"
|
||
auto_deprecated_ids.append(r.playbook_id)
|
||
else:
|
||
kept_ids.append(r.playbook_id)
|
||
|
||
# 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 1 修復(P0 silent failure)
|
||
# 原 await db.commit() 在 with 區塊外呼叫,session 已被 context manager
|
||
# 關閉後 auto-commit,二次 commit 拋 InvalidRequestError 被外層 try/except 吞掉
|
||
# 修法:commit 移入 with 區塊內,在 session 有效期間顯式提交
|
||
if auto_deprecated_ids:
|
||
await db.commit()
|
||
logger.info(
|
||
"governance_trust_drift_auto_deprecated",
|
||
count=len(auto_deprecated_ids),
|
||
ids=auto_deprecated_ids[:10],
|
||
)
|
||
|
||
if drifted and emit_alert:
|
||
drift_ratio = len(drifted) / total if total > 0 else 0.0
|
||
await self._alert(
|
||
"trust_drift",
|
||
{
|
||
"status": "warning",
|
||
"impact": {
|
||
"drifted_count": len(drifted),
|
||
"total_playbooks": total,
|
||
"drift_ratio": round(drift_ratio, 3),
|
||
"threshold": TRUST_DRIFT_THRESHOLD,
|
||
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"Auto-deprecate low-trust stale playbooks",
|
||
"Review candidate playbooks by impact scope and rollback if needed",
|
||
],
|
||
"auto_deprecated_count": len(auto_deprecated_ids),
|
||
"auto_deprecated_ids": auto_deprecated_ids[:10],
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"立即補齊 PLAYBOOK_SOURCE 與 playbook_metadata",
|
||
"必要時人工覆核 kept_ids 中的高風險 Playbook",
|
||
],
|
||
"sample_playbook_ids": kept_ids[:10],
|
||
},
|
||
"drifted_count": len(drifted),
|
||
"auto_deprecated_count": len(auto_deprecated_ids),
|
||
"auto_deprecated_ids": auto_deprecated_ids[:10],
|
||
"playbook_ids": kept_ids[:10],
|
||
"total_playbooks": total,
|
||
"threshold": TRUST_DRIFT_THRESHOLD,
|
||
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
|
||
},
|
||
)
|
||
|
||
logger.info(
|
||
"governance_trust_drift_checked",
|
||
total=total,
|
||
drifted=len(drifted),
|
||
auto_deprecated=len(auto_deprecated_ids),
|
||
kept=len(kept_ids),
|
||
)
|
||
drift_ratio = len(drifted) / total if total > 0 else 0.0
|
||
return {
|
||
"checked": total,
|
||
"drifted": len(drifted),
|
||
"drift_ratio": drift_ratio,
|
||
"auto_deprecated": len(auto_deprecated_ids),
|
||
"kept": len(kept_ids),
|
||
}
|
||
|
||
# =========================================================================
|
||
# 2. 知識庫衰退
|
||
# =========================================================================
|
||
|
||
async def check_knowledge_degradation(self) -> dict[str, Any]:
|
||
"""KM 7 天未更新 > 20% 總量 → 告警知識衰退
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
"""
|
||
stale_cutoff = now_taipei() - timedelta(days=KM_STALE_DAYS)
|
||
|
||
async with get_db_context() as db:
|
||
# 非 archived 總數
|
||
total_result = await db.execute(
|
||
select(func.count()).select_from(KnowledgeEntryRecord).where(
|
||
KnowledgeEntryRecord.status != EntryStatus.ARCHIVED
|
||
)
|
||
)
|
||
total = total_result.scalar() or 0
|
||
|
||
# 7 天內未更新(updated_at < cutoff)且非 archived
|
||
stale_result = await db.execute(
|
||
select(func.count()).select_from(KnowledgeEntryRecord).where(
|
||
KnowledgeEntryRecord.status != EntryStatus.ARCHIVED,
|
||
KnowledgeEntryRecord.updated_at < stale_cutoff,
|
||
)
|
||
)
|
||
stale = stale_result.scalar() or 0
|
||
|
||
ratio = stale / total if total > 0 else 0.0
|
||
|
||
if total > 0 and ratio > KM_STALE_RATIO:
|
||
await self._alert(
|
||
"knowledge_degradation",
|
||
{
|
||
"status": "warning",
|
||
"impact": {
|
||
"stale_count": stale,
|
||
"total_count": total,
|
||
"stale_ratio": round(ratio, 3),
|
||
"threshold": KM_STALE_RATIO,
|
||
"stale_days": KM_STALE_DAYS,
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"啟動 KM 反查與自動補齊流程",
|
||
"關鍵服務告警自動同步到 KM 任務,補齊缺失條目",
|
||
],
|
||
"next_action": "run_kb_growth_healthcheck",
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"每日檢查 ANTI_PATTERN 更新結果",
|
||
"安排至少 2 位 owner 對 stale條目做快速人工審核",
|
||
],
|
||
},
|
||
"stale_count": stale,
|
||
"total_count": total,
|
||
"stale_ratio": round(ratio, 3),
|
||
"threshold": KM_STALE_RATIO,
|
||
"stale_days": KM_STALE_DAYS,
|
||
},
|
||
)
|
||
|
||
logger.info(
|
||
"governance_knowledge_degradation_checked",
|
||
total=total,
|
||
stale=stale,
|
||
ratio=round(ratio, 3),
|
||
)
|
||
return {"total": total, "stale": stale, "ratio": round(ratio, 3)}
|
||
|
||
# =========================================================================
|
||
# 3. LLM 幻覺率
|
||
# =========================================================================
|
||
|
||
async def check_llm_hallucination(self) -> dict[str, Any]:
|
||
"""最近 100 筆 IncidentEvidence verification_result=failed 比例 > 10% → 告警
|
||
|
||
verification_result 可能值:success / degraded / failed / timeout
|
||
只有 'failed' 視為幻覺(LLM 判斷錯誤導致執行後驗證失敗)
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
"""
|
||
async with get_db_context() as db:
|
||
# 取最近 RECENT_LIMIT 筆有 verification_result 的記錄
|
||
result = await db.execute(
|
||
select(IncidentEvidence.verification_result)
|
||
.where(IncidentEvidence.verification_result.is_not(None))
|
||
.order_by(IncidentEvidence.collected_at.desc())
|
||
.limit(RECENT_LIMIT)
|
||
)
|
||
rows = result.scalars().all()
|
||
|
||
total = len(rows)
|
||
if total == 0:
|
||
logger.info("governance_hallucination_checked", total=0, rate=0.0)
|
||
return {"total": 0, "failed": 0, "rate": 0.0}
|
||
|
||
failed = sum(1 for r in rows if r == "failed")
|
||
rate = failed / total
|
||
|
||
if rate > HALLUCINATION_RATE_THRESHOLD:
|
||
await self._alert(
|
||
"llm_hallucination",
|
||
{
|
||
"status": "warning",
|
||
"impact": {
|
||
"failed_count": failed,
|
||
"total_checked": total,
|
||
"hallucination_rate": round(rate, 3),
|
||
"threshold": HALLUCINATION_RATE_THRESHOLD,
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"檢核 AI 建議來源與 evidence snapshot 一致性",
|
||
"檢視最近 incident 的 verifier 輸入欄位是否缺失關鍵上下文",
|
||
],
|
||
"next_action": "run_knowledge_gap_audit",
|
||
"hint": "高失敗率通常表示 evidence 收斂流程退化或資料欄位解讀偏差",
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"啟動 `playbook_evidence` 對齊補償流程",
|
||
"調整 verify timeout 與降級策略,避免過度信任低品質證據",
|
||
],
|
||
},
|
||
"failed_count": failed,
|
||
"total_checked": total,
|
||
"hallucination_rate": round(rate, 3),
|
||
"threshold": HALLUCINATION_RATE_THRESHOLD,
|
||
},
|
||
)
|
||
|
||
logger.info(
|
||
"governance_hallucination_checked",
|
||
total=total,
|
||
failed=failed,
|
||
rate=round(rate, 3),
|
||
)
|
||
return {"total": total, "failed": failed, "rate": round(rate, 3)}
|
||
|
||
# =========================================================================
|
||
# 4. 執行失敗率 (Blast Radius)
|
||
# =========================================================================
|
||
|
||
async def check_execution_blast_radius(self) -> dict[str, Any]:
|
||
"""最近 100 筆 AutoRepairExecution.success=False 比例 > 15% → 告警
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
"""
|
||
async with get_db_context() as db:
|
||
result = await db.execute(
|
||
select(AutoRepairExecution.success)
|
||
.order_by(AutoRepairExecution.created_at.desc())
|
||
.limit(RECENT_LIMIT)
|
||
)
|
||
rows = result.scalars().all()
|
||
|
||
total = len(rows)
|
||
if total == 0:
|
||
logger.info("governance_blast_radius_checked", total=0, rate=0.0)
|
||
return {"total": 0, "failed": 0, "rate": 0.0}
|
||
|
||
failed = sum(1 for r in rows if not r)
|
||
rate = failed / total
|
||
|
||
if rate > EXECUTION_FAIL_RATE_THRESHOLD:
|
||
await self._alert(
|
||
"execution_blast_radius",
|
||
{
|
||
"status": "warning",
|
||
"impact": {
|
||
"failed_count": failed,
|
||
"total_executions": total,
|
||
"failure_rate": round(rate, 3),
|
||
"threshold": EXECUTION_FAIL_RATE_THRESHOLD,
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"鎖定失敗 playbook 清單,關閉高風險自動執行",
|
||
"比對 incident evidence 與 post_execution_verification 失敗原因",
|
||
],
|
||
"next_action": "pause_auto_repair_for_top_failing_playbooks",
|
||
"hint": "可能是 auto_repair_playbook 與 runtime 版本/環境脫節",
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"跑 `run_self_check` 快照與失敗 playbook 熱點報表",
|
||
"必要時啟用 emergency fallback 路由進人工審核",
|
||
],
|
||
},
|
||
"failed_count": failed,
|
||
"total_executions": total,
|
||
"failure_rate": round(rate, 3),
|
||
"threshold": EXECUTION_FAIL_RATE_THRESHOLD,
|
||
},
|
||
)
|
||
|
||
logger.info(
|
||
"governance_blast_radius_checked",
|
||
total=total,
|
||
failed=failed,
|
||
rate=round(rate, 3),
|
||
)
|
||
return {"total": total, "failed": failed, "rate": round(rate, 3)}
|
||
|
||
# =========================================================================
|
||
# 5. SLO 合規性(ADR-100)
|
||
# =========================================================================
|
||
|
||
async def check_slo_compliance(self) -> dict[str, Any]:
|
||
"""SLO 4 項合規性檢查 — 違反時降級飛輪行為
|
||
|
||
從 Prometheus Recording rules 讀取 SLI 值,
|
||
與硬紅線閾值比對,違反時呼叫 _alert() 寫 PG + 推 Telegram。
|
||
|
||
SLO 1 自主化率: sli:autonomy_rate:5m 硬紅線 < 0.70
|
||
SLO 2 決策準確率: sli:decision_accuracy:5m 硬紅線 < 0.85
|
||
SLO 3 信心校準: sli:confidence_calibration:1h 硬紅線 < 0.70
|
||
SLO 4 KM 增長率: knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5
|
||
|
||
2026-04-27 P3.4 by Claude — AI SLO(ADR-100)
|
||
"""
|
||
import httpx
|
||
import math
|
||
|
||
from src.core.config import settings
|
||
|
||
prom_url = getattr(settings, "PROMETHEUS_URL", "http://prometheus.observability.svc:9090")
|
||
|
||
queries: dict[str, str] = {
|
||
"autonomy_rate": "sli:autonomy_rate:5m",
|
||
"decision_accuracy": "sli:decision_accuracy:5m",
|
||
"confidence_calibration": "sli:confidence_calibration:1h",
|
||
"km_growth_rate": "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)",
|
||
}
|
||
# 硬紅線:低於此值必須告警(非軟性警告)
|
||
hard_red_lines: dict[str, float] = {
|
||
"autonomy_rate": 0.70,
|
||
"decision_accuracy": 0.85,
|
||
"confidence_calibration": 0.70,
|
||
"km_growth_rate": 5.0,
|
||
}
|
||
# SLO 目標值(供日誌記錄)
|
||
slo_targets: dict[str, float] = {
|
||
"autonomy_rate": 0.80,
|
||
"decision_accuracy": 0.90,
|
||
"confidence_calibration": 0.80,
|
||
"km_growth_rate": 20.0,
|
||
}
|
||
|
||
results: dict[str, Any] = {}
|
||
|
||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||
for name, query in queries.items():
|
||
try:
|
||
resp = await client.get(
|
||
f"{prom_url}/api/v1/query",
|
||
params={"query": query},
|
||
)
|
||
data = resp.json()
|
||
if data.get("status") == "success":
|
||
result_list = data.get("data", {}).get("result", [])
|
||
# 2026-04-28 ogt + Claude Opus 4.7: P0-1 假警報止血
|
||
# 空 result = Prometheus 查無資料(metric 未 emit / rule 未部署),不等於 SLO=0
|
||
# ADR-100 emitter 全部尚未實作(automation_operation_log_total 等 4 個 counter 零定義)
|
||
# 不可 fallback 0.0,否則必觸發 violated=True 噴假警報
|
||
if not result_list:
|
||
results[name] = {
|
||
"name": name,
|
||
"status": "skipped",
|
||
"error": "no_data",
|
||
"reason": "prometheus_empty_result_metric_not_emitted",
|
||
"hint": "ADR-100 emitter 未輸出、Prometheus recording rule 未載入,或 multiprocess 目錄未掛載",
|
||
}
|
||
logger.warning(
|
||
"governance_slo_no_data",
|
||
slo=name,
|
||
query=query,
|
||
hint="ADR-100 metrics, recording rules, or multiprocess mount not ready",
|
||
)
|
||
continue
|
||
value = float(result_list[0]["value"][1])
|
||
if not math.isfinite(value):
|
||
results[name] = {
|
||
"name": name,
|
||
"status": "skipped",
|
||
"error": "non_finite_value",
|
||
"reason": "prometheus_nan_or_inf",
|
||
"hint": "SLO 分母目前沒有足夠事件,等待下一個有效樣本再評估",
|
||
}
|
||
logger.warning(
|
||
"governance_slo_non_finite",
|
||
slo=name,
|
||
query=query,
|
||
value=str(result_list[0]["value"][1]),
|
||
)
|
||
continue
|
||
threshold = hard_red_lines[name]
|
||
target = slo_targets[name]
|
||
violated = value < threshold
|
||
|
||
results[name] = {
|
||
"name": name,
|
||
"status": "violated" if violated else "ok",
|
||
"value": round(value, 4),
|
||
"slo_target": target,
|
||
"hard_red_line": threshold,
|
||
"gap": round(threshold - value, 4) if violated else round(value - target, 4),
|
||
"violated": violated,
|
||
}
|
||
|
||
if violated:
|
||
await self._alert(
|
||
f"slo_{name}_violation",
|
||
{
|
||
"status": "violation",
|
||
"impact": {
|
||
"name": name,
|
||
"value": round(value, 4),
|
||
"target": target,
|
||
"threshold": threshold,
|
||
"gap": round(threshold - value, 4),
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"Pause auto-scaling or risky auto-fix tasks",
|
||
"Review evidence/decision traces and adjust policy thresholds",
|
||
],
|
||
"next_action": "trigger_flywheel_safeguard",
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"Check verifier lag and post-exec learning health",
|
||
"Run emergency incident audit on failed approvals",
|
||
],
|
||
},
|
||
},
|
||
)
|
||
logger.warning(
|
||
"governance_slo_violated",
|
||
slo=name,
|
||
value=round(value, 4),
|
||
hard_red_line=threshold,
|
||
)
|
||
elif value == 0 and threshold <= 0:
|
||
logger.warning(
|
||
"governance_slo_unexpected_zero",
|
||
slo=name,
|
||
value=round(value, 4),
|
||
)
|
||
else:
|
||
logger.info(
|
||
"governance_slo_ok",
|
||
slo=name,
|
||
value=round(value, 4),
|
||
target=target,
|
||
)
|
||
else:
|
||
results[name] = {
|
||
"name": name,
|
||
"status": "error",
|
||
"error": "prometheus_query_failed",
|
||
"response_status": data.get("status"),
|
||
}
|
||
logger.warning(
|
||
"governance_slo_prometheus_error",
|
||
slo=name,
|
||
query=query,
|
||
response_status=data.get("status"),
|
||
)
|
||
except Exception as e:
|
||
results[name] = {
|
||
"name": name,
|
||
"status": "error",
|
||
"error": str(e),
|
||
}
|
||
logger.warning("governance_slo_check_error", slo=name, error=str(e))
|
||
|
||
# 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
|
||
# 加聚合 _meta 區分「全 skipped」(metric 未 emit) vs「全 ok」(SLO 健康)
|
||
# 防止 dashboard 把 no_data 當 pass 顯示
|
||
violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "violated")
|
||
skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "skipped")
|
||
ok_count = sum(
|
||
1 for v in results.values()
|
||
if isinstance(v, dict)
|
||
and v.get("status") == "ok"
|
||
)
|
||
error_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "error")
|
||
results["_meta"] = {
|
||
"violated_count": violated_count,
|
||
"skipped_count": skipped_count,
|
||
"ok_count": ok_count,
|
||
"error_count": error_count,
|
||
"all_status": sorted({v.get("status") for v in results.values() if isinstance(v, dict)}),
|
||
"all_skipped": skipped_count > 0 and ok_count == 0 and violated_count == 0,
|
||
"status": (
|
||
"no_data" if (skipped_count > 0 and ok_count == 0 and violated_count == 0)
|
||
else "violated" if violated_count > 0
|
||
else "ok"
|
||
),
|
||
}
|
||
logger.info(
|
||
"governance_slo_compliance_complete",
|
||
results=results,
|
||
violated=violated_count,
|
||
skipped=skipped_count,
|
||
ok=ok_count,
|
||
status=results["_meta"]["status"],
|
||
)
|
||
return results
|
||
|
||
# =========================================================================
|
||
# 全跑(exception 隔離)
|
||
# =========================================================================
|
||
|
||
async def run_self_check(self) -> dict[str, Any]:
|
||
"""5 項全跑,每項獨立 try/except 隔離,任一失敗不影響其他項目
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
2026-04-27 P3.4 by Claude — 加入第 5 項 slo_compliance(ADR-100)
|
||
"""
|
||
results: dict[str, Any] = {}
|
||
checks = [
|
||
("trust_drift", self.check_trust_drift),
|
||
("knowledge_degradation", self.check_knowledge_degradation),
|
||
("llm_hallucination", self.check_llm_hallucination),
|
||
("execution_blast_radius", self.check_execution_blast_radius),
|
||
("slo_compliance", self.check_slo_compliance),
|
||
]
|
||
|
||
for check_name, check_func in checks:
|
||
try:
|
||
results[check_name] = await check_func()
|
||
except Exception as e:
|
||
logger.exception(
|
||
"governance_check_failed",
|
||
check=check_name,
|
||
error=str(e),
|
||
)
|
||
results[check_name] = {"error": str(e)}
|
||
|
||
# 2026-04-27 Wave8-X3 by Claude — B8 全失敗聚合告警
|
||
# ≥3 項失敗代表治理機制本身故障,必須送出緊急告警
|
||
failed_checks = [k for k, v in results.items() if isinstance(v, dict) and "error" in v]
|
||
if len(failed_checks) >= 3:
|
||
try:
|
||
await self._alert(
|
||
"governance_self_failure",
|
||
{
|
||
"status": "critical",
|
||
"impact": {
|
||
"failed_checks": failed_checks,
|
||
"total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
|
||
"errors": {k: results[k].get("error") for k in failed_checks},
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"暫停非關鍵治理自動化接收鏈路",
|
||
"聚焦治理執行路徑錯誤並補齊 fallback",
|
||
],
|
||
"next_action": "investigate_governance_pipeline_health",
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"檢查 GovernanceAgent run loop 是否完整執行 5 個項目",
|
||
"確認 DB 寫入與 Prometheus fetch 未被上游干擾",
|
||
],
|
||
},
|
||
},
|
||
)
|
||
except Exception:
|
||
logger.exception("governance_self_failure_alert_failed")
|
||
|
||
# 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
|
||
# SLO 全 skipped 是「資料未產生」(emitter 未實作)不是「治理機制故障」
|
||
# 用獨立 alert 區分,避免污染 self_failure 計數
|
||
slo_meta = (
|
||
results.get("slo_compliance", {}).get("_meta")
|
||
if isinstance(results.get("slo_compliance"), dict)
|
||
else None
|
||
)
|
||
if slo_meta and slo_meta.get("all_skipped"):
|
||
try:
|
||
await self._alert(
|
||
"governance_slo_data_gap",
|
||
{
|
||
"status": "warning",
|
||
"impact": {
|
||
"reason": "all_slo_metrics_not_emitted",
|
||
"skipped_count": slo_meta.get("skipped_count", 0),
|
||
"all_slo_metrics_not_emitted": True,
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"補齊 ADR-100 SLO emitter(automation_operation_log_total / post_execution_verification_total / knowledge_entries_total)",
|
||
"確認 Prometheus recording rules 已載入,且 API Pod multiprocess 目錄可寫",
|
||
],
|
||
"next_action": "run_adr100_slo_emit_playbook",
|
||
"hint": "ADR-100 emitter、Prometheus recording rules、或 multiprocess 目錄任一環節未就緒",
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"先確認 /metrics 是否已輸出 ADR-100 底層指標",
|
||
"檢查 Prometheus rule 是否已載入 sli:autonomy_rate:5m 等 4 項告警規則",
|
||
],
|
||
},
|
||
},
|
||
)
|
||
except Exception:
|
||
logger.exception("governance_slo_data_gap_alert_failed")
|
||
|
||
logger.info("governance_self_check_complete", results=results)
|
||
return results
|
||
|
||
# =========================================================================
|
||
# 告警輸出
|
||
# =========================================================================
|
||
|
||
async def _alert(self, event_type: str, payload: dict[str, Any]) -> None:
|
||
"""structlog 告警 + PG 持久化 + Telegram 推送(via FailoverAlerter)
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
2026-04-26 P2-DB-Fix by Claude — db-expert P0 三修(P0.1): 補 PG 寫入 ai_governance_events
|
||
ADR-085 鐵律:AI 學習成果不可存 Cache,必須落地 PG
|
||
"""
|
||
# 1. 寫 PG(ADR-085 鐵律 — 失敗不阻斷主流程)
|
||
try:
|
||
from sqlalchemy import insert as _sa_insert
|
||
async with get_db_context() as db:
|
||
await db.execute(
|
||
_sa_insert(AiGovernanceEvent).values(
|
||
event_type=event_type,
|
||
details=payload,
|
||
)
|
||
)
|
||
await db.commit()
|
||
except Exception as _pg_err:
|
||
logger.warning("governance_pg_write_failed", error=str(_pg_err))
|
||
|
||
# 2. structlog(保留既有行為)
|
||
logger.warning("governance_alert", event_type=event_type, **payload)
|
||
|
||
# Lazy import:延遲到實際呼叫時才取 alerter,避免啟動時循環依賴
|
||
alerter = self._alerter
|
||
if alerter is None:
|
||
try:
|
||
from src.services.failover_alerter import get_failover_alerter
|
||
alerter = get_failover_alerter()
|
||
except Exception as e:
|
||
logger.warning("governance_alerter_get_failed", error=str(e))
|
||
return
|
||
|
||
try:
|
||
await alerter.alert_governance(event_type, payload)
|
||
except Exception as e:
|
||
logger.warning("governance_telegram_alert_failed", error=str(e))
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton + 排程迴圈
|
||
# =============================================================================
|
||
|
||
_agent: GovernanceAgent | None = None
|
||
|
||
|
||
def get_governance_agent() -> GovernanceAgent:
|
||
"""取得 GovernanceAgent singleton
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
"""
|
||
global _agent
|
||
if _agent is None:
|
||
_agent = GovernanceAgent()
|
||
return _agent
|
||
|
||
|
||
def reset_governance_agent() -> None:
|
||
"""重置 singleton(測試用)
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
"""
|
||
global _agent
|
||
_agent = None
|
||
|
||
|
||
async def run_governance_loop(interval_seconds: int = 3600) -> None:
|
||
"""每 1 小時執行一次 GovernanceAgent.run_self_check()
|
||
|
||
沿用 main.py 的 asyncio.create_task + sleep 迴圈模式(無 APScheduler)。
|
||
coalesce 效果:每次 sleep interval_seconds,不堆積多次執行。
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
"""
|
||
agent = get_governance_agent()
|
||
while True:
|
||
try:
|
||
await agent.run_self_check()
|
||
except Exception as e:
|
||
logger.warning("governance_loop_error", error=str(e))
|
||
await asyncio.sleep(interval_seconds)
|