1063 lines
45 KiB
Python
1063 lines
45 KiB
Python
"""AI 自我治理 Agent
|
||
|
||
四項自檢,每 1 小時執行一次:
|
||
1. trust_drift — Playbook trust_score < 0.2 → 告警建議廢棄
|
||
2. knowledge_degradation — KM 7 天未更新 > 20% 總量 → 告警知識衰退
|
||
3. llm_hallucination — 近 100 筆 evidence verification_result=failed 比例 > 10%
|
||
4. execution_blast_radius — 近 100 筆 auto_repair_executions.success=False 比例 > 15%
|
||
5. slo_compliance — 5 個 SLO 合規性檢查(ADR-100),違反時降級飛輪行為
|
||
|
||
所有 check 互相隔離(try/except),任一失敗不阻斷其他項目。
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
2026-04-27 P3.4 by Claude — 新增 SLO 合規性自檢(ADR-100)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
from datetime import timedelta
|
||
from typing import Any
|
||
|
||
import structlog
|
||
from sqlalchemy import func, select, update
|
||
|
||
from src.db.base import get_db_context
|
||
from src.db.models import (
|
||
AiGovernanceEvent,
|
||
AutoRepairExecution,
|
||
GovernanceRemediationDispatch,
|
||
IncidentEvidence,
|
||
KnowledgeEntryRecord,
|
||
PlaybookRecord,
|
||
generate_uuid,
|
||
)
|
||
from src.models.knowledge import EntryStatus
|
||
from src.repositories.governance_remediation_dispatch_repo import (
|
||
DispatchAlreadyActive,
|
||
create_dispatch,
|
||
)
|
||
from src.utils.timezone import now_taipei
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# =============================================================================
|
||
# 閾值常數
|
||
# =============================================================================
|
||
TRUST_DRIFT_THRESHOLD = 0.2 # playbook trust_score 低於此值 → 告警
|
||
# 2026-05-02 ogt + Claude Sonnet 4.6: trust_drift auto-deprecate
|
||
# trust < 0.2 + (last_used > N 天前 OR 從沒用過 + 創建超過 N 天) → 自動 deprecate
|
||
# 這個 N 設 30 天,給 playbook 充足試用期,避免新提案被早期幾次失敗就廢棄
|
||
TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS = 30
|
||
KM_STALE_DAYS = 7 # 知識條目超過幾天未更新視為陳舊
|
||
KM_STALE_RATIO = 0.20 # 陳舊比例超過此值 → 告警
|
||
HALLUCINATION_RATE_THRESHOLD = 0.10 # LLM verification failed 比例超過此值 → 告警
|
||
EXECUTION_FAIL_RATE_THRESHOLD = 0.15 # 執行失敗比例超過此值 → 告警
|
||
RECENT_LIMIT = 100 # 最近幾筆做統計
|
||
GOVERNANCE_SELF_CHECK_LEASE_KEY = "governance:self_check:cycle_lease"
|
||
|
||
|
||
def _slo_remediation_items(name: str) -> list[str]:
|
||
if name == "truth_chain_quality_summary_latency":
|
||
return [
|
||
"Check truth-chain quality summary cache miss latency and DB query plan",
|
||
"Confirm operator summary cache is warm before treating homepage SLO as degraded",
|
||
]
|
||
return [
|
||
"Pause auto-scaling or risky auto-fix tasks",
|
||
"Review evidence/decision traces and adjust policy thresholds",
|
||
]
|
||
|
||
|
||
def _slo_actionable_items(name: str) -> list[str]:
|
||
if name == "truth_chain_quality_summary_latency":
|
||
return [
|
||
"Call /api/v1/platform/truth-chain/quality/summary?limit=8&refresh=true and compare duration",
|
||
"Inspect /metrics for awooop_truth_chain_quality_summary_last_duration_seconds",
|
||
]
|
||
return [
|
||
"Check verifier lag and post-exec learning health",
|
||
"Run emergency incident audit on failed approvals",
|
||
]
|
||
|
||
|
||
def _slo_next_action(name: str) -> str:
|
||
if name == "truth_chain_quality_summary_latency":
|
||
return "run_truth_chain_quality_summary_latency_probe"
|
||
return "trigger_flywheel_safeguard"
|
||
|
||
|
||
# =============================================================================
|
||
# GovernanceAgent
|
||
# =============================================================================
|
||
|
||
class GovernanceAgent:
|
||
"""AI 自我治理 Agent — 5 項自檢 + 1h 排程
|
||
|
||
1-4: trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius
|
||
5: slo_compliance(ADR-100 SLO 合規性)
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
2026-04-27 P3.4 by Claude — 加入第 5 項 slo_compliance
|
||
"""
|
||
|
||
def __init__(self, alerter=None) -> None:
|
||
# alerter: FailoverAlerter instance(可注入,預設從 singleton 取得)
|
||
self._alerter = alerter
|
||
|
||
# =========================================================================
|
||
# 1. Playbook 信任度漂移
|
||
# =========================================================================
|
||
|
||
async def check_trust_drift(self, emit_alert: bool = True) -> dict[str, Any]:
|
||
"""Playbook trust_score < 0.2 → 告警建議廢棄;30 天沒用過的直接 auto-deprecate
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
2026-05-02 ogt + Claude Sonnet 4.6: 加 auto_deprecate_low_trust_unused 自治路徑
|
||
守衛條件:trust < 0.2 AND (last_used_at < 30 天前 OR 從未使用且創建超過 30 天)
|
||
→ status 改 'deprecated',alert 改報「N 個告警 + M 個 auto-deprecated」
|
||
2026-05-05 Codex: emit_alert=False 供 W-6 watchdog 查詢統計,維持
|
||
governance_agent 單一入口,但避免與 hourly self-check 發出雙重 Telegram。
|
||
"""
|
||
async with get_db_context() as db:
|
||
result = await db.execute(
|
||
select(PlaybookRecord).where(
|
||
PlaybookRecord.status.not_in(["deprecated", "archived"])
|
||
)
|
||
)
|
||
all_records = result.scalars().all()
|
||
|
||
total = len(all_records)
|
||
drifted = [r for r in all_records if float(r.trust_score) < TRUST_DRIFT_THRESHOLD]
|
||
|
||
# auto-deprecate eligibility
|
||
cutoff = now_taipei() - timedelta(days=TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS)
|
||
auto_deprecated_ids: list[str] = []
|
||
kept_ids: list[str] = []
|
||
for r in drifted:
|
||
last = r.last_used_at
|
||
created = r.created_at
|
||
# 沒用過 → 用 created_at 作為「進入系統時間」
|
||
ref_time = last if last is not None else created
|
||
if ref_time is not None and ref_time < cutoff:
|
||
r.status = "deprecated"
|
||
auto_deprecated_ids.append(r.playbook_id)
|
||
else:
|
||
kept_ids.append(r.playbook_id)
|
||
|
||
# 2026-05-02 ogt + Claude Sonnet 4.6 — Bug 1 修復(P0 silent failure)
|
||
# 原 await db.commit() 在 with 區塊外呼叫,session 已被 context manager
|
||
# 關閉後 auto-commit,二次 commit 拋 InvalidRequestError 被外層 try/except 吞掉
|
||
# 修法:commit 移入 with 區塊內,在 session 有效期間顯式提交
|
||
if auto_deprecated_ids:
|
||
await db.commit()
|
||
logger.info(
|
||
"governance_trust_drift_auto_deprecated",
|
||
count=len(auto_deprecated_ids),
|
||
ids=auto_deprecated_ids[:10],
|
||
)
|
||
|
||
if drifted and emit_alert:
|
||
drift_ratio = len(drifted) / total if total > 0 else 0.0
|
||
await self._alert(
|
||
"trust_drift",
|
||
{
|
||
"status": "warning",
|
||
"impact": {
|
||
"drifted_count": len(drifted),
|
||
"total_playbooks": total,
|
||
"drift_ratio": round(drift_ratio, 3),
|
||
"threshold": TRUST_DRIFT_THRESHOLD,
|
||
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"Auto-deprecate low-trust stale playbooks",
|
||
"Review candidate playbooks by impact scope and rollback if needed",
|
||
],
|
||
"auto_deprecated_count": len(auto_deprecated_ids),
|
||
"auto_deprecated_ids": auto_deprecated_ids[:10],
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"立即補齊 PLAYBOOK_SOURCE 與 playbook_metadata",
|
||
"必要時人工覆核 kept_ids 中的高風險 Playbook",
|
||
],
|
||
"sample_playbook_ids": kept_ids[:10],
|
||
},
|
||
"drifted_count": len(drifted),
|
||
"auto_deprecated_count": len(auto_deprecated_ids),
|
||
"auto_deprecated_ids": auto_deprecated_ids[:10],
|
||
"playbook_ids": kept_ids[:10],
|
||
"total_playbooks": total,
|
||
"threshold": TRUST_DRIFT_THRESHOLD,
|
||
"auto_deprecate_after_days": TRUST_DRIFT_AUTO_DEPRECATE_AFTER_DAYS,
|
||
},
|
||
)
|
||
|
||
logger.info(
|
||
"governance_trust_drift_checked",
|
||
total=total,
|
||
drifted=len(drifted),
|
||
auto_deprecated=len(auto_deprecated_ids),
|
||
kept=len(kept_ids),
|
||
)
|
||
drift_ratio = len(drifted) / total if total > 0 else 0.0
|
||
return {
|
||
"checked": total,
|
||
"drifted": len(drifted),
|
||
"drift_ratio": drift_ratio,
|
||
"auto_deprecated": len(auto_deprecated_ids),
|
||
"kept": len(kept_ids),
|
||
}
|
||
|
||
# =========================================================================
|
||
# 2. 知識庫衰退
|
||
# =========================================================================
|
||
|
||
async def check_knowledge_degradation(self) -> dict[str, Any]:
|
||
"""KM 7 天未更新 > 20% 總量 → 告警知識衰退
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
"""
|
||
stale_cutoff = now_taipei() - timedelta(days=KM_STALE_DAYS)
|
||
|
||
async with get_db_context() as db:
|
||
# 非 archived 總數
|
||
total_result = await db.execute(
|
||
select(func.count()).select_from(KnowledgeEntryRecord).where(
|
||
KnowledgeEntryRecord.status != EntryStatus.ARCHIVED
|
||
)
|
||
)
|
||
total = total_result.scalar() or 0
|
||
|
||
# 7 天內未更新(updated_at < cutoff)且非 archived
|
||
stale_result = await db.execute(
|
||
select(func.count()).select_from(KnowledgeEntryRecord).where(
|
||
KnowledgeEntryRecord.status != EntryStatus.ARCHIVED,
|
||
KnowledgeEntryRecord.updated_at < stale_cutoff,
|
||
)
|
||
)
|
||
stale = stale_result.scalar() or 0
|
||
|
||
ratio = stale / total if total > 0 else 0.0
|
||
|
||
if total > 0 and ratio > KM_STALE_RATIO:
|
||
if await _has_open_knowledge_degradation_review():
|
||
logger.info(
|
||
"governance_knowledge_degradation_alert_suppressed",
|
||
reason="open_owner_review_exists",
|
||
total=total,
|
||
stale=stale,
|
||
ratio=round(ratio, 3),
|
||
)
|
||
return {
|
||
"total": total,
|
||
"stale": stale,
|
||
"ratio": round(ratio, 3),
|
||
"alert_suppressed": True,
|
||
"suppress_reason": "open_owner_review_exists",
|
||
}
|
||
await self._alert(
|
||
"knowledge_degradation",
|
||
{
|
||
"status": "warning",
|
||
"impact": {
|
||
"stale_count": stale,
|
||
"total_count": total,
|
||
"stale_ratio": round(ratio, 3),
|
||
"threshold": KM_STALE_RATIO,
|
||
"stale_days": KM_STALE_DAYS,
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"啟動 KM 反查與自動補齊流程",
|
||
"關鍵服務告警自動同步到 KM 任務,補齊缺失條目",
|
||
],
|
||
"next_action": "run_kb_growth_healthcheck",
|
||
},
|
||
"ownership": {
|
||
"lead_agent": "Hermes",
|
||
"lead_reason": "E7 自動 KM 主責:反查 Incident / Sentry / SigNoz / PlayBook,產生 KM 更新草稿與任務。",
|
||
"support_agents": [
|
||
"OpenClaw:提供告警分類、規則匹配與 PlayBook 脈絡摘要,不直接批量改寫 KM。",
|
||
"ElephantAlpha:read-only 稽核高影響 KM 草稿與風險,不執行寫入或通知。",
|
||
],
|
||
"human_owner": "KM owner / SRE owner",
|
||
"human_reason": "審核高影響 KM 後才允許寫入,避免 AI 自動固化錯誤知識。",
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"每日檢查 ANTI_PATTERN 更新結果",
|
||
"安排至少 2 位 owner 對 stale 條目做快速人工審核",
|
||
],
|
||
},
|
||
"stale_count": stale,
|
||
"total_count": total,
|
||
"stale_ratio": round(ratio, 3),
|
||
"threshold": KM_STALE_RATIO,
|
||
"stale_days": KM_STALE_DAYS,
|
||
},
|
||
)
|
||
|
||
logger.info(
|
||
"governance_knowledge_degradation_checked",
|
||
total=total,
|
||
stale=stale,
|
||
ratio=round(ratio, 3),
|
||
)
|
||
result = {"total": total, "stale": stale, "ratio": round(ratio, 3)}
|
||
if total > 0 and ratio <= KM_STALE_RATIO:
|
||
result["resolved_open_events"] = await _resolve_open_knowledge_degradation_events()
|
||
return result
|
||
|
||
# =========================================================================
|
||
# 3. LLM 幻覺率
|
||
# =========================================================================
|
||
|
||
async def check_llm_hallucination(self) -> dict[str, Any]:
|
||
"""最近 100 筆 IncidentEvidence verification_result=failed 比例 > 10% → 告警
|
||
|
||
verification_result 可能值:success / degraded / failed / timeout
|
||
只有 'failed' 視為幻覺(LLM 判斷錯誤導致執行後驗證失敗)
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
"""
|
||
async with get_db_context() as db:
|
||
# 取最近 RECENT_LIMIT 筆有 verification_result 的記錄
|
||
result = await db.execute(
|
||
select(IncidentEvidence.verification_result)
|
||
.where(IncidentEvidence.verification_result.is_not(None))
|
||
.order_by(IncidentEvidence.collected_at.desc())
|
||
.limit(RECENT_LIMIT)
|
||
)
|
||
rows = result.scalars().all()
|
||
|
||
total = len(rows)
|
||
if total == 0:
|
||
logger.info("governance_hallucination_checked", total=0, rate=0.0)
|
||
return {"total": 0, "failed": 0, "rate": 0.0}
|
||
|
||
failed = sum(1 for r in rows if r == "failed")
|
||
rate = failed / total
|
||
|
||
if rate > HALLUCINATION_RATE_THRESHOLD:
|
||
await self._alert(
|
||
"llm_hallucination",
|
||
{
|
||
"status": "warning",
|
||
"impact": {
|
||
"failed_count": failed,
|
||
"total_checked": total,
|
||
"hallucination_rate": round(rate, 3),
|
||
"threshold": HALLUCINATION_RATE_THRESHOLD,
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"檢核 AI 建議來源與 evidence snapshot 一致性",
|
||
"檢視最近 incident 的 verifier 輸入欄位是否缺失關鍵上下文",
|
||
],
|
||
"next_action": "run_knowledge_gap_audit",
|
||
"hint": "高失敗率通常表示 evidence 收斂流程退化或資料欄位解讀偏差",
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"啟動 `playbook_evidence` 對齊補償流程",
|
||
"調整 verify timeout 與降級策略,避免過度信任低品質證據",
|
||
],
|
||
},
|
||
"failed_count": failed,
|
||
"total_checked": total,
|
||
"hallucination_rate": round(rate, 3),
|
||
"threshold": HALLUCINATION_RATE_THRESHOLD,
|
||
},
|
||
)
|
||
|
||
logger.info(
|
||
"governance_hallucination_checked",
|
||
total=total,
|
||
failed=failed,
|
||
rate=round(rate, 3),
|
||
)
|
||
return {"total": total, "failed": failed, "rate": round(rate, 3)}
|
||
|
||
# =========================================================================
|
||
# 4. 執行失敗率 (Blast Radius)
|
||
# =========================================================================
|
||
|
||
async def check_execution_blast_radius(self) -> dict[str, Any]:
|
||
"""最近 100 筆 AutoRepairExecution.success=False 比例 > 15% → 告警
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
"""
|
||
async with get_db_context() as db:
|
||
result = await db.execute(
|
||
select(AutoRepairExecution.success)
|
||
.order_by(AutoRepairExecution.created_at.desc())
|
||
.limit(RECENT_LIMIT)
|
||
)
|
||
rows = result.scalars().all()
|
||
|
||
total = len(rows)
|
||
if total == 0:
|
||
logger.info("governance_blast_radius_checked", total=0, rate=0.0)
|
||
return {"total": 0, "failed": 0, "rate": 0.0}
|
||
|
||
failed = sum(1 for r in rows if not r)
|
||
rate = failed / total
|
||
|
||
if rate > EXECUTION_FAIL_RATE_THRESHOLD:
|
||
await self._alert(
|
||
"execution_blast_radius",
|
||
{
|
||
"status": "warning",
|
||
"impact": {
|
||
"failed_count": failed,
|
||
"total_executions": total,
|
||
"failure_rate": round(rate, 3),
|
||
"threshold": EXECUTION_FAIL_RATE_THRESHOLD,
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"鎖定失敗 playbook 清單,關閉高風險自動執行",
|
||
"比對 incident evidence 與 post_execution_verification 失敗原因",
|
||
],
|
||
"next_action": "pause_auto_repair_for_top_failing_playbooks",
|
||
"hint": "可能是 auto_repair_playbook 與 runtime 版本/環境脫節",
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"跑 `run_self_check` 快照與失敗 playbook 熱點報表",
|
||
"必要時啟用 emergency fallback 路由進人工審核",
|
||
],
|
||
},
|
||
"failed_count": failed,
|
||
"total_executions": total,
|
||
"failure_rate": round(rate, 3),
|
||
"threshold": EXECUTION_FAIL_RATE_THRESHOLD,
|
||
},
|
||
)
|
||
|
||
logger.info(
|
||
"governance_blast_radius_checked",
|
||
total=total,
|
||
failed=failed,
|
||
rate=round(rate, 3),
|
||
)
|
||
return {"total": total, "failed": failed, "rate": round(rate, 3)}
|
||
|
||
# =========================================================================
|
||
# 5. SLO 合規性(ADR-100)
|
||
# =========================================================================
|
||
|
||
async def check_slo_compliance(self) -> dict[str, Any]:
|
||
"""SLO 5 項合規性檢查 — 違反時降級飛輪行為
|
||
|
||
從 Prometheus Recording rules 讀取 SLI 值,
|
||
與硬紅線閾值比對,違反時呼叫 _alert() 寫 PG + 推 Telegram。
|
||
|
||
SLO 1 自主化率: sli:autonomy_rate:5m 硬紅線 < 0.70
|
||
SLO 2 決策準確率: sli:decision_accuracy:5m 硬紅線 < 0.85
|
||
SLO 3 信心校準: sli:confidence_calibration:1h 硬紅線 < 0.70
|
||
SLO 4 KM 增長率: knowledge_entries_created_24h / sli:km_growth_rate:24h 硬紅線 < 5
|
||
SLO 5 總覽延遲: awooop_truth_chain_quality_summary_last_duration_seconds 硬紅線 > 8s
|
||
|
||
2026-04-27 P3.4 by Claude — AI SLO(ADR-100)
|
||
"""
|
||
import math
|
||
|
||
import httpx
|
||
|
||
from src.core.config import settings
|
||
|
||
prom_url = getattr(settings, "PROMETHEUS_URL", "http://prometheus.observability.svc:9090")
|
||
|
||
queries: dict[str, str] = {
|
||
"autonomy_rate": "sli:autonomy_rate:5m",
|
||
"decision_accuracy": "sli:decision_accuracy:5m",
|
||
"confidence_calibration": "sli:confidence_calibration:1h",
|
||
"km_growth_rate": "max(knowledge_entries_created_24h) or max(sli:km_growth_rate:24h)",
|
||
"truth_chain_quality_summary_latency": 'max(awooop_truth_chain_quality_summary_last_duration_seconds{project_id="awoooi",limit="8",success="true"})',
|
||
}
|
||
# 硬紅線:above 指標低於此值、below 指標高於此值時必須告警(非軟性警告)
|
||
hard_red_lines: dict[str, float] = {
|
||
"autonomy_rate": 0.70,
|
||
"decision_accuracy": 0.85,
|
||
"confidence_calibration": 0.70,
|
||
"km_growth_rate": 5.0,
|
||
"truth_chain_quality_summary_latency": 8.0,
|
||
}
|
||
# SLO 目標值(供日誌記錄)
|
||
slo_targets: dict[str, float] = {
|
||
"autonomy_rate": 0.80,
|
||
"decision_accuracy": 0.90,
|
||
"confidence_calibration": 0.80,
|
||
"km_growth_rate": 20.0,
|
||
"truth_chain_quality_summary_latency": 2.0,
|
||
}
|
||
slo_directions: dict[str, str] = {
|
||
"autonomy_rate": "above",
|
||
"decision_accuracy": "above",
|
||
"confidence_calibration": "above",
|
||
"km_growth_rate": "above",
|
||
"truth_chain_quality_summary_latency": "below",
|
||
}
|
||
|
||
results: dict[str, Any] = {}
|
||
|
||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||
for name, query in queries.items():
|
||
try:
|
||
resp = await client.get(
|
||
f"{prom_url}/api/v1/query",
|
||
params={"query": query},
|
||
)
|
||
data = resp.json()
|
||
if data.get("status") == "success":
|
||
result_list = data.get("data", {}).get("result", [])
|
||
# 2026-04-28 ogt + Claude Opus 4.7: P0-1 假警報止血
|
||
# 空 result = Prometheus 查無資料(metric 未 emit / rule 未部署),不等於 SLO=0
|
||
# ADR-100 emitter 全部尚未實作(automation_operation_log_total 等 4 個 counter 零定義)
|
||
# 不可 fallback 0.0,否則必觸發 violated=True 噴假警報
|
||
if not result_list:
|
||
results[name] = {
|
||
"name": name,
|
||
"status": "skipped",
|
||
"error": "no_data",
|
||
"reason": "prometheus_empty_result_metric_not_emitted",
|
||
"hint": "ADR-100 emitter 未輸出、Prometheus recording rule 未載入,或 multiprocess 目錄未掛載",
|
||
}
|
||
logger.warning(
|
||
"governance_slo_no_data",
|
||
slo=name,
|
||
query=query,
|
||
hint="ADR-100 metrics, recording rules, or multiprocess mount not ready",
|
||
)
|
||
continue
|
||
value = float(result_list[0]["value"][1])
|
||
if not math.isfinite(value):
|
||
results[name] = {
|
||
"name": name,
|
||
"status": "skipped",
|
||
"error": "non_finite_value",
|
||
"reason": "prometheus_nan_or_inf",
|
||
"hint": "SLO 分母目前沒有足夠事件,等待下一個有效樣本再評估",
|
||
}
|
||
logger.warning(
|
||
"governance_slo_non_finite",
|
||
slo=name,
|
||
query=query,
|
||
value=str(result_list[0]["value"][1]),
|
||
)
|
||
continue
|
||
threshold = hard_red_lines[name]
|
||
target = slo_targets[name]
|
||
direction = slo_directions.get(name, "above")
|
||
violated = value > threshold if direction == "below" else value < threshold
|
||
gap = (
|
||
value - threshold
|
||
if violated and direction == "below"
|
||
else threshold - value
|
||
if violated
|
||
else target - value
|
||
if direction == "below"
|
||
else value - target
|
||
)
|
||
|
||
results[name] = {
|
||
"name": name,
|
||
"status": "violated" if violated else "ok",
|
||
"value": round(value, 4),
|
||
"slo_target": target,
|
||
"hard_red_line": threshold,
|
||
"direction": direction,
|
||
"gap": round(gap, 4),
|
||
"violated": violated,
|
||
}
|
||
|
||
if violated:
|
||
await self._alert(
|
||
f"slo_{name}_violation",
|
||
{
|
||
"status": "violation",
|
||
"impact": {
|
||
"name": name,
|
||
"value": round(value, 4),
|
||
"target": target,
|
||
"threshold": threshold,
|
||
"direction": direction,
|
||
"gap": round(gap, 4),
|
||
},
|
||
"remediation": {
|
||
"items": _slo_remediation_items(name),
|
||
"next_action": _slo_next_action(name),
|
||
},
|
||
"actionable": {
|
||
"items": _slo_actionable_items(name),
|
||
},
|
||
},
|
||
)
|
||
logger.warning(
|
||
"governance_slo_violated",
|
||
slo=name,
|
||
value=round(value, 4),
|
||
hard_red_line=threshold,
|
||
)
|
||
elif value == 0 and threshold <= 0:
|
||
logger.warning(
|
||
"governance_slo_unexpected_zero",
|
||
slo=name,
|
||
value=round(value, 4),
|
||
)
|
||
else:
|
||
logger.info(
|
||
"governance_slo_ok",
|
||
slo=name,
|
||
value=round(value, 4),
|
||
target=target,
|
||
)
|
||
else:
|
||
results[name] = {
|
||
"name": name,
|
||
"status": "error",
|
||
"error": "prometheus_query_failed",
|
||
"response_status": data.get("status"),
|
||
}
|
||
logger.warning(
|
||
"governance_slo_prometheus_error",
|
||
slo=name,
|
||
query=query,
|
||
response_status=data.get("status"),
|
||
)
|
||
except Exception as e:
|
||
results[name] = {
|
||
"name": name,
|
||
"status": "error",
|
||
"error": str(e),
|
||
}
|
||
logger.warning("governance_slo_check_error", slo=name, error=str(e))
|
||
|
||
# 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
|
||
# 加聚合 _meta 區分「全 skipped」(metric 未 emit) vs「全 ok」(SLO 健康)
|
||
# 防止 dashboard 把 no_data 當 pass 顯示
|
||
violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "violated")
|
||
skipped_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "skipped")
|
||
ok_count = sum(
|
||
1 for v in results.values()
|
||
if isinstance(v, dict)
|
||
and v.get("status") == "ok"
|
||
)
|
||
error_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("status") == "error")
|
||
results["_meta"] = {
|
||
"violated_count": violated_count,
|
||
"skipped_count": skipped_count,
|
||
"ok_count": ok_count,
|
||
"error_count": error_count,
|
||
"all_status": sorted({v.get("status") for v in results.values() if isinstance(v, dict)}),
|
||
"all_skipped": skipped_count > 0 and ok_count == 0 and violated_count == 0,
|
||
"status": (
|
||
"no_data" if (skipped_count > 0 and ok_count == 0 and violated_count == 0)
|
||
else "violated" if violated_count > 0
|
||
else "ok"
|
||
),
|
||
}
|
||
logger.info(
|
||
"governance_slo_compliance_complete",
|
||
results=results,
|
||
violated=violated_count,
|
||
skipped=skipped_count,
|
||
ok=ok_count,
|
||
status=results["_meta"]["status"],
|
||
)
|
||
return results
|
||
|
||
# =========================================================================
|
||
# 全跑(exception 隔離)
|
||
# =========================================================================
|
||
|
||
async def run_self_check(self) -> dict[str, Any]:
|
||
"""5 項全跑,每項獨立 try/except 隔離,任一失敗不影響其他項目
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
2026-04-27 P3.4 by Claude — 加入第 5 項 slo_compliance(ADR-100)
|
||
"""
|
||
results: dict[str, Any] = {}
|
||
checks = [
|
||
("trust_drift", self.check_trust_drift),
|
||
("knowledge_degradation", self.check_knowledge_degradation),
|
||
("llm_hallucination", self.check_llm_hallucination),
|
||
("execution_blast_radius", self.check_execution_blast_radius),
|
||
("slo_compliance", self.check_slo_compliance),
|
||
]
|
||
|
||
for check_name, check_func in checks:
|
||
try:
|
||
results[check_name] = await check_func()
|
||
except Exception as e:
|
||
logger.exception(
|
||
"governance_check_failed",
|
||
check=check_name,
|
||
error=str(e),
|
||
)
|
||
results[check_name] = {"error": str(e)}
|
||
|
||
# 2026-04-27 Wave8-X3 by Claude — B8 全失敗聚合告警
|
||
# ≥3 項失敗代表治理機制本身故障,必須送出緊急告警
|
||
failed_checks = [k for k, v in results.items() if isinstance(v, dict) and "error" in v]
|
||
if len(failed_checks) >= 3:
|
||
try:
|
||
await self._alert(
|
||
"governance_self_failure",
|
||
{
|
||
"status": "critical",
|
||
"impact": {
|
||
"failed_checks": failed_checks,
|
||
"total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
|
||
"errors": {k: results[k].get("error") for k in failed_checks},
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"暫停非關鍵治理自動化接收鏈路",
|
||
"聚焦治理執行路徑錯誤並補齊 fallback",
|
||
],
|
||
"next_action": "investigate_governance_pipeline_health",
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"檢查 GovernanceAgent run loop 是否完整執行 5 個項目",
|
||
"確認 DB 寫入與 Prometheus fetch 未被上游干擾",
|
||
],
|
||
},
|
||
},
|
||
)
|
||
except Exception:
|
||
logger.exception("governance_self_failure_alert_failed")
|
||
|
||
# 2026-04-29 ogt + Claude Opus 4.7: critic M6 修
|
||
# SLO 全 skipped 是「資料未產生」(emitter 未實作)不是「治理機制故障」
|
||
# 用獨立 alert 區分,避免污染 self_failure 計數
|
||
slo_meta = (
|
||
results.get("slo_compliance", {}).get("_meta")
|
||
if isinstance(results.get("slo_compliance"), dict)
|
||
else None
|
||
)
|
||
if slo_meta and slo_meta.get("all_skipped"):
|
||
try:
|
||
await self._alert(
|
||
"governance_slo_data_gap",
|
||
{
|
||
"status": "warning",
|
||
"impact": {
|
||
"reason": "all_slo_metrics_not_emitted",
|
||
"skipped_count": slo_meta.get("skipped_count", 0),
|
||
"all_slo_metrics_not_emitted": True,
|
||
},
|
||
"remediation": {
|
||
"items": [
|
||
"補齊 ADR-100 SLO emitter(automation_operation_log_total / post_execution_verification_total / knowledge_entries_total)",
|
||
"確認 Prometheus recording rules 已載入,且 API Pod multiprocess 目錄可寫",
|
||
],
|
||
"next_action": "run_adr100_slo_emit_playbook",
|
||
"hint": "ADR-100 emitter、Prometheus recording rules、或 multiprocess 目錄任一環節未就緒",
|
||
},
|
||
"actionable": {
|
||
"items": [
|
||
"先確認 /metrics 是否已輸出 ADR-100 底層指標",
|
||
"檢查 Prometheus rule 與 truth-chain quality summary runtime metric 是否可查詢",
|
||
],
|
||
},
|
||
},
|
||
)
|
||
except Exception:
|
||
logger.exception("governance_slo_data_gap_alert_failed")
|
||
|
||
logger.info("governance_self_check_complete", results=results)
|
||
return results
|
||
|
||
# =========================================================================
|
||
# 告警輸出
|
||
# =========================================================================
|
||
|
||
async def _alert(self, event_type: str, payload: dict[str, Any]) -> None:
|
||
"""structlog 告警 + PG 持久化 + Telegram 推送(via FailoverAlerter)
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
2026-04-26 P2-DB-Fix by Claude — db-expert P0 三修(P0.1): 補 PG 寫入 ai_governance_events
|
||
ADR-085 鐵律:AI 學習成果不可存 Cache,必須落地 PG
|
||
"""
|
||
# 1. 寫 PG(ADR-085 鐵律 — 失敗不阻斷主流程)
|
||
event_id = generate_uuid()
|
||
pg_written = False
|
||
try:
|
||
from sqlalchemy import insert as _sa_insert
|
||
async with get_db_context() as db:
|
||
await db.execute(
|
||
_sa_insert(AiGovernanceEvent).values(
|
||
id=event_id,
|
||
event_type=event_type,
|
||
details=payload,
|
||
)
|
||
)
|
||
await db.commit()
|
||
pg_written = True
|
||
except Exception as _pg_err:
|
||
logger.warning("governance_pg_write_failed", error=str(_pg_err))
|
||
|
||
if pg_written:
|
||
await _maybe_create_intake_dispatch(event_id, event_type, payload)
|
||
|
||
# 2. structlog(保留既有行為)
|
||
logger.warning("governance_alert", event_type=event_type, **payload)
|
||
|
||
# Lazy import:延遲到實際呼叫時才取 alerter,避免啟動時循環依賴
|
||
alerter = self._alerter
|
||
if alerter is None:
|
||
try:
|
||
from src.services.failover_alerter import get_failover_alerter
|
||
alerter = get_failover_alerter()
|
||
except Exception as e:
|
||
logger.warning("governance_alerter_get_failed", error=str(e))
|
||
return
|
||
|
||
try:
|
||
await alerter.alert_governance(event_type, payload)
|
||
except Exception as e:
|
||
logger.warning("governance_telegram_alert_failed", error=str(e))
|
||
|
||
|
||
async def _has_open_knowledge_degradation_review() -> bool:
|
||
"""已有 Hermes owner-review 工單時,不再重複建立 KM stale 告警。
|
||
|
||
多個 API Pod 會同時啟動 governance loop;同一個 stale ratio 若已經
|
||
進入 Hermes review draft,就應視為「同一個未結治理工作」,避免
|
||
Telegram / Work Items 每輪產生新的治理事件與 REVIEW 草稿。
|
||
"""
|
||
try:
|
||
async with get_db_context() as db:
|
||
result = await db.execute(
|
||
select(GovernanceRemediationDispatch.id)
|
||
.join(
|
||
AiGovernanceEvent,
|
||
GovernanceRemediationDispatch.governance_event_id == AiGovernanceEvent.id,
|
||
)
|
||
.where(AiGovernanceEvent.event_type == "knowledge_degradation")
|
||
.where(AiGovernanceEvent.resolved.is_(False))
|
||
.where(GovernanceRemediationDispatch.event_type == "knowledge_degradation")
|
||
.where(GovernanceRemediationDispatch.executor_type == "hermes_kb_growth_healthcheck")
|
||
.where(
|
||
GovernanceRemediationDispatch.dispatch_status.in_(
|
||
["pending", "dispatched", "executing", "succeeded"]
|
||
)
|
||
)
|
||
.order_by(GovernanceRemediationDispatch.dispatched_at.desc())
|
||
.limit(1)
|
||
)
|
||
return result.scalar_one_or_none() is not None
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"governance_knowledge_degradation_review_lookup_failed_fail_open",
|
||
error=str(exc),
|
||
)
|
||
return False
|
||
|
||
|
||
async def _resolve_open_knowledge_degradation_events() -> int:
|
||
"""KM stale ratio 回到門檻內時,收斂未解治理事件。"""
|
||
try:
|
||
async with get_db_context() as db:
|
||
result = await db.execute(
|
||
update(AiGovernanceEvent)
|
||
.where(AiGovernanceEvent.event_type == "knowledge_degradation")
|
||
.where(AiGovernanceEvent.resolved.is_(False))
|
||
.values(resolved=True, resolved_at=now_taipei())
|
||
.execution_options(synchronize_session=False)
|
||
)
|
||
resolved_count = int(result.rowcount or 0)
|
||
if resolved_count:
|
||
await db.commit()
|
||
logger.info(
|
||
"governance_knowledge_degradation_resolved",
|
||
resolved_count=resolved_count,
|
||
reason="stale_ratio_recovered",
|
||
)
|
||
return resolved_count
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"governance_knowledge_degradation_resolve_failed",
|
||
error=str(exc),
|
||
)
|
||
return 0
|
||
|
||
|
||
async def _maybe_create_intake_dispatch(
|
||
event_id: str,
|
||
event_type: str,
|
||
payload: dict[str, Any],
|
||
) -> None:
|
||
"""把可行動治理告警同步轉成 non-executing dispatch work item。
|
||
|
||
這層只建立可追蹤派工,不執行修復、不寫 KM、不發額外通知。
|
||
後續 Hermes KB growth worker / GovernanceDispatcher 可以接續推進狀態。
|
||
"""
|
||
if event_type != "knowledge_degradation":
|
||
return
|
||
|
||
try:
|
||
await create_dispatch(
|
||
event_id=event_id,
|
||
event_type=event_type,
|
||
executor_type="hermes_kb_growth_healthcheck",
|
||
decision_context=_build_knowledge_degradation_dispatch_context(event_id, payload),
|
||
max_attempts=1,
|
||
created_by="governance_agent_intake",
|
||
)
|
||
except DispatchAlreadyActive:
|
||
logger.info(
|
||
"governance_intake_dispatch_already_active",
|
||
event_id=event_id,
|
||
event_type=event_type,
|
||
)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"governance_intake_dispatch_failed",
|
||
event_id=event_id,
|
||
event_type=event_type,
|
||
error=str(exc),
|
||
)
|
||
|
||
|
||
def _build_knowledge_degradation_dispatch_context(
|
||
event_id: str,
|
||
payload: dict[str, Any],
|
||
) -> dict[str, Any]:
|
||
impact = payload.get("impact") if isinstance(payload.get("impact"), dict) else {}
|
||
remediation = payload.get("remediation") if isinstance(payload.get("remediation"), dict) else {}
|
||
ownership = payload.get("ownership") if isinstance(payload.get("ownership"), dict) else {}
|
||
next_action = remediation.get("next_action")
|
||
if not isinstance(next_action, str) or not next_action:
|
||
next_action = "run_kb_growth_healthcheck"
|
||
|
||
return {
|
||
"version": "v1",
|
||
"trigger_source": "governance_agent",
|
||
"triggered_metric": "knowledge_degradation",
|
||
"metric_value": impact.get("stale_ratio"),
|
||
"threshold": impact.get("threshold"),
|
||
"suggested_action": next_action,
|
||
"next_action": next_action,
|
||
"decision_path": "pending_owner_review",
|
||
"ownership": ownership,
|
||
"affected_resources": ["knowledge_entries"],
|
||
"workflow": {
|
||
"work_item_id": f"governance:knowledge_degradation:{event_id}",
|
||
"work_kind": "kb_growth_healthcheck",
|
||
"current_stage": "queued_kb_healthcheck",
|
||
"steps": [
|
||
"detected",
|
||
"ai_analyzed",
|
||
"queued_kb_healthcheck",
|
||
"draft_km_updates",
|
||
"waiting_owner_review",
|
||
"km_writeback_after_approval",
|
||
"stale_ratio_recheck",
|
||
],
|
||
"stage_by_dispatch_status": {
|
||
"pending": "queued_kb_healthcheck",
|
||
"dispatched": "queued_kb_healthcheck",
|
||
"executing": "draft_km_updates",
|
||
"succeeded": "stale_ratio_recheck",
|
||
"failed": "needs_manual_km_triage",
|
||
"skipped": "waiting_owner_review",
|
||
"cancelled": "cancelled",
|
||
},
|
||
"next_action": next_action,
|
||
"needs_human_review": True,
|
||
"writes_km_without_approval": False,
|
||
"impact": impact,
|
||
},
|
||
"extra": {
|
||
"event_id": event_id,
|
||
"stale_count": impact.get("stale_count"),
|
||
"total_count": impact.get("total_count"),
|
||
"stale_days": impact.get("stale_days"),
|
||
"ownership": ownership,
|
||
},
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton + 排程迴圈
|
||
# =============================================================================
|
||
|
||
_agent: GovernanceAgent | None = None
|
||
|
||
|
||
def get_governance_agent() -> GovernanceAgent:
|
||
"""取得 GovernanceAgent singleton
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
"""
|
||
global _agent
|
||
if _agent is None:
|
||
_agent = GovernanceAgent()
|
||
return _agent
|
||
|
||
|
||
def reset_governance_agent() -> None:
|
||
"""重置 singleton(測試用)
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
"""
|
||
global _agent
|
||
_agent = None
|
||
|
||
|
||
async def run_governance_loop(interval_seconds: int = 3600) -> None:
|
||
"""每 1 小時執行一次 GovernanceAgent.run_self_check()
|
||
|
||
沿用 main.py 的 asyncio.create_task + sleep 迴圈模式(無 APScheduler)。
|
||
coalesce 效果:每次 sleep interval_seconds,不堆積多次執行。
|
||
|
||
2026-04-26 P2.2 by Claude
|
||
"""
|
||
agent = get_governance_agent()
|
||
while True:
|
||
try:
|
||
if await _try_acquire_governance_self_check_lease(interval_seconds):
|
||
await agent.run_self_check()
|
||
else:
|
||
logger.debug(
|
||
"governance_self_check_cycle_skipped",
|
||
reason="cycle_lease_held",
|
||
)
|
||
except Exception as e:
|
||
logger.warning("governance_loop_error", error=str(e))
|
||
await asyncio.sleep(interval_seconds)
|
||
|
||
|
||
async def _try_acquire_governance_self_check_lease(interval_seconds: int) -> bool:
|
||
"""跨 API Pod 的 self-check 週期租約。
|
||
|
||
這是週期 cooldown,不是 critical-section lock;取得後不主動 release。
|
||
TTL 到期前其他 replica 只略過本輪,避免同一治理狀態被多個 Pod 寫成
|
||
多筆事件、多張 Hermes KM 草稿。
|
||
"""
|
||
ttl = max(60, int(interval_seconds))
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
|
||
redis = get_redis()
|
||
acquired = await redis.set(
|
||
GOVERNANCE_SELF_CHECK_LEASE_KEY,
|
||
"1",
|
||
ex=ttl,
|
||
nx=True,
|
||
)
|
||
return bool(acquired)
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"governance_self_check_lease_unavailable_fail_open",
|
||
error=str(exc),
|
||
)
|
||
return True
|