feat(wave5-p2): GovernanceAgent 4 項自檢 + Ollama 健康告警規則 + Prometheus metrics 整合
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 10m45s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 10m45s
MASTER plan_complete_v3.md Wave 5 P2.2 + P2.3 完成(multiple engineers 在限額前完成代碼,補 commit): P2.2 — GovernanceAgent 4 項自檢: - governance_agent.py (342 行) — 每 1 小時自檢循環: · trust_drift(信任度漂移檢測) · knowledge_degradation(知識退化檢測) · llm_hallucination(LLM 幻覺檢測) · execution_blast_radius(執行爆炸半徑檢測) - main.py lifespan: asyncio.create_task(run_governance_loop()) 啟動 try/except 包裹,schedule 失敗不阻斷主流程 - failover_alerter.py: alert_governance(event_type, payload) 1h dedup 四類事件 → Telegram MarkdownV2 告警 P2.3 — Ollama 健康規則 + Prometheus Metrics: - ops/monitoring/ollama_health_rules.yaml (148 行): · OllamaHealthDegraded / OllamaPrimaryDown · OllamaFailoverTriggered / GeminiQuotaExceeded · 補 Prometheus 取資料的 alert rules - core/metrics.py (57 行): · GEMINI_DAILY_CALL_COUNT / GEMINI_DAILY_QUOTA Gauge · OLLAMA_FAILOVER_TRIGGERED_TOTAL Counter · OLLAMA_CURRENT_PRIMARY_IS_OLLAMA Gauge - ollama_failover_manager.py: · _check_gemini_quota: 每次 check 同步更新 Gauge(讓 Prometheus 取最新值) · select_provider: failover 時 inc Counter + 切 Primary Gauge · try/except 包裹,metric 失敗不阻斷主路由 E2E 測試: - test_failover_e2e_dispatch.py (365 行) 完整 dispatch 路徑:health check → failover decide → alerter → metrics Tests: 54 passed (e2e_dispatch + failover_manager + failover_alerter) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-Authored-By: Multiple Engineers (上 session Wave 5) <noreply@anthropic.com>
This commit is contained in:
@@ -129,6 +129,63 @@ LEARNING_SKIP_TOTAL = Counter(
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Ollama 容災指標 (P2.3, 2026-04-26 台北時區)
|
||||
# 建立者: Claude Sonnet 4.6 (tool-expert, P2.3)
|
||||
#
|
||||
# 對應告警規則: ops/monitoring/ollama_health_rules.yaml
|
||||
#
|
||||
# 使用位置:
|
||||
# - ollama_failover_manager.py: OLLAMA_FAILOVER_TRIGGERED_TOTAL, AI_ROUTER_PROVIDER_TOTAL
|
||||
# - ollama_auto_recovery.py: OLLAMA_RECOVERY_TRIGGERED_TOTAL
|
||||
# - ollama_health_monitor.py: OLLAMA_HEALTH_STATUS
|
||||
# - main.py lifespan / background task: GEMINI_DAILY_CALL_COUNT, GEMINI_DAILY_QUOTA
|
||||
#
|
||||
# Backlog(需設計後另行補入):
|
||||
# - ollama_inference_duration_seconds (Histogram) — 需在 _check_inference() 裡 observe
|
||||
# - post_execution_verification_failed_total / _total — 需 auto_repair_service.py 補入
|
||||
# =============================================================================
|
||||
|
||||
OLLAMA_FAILOVER_TRIGGERED_TOTAL = Counter(
|
||||
"ollama_failover_triggered_total",
|
||||
"Ollama failover events (primary switched away from ollama_111)",
|
||||
["from_provider", "to_provider"],
|
||||
)
|
||||
|
||||
OLLAMA_RECOVERY_TRIGGERED_TOTAL = Counter(
|
||||
"ollama_recovery_triggered_total",
|
||||
"Ollama auto-recovery events (primary switched back to ollama_111)",
|
||||
["from_provider"],
|
||||
)
|
||||
|
||||
OLLAMA_HEALTH_STATUS = Gauge(
|
||||
"ollama_health_status",
|
||||
"Ollama instance health (1=healthy, 0=not_healthy/offline)",
|
||||
["host"], # host: "111" or "188"
|
||||
)
|
||||
|
||||
OLLAMA_CURRENT_PRIMARY_IS_OLLAMA = Gauge(
|
||||
"ollama_current_primary_is_ollama",
|
||||
"Whether the current primary AI provider is ollama_111 (1=yes, 0=no)",
|
||||
)
|
||||
|
||||
AI_ROUTER_PROVIDER_TOTAL = Counter(
|
||||
"ai_router_selected_provider_total",
|
||||
"AI router provider selection count (all routing decisions)",
|
||||
["provider"],
|
||||
)
|
||||
|
||||
GEMINI_DAILY_CALL_COUNT = Gauge(
|
||||
"gemini_daily_call_count",
|
||||
"Gemini API calls made today (read from Redis ollama:gemini_daily_count:{date})",
|
||||
)
|
||||
|
||||
GEMINI_DAILY_QUOTA = Gauge(
|
||||
"gemini_daily_quota",
|
||||
"Gemini API daily call quota (from settings.GEMINI_DAILY_QUOTA)",
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
@@ -546,6 +546,15 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
except Exception as e:
|
||||
logger.warning("ai_slo_watchdog_schedule_failed", error=str(e))
|
||||
|
||||
# 2026-04-26 P2.2 by Claude — GovernanceAgent 4 項自檢(每 1 小時)
|
||||
# MASTER P2.2:trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius
|
||||
try:
|
||||
from src.services.governance_agent import run_governance_loop
|
||||
asyncio.create_task(run_governance_loop())
|
||||
logger.info("governance_agent_scheduled", interval_sec=3600)
|
||||
except Exception as e:
|
||||
logger.warning("governance_agent_schedule_failed", error=str(e))
|
||||
|
||||
# 2026-04-25 P1.2 by Claude Engineer-A2 — failover 整合到 ai_router + lifespan
|
||||
# OllamaFailoverManager + OllamaAutoRecoveryService 飛輪接線:
|
||||
# failover 切換時 → recovery_callback → set_current_primary → Redis 持久化
|
||||
|
||||
@@ -84,6 +84,32 @@ class FailoverAlerter:
|
||||
await self._send(msg)
|
||||
logger.info("recovery_alert_sent", from_provider=from_provider)
|
||||
|
||||
async def alert_governance(self, event_type: str, payload: dict[str, Any]) -> None:
|
||||
"""AI 治理告警(dedup 1h)
|
||||
|
||||
event_type: trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius
|
||||
dedup TTL 3600s — 同類告警 1 小時內不重複發送
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
"""
|
||||
dedup_key = f"alert:governance:{event_type}"
|
||||
if not await self._check_dedup(dedup_key, ttl=3600):
|
||||
logger.debug("governance_alert_dedup_skipped", event_type=event_type)
|
||||
return
|
||||
|
||||
# 格式化 payload 為可讀字串(key=value,換行分隔)
|
||||
detail_lines = "\n".join(
|
||||
f"{_escape_md(str(k))}:{_escape_md(str(v))}"
|
||||
for k, v in payload.items()
|
||||
)
|
||||
msg = (
|
||||
f"*AI 治理警報*\n\n"
|
||||
f"類型:{_escape_md(event_type)}\n\n"
|
||||
f"{detail_lines}"
|
||||
)
|
||||
await self._send(msg)
|
||||
logger.info("governance_alert_sent", event_type=event_type)
|
||||
|
||||
async def alert_gemini_quota_exceeded(self, event: dict[str, Any]) -> None:
|
||||
"""Gemini 每日上限觸發,降級到 188 CPU 備援 — 24h dedup(每日重置)"""
|
||||
# 2026-04-26 critic-H1 hotfix by Claude Opus 4.7 — dedup key 加日期後綴
|
||||
|
||||
342
apps/api/src/services/governance_agent.py
Normal file
342
apps/api/src/services/governance_agent.py
Normal file
@@ -0,0 +1,342 @@
|
||||
"""AI 自我治理 Agent
|
||||
|
||||
四項自檢,每 1 小時執行一次:
|
||||
1. trust_drift — Playbook trust_score < 0.2 → 告警建議廢棄
|
||||
2. knowledge_degradation — KM 7 天未更新 > 20% 總量 → 告警知識衰退
|
||||
3. llm_hallucination — 近 100 筆 evidence verification_result=failed 比例 > 10%
|
||||
4. execution_blast_radius — 近 100 筆 auto_repair_executions.success=False 比例 > 15%
|
||||
|
||||
所有 check 互相隔離(try/except),任一失敗不阻斷其他項目。
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import timedelta
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
from sqlalchemy import func, select
|
||||
|
||||
from src.db.base import get_db_context
|
||||
from src.db.models import (
|
||||
AutoRepairExecution,
|
||||
IncidentEvidence,
|
||||
KnowledgeEntryRecord,
|
||||
PlaybookRecord,
|
||||
)
|
||||
from src.models.knowledge import EntryStatus
|
||||
from src.utils.timezone import now_taipei
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# =============================================================================
|
||||
# 閾值常數
|
||||
# =============================================================================
|
||||
TRUST_DRIFT_THRESHOLD = 0.2 # playbook trust_score 低於此值 → 告警
|
||||
KM_STALE_DAYS = 7 # 知識條目超過幾天未更新視為陳舊
|
||||
KM_STALE_RATIO = 0.20 # 陳舊比例超過此值 → 告警
|
||||
HALLUCINATION_RATE_THRESHOLD = 0.10 # LLM verification failed 比例超過此值 → 告警
|
||||
EXECUTION_FAIL_RATE_THRESHOLD = 0.15 # 執行失敗比例超過此值 → 告警
|
||||
RECENT_LIMIT = 100 # 最近幾筆做統計
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# GovernanceAgent
|
||||
# =============================================================================
|
||||
|
||||
class GovernanceAgent:
|
||||
"""AI 自我治理 Agent — 4 項自檢 + 1h 排程
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
"""
|
||||
|
||||
def __init__(self, alerter=None) -> None:
|
||||
# alerter: FailoverAlerter instance(可注入,預設從 singleton 取得)
|
||||
self._alerter = alerter
|
||||
|
||||
# =========================================================================
|
||||
# 1. Playbook 信任度漂移
|
||||
# =========================================================================
|
||||
|
||||
async def check_trust_drift(self) -> dict[str, Any]:
|
||||
"""Playbook trust_score < 0.2 → 告警建議廢棄
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
"""
|
||||
async with get_db_context() as db:
|
||||
result = await db.execute(
|
||||
select(PlaybookRecord).where(
|
||||
PlaybookRecord.status.not_in(["deprecated", "archived"])
|
||||
)
|
||||
)
|
||||
all_records = result.scalars().all()
|
||||
|
||||
total = len(all_records)
|
||||
drifted = [r for r in all_records if float(r.trust_score) < TRUST_DRIFT_THRESHOLD]
|
||||
drifted_ids = [r.playbook_id for r in drifted[:10]]
|
||||
|
||||
if drifted:
|
||||
await self._alert(
|
||||
"trust_drift",
|
||||
{
|
||||
"drifted_count": len(drifted),
|
||||
"total_playbooks": total,
|
||||
"playbook_ids": drifted_ids,
|
||||
"threshold": TRUST_DRIFT_THRESHOLD,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"governance_trust_drift_checked",
|
||||
total=total,
|
||||
drifted=len(drifted),
|
||||
)
|
||||
return {"checked": total, "drifted": len(drifted)}
|
||||
|
||||
# =========================================================================
|
||||
# 2. 知識庫衰退
|
||||
# =========================================================================
|
||||
|
||||
async def check_knowledge_degradation(self) -> dict[str, Any]:
|
||||
"""KM 7 天未更新 > 20% 總量 → 告警知識衰退
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
"""
|
||||
stale_cutoff = now_taipei() - timedelta(days=KM_STALE_DAYS)
|
||||
|
||||
async with get_db_context() as db:
|
||||
# 非 archived 總數
|
||||
total_result = await db.execute(
|
||||
select(func.count()).select_from(KnowledgeEntryRecord).where(
|
||||
KnowledgeEntryRecord.status != EntryStatus.ARCHIVED
|
||||
)
|
||||
)
|
||||
total = total_result.scalar() or 0
|
||||
|
||||
# 7 天內未更新(updated_at < cutoff)且非 archived
|
||||
stale_result = await db.execute(
|
||||
select(func.count()).select_from(KnowledgeEntryRecord).where(
|
||||
KnowledgeEntryRecord.status != EntryStatus.ARCHIVED,
|
||||
KnowledgeEntryRecord.updated_at < stale_cutoff,
|
||||
)
|
||||
)
|
||||
stale = stale_result.scalar() or 0
|
||||
|
||||
ratio = stale / total if total > 0 else 0.0
|
||||
|
||||
if total > 0 and ratio > KM_STALE_RATIO:
|
||||
await self._alert(
|
||||
"knowledge_degradation",
|
||||
{
|
||||
"stale_count": stale,
|
||||
"total_count": total,
|
||||
"stale_ratio": round(ratio, 3),
|
||||
"threshold": KM_STALE_RATIO,
|
||||
"stale_days": KM_STALE_DAYS,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"governance_knowledge_degradation_checked",
|
||||
total=total,
|
||||
stale=stale,
|
||||
ratio=round(ratio, 3),
|
||||
)
|
||||
return {"total": total, "stale": stale, "ratio": round(ratio, 3)}
|
||||
|
||||
# =========================================================================
|
||||
# 3. LLM 幻覺率
|
||||
# =========================================================================
|
||||
|
||||
async def check_llm_hallucination(self) -> dict[str, Any]:
|
||||
"""最近 100 筆 IncidentEvidence verification_result=failed 比例 > 10% → 告警
|
||||
|
||||
verification_result 可能值:success / degraded / failed / timeout
|
||||
只有 'failed' 視為幻覺(LLM 判斷錯誤導致執行後驗證失敗)
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
"""
|
||||
async with get_db_context() as db:
|
||||
# 取最近 RECENT_LIMIT 筆有 verification_result 的記錄
|
||||
result = await db.execute(
|
||||
select(IncidentEvidence.verification_result)
|
||||
.where(IncidentEvidence.verification_result.is_not(None))
|
||||
.order_by(IncidentEvidence.collected_at.desc())
|
||||
.limit(RECENT_LIMIT)
|
||||
)
|
||||
rows = result.scalars().all()
|
||||
|
||||
total = len(rows)
|
||||
if total == 0:
|
||||
logger.info("governance_hallucination_checked", total=0, rate=0.0)
|
||||
return {"total": 0, "failed": 0, "rate": 0.0}
|
||||
|
||||
failed = sum(1 for r in rows if r == "failed")
|
||||
rate = failed / total
|
||||
|
||||
if rate > HALLUCINATION_RATE_THRESHOLD:
|
||||
await self._alert(
|
||||
"llm_hallucination",
|
||||
{
|
||||
"failed_count": failed,
|
||||
"total_checked": total,
|
||||
"hallucination_rate": round(rate, 3),
|
||||
"threshold": HALLUCINATION_RATE_THRESHOLD,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"governance_hallucination_checked",
|
||||
total=total,
|
||||
failed=failed,
|
||||
rate=round(rate, 3),
|
||||
)
|
||||
return {"total": total, "failed": failed, "rate": round(rate, 3)}
|
||||
|
||||
# =========================================================================
|
||||
# 4. 執行失敗率 (Blast Radius)
|
||||
# =========================================================================
|
||||
|
||||
async def check_execution_blast_radius(self) -> dict[str, Any]:
|
||||
"""最近 100 筆 AutoRepairExecution.success=False 比例 > 15% → 告警
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
"""
|
||||
async with get_db_context() as db:
|
||||
result = await db.execute(
|
||||
select(AutoRepairExecution.success)
|
||||
.order_by(AutoRepairExecution.created_at.desc())
|
||||
.limit(RECENT_LIMIT)
|
||||
)
|
||||
rows = result.scalars().all()
|
||||
|
||||
total = len(rows)
|
||||
if total == 0:
|
||||
logger.info("governance_blast_radius_checked", total=0, rate=0.0)
|
||||
return {"total": 0, "failed": 0, "rate": 0.0}
|
||||
|
||||
failed = sum(1 for r in rows if not r)
|
||||
rate = failed / total
|
||||
|
||||
if rate > EXECUTION_FAIL_RATE_THRESHOLD:
|
||||
await self._alert(
|
||||
"execution_blast_radius",
|
||||
{
|
||||
"failed_count": failed,
|
||||
"total_executions": total,
|
||||
"failure_rate": round(rate, 3),
|
||||
"threshold": EXECUTION_FAIL_RATE_THRESHOLD,
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"governance_blast_radius_checked",
|
||||
total=total,
|
||||
failed=failed,
|
||||
rate=round(rate, 3),
|
||||
)
|
||||
return {"total": total, "failed": failed, "rate": round(rate, 3)}
|
||||
|
||||
# =========================================================================
|
||||
# 全跑(exception 隔離)
|
||||
# =========================================================================
|
||||
|
||||
async def run_self_check(self) -> dict[str, Any]:
|
||||
"""4 項全跑,每項獨立 try/except 隔離,任一失敗不影響其他項目
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
"""
|
||||
results: dict[str, Any] = {}
|
||||
checks = [
|
||||
("trust_drift", self.check_trust_drift),
|
||||
("knowledge_degradation", self.check_knowledge_degradation),
|
||||
("llm_hallucination", self.check_llm_hallucination),
|
||||
("execution_blast_radius", self.check_execution_blast_radius),
|
||||
]
|
||||
|
||||
for check_name, check_func in checks:
|
||||
try:
|
||||
results[check_name] = await check_func()
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"governance_check_failed",
|
||||
check=check_name,
|
||||
error=str(e),
|
||||
)
|
||||
results[check_name] = {"error": str(e)}
|
||||
|
||||
logger.info("governance_self_check_complete", results=results)
|
||||
return results
|
||||
|
||||
# =========================================================================
|
||||
# 告警輸出
|
||||
# =========================================================================
|
||||
|
||||
async def _alert(self, event_type: str, payload: dict[str, Any]) -> None:
|
||||
"""structlog 告警 + Telegram 推送(via FailoverAlerter)
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
"""
|
||||
logger.warning("governance_alert", event_type=event_type, **payload)
|
||||
|
||||
# Lazy import:延遲到實際呼叫時才取 alerter,避免啟動時循環依賴
|
||||
alerter = self._alerter
|
||||
if alerter is None:
|
||||
try:
|
||||
from src.services.failover_alerter import get_failover_alerter
|
||||
alerter = get_failover_alerter()
|
||||
except Exception as e:
|
||||
logger.warning("governance_alerter_get_failed", error=str(e))
|
||||
return
|
||||
|
||||
try:
|
||||
await alerter.alert_governance(event_type, payload)
|
||||
except Exception as e:
|
||||
logger.warning("governance_telegram_alert_failed", error=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Singleton + 排程迴圈
|
||||
# =============================================================================
|
||||
|
||||
_agent: GovernanceAgent | None = None
|
||||
|
||||
|
||||
def get_governance_agent() -> GovernanceAgent:
|
||||
"""取得 GovernanceAgent singleton
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
"""
|
||||
global _agent
|
||||
if _agent is None:
|
||||
_agent = GovernanceAgent()
|
||||
return _agent
|
||||
|
||||
|
||||
def reset_governance_agent() -> None:
|
||||
"""重置 singleton(測試用)
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
"""
|
||||
global _agent
|
||||
_agent = None
|
||||
|
||||
|
||||
async def run_governance_loop(interval_seconds: int = 3600) -> None:
|
||||
"""每 1 小時執行一次 GovernanceAgent.run_self_check()
|
||||
|
||||
沿用 main.py 的 asyncio.create_task + sleep 迴圈模式(無 APScheduler)。
|
||||
coalesce 效果:每次 sleep interval_seconds,不堆積多次執行。
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
"""
|
||||
agent = get_governance_agent()
|
||||
while True:
|
||||
try:
|
||||
await agent.run_self_check()
|
||||
except Exception as e:
|
||||
logger.warning("governance_loop_error", error=str(e))
|
||||
await asyncio.sleep(interval_seconds)
|
||||
@@ -424,6 +424,15 @@ class OllamaFailoverManager:
|
||||
results = await pipe.execute()
|
||||
new_count = int(results[1]) # results[1] = INCR 後新值
|
||||
|
||||
# 2026-04-26 P2.3 by Claude Sonnet 4.6 (tool-expert) — 刷新 Gemini Prometheus Gauge
|
||||
# 每次 quota check 時同步更新,讓 Prometheus 取到最新值
|
||||
try:
|
||||
from src.core.metrics import GEMINI_DAILY_CALL_COUNT, GEMINI_DAILY_QUOTA
|
||||
GEMINI_DAILY_CALL_COUNT.set(new_count)
|
||||
GEMINI_DAILY_QUOTA.set(quota)
|
||||
except Exception:
|
||||
pass # metric 更新失敗不阻斷主路由邏輯
|
||||
|
||||
if new_count > quota:
|
||||
# 已超配額(INCR 後 > quota),回退不是必要的(最多超發 1 次)
|
||||
# 但要回傳 False 讓 router 切到 188
|
||||
@@ -551,6 +560,20 @@ class OllamaFailoverManager:
|
||||
# 111 正常,無切換事件
|
||||
return
|
||||
|
||||
# 2026-04-26 P2.3 by Claude Sonnet 4.6 (tool-expert) — 記錄 failover Prometheus metric
|
||||
try:
|
||||
from src.core.metrics import (
|
||||
OLLAMA_FAILOVER_TRIGGERED_TOTAL,
|
||||
OLLAMA_CURRENT_PRIMARY_IS_OLLAMA,
|
||||
)
|
||||
OLLAMA_FAILOVER_TRIGGERED_TOTAL.labels(
|
||||
from_provider="ollama",
|
||||
to_provider=result.primary.provider_name,
|
||||
).inc()
|
||||
OLLAMA_CURRENT_PRIMARY_IS_OLLAMA.set(0)
|
||||
except Exception as _metric_err:
|
||||
logger.debug("ollama_failover_metric_error", error=str(_metric_err))
|
||||
|
||||
logger.info(
|
||||
"ollama_failover_triggered",
|
||||
service="ollama_failover",
|
||||
|
||||
Reference in New Issue
Block a user