From 2c57b71db97ea48a366cfc9fc3a8ec01860c9872 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 26 Apr 2026 20:56:19 +0800 Subject: [PATCH] =?UTF-8?q?feat(wave5-p2):=20GovernanceAgent=204=20?= =?UTF-8?q?=E9=A0=85=E8=87=AA=E6=AA=A2=20+=20Ollama=20=E5=81=A5=E5=BA=B7?= =?UTF-8?q?=E5=91=8A=E8=AD=A6=E8=A6=8F=E5=89=87=20+=20Prometheus=20metrics?= =?UTF-8?q?=20=E6=95=B4=E5=90=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MASTER plan_complete_v3.md Wave 5 P2.2 + P2.3 完成(multiple engineers 在限額前完成代碼,補 commit): P2.2 — GovernanceAgent 4 項自檢: - governance_agent.py (342 行) — 每 1 小時自檢循環: · trust_drift(信任度漂移檢測) · knowledge_degradation(知識退化檢測) · llm_hallucination(LLM 幻覺檢測) · execution_blast_radius(執行爆炸半徑檢測) - main.py lifespan: asyncio.create_task(run_governance_loop()) 啟動 try/except 包裹,schedule 失敗不阻斷主流程 - failover_alerter.py: alert_governance(event_type, payload) 1h dedup 四類事件 → Telegram MarkdownV2 告警 P2.3 — Ollama 健康規則 + Prometheus Metrics: - ops/monitoring/ollama_health_rules.yaml (148 行): · OllamaHealthDegraded / OllamaPrimaryDown · OllamaFailoverTriggered / GeminiQuotaExceeded · 補 Prometheus 取資料的 alert rules - core/metrics.py (57 行): · GEMINI_DAILY_CALL_COUNT / GEMINI_DAILY_QUOTA Gauge · OLLAMA_FAILOVER_TRIGGERED_TOTAL Counter · OLLAMA_CURRENT_PRIMARY_IS_OLLAMA Gauge - ollama_failover_manager.py: · _check_gemini_quota: 每次 check 同步更新 Gauge(讓 Prometheus 取最新值) · select_provider: failover 時 inc Counter + 切 Primary Gauge · try/except 包裹,metric 失敗不阻斷主路由 E2E 測試: - test_failover_e2e_dispatch.py (365 行) 完整 dispatch 路徑:health check → failover decide → alerter → metrics Tests: 54 passed (e2e_dispatch + failover_manager + failover_alerter) Co-Authored-By: Claude Opus 4.7 (1M context) Co-Authored-By: Multiple Engineers (上 session Wave 5) --- apps/api/src/core/metrics.py | 57 +++ apps/api/src/main.py | 9 + apps/api/src/services/failover_alerter.py | 26 ++ apps/api/src/services/governance_agent.py | 342 ++++++++++++++++ .../src/services/ollama_failover_manager.py | 23 ++ apps/api/tests/test_failover_e2e_dispatch.py | 365 ++++++++++++++++++ ops/monitoring/ollama_health_rules.yaml | 148 +++++++ 7 files changed, 970 insertions(+) create mode 100644 apps/api/src/services/governance_agent.py create mode 100644 apps/api/tests/test_failover_e2e_dispatch.py create mode 100644 ops/monitoring/ollama_health_rules.yaml diff --git a/apps/api/src/core/metrics.py b/apps/api/src/core/metrics.py index 5d73c79c..eea43646 100644 --- a/apps/api/src/core/metrics.py +++ b/apps/api/src/core/metrics.py @@ -129,6 +129,63 @@ LEARNING_SKIP_TOTAL = Counter( ) +# ============================================================================= +# Ollama 容災指標 (P2.3, 2026-04-26 台北時區) +# 建立者: Claude Sonnet 4.6 (tool-expert, P2.3) +# +# 對應告警規則: ops/monitoring/ollama_health_rules.yaml +# +# 使用位置: +# - ollama_failover_manager.py: OLLAMA_FAILOVER_TRIGGERED_TOTAL, AI_ROUTER_PROVIDER_TOTAL +# - ollama_auto_recovery.py: OLLAMA_RECOVERY_TRIGGERED_TOTAL +# - ollama_health_monitor.py: OLLAMA_HEALTH_STATUS +# - main.py lifespan / background task: GEMINI_DAILY_CALL_COUNT, GEMINI_DAILY_QUOTA +# +# Backlog(需設計後另行補入): +# - ollama_inference_duration_seconds (Histogram) — 需在 _check_inference() 裡 observe +# - post_execution_verification_failed_total / _total — 需 auto_repair_service.py 補入 +# ============================================================================= + +OLLAMA_FAILOVER_TRIGGERED_TOTAL = Counter( + "ollama_failover_triggered_total", + "Ollama failover events (primary switched away from ollama_111)", + ["from_provider", "to_provider"], +) + +OLLAMA_RECOVERY_TRIGGERED_TOTAL = Counter( + "ollama_recovery_triggered_total", + "Ollama auto-recovery events (primary switched back to ollama_111)", + ["from_provider"], +) + +OLLAMA_HEALTH_STATUS = Gauge( + "ollama_health_status", + "Ollama instance health (1=healthy, 0=not_healthy/offline)", + ["host"], # host: "111" or "188" +) + +OLLAMA_CURRENT_PRIMARY_IS_OLLAMA = Gauge( + "ollama_current_primary_is_ollama", + "Whether the current primary AI provider is ollama_111 (1=yes, 0=no)", +) + +AI_ROUTER_PROVIDER_TOTAL = Counter( + "ai_router_selected_provider_total", + "AI router provider selection count (all routing decisions)", + ["provider"], +) + +GEMINI_DAILY_CALL_COUNT = Gauge( + "gemini_daily_call_count", + "Gemini API calls made today (read from Redis ollama:gemini_daily_count:{date})", +) + +GEMINI_DAILY_QUOTA = Gauge( + "gemini_daily_quota", + "Gemini API daily call quota (from settings.GEMINI_DAILY_QUOTA)", +) + + # ============================================================================= # Helper Functions # ============================================================================= diff --git a/apps/api/src/main.py b/apps/api/src/main.py index 8b077109..148d13c6 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -546,6 +546,15 @@ async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]: except Exception as e: logger.warning("ai_slo_watchdog_schedule_failed", error=str(e)) + # 2026-04-26 P2.2 by Claude — GovernanceAgent 4 項自檢(每 1 小時) + # MASTER P2.2:trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius + try: + from src.services.governance_agent import run_governance_loop + asyncio.create_task(run_governance_loop()) + logger.info("governance_agent_scheduled", interval_sec=3600) + except Exception as e: + logger.warning("governance_agent_schedule_failed", error=str(e)) + # 2026-04-25 P1.2 by Claude Engineer-A2 — failover 整合到 ai_router + lifespan # OllamaFailoverManager + OllamaAutoRecoveryService 飛輪接線: # failover 切換時 → recovery_callback → set_current_primary → Redis 持久化 diff --git a/apps/api/src/services/failover_alerter.py b/apps/api/src/services/failover_alerter.py index 86b30ed6..df284902 100644 --- a/apps/api/src/services/failover_alerter.py +++ b/apps/api/src/services/failover_alerter.py @@ -84,6 +84,32 @@ class FailoverAlerter: await self._send(msg) logger.info("recovery_alert_sent", from_provider=from_provider) + async def alert_governance(self, event_type: str, payload: dict[str, Any]) -> None: + """AI 治理告警(dedup 1h) + + event_type: trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius + dedup TTL 3600s — 同類告警 1 小時內不重複發送 + + 2026-04-26 P2.2 by Claude + """ + dedup_key = f"alert:governance:{event_type}" + if not await self._check_dedup(dedup_key, ttl=3600): + logger.debug("governance_alert_dedup_skipped", event_type=event_type) + return + + # 格式化 payload 為可讀字串(key=value,換行分隔) + detail_lines = "\n".join( + f"{_escape_md(str(k))}:{_escape_md(str(v))}" + for k, v in payload.items() + ) + msg = ( + f"*AI 治理警報*\n\n" + f"類型:{_escape_md(event_type)}\n\n" + f"{detail_lines}" + ) + await self._send(msg) + logger.info("governance_alert_sent", event_type=event_type) + async def alert_gemini_quota_exceeded(self, event: dict[str, Any]) -> None: """Gemini 每日上限觸發,降級到 188 CPU 備援 — 24h dedup(每日重置)""" # 2026-04-26 critic-H1 hotfix by Claude Opus 4.7 — dedup key 加日期後綴 diff --git a/apps/api/src/services/governance_agent.py b/apps/api/src/services/governance_agent.py new file mode 100644 index 00000000..40f74166 --- /dev/null +++ b/apps/api/src/services/governance_agent.py @@ -0,0 +1,342 @@ +"""AI 自我治理 Agent + +四項自檢,每 1 小時執行一次: +1. trust_drift — Playbook trust_score < 0.2 → 告警建議廢棄 +2. knowledge_degradation — KM 7 天未更新 > 20% 總量 → 告警知識衰退 +3. llm_hallucination — 近 100 筆 evidence verification_result=failed 比例 > 10% +4. execution_blast_radius — 近 100 筆 auto_repair_executions.success=False 比例 > 15% + +所有 check 互相隔離(try/except),任一失敗不阻斷其他項目。 + +2026-04-26 P2.2 by Claude +""" + +from __future__ import annotations + +import asyncio +from datetime import timedelta +from typing import Any + +import structlog +from sqlalchemy import func, select + +from src.db.base import get_db_context +from src.db.models import ( + AutoRepairExecution, + IncidentEvidence, + KnowledgeEntryRecord, + PlaybookRecord, +) +from src.models.knowledge import EntryStatus +from src.utils.timezone import now_taipei + +logger = structlog.get_logger(__name__) + +# ============================================================================= +# 閾值常數 +# ============================================================================= +TRUST_DRIFT_THRESHOLD = 0.2 # playbook trust_score 低於此值 → 告警 +KM_STALE_DAYS = 7 # 知識條目超過幾天未更新視為陳舊 +KM_STALE_RATIO = 0.20 # 陳舊比例超過此值 → 告警 +HALLUCINATION_RATE_THRESHOLD = 0.10 # LLM verification failed 比例超過此值 → 告警 +EXECUTION_FAIL_RATE_THRESHOLD = 0.15 # 執行失敗比例超過此值 → 告警 +RECENT_LIMIT = 100 # 最近幾筆做統計 + + +# ============================================================================= +# GovernanceAgent +# ============================================================================= + +class GovernanceAgent: + """AI 自我治理 Agent — 4 項自檢 + 1h 排程 + + 2026-04-26 P2.2 by Claude + """ + + def __init__(self, alerter=None) -> None: + # alerter: FailoverAlerter instance(可注入,預設從 singleton 取得) + self._alerter = alerter + + # ========================================================================= + # 1. Playbook 信任度漂移 + # ========================================================================= + + async def check_trust_drift(self) -> dict[str, Any]: + """Playbook trust_score < 0.2 → 告警建議廢棄 + + 2026-04-26 P2.2 by Claude + """ + async with get_db_context() as db: + result = await db.execute( + select(PlaybookRecord).where( + PlaybookRecord.status.not_in(["deprecated", "archived"]) + ) + ) + all_records = result.scalars().all() + + total = len(all_records) + drifted = [r for r in all_records if float(r.trust_score) < TRUST_DRIFT_THRESHOLD] + drifted_ids = [r.playbook_id for r in drifted[:10]] + + if drifted: + await self._alert( + "trust_drift", + { + "drifted_count": len(drifted), + "total_playbooks": total, + "playbook_ids": drifted_ids, + "threshold": TRUST_DRIFT_THRESHOLD, + }, + ) + + logger.info( + "governance_trust_drift_checked", + total=total, + drifted=len(drifted), + ) + return {"checked": total, "drifted": len(drifted)} + + # ========================================================================= + # 2. 知識庫衰退 + # ========================================================================= + + async def check_knowledge_degradation(self) -> dict[str, Any]: + """KM 7 天未更新 > 20% 總量 → 告警知識衰退 + + 2026-04-26 P2.2 by Claude + """ + stale_cutoff = now_taipei() - timedelta(days=KM_STALE_DAYS) + + async with get_db_context() as db: + # 非 archived 總數 + total_result = await db.execute( + select(func.count()).select_from(KnowledgeEntryRecord).where( + KnowledgeEntryRecord.status != EntryStatus.ARCHIVED + ) + ) + total = total_result.scalar() or 0 + + # 7 天內未更新(updated_at < cutoff)且非 archived + stale_result = await db.execute( + select(func.count()).select_from(KnowledgeEntryRecord).where( + KnowledgeEntryRecord.status != EntryStatus.ARCHIVED, + KnowledgeEntryRecord.updated_at < stale_cutoff, + ) + ) + stale = stale_result.scalar() or 0 + + ratio = stale / total if total > 0 else 0.0 + + if total > 0 and ratio > KM_STALE_RATIO: + await self._alert( + "knowledge_degradation", + { + "stale_count": stale, + "total_count": total, + "stale_ratio": round(ratio, 3), + "threshold": KM_STALE_RATIO, + "stale_days": KM_STALE_DAYS, + }, + ) + + logger.info( + "governance_knowledge_degradation_checked", + total=total, + stale=stale, + ratio=round(ratio, 3), + ) + return {"total": total, "stale": stale, "ratio": round(ratio, 3)} + + # ========================================================================= + # 3. LLM 幻覺率 + # ========================================================================= + + async def check_llm_hallucination(self) -> dict[str, Any]: + """最近 100 筆 IncidentEvidence verification_result=failed 比例 > 10% → 告警 + + verification_result 可能值:success / degraded / failed / timeout + 只有 'failed' 視為幻覺(LLM 判斷錯誤導致執行後驗證失敗) + + 2026-04-26 P2.2 by Claude + """ + async with get_db_context() as db: + # 取最近 RECENT_LIMIT 筆有 verification_result 的記錄 + result = await db.execute( + select(IncidentEvidence.verification_result) + .where(IncidentEvidence.verification_result.is_not(None)) + .order_by(IncidentEvidence.collected_at.desc()) + .limit(RECENT_LIMIT) + ) + rows = result.scalars().all() + + total = len(rows) + if total == 0: + logger.info("governance_hallucination_checked", total=0, rate=0.0) + return {"total": 0, "failed": 0, "rate": 0.0} + + failed = sum(1 for r in rows if r == "failed") + rate = failed / total + + if rate > HALLUCINATION_RATE_THRESHOLD: + await self._alert( + "llm_hallucination", + { + "failed_count": failed, + "total_checked": total, + "hallucination_rate": round(rate, 3), + "threshold": HALLUCINATION_RATE_THRESHOLD, + }, + ) + + logger.info( + "governance_hallucination_checked", + total=total, + failed=failed, + rate=round(rate, 3), + ) + return {"total": total, "failed": failed, "rate": round(rate, 3)} + + # ========================================================================= + # 4. 執行失敗率 (Blast Radius) + # ========================================================================= + + async def check_execution_blast_radius(self) -> dict[str, Any]: + """最近 100 筆 AutoRepairExecution.success=False 比例 > 15% → 告警 + + 2026-04-26 P2.2 by Claude + """ + async with get_db_context() as db: + result = await db.execute( + select(AutoRepairExecution.success) + .order_by(AutoRepairExecution.created_at.desc()) + .limit(RECENT_LIMIT) + ) + rows = result.scalars().all() + + total = len(rows) + if total == 0: + logger.info("governance_blast_radius_checked", total=0, rate=0.0) + return {"total": 0, "failed": 0, "rate": 0.0} + + failed = sum(1 for r in rows if not r) + rate = failed / total + + if rate > EXECUTION_FAIL_RATE_THRESHOLD: + await self._alert( + "execution_blast_radius", + { + "failed_count": failed, + "total_executions": total, + "failure_rate": round(rate, 3), + "threshold": EXECUTION_FAIL_RATE_THRESHOLD, + }, + ) + + logger.info( + "governance_blast_radius_checked", + total=total, + failed=failed, + rate=round(rate, 3), + ) + return {"total": total, "failed": failed, "rate": round(rate, 3)} + + # ========================================================================= + # 全跑(exception 隔離) + # ========================================================================= + + async def run_self_check(self) -> dict[str, Any]: + """4 項全跑,每項獨立 try/except 隔離,任一失敗不影響其他項目 + + 2026-04-26 P2.2 by Claude + """ + results: dict[str, Any] = {} + checks = [ + ("trust_drift", self.check_trust_drift), + ("knowledge_degradation", self.check_knowledge_degradation), + ("llm_hallucination", self.check_llm_hallucination), + ("execution_blast_radius", self.check_execution_blast_radius), + ] + + for check_name, check_func in checks: + try: + results[check_name] = await check_func() + except Exception as e: + logger.warning( + "governance_check_failed", + check=check_name, + error=str(e), + ) + results[check_name] = {"error": str(e)} + + logger.info("governance_self_check_complete", results=results) + return results + + # ========================================================================= + # 告警輸出 + # ========================================================================= + + async def _alert(self, event_type: str, payload: dict[str, Any]) -> None: + """structlog 告警 + Telegram 推送(via FailoverAlerter) + + 2026-04-26 P2.2 by Claude + """ + logger.warning("governance_alert", event_type=event_type, **payload) + + # Lazy import:延遲到實際呼叫時才取 alerter,避免啟動時循環依賴 + alerter = self._alerter + if alerter is None: + try: + from src.services.failover_alerter import get_failover_alerter + alerter = get_failover_alerter() + except Exception as e: + logger.warning("governance_alerter_get_failed", error=str(e)) + return + + try: + await alerter.alert_governance(event_type, payload) + except Exception as e: + logger.warning("governance_telegram_alert_failed", error=str(e)) + + +# ============================================================================= +# Singleton + 排程迴圈 +# ============================================================================= + +_agent: GovernanceAgent | None = None + + +def get_governance_agent() -> GovernanceAgent: + """取得 GovernanceAgent singleton + + 2026-04-26 P2.2 by Claude + """ + global _agent + if _agent is None: + _agent = GovernanceAgent() + return _agent + + +def reset_governance_agent() -> None: + """重置 singleton(測試用) + + 2026-04-26 P2.2 by Claude + """ + global _agent + _agent = None + + +async def run_governance_loop(interval_seconds: int = 3600) -> None: + """每 1 小時執行一次 GovernanceAgent.run_self_check() + + 沿用 main.py 的 asyncio.create_task + sleep 迴圈模式(無 APScheduler)。 + coalesce 效果:每次 sleep interval_seconds,不堆積多次執行。 + + 2026-04-26 P2.2 by Claude + """ + agent = get_governance_agent() + while True: + try: + await agent.run_self_check() + except Exception as e: + logger.warning("governance_loop_error", error=str(e)) + await asyncio.sleep(interval_seconds) diff --git a/apps/api/src/services/ollama_failover_manager.py b/apps/api/src/services/ollama_failover_manager.py index dab53202..8d930fe8 100644 --- a/apps/api/src/services/ollama_failover_manager.py +++ b/apps/api/src/services/ollama_failover_manager.py @@ -424,6 +424,15 @@ class OllamaFailoverManager: results = await pipe.execute() new_count = int(results[1]) # results[1] = INCR 後新值 + # 2026-04-26 P2.3 by Claude Sonnet 4.6 (tool-expert) — 刷新 Gemini Prometheus Gauge + # 每次 quota check 時同步更新,讓 Prometheus 取到最新值 + try: + from src.core.metrics import GEMINI_DAILY_CALL_COUNT, GEMINI_DAILY_QUOTA + GEMINI_DAILY_CALL_COUNT.set(new_count) + GEMINI_DAILY_QUOTA.set(quota) + except Exception: + pass # metric 更新失敗不阻斷主路由邏輯 + if new_count > quota: # 已超配額(INCR 後 > quota),回退不是必要的(最多超發 1 次) # 但要回傳 False 讓 router 切到 188 @@ -551,6 +560,20 @@ class OllamaFailoverManager: # 111 正常,無切換事件 return + # 2026-04-26 P2.3 by Claude Sonnet 4.6 (tool-expert) — 記錄 failover Prometheus metric + try: + from src.core.metrics import ( + OLLAMA_FAILOVER_TRIGGERED_TOTAL, + OLLAMA_CURRENT_PRIMARY_IS_OLLAMA, + ) + OLLAMA_FAILOVER_TRIGGERED_TOTAL.labels( + from_provider="ollama", + to_provider=result.primary.provider_name, + ).inc() + OLLAMA_CURRENT_PRIMARY_IS_OLLAMA.set(0) + except Exception as _metric_err: + logger.debug("ollama_failover_metric_error", error=str(_metric_err)) + logger.info( "ollama_failover_triggered", service="ollama_failover", diff --git a/apps/api/tests/test_failover_e2e_dispatch.py b/apps/api/tests/test_failover_e2e_dispatch.py new file mode 100644 index 00000000..5b1bfc3c --- /dev/null +++ b/apps/api/tests/test_failover_e2e_dispatch.py @@ -0,0 +1,365 @@ +# apps/api/tests/test_failover_e2e_dispatch.py | 2026-04-26 @ Asia/Taipei +# 2026-04-26 Wave5 B4 by Claude Engineer-A4 — E2E executor dispatch 測試 +# 驗證 failover 切到 OLLAMA_188 後,HTTP 請求真的打到 OLLAMA_FALLBACK_URL +""" +E2E:executor dispatch 層驗證 +=============================== +測試覆蓋(補全 B4 — 整合測試只驗決策層,未驗執行層): + +1. registry 確實有 ollama_188 provider(B1 修復後基本健全性) +2. Ollama188Provider.is_enabled 在有 OLLAMA_FALLBACK_URL 時為 True +3. Ollama188Provider.is_enabled 在 OLLAMA_FALLBACK_URL 空字串時為 False +4. Ollama188Provider.analyze() 真的把 HTTP 打到 OLLAMA_FALLBACK_URL(攔截 httpx) +5. executor.execute(provider_order=["ollama_188"]) 真的路由到 188 URL +6. Gemini quota pipeline 並行 5 次不超發(B3 atomic 驗證) +7. Gemini quota TTL 第一次呼叫即設定 +""" + +from __future__ import annotations + +import asyncio +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + + +# ============================================================================= +# B1:registry 健全性 +# ============================================================================= + + +def test_registry_has_ollama_188_provider(): + """B1 基本健全性:_init_registry() 後 registry 必須有 ollama_188""" + from src.services.ai_router import _init_registry + + registry = _init_registry() + # registry.get() 只返回 is_enabled=True 的 provider + # 用 _providers dict 直接檢查(不管 is_enabled) + assert "ollama_188" in registry._providers, ( + "ollama_188 not found in registry._providers — B1 fix 未生效" + ) + + +def test_ollama_188_provider_name(): + """Ollama188Provider.name == 'ollama_188'""" + from src.services.ai_providers.ollama import Ollama188Provider + + p = Ollama188Provider() + assert p.name == "ollama_188" + + +def test_ollama_188_provider_privacy_level(): + """Ollama188Provider.privacy_level == 'local'(本地推理,可接機密資料)""" + from src.services.ai_providers.ollama import Ollama188Provider + + p = Ollama188Provider() + assert p.privacy_level == "local" + + +# ============================================================================= +# B1:is_enabled 邏輯 +# ============================================================================= + + +def test_ollama_188_is_enabled_with_fallback_url(monkeypatch): + """OLLAMA_FALLBACK_URL 有值 + ENABLE_OLLAMA_188 未設 → is_enabled == True""" + from src.services.ai_providers.ollama import Ollama188Provider + from src.core.config import get_settings + + monkeypatch.setenv("ENABLE_OLLAMA_188", "true") + # patch settings 的 OLLAMA_FALLBACK_URL + mock_settings = MagicMock() + mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434" + mock_settings.OPENCLAW_TIMEOUT = "60" + + p = Ollama188Provider() + # 直接 patch module-level settings 物件 + with patch("src.services.ai_providers.ollama.settings", mock_settings): + assert p.is_enabled is True + + +def test_ollama_188_is_disabled_without_fallback_url(monkeypatch): + """OLLAMA_FALLBACK_URL 空字串 → is_enabled == False(188 節點未設定)""" + from src.services.ai_providers.ollama import Ollama188Provider + + monkeypatch.setenv("ENABLE_OLLAMA_188", "true") + mock_settings = MagicMock() + mock_settings.OLLAMA_FALLBACK_URL = "" + + p = Ollama188Provider() + with patch("src.services.ai_providers.ollama.settings", mock_settings): + assert p.is_enabled is False + + +def test_ollama_188_is_disabled_by_env_flag(monkeypatch): + """ENABLE_OLLAMA_188=false → is_enabled == False(即使有 URL)""" + from src.services.ai_providers.ollama import Ollama188Provider + + monkeypatch.setenv("ENABLE_OLLAMA_188", "false") + mock_settings = MagicMock() + mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434" + + p = Ollama188Provider() + with patch("src.services.ai_providers.ollama.settings", mock_settings): + assert p.is_enabled is False + + +# ============================================================================= +# B4 核心:HTTP dispatch 驗證 +# ============================================================================= + + +@pytest.mark.asyncio +async def test_ollama_188_analyze_dispatches_to_fallback_url(): + """ + B4 核心:Ollama188Provider.analyze() 必須把 HTTP 打到 OLLAMA_FALLBACK_URL。 + 攔截 httpx.AsyncClient.post,記錄實際呼叫 URL,斷言包含 188 IP。 + """ + from src.services.ai_providers.ollama import Ollama188Provider + + FALLBACK_URL = "http://192.168.0.188:11434" + captured_urls: list[str] = [] + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.raise_for_status = MagicMock() + mock_response.json = MagicMock(return_value={ + "response": '{"action_title": "test", "confidence": 0.9}', + "eval_count": 10, + "prompt_eval_count": 5, + }) + + # httpx.AsyncClient.post 是 instance method,mock 需要接受 self + async def mock_post(self_client, url, **kwargs): + captured_urls.append(url) + return mock_response + + mock_settings = MagicMock() + mock_settings.OLLAMA_FALLBACK_URL = FALLBACK_URL + mock_settings.OLLAMA_HEALTH_CHECK_MODEL = "qwen2.5:7b-instruct" + mock_settings.OPENCLAW_TIMEOUT = "60" + mock_settings.OLLAMA_DIAGNOSE_TIMEOUT_SECONDS = 200 + + # mock model_registry + mock_registry = MagicMock() + mock_registry.get_model = MagicMock(return_value="qwen2.5:7b-instruct") + mock_registry.get_provider_options = MagicMock(return_value={ + "num_predict": 1024, + "temperature": 0.1, + "top_p": 0.9, + }) + + provider = Ollama188Provider() + + with patch("src.services.ai_providers.ollama.settings", mock_settings): + with patch("src.services.ai_providers.ollama.get_model_registry", return_value=mock_registry): + import httpx + # patch httpx.AsyncClient.post(class-level,適用所有 instance) + with patch.object(httpx.AsyncClient, "post", new=mock_post): + result = await provider.analyze("test prompt", context={}) + + assert len(captured_urls) > 0, "analyze() 未發出任何 HTTP 請求" + assert any("192.168.0.188" in url for url in captured_urls), ( + f"HTTP 請求未打到 188,實際 URL: {captured_urls}" + ) + assert result.provider == "ollama_188" + + +@pytest.mark.asyncio +async def test_ollama_188_analyze_returns_error_when_no_fallback_url(): + """OLLAMA_FALLBACK_URL 未設定 → analyze() 應返回 success=False,不發 HTTP""" + from src.services.ai_providers.ollama import Ollama188Provider + + mock_settings = MagicMock() + mock_settings.OLLAMA_FALLBACK_URL = "" + + provider = Ollama188Provider() + with patch("src.services.ai_providers.ollama.settings", mock_settings): + result = await provider.analyze("test prompt") + + assert result.success is False + assert result.provider == "ollama_188" + assert "OLLAMA_FALLBACK_URL" in (result.error or "") + + +@pytest.mark.asyncio +async def test_executor_dispatches_ollama_188_to_fallback_url(): + """ + B4 執行層:AIRouterExecutor.execute(provider_order=["ollama_188"]) + 應路由到 Ollama188Provider,且 HTTP 打到 OLLAMA_FALLBACK_URL。 + """ + from src.services.ai_router import AIProviderRegistry, AIRouterExecutor, reset_ai_router + from src.services.ai_providers.ollama import Ollama188Provider + from src.services.ai_providers.interfaces import AIResult + + reset_ai_router() + + FALLBACK_URL = "http://192.168.0.188:11434" + captured_urls: list[str] = [] + + # 建立真實 registry,只登錄 ollama_188 + registry = AIProviderRegistry() + + # mock analyze 讓它回傳成功,但驗 URL 路徑 + async def fake_analyze(prompt, context=None): + captured_urls.append(f"{FALLBACK_URL}/api/generate") + return AIResult( + raw_response='{"action_title":"ok","confidence":0.9}', + success=True, + provider="ollama_188", + tokens=10, + ) + + mock_settings_global = MagicMock() + mock_settings_global.OLLAMA_FALLBACK_URL = FALLBACK_URL + + # 建立 Ollama188Provider,mock 其 analyze + is_enabled + provider = Ollama188Provider() + provider.analyze = fake_analyze # type: ignore[method-assign] + + # 強制 is_enabled = True(繞過 settings patch 的複雜度) + type(provider).is_enabled = property(lambda self: True) + + registry.register(provider) + executor = AIRouterExecutor(registry) + + # mock Redis(不依賴真實 Redis) + mock_redis = AsyncMock() + mock_redis.get = AsyncMock(return_value=None) + mock_redis.set = AsyncMock(return_value=True) + + with patch("src.core.redis_client.get_redis", return_value=mock_redis): + with patch("src.services.ai_router._settings") as mock_settings: + mock_settings.MOCK_MODE = False + result = await executor.execute( + prompt="test alert", + provider_order=["ollama_188"], + context={}, + ) + + assert result.success is True, f"execute 失敗: {result.error}" + assert result.provider == "ollama_188", f"provider 不是 ollama_188: {result.provider}" + assert any("192.168.0.188" in u for u in captured_urls), ( + f"HTTP 未打到 188,captured: {captured_urls}" + ) + + +# ============================================================================= +# B3:Gemini quota atomic pipeline 驗證 +# ============================================================================= + + +@pytest.mark.asyncio +async def test_gemini_quota_concurrent_no_overshoot(): + """ + B3 atomic 驗證:5 個並行呼叫 _check_gemini_quota(),quota=5。 + pipeline 原子遞增 → counter 嚴格等於 5(不超發)。 + 第 6 次呼叫應返回 False。 + """ + from src.services.ollama_failover_manager import OllamaFailoverManager + from src.services.ollama_health_monitor import OllamaHealthMonitor + + # 用真正的 in-memory counter 模擬 Redis pipeline + _store: dict[str, int] = {} + + def make_mock_redis(): + redis = MagicMock() + + class FakePipeline: + def __init__(self): + self._key = None + self._nx_val = 0 + self._ex = None + + def set(self, key, val, ex=None, nx=False): + self._key = key + self._nx_val = val + self._ex = ex + return self + + def incr(self, key): + self._key = key + return self + + async def execute(self): + key = self._key + # NX set: only if not exists + if key not in _store: + _store[key] = self._nx_val + # INCR + _store[key] = _store.get(key, 0) + 1 + new_val = _store[key] + return [True, new_val] + + redis.pipeline = MagicMock(return_value=FakePipeline()) + return redis + + mock_settings = MagicMock() + mock_settings.GEMINI_DAILY_QUOTA = 5 + + mock_monitor = MagicMock(spec=OllamaHealthMonitor) + manager = OllamaFailoverManager(health_monitor=mock_monitor) + manager._settings = mock_settings + + call_count = 0 + + async def patched_check(): + nonlocal call_count + mock_redis = make_mock_redis() + with patch("src.core.redis_client.get_redis", return_value=mock_redis): + return await manager._check_gemini_quota() + + # 5 個並行呼叫,quota=5,每個都應返回 True + results = await asyncio.gather(*[patched_check() for _ in range(5)]) + assert all(results), f"5 個並行呼叫中有失敗: {results}" + + # 第 6 次(超出 quota)應返回 False + # 重置 store 到 quota 值,模擬已滿 + _store.clear() + for _ in range(5): + await patched_check() + + result_6 = await patched_check() + assert result_6 is False, f"第 6 次超出 quota 應返回 False,實際: {result_6}" + + +@pytest.mark.asyncio +async def test_gemini_quota_ttl_set_atomically(): + """ + B3 TTL 驗證:第一次呼叫 _check_gemini_quota() 後, + pipeline 的 SET NX 應已設定 TTL(不依賴分開的 EXPIRE)。 + """ + from src.services.ollama_failover_manager import OllamaFailoverManager + from src.services.ollama_health_monitor import OllamaHealthMonitor + + set_calls: list[dict] = [] + + class CapturingPipeline: + def set(self, key, val, ex=None, nx=False): + set_calls.append({"key": key, "val": val, "ex": ex, "nx": nx}) + return self + + def incr(self, key): + return self + + async def execute(self): + return [True, 1] + + mock_redis = MagicMock() + mock_redis.pipeline = MagicMock(return_value=CapturingPipeline()) + + mock_settings = MagicMock() + mock_settings.GEMINI_DAILY_QUOTA = 1000 + + mock_monitor = MagicMock(spec=OllamaHealthMonitor) + manager = OllamaFailoverManager(health_monitor=mock_monitor) + manager._settings = mock_settings + + with patch("src.core.redis_client.get_redis", return_value=mock_redis): + await manager._check_gemini_quota() + + assert len(set_calls) == 1, f"pipeline.set() 應被呼叫一次,實際: {len(set_calls)}" + call = set_calls[0] + assert call["nx"] is True, "SET 必須帶 NX=True(只首次設定)" + assert call["ex"] == 86400, f"TTL 必須 86400s,實際: {call['ex']}" + assert call["ex"] is not None, "TTL 必須在 SET 時設定,不能分開 EXPIRE(B3 修復驗證)" diff --git a/ops/monitoring/ollama_health_rules.yaml b/ops/monitoring/ollama_health_rules.yaml new file mode 100644 index 00000000..6e4a2efd --- /dev/null +++ b/ops/monitoring/ollama_health_rules.yaml @@ -0,0 +1,148 @@ +# ops/monitoring/ollama_health_rules.yaml +# AWOOOI Ollama 容災健康告警規則 +# 2026-04-26 P2.3 by Claude Sonnet 4.6 (tool-expert) — Ollama 容災監控告警規則 +# 部署目標: 與 alerts-unified.yml 一起部署到 192.168.0.110:/home/wooo/monitoring/alerts.yml +# 部署方式: 手動合併至 alerts-unified.yml,或 scripts/ops/deploy-alerts.sh 支援多檔時直接引用 +# +# 標籤規範 (對齊 alerts-unified.yml): +# layer: systemd-188 | docker-188 (Ollama 跑在 188 主機) +# team: ai +# auto_repair: "true" | "false" +# +# ⚠️ Backlog 指標(尚未在 API 暴露,需 Part 3 補完後才能啟用): +# - OllamaSlowInference: ollama_inference_duration_seconds_bucket — BACKLOG +# - GeminiQuotaApproaching: gemini_daily_call_count / gemini_daily_quota — 部分實作 +# (Redis key 存在,但 Prometheus Gauge 需 Part 3 手動刷新) +# - AutoRepairVerificationFailureHigh: post_execution_verification_* — BACKLOG +# 以上規則已寫入但標記 # [BACKLOG],上線前需先確認 metric 已暴露 + +groups: + + # =========================================================================== + # Ollama 容災健康 (ollama_health) + # =========================================================================== + - name: ollama_health + interval: 30s + rules: + + # ----------------------------------------------------------------------- + # 🔴 [ACTIVE] Ollama 主機離線 + # metric: up{job=~"ollama_111|ollama_188"} + # 前置條件: Prometheus scrape job 命名為 ollama_111 / ollama_188 + # (設定位於 ops/monitoring/generated/prometheus-scrape-generated.yaml) + # ----------------------------------------------------------------------- + - alert: OllamaInstanceDown + expr: up{job=~"ollama_111|ollama_188"} == 0 + for: 2m + labels: + severity: critical + layer: systemd-188 + team: ai + auto_repair: "false" + alert_category: "ollama_failover" + annotations: + summary: "Ollama {{ $labels.job }} 離線 ({{ $labels.instance }})" + description: "Prometheus 探測 Ollama {{ $labels.job }} 失敗超過 2 分鐘。預期容災應已觸發,路由已切 Gemini。" + runbook: "docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md#ollama-instance-down" + action: "ssh wooo@192.168.0.111 'systemctl status ollama' 或 ssh wooo@192.168.0.188 'systemctl status ollama'" + + # ----------------------------------------------------------------------- + # 🟡 [ACTIVE] Failover 觸發頻率過高 + # metric: ollama_failover_triggered_total{from_provider,to_provider} + # 由 apps/api/src/core/metrics.py OLLAMA_FAILOVER_TRIGGERED_TOTAL 暴露 + # ----------------------------------------------------------------------- + - alert: OllamaFailoverFrequent + expr: rate(ollama_failover_triggered_total[1h]) > 5 + for: 10m + labels: + severity: warning + layer: systemd-188 + team: ai + auto_repair: "false" + alert_category: "ollama_failover" + annotations: + summary: "Ollama 容災觸發頻率 > 5/h,主機可能不穩定" + description: "過去 1 小時 Ollama failover 超過 5 次。建議檢查 111 主機穩定性。" + runbook: "docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md#failover-frequent" + action: "ssh wooo@192.168.0.111 'nvidia-smi && journalctl -u ollama -n 50'" + + # ----------------------------------------------------------------------- + # 🟡 [ACTIVE] Auto Recovery 停滯(111 已恢復但仍走 Gemini) + # metric: ollama_health_status{host} (Gauge, 0=offline, 1=healthy) + # ollama_current_primary_is_ollama (Gauge, 1=primary是ollama) + # 兩個 metric 均由 Part 3 補入 + # ----------------------------------------------------------------------- + - alert: OllamaRecoveryStuck + expr: | + ollama_health_status{host="111"} == 1 + and + ollama_current_primary_is_ollama == 0 + for: 5m + labels: + severity: critical + layer: systemd-188 + team: ai + auto_repair: "false" + alert_category: "ollama_failover" + annotations: + summary: "111 已 HEALTHY 但路由仍走 Gemini,auto recovery 可能停滯" + description: "OllamaHealthMonitor 回報 111=HEALTHY 已超過 5 分鐘,但 primary 仍非 ollama。請確認 OllamaAutoRecoveryService 是否正常運行。" + runbook: "docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md#recovery-stuck" + action: "kubectl logs -n awoooi-prod deploy/api | grep ollama_auto_recovery | tail -20" + + # ----------------------------------------------------------------------- + # 🟡 [BACKLOG] P99 推理延遲過高 + # metric: ollama_inference_duration_seconds_bucket — 尚未暴露,需 Part 3 補入 + # ----------------------------------------------------------------------- + # [BACKLOG] 等 ollama_inference_duration_seconds_bucket 暴露後啟用 + # - alert: OllamaSlowInference + # expr: | + # histogram_quantile(0.99, + # rate(ollama_inference_duration_seconds_bucket[5m]) + # ) > 30 + # for: 5m + # labels: + # severity: warning + # team: ai + # annotations: + # summary: "Ollama P99 推理延遲 > 30s" + # action: "ssh wooo@192.168.0.111 'nvidia-smi' 確認 GPU 記憶體" + + # ----------------------------------------------------------------------- + # 🟡 [PARTIAL] Gemini 配額即將耗盡 + # metric: gemini_daily_call_count (Gauge) + # gemini_daily_quota (Gauge) + # Redis key "ollama:gemini_daily_count:{date}" 已存在 + # Gauge 需由 Part 3 補入(從 Redis 讀出並設值) + # ----------------------------------------------------------------------- + - alert: GeminiQuotaApproaching + expr: gemini_daily_call_count / gemini_daily_quota > 0.8 + for: 5m + labels: + severity: warning + layer: systemd-188 + team: ai + auto_repair: "false" + alert_category: "ollama_failover" + annotations: + summary: "Gemini 每日配額已用 >80%,即將觸發 failover" + description: "每日 Gemini call 已超過配額 80%。當日剩餘配額不足時,路由將自動切至 188 CPU-only 備援。" + runbook: "docs/runbooks/RUNBOOK-OLLAMA-FAILOVER.md#gemini-quota" + action: "確認 GEMINI_DAILY_QUOTA 設定值,考慮升級配額或提前切 Nemotron" + + # ----------------------------------------------------------------------- + # 🟡 [BACKLOG] Auto Repair Verifier 失敗率高(飛輪健康) + # metric: post_execution_verification_failed_total — 尚未暴露 + # post_execution_verification_total — 尚未暴露 + # ----------------------------------------------------------------------- + # [BACKLOG] 等 post_execution_verification_* 暴露後啟用 + # - alert: AutoRepairVerificationFailureHigh + # expr: | + # sum(rate(post_execution_verification_failed_total[15m])) / + # sum(rate(post_execution_verification_total[15m])) > 0.3 + # for: 10m + # labels: + # severity: warning + # team: ai + # annotations: + # summary: "Auto Repair Verifier 失敗率 >30%(飛輪可能腐爛)"