diff --git a/apps/api/src/core/config.py b/apps/api/src/core/config.py index dcb43443..a81e8a2b 100644 --- a/apps/api/src/core/config.py +++ b/apps/api/src/core/config.py @@ -500,7 +500,7 @@ class Settings(BaseSettings): default=False, description=( "Allow LocalCodeReviewService to fall back to Gemini when the " - "GCP-B/Ollama code-review lane fails. Default false to avoid " + "local Ollama code-review lane fails. Default false to avoid " "unexpected cloud spend from Gitea push/PR alerts." ), ) diff --git a/apps/api/src/hermes/nl_gateway.py b/apps/api/src/hermes/nl_gateway.py index c9779387..0d633557 100644 --- a/apps/api/src/hermes/nl_gateway.py +++ b/apps/api/src/hermes/nl_gateway.py @@ -9,6 +9,7 @@ Layer 1 意圖路由(關鍵字正則)→ Ollama 本地模型(111)→ Tel debugger/vuln → deepseek-r1:14b(推理); code agents → qwen2.5-coder:7b; 其他 → qwen2.5:7b-instruct """ from __future__ import annotations + import asyncio import re import time @@ -17,7 +18,6 @@ import httpx import structlog from sqlalchemy import text -from src.core.config import settings from src.core.redis_client import get_redis from src.db.base import get_db_context from src.hermes.agent_loader import get_agent_system_prompt @@ -266,7 +266,9 @@ async def process_nl_message( success = False error_type: str | None = None try: - ollama_base = getattr(settings, "OLLAMA_URL", "http://34.143.170.20:11434") # 2026-05-03 ogt: ADR-110 GCP-A Primary + from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint + + ollama_base = resolve_ollama_endpoint("hermes") async with httpx.AsyncClient(timeout=_OLLAMA_TIMEOUT) as _hc: resp = await _hc.post( f"{ollama_base}/api/chat", diff --git a/apps/api/src/services/decision_fusion_adapter.py b/apps/api/src/services/decision_fusion_adapter.py index 6628b614..529f2e90 100644 --- a/apps/api/src/services/decision_fusion_adapter.py +++ b/apps/api/src/services/decision_fusion_adapter.py @@ -26,7 +26,7 @@ from __future__ import annotations import asyncio import re -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Literal import httpx @@ -125,7 +125,7 @@ class DecisionFusionAdapter: # Public API # ========================================================================= - async def fuse_decision(self, event: "AiGovernanceEvent") -> FusedDecision: + async def fuse_decision(self, event: AiGovernanceEvent) -> FusedDecision: """三維融合:LLM × Playbook × MCP → FusedDecision。 三個維度並行評估(asyncio.gather),任一失敗靜默降為 0.5。 @@ -226,7 +226,7 @@ class DecisionFusionAdapter: # ========================================================================= async def _score_llm( - self, event: "AiGovernanceEvent" + self, event: AiGovernanceEvent ) -> tuple[float, str, dict[str, Any]]: """Ollama LLM 推理:治理事件情境 → 建議動作 + 信心度。 @@ -254,7 +254,9 @@ class DecisionFusionAdapter: "只輸出 CONFIDENCE 和 ACTION 兩行,不要其他解釋。" ) - ollama_url = getattr(self._settings, "OLLAMA_URL", "http://192.168.0.111:11434") # 2026-05-04 ogt: ADR-110 修正 — 111 primary + from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint + + ollama_url = resolve_ollama_endpoint("deep_rca") try: async with httpx.AsyncClient( @@ -320,7 +322,7 @@ class DecisionFusionAdapter: # ========================================================================= async def _score_playbook( - self, event: "AiGovernanceEvent" + self, event: AiGovernanceEvent ) -> tuple[float, str | None, float | None]: """Playbook 相似度比對 → 取最高 trust_score。 @@ -373,7 +375,7 @@ class DecisionFusionAdapter: # ========================================================================= async def _score_mcp( - self, event: "AiGovernanceEvent" + self, event: AiGovernanceEvent ) -> tuple[float, dict[str, Any]]: """Prometheus 情報採集 → MCP 感官品質分數。 diff --git a/apps/api/src/services/drift_narrator_service.py b/apps/api/src/services/drift_narrator_service.py index f33d928d..e09448e6 100644 --- a/apps/api/src/services/drift_narrator_service.py +++ b/apps/api/src/services/drift_narrator_service.py @@ -33,10 +33,11 @@ logger = structlog.get_logger(__name__) # ============================================================ # 設定 # ============================================================ -# 2026-05-03 ogt: ADR-110 GCP-A Primary — 改從 settings 讀取,不再硬編碼 111 +# 2026-05-05 Codex: 重摘要走 111 lane,避免污染 GCP alert-fast lane def _get_ollama_url() -> str: - from src.core.config import get_settings - return get_settings().OLLAMA_URL + from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint + + return resolve_ollama_endpoint("deep_rca") # D1 集中化 2026-04-11: 從 models.json providers.ollama.models.drift_summary 讀取 NARRATOR_MODEL = get_model("ollama", "drift_summary") NARRATOR_TIMEOUT = 90.0 # seconds @@ -120,8 +121,8 @@ class DriftNarratorService: async def narrate_and_notify( self, - report: "DriftReport", - interpretation: "DriftInterpretation | None" = None, + report: DriftReport, + interpretation: DriftInterpretation | None = None, ) -> None: """ 生成人話摘要並推送 Telegram @@ -166,7 +167,7 @@ class DriftNarratorService: medium=report.medium_count, ) - def _should_narrate(self, report: "DriftReport") -> bool: + def _should_narrate(self, report: DriftReport) -> bool: """觸發條件:high >= 1 or medium >= 3""" # 過濾 HPA 白名單後重算 non_hpa_items = [ @@ -180,8 +181,8 @@ class DriftNarratorService: async def _generate_narrative_and_items( self, - report: "DriftReport", - interpretation: "DriftInterpretation | None", + report: DriftReport, + interpretation: DriftInterpretation | None, ) -> tuple[str, list[dict], dict]: """ 2026-04-18 ogt + Claude Opus 4.7: B 方案 — LLM 產生 narrative + 結構化 items @@ -354,8 +355,8 @@ class DriftNarratorService: def _fallback_recommendation( self, - report: "DriftReport", - interpretation: "DriftInterpretation | None", + report: DriftReport, + interpretation: DriftInterpretation | None, ) -> dict: """ 2026-04-20 P0.2 ogt + Claude Opus 4.7: LLM 沒給 recommendation 時的 Python fallback @@ -397,7 +398,7 @@ class DriftNarratorService: async def _log_ai_action_to_db( self, - report: "DriftReport", + report: DriftReport, prompt: str, raw_response: str | None, narrative: str, @@ -416,7 +417,9 @@ class DriftNarratorService: - 若能找到該 drift 的 incident 關聯,設 parent_op_id """ import json as _json + from sqlalchemy import text as _sql + from src.db.base import get_db_context input_json = _json.dumps({ @@ -511,7 +514,7 @@ class DriftNarratorService: items_count=len(items), ) - def _format_drift_for_llm(self, report: "DriftReport") -> str: + def _format_drift_for_llm(self, report: DriftReport) -> str: """ 2026-04-18 ogt + Claude Opus 4.7: B 方案 — 餵 LLM 用的 JSON 序列化 保留更多原始 context 給 LLM 推理,不做 30 字元暴力截斷 @@ -582,7 +585,7 @@ class DriftNarratorService: # 一般變化 return f"{from_val} → {to_val}" - def _fallback_items(self, report: "DriftReport") -> list[dict]: + def _fallback_items(self, report: DriftReport) -> list[dict]: """ LLM 失敗時的 Python 智能摘要 (取代舊 str()[:30]) - 過濾白名單 @@ -605,7 +608,7 @@ class DriftNarratorService: }) return items - def _format_intent_summary(self, interpretation: "DriftInterpretation | None") -> str: + def _format_intent_summary(self, interpretation: DriftInterpretation | None) -> str: if not interpretation: return "無意圖分析" return ( @@ -616,8 +619,8 @@ class DriftNarratorService: def _fallback_narrative( self, - report: "DriftReport", - interpretation: "DriftInterpretation | None", + report: DriftReport, + interpretation: DriftInterpretation | None, ) -> str: """LLM 失敗時的結構化 fallback""" resources = list({ @@ -636,7 +639,7 @@ class DriftNarratorService: async def _send_telegram( self, - report: "DriftReport", + report: DriftReport, narrative: str, items: list[dict], recommendation: dict | None = None, @@ -667,7 +670,7 @@ class DriftNarratorService: except Exception as e: logger.warning("drift_narrator_telegram_error", error=str(e)) - def _count_nontrivial_drift(self, report: "DriftReport") -> int: + def _count_nontrivial_drift(self, report: DriftReport) -> int: """ 計算非白名單、非 trivial (K8s 自動補齊) 的 drift 數 用於 Telegram 底部「還有 N 項」顯示實際可操作數量 @@ -704,7 +707,7 @@ class DriftNarratorService: def _render_telegram_body( self, - report: "DriftReport", + report: DriftReport, narrative: str, items: list[dict], recommendation: dict | None = None, diff --git a/apps/api/src/services/embedding_service.py b/apps/api/src/services/embedding_service.py index 60e8d1c3..76603071 100644 --- a/apps/api/src/services/embedding_service.py +++ b/apps/api/src/services/embedding_service.py @@ -21,8 +21,8 @@ from typing import Protocol import httpx import structlog -from src.core.config import settings from src.services.model_registry import get_model as _get_model +from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint logger = structlog.get_logger(__name__) @@ -98,7 +98,7 @@ class OllamaEmbeddingService: P1 修復 (2026-03-29): 維度配置化,支援更多模型 """ self._model = model - self._ollama_url = ollama_url or settings.OLLAMA_URL + self._ollama_url = ollama_url or resolve_ollama_endpoint("embedding") self._timeout = timeout self._default_dimension = default_dimension or self.MODEL_DIMENSIONS.get( model, self.DEFAULT_DIMENSION diff --git a/apps/api/src/services/image_analysis_service.py b/apps/api/src/services/image_analysis_service.py index 4440fbae..d40728d1 100644 --- a/apps/api/src/services/image_analysis_service.py +++ b/apps/api/src/services/image_analysis_service.py @@ -21,7 +21,6 @@ AWOOOI — Image Analysis Service (Phase 34, ADR-067) from __future__ import annotations import base64 -import os import time from pathlib import Path from typing import TYPE_CHECKING @@ -29,14 +28,13 @@ from typing import TYPE_CHECKING import httpx import structlog -from src.core.config import get_settings from src.services.model_registry import get_model +from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint if TYPE_CHECKING: pass logger = structlog.get_logger(__name__) -settings = get_settings() # D1 集中化 2026-04-11: 從 models.json providers.ollama.models.image_analysis 讀取 _MODEL = get_model("ollama", "image_analysis") @@ -124,7 +122,7 @@ class ImageAnalysisService: image_b64 = base64.b64encode(image_path.read_bytes()).decode() http = await self._get_http() resp = await http.post( - f"{settings.OLLAMA_URL}/api/generate", + f"{resolve_ollama_endpoint('image_analysis')}/api/generate", json={ "model": _MODEL, "prompt": question, diff --git a/apps/api/src/services/intent_classifier.py b/apps/api/src/services/intent_classifier.py index 12ee6e38..d5d44938 100644 --- a/apps/api/src/services/intent_classifier.py +++ b/apps/api/src/services/intent_classifier.py @@ -31,8 +31,8 @@ from typing import Protocol, runtime_checkable import httpx import structlog -from src.core.config import settings from src.services.model_registry import get_model_registry +from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint logger = structlog.get_logger(__name__) @@ -549,7 +549,7 @@ class IntentClassifier: # 呼叫 Ollama async with httpx.AsyncClient() as client: response = await client.post( - f"{settings.OLLAMA_URL}/api/generate", + f"{resolve_ollama_endpoint('hermes')}/api/generate", json={ "model": model_name, "prompt": prompt, diff --git a/apps/api/src/services/knowledge_extractor_service.py b/apps/api/src/services/knowledge_extractor_service.py index bbbb8874..7326a6c6 100644 --- a/apps/api/src/services/knowledge_extractor_service.py +++ b/apps/api/src/services/knowledge_extractor_service.py @@ -15,10 +15,11 @@ import structlog logger = structlog.get_logger(__name__) -# 2026-05-03 ogt: ADR-110 GCP-A Primary — 改從 settings 讀取,不再硬編碼 111 +# 2026-05-05 Codex: KB 萃取走 111 lane,避免污染 GCP alert-fast lane def _get_ollama_base() -> str: - from src.core.config import get_settings - return get_settings().OLLAMA_URL + from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint + + return resolve_ollama_endpoint("deep_rca") _EXTRACT_MODEL = "llama3.2:3b" _EXTRACT_TIMEOUT = 30.0 # 秒,容忍慢速 @@ -117,7 +118,11 @@ class KnowledgeExtractorService: category = self._infer_category(incident) # 5. 建立 KB 條目 - from src.models.knowledge import EntrySource, EntryType, KnowledgeEntryCreate + from src.models.knowledge import ( + EntrySource, + EntryType, + KnowledgeEntryCreate, + ) from src.services.knowledge_service import get_knowledge_service entry_data = KnowledgeEntryCreate( diff --git a/apps/api/src/services/knowledge_rag_service.py b/apps/api/src/services/knowledge_rag_service.py index 3e68ce67..93191722 100644 --- a/apps/api/src/services/knowledge_rag_service.py +++ b/apps/api/src/services/knowledge_rag_service.py @@ -20,11 +20,10 @@ from pathlib import Path import httpx import structlog -from src.core.config import get_settings import src.repositories.rag_chunk_repository as rag_repo +from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint logger = structlog.get_logger(__name__) -settings = get_settings() _EMBED_MODEL = "nomic-embed-text" _GEN_MODEL = "qwen2.5:7b-instruct" @@ -131,7 +130,7 @@ class KnowledgeRAGService: try: http = await self._get_http() resp = await http.post( - f"{settings.OLLAMA_URL}/api/embeddings", + f"{resolve_ollama_endpoint('embedding')}/api/embeddings", json={"model": _EMBED_MODEL, "prompt": text}, ) if resp.status_code == 200: @@ -150,7 +149,7 @@ class KnowledgeRAGService: try: http = await self._get_http() resp = await http.post( - f"{settings.OLLAMA_URL}/api/generate", + f"{resolve_ollama_endpoint('rag')}/api/generate", json={ "model": _GEN_MODEL, "prompt": prompt, diff --git a/apps/api/src/services/local_code_review_service.py b/apps/api/src/services/local_code_review_service.py index 9e7bdcd0..c7f1b95c 100644 --- a/apps/api/src/services/local_code_review_service.py +++ b/apps/api/src/services/local_code_review_service.py @@ -186,7 +186,7 @@ class LocalCodeReviewService: ) return { "review_text": ( - "⚠️ Code Review:GCP-B/Ollama 審查未完成," + "⚠️ Code Review:本地 Ollama 審查未完成," "已依成本策略跳過 Gemini fallback。" ), "issues_count": 1, diff --git a/apps/api/src/services/log_summary_service.py b/apps/api/src/services/log_summary_service.py index 6d92fe7b..5574bfaf 100644 --- a/apps/api/src/services/log_summary_service.py +++ b/apps/api/src/services/log_summary_service.py @@ -35,8 +35,9 @@ logger = structlog.get_logger(__name__) # ============================================================ # 2026-05-03 ogt: ADR-110 GCP-A Primary — 改從 settings 讀取,不再硬編碼 111 def _get_ollama_url() -> str: - from src.core.config import get_settings - return get_settings().OLLAMA_URL + from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint + + return resolve_ollama_endpoint("deep_rca") # D1 集中化 2026-04-11: 從 models.json providers.ollama.models.log_anomaly 讀取 SUMMARY_MODEL = get_model("ollama", "log_anomaly") LLM_TIMEOUT = 180.0 # deepseek-r1 硬超時 @@ -145,7 +146,7 @@ class LogSummaryService: self.summarize(pod_name, namespace), timeout=SOFT_TIMEOUT, ) - except asyncio.TimeoutError: + except TimeoutError: logger.info( "log_summary_soft_timeout", pod=pod_name, @@ -182,7 +183,7 @@ class LogSummaryService: def _extract_anomaly_lines(self, raw_logs: str) -> list[str]: """過濾異常行 + 敏感資料遮蔽,取最後 ANOMALY_TAIL_LINES 行""" lines = raw_logs.splitlines() - anomaly = [l for l in lines if _ANOMALY_PATTERN.search(l)] + anomaly = [line for line in lines if _ANOMALY_PATTERN.search(line)] # 取最後 N 行 anomaly = anomaly[-ANOMALY_TAIL_LINES:] # 遮蔽敏感資料 diff --git a/apps/api/src/services/ollama_endpoint_resolver.py b/apps/api/src/services/ollama_endpoint_resolver.py index bbdd8ade..d51dd38b 100644 --- a/apps/api/src/services/ollama_endpoint_resolver.py +++ b/apps/api/src/services/ollama_endpoint_resolver.py @@ -16,27 +16,37 @@ from src.core.config import settings OllamaWorkloadType = Literal[ "interactive", "healthcheck", + "alert_fast", "batch", "embedding", "rag", "code_review", "shadow", "canary", + "deep_rca", + "image_analysis", + "hermes", "local_required", "privacy_sensitive", "dr", ] -_GCP_B_PREFERRED_WORKLOADS = { +_GCP_A_PREFERRED_WORKLOADS = { + "interactive", + "healthcheck", + "alert_fast", +} + +_LOCAL_PREFERRED_WORKLOADS = { "batch", "embedding", "rag", "code_review", "shadow", "canary", -} - -_LOCAL_PREFERRED_WORKLOADS = { + "deep_rca", + "image_analysis", + "hermes", "local_required", "privacy_sensitive", "dr", @@ -68,20 +78,28 @@ def resolve_ollama_selection( secondary = cfg.OLLAMA_SECONDARY_URL fallback = cfg.OLLAMA_FALLBACK_URL - if workload_type in _GCP_B_PREFERRED_WORKLOADS and secondary: + if workload_type in _LOCAL_PREFERRED_WORKLOADS: + if fallback: + return OllamaEndpointSelection( + url=fallback, + provider_name="ollama_local", + workload_type=workload_type, + reason="local_heavy_or_privacy_lane", + ) + if secondary: + return OllamaEndpointSelection( + url=secondary, + provider_name="ollama_gcp_b", + workload_type=workload_type, + reason="local_missing_gcp_b_fallback", + ) + + if workload_type not in _GCP_A_PREFERRED_WORKLOADS and secondary: return OllamaEndpointSelection( url=secondary, provider_name="ollama_gcp_b", workload_type=workload_type, - reason="gcp_b_batch_lane", - ) - - if workload_type in _LOCAL_PREFERRED_WORKLOADS and fallback: - return OllamaEndpointSelection( - url=fallback, - provider_name="ollama_local", - workload_type=workload_type, - reason="local_privacy_or_dr_lane", + reason="gcp_b_default_non_alert_lane", ) return OllamaEndpointSelection( diff --git a/apps/api/src/services/playbook_rag.py b/apps/api/src/services/playbook_rag.py index 41a0c9f9..70694ffb 100644 --- a/apps/api/src/services/playbook_rag.py +++ b/apps/api/src/services/playbook_rag.py @@ -29,9 +29,9 @@ from typing import Any import httpx import structlog -from src.core.config import settings from src.models.playbook import Playbook, SymptomPattern from src.repositories.interfaces import IEmbeddingCacheRepository +from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint logger = structlog.get_logger(__name__) @@ -146,7 +146,7 @@ class PlaybookRAGService: """ self._http_client = http_client self._embedding_cache = embedding_cache - self.ollama_url = settings.OLLAMA_URL + self.ollama_url = resolve_ollama_endpoint("embedding") self.embedding_model = EMBEDDING_MODEL # ========================================================================= diff --git a/apps/api/tests/test_local_code_review_cloud_fallback.py b/apps/api/tests/test_local_code_review_cloud_fallback.py index 77c0c227..c4e05139 100644 --- a/apps/api/tests/test_local_code_review_cloud_fallback.py +++ b/apps/api/tests/test_local_code_review_cloud_fallback.py @@ -33,7 +33,7 @@ async def _noop_save(*args: Any, **kwargs: Any) -> None: @pytest.mark.asyncio -async def test_large_pr_uses_gcp_b_ollama_when_gemini_fallback_disabled( +async def test_large_pr_uses_local_ollama_when_gemini_fallback_disabled( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr( @@ -44,7 +44,7 @@ async def test_large_pr_uses_gcp_b_ollama_when_gemini_fallback_disabled( monkeypatch.setattr( review_module, "resolve_ollama_endpoint", - lambda workload_type: "http://gcp-b:11436", + lambda workload_type: "http://local-111:11434", ) client = _FakeClient() @@ -69,7 +69,7 @@ async def test_large_pr_uses_gcp_b_ollama_when_gemini_fallback_disabled( assert result is not None assert result["provider"] == "ollama" - assert client.posted_urls == ["http://gcp-b:11436/api/generate"] + assert client.posted_urls == ["http://local-111:11434/api/generate"] @pytest.mark.asyncio @@ -84,7 +84,7 @@ async def test_ollama_failure_does_not_fall_back_to_gemini_by_default( monkeypatch.setattr( review_module, "resolve_ollama_endpoint", - lambda workload_type: "http://gcp-b:11436", + lambda workload_type: "http://local-111:11434", ) client = _FakeClient(fail=True) @@ -110,7 +110,7 @@ async def test_ollama_failure_does_not_fall_back_to_gemini_by_default( assert result is not None assert result["provider"] == "ollama_unavailable" assert result["cloud_fallback_skipped"] is True - assert client.posted_urls == ["http://gcp-b:11436/api/generate"] + assert client.posted_urls == ["http://local-111:11434/api/generate"] @pytest.mark.asyncio diff --git a/apps/api/tests/test_ollama_endpoint_resolver.py b/apps/api/tests/test_ollama_endpoint_resolver.py index 5519ec16..0f5d57d4 100644 --- a/apps/api/tests/test_ollama_endpoint_resolver.py +++ b/apps/api/tests/test_ollama_endpoint_resolver.py @@ -3,7 +3,6 @@ from __future__ import annotations from types import SimpleNamespace from src.services.ollama_endpoint_resolver import ( - resolve_ollama_endpoint, resolve_ollama_selection, ) @@ -21,20 +20,30 @@ def _settings( ) -def test_batch_workloads_prefer_gcp_b() -> None: +def test_heavy_workloads_prefer_local_lane() -> None: cfg = _settings() - for workload in ("batch", "embedding", "rag", "code_review", "shadow", "canary"): + for workload in ( + "batch", + "embedding", + "rag", + "code_review", + "shadow", + "canary", + "deep_rca", + "image_analysis", + "hermes", + ): selection = resolve_ollama_selection(workload, config=cfg) - assert selection.url == "http://192.168.0.110:11436" - assert selection.provider_name == "ollama_gcp_b" - assert selection.reason == "gcp_b_batch_lane" + assert selection.url == "http://192.168.0.110:11437" + assert selection.provider_name == "ollama_local" + assert selection.reason == "local_heavy_or_privacy_lane" def test_interactive_workloads_stay_on_gcp_a() -> None: cfg = _settings() - for workload in ("interactive", "healthcheck"): + for workload in ("interactive", "healthcheck", "alert_fast"): selection = resolve_ollama_selection(workload, config=cfg) assert selection.url == "http://192.168.0.110:11435" assert selection.provider_name == "ollama_gcp_a" @@ -49,7 +58,10 @@ def test_local_required_workloads_use_local_lane() -> None: assert selection.provider_name == "ollama_local" -def test_batch_workloads_fall_back_to_primary_when_secondary_missing() -> None: - cfg = _settings(secondary="") +def test_heavy_workloads_fall_back_to_gcp_b_when_local_missing() -> None: + cfg = _settings(fallback="") - assert resolve_ollama_endpoint("embedding", config=cfg) == "http://192.168.0.110:11435" + selection = resolve_ollama_selection("embedding", config=cfg) + assert selection.url == "http://192.168.0.110:11436" + assert selection.provider_name == "ollama_gcp_b" + assert selection.reason == "local_missing_gcp_b_fallback" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 71c9c04b..45aa2727 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -3193,3 +3193,26 @@ bash scripts/ops/ollama-topology-check.sh ``` 結論:GCP-A/B 可作 `alert-fast` lane,但目前不應承擔 14B/32B 同步告警推理;重模型必須由 AwoooP Inference Gateway 隔離到 async / 111 / GPU 節點。 + +### Runtime 過渡護欄 + +在 Inference Gateway 尚未接管所有 provider 前,先調整 `ollama_endpoint_resolver`: + +- `interactive` / `healthcheck` / `alert_fast` 保持 GCP-A 優先 +- `code_review` / `rag` / `embedding` / `deep_rca` / `image_analysis` / `hermes` 改為 111 優先 +- 111 不可用時才回 GCP-B,避免 GCP-A/B 在告警 canary 期間被 7B/14B/32B 模型污染 + +驗證: + +```bash +/Users/ogt/awoooi/apps/api/.venv/bin/python -m ruff check apps/api/src/core/config.py apps/api/src/services/ollama_endpoint_resolver.py apps/api/src/services/knowledge_rag_service.py apps/api/src/services/playbook_rag.py apps/api/src/services/log_summary_service.py apps/api/src/services/image_analysis_service.py apps/api/src/services/local_code_review_service.py apps/api/src/hermes/nl_gateway.py apps/api/tests/test_ollama_endpoint_resolver.py apps/api/tests/test_local_code_review_cloud_fallback.py +# All checks passed + +DATABASE_URL=postgresql+asyncpg://u:p@localhost:5432/test REDIS_URL=redis://localhost:6379/0 \ + /Users/ogt/awoooi/apps/api/.venv/bin/python -m pytest \ + apps/api/tests/test_ollama_endpoint_resolver.py \ + apps/api/tests/test_local_code_review_cloud_fallback.py \ + apps/api/tests/test_ollama_provider_endpoints.py \ + apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py -q +# 15 passed +``` diff --git a/docs/runbooks/AWOOOP-INFERENCE-GATEWAY.md b/docs/runbooks/AWOOOP-INFERENCE-GATEWAY.md index 8de6d15b..3a92711d 100644 --- a/docs/runbooks/AWOOOP-INFERENCE-GATEWAY.md +++ b/docs/runbooks/AWOOOP-INFERENCE-GATEWAY.md @@ -35,8 +35,8 @@ gateway must own runtime scheduling. | Lane | Model | Allowed hosts | Notes | |------|-------|---------------|-------| | `alert-fast` | `gemma3:4b` | GCP-A, GCP-B, 111 | Synchronous, protected | -| `code-review` | `qwen2.5-coder:7b` | GCP-B, 111 | Never 32B on GCP during alert canary | -| `embedding` | `bge-m3` | GCP-A, GCP-B, 111 | Short timeout | +| `code-review` | `qwen2.5-coder:7b` | 111, then GCP-B | Transitional: keep GCP-B clean during alert canary | +| `embedding` | `bge-m3` | 111, then GCP-B | Transitional: keep GCP-A/B clean during alert canary | | `deep-rca` | 14B-class model | 111 or GPU node | Async only | | `paid-emergency` | Gemini / Claude | Cloud | Budget-gated emergency fallback | @@ -88,8 +88,8 @@ if lane == alert-fast: if lane == code-review: model = qwen2.5-coder:7b - try GCP-B with 90s timeout try 111 with 120s timeout + try GCP-B with 90s timeout only if 111 is unavailable if lane == deep-rca: reject synchronous request @@ -150,4 +150,3 @@ OLLAMA_FALLBACK_URL: "http://192.168.0.111:11434" ``` Do not disable budget hard kill during rollback. -