fix(ai): isolate heavy Ollama workloads from GCP alert lane
This commit is contained in:
@@ -500,7 +500,7 @@ class Settings(BaseSettings):
|
||||
default=False,
|
||||
description=(
|
||||
"Allow LocalCodeReviewService to fall back to Gemini when the "
|
||||
"GCP-B/Ollama code-review lane fails. Default false to avoid "
|
||||
"local Ollama code-review lane fails. Default false to avoid "
|
||||
"unexpected cloud spend from Gitea push/PR alerts."
|
||||
),
|
||||
)
|
||||
|
||||
@@ -9,6 +9,7 @@ Layer 1 意圖路由(關鍵字正則)→ Ollama 本地模型(111)→ Tel
|
||||
debugger/vuln → deepseek-r1:14b(推理); code agents → qwen2.5-coder:7b; 其他 → qwen2.5:7b-instruct
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import time
|
||||
@@ -17,7 +18,6 @@ import httpx
|
||||
import structlog
|
||||
from sqlalchemy import text
|
||||
|
||||
from src.core.config import settings
|
||||
from src.core.redis_client import get_redis
|
||||
from src.db.base import get_db_context
|
||||
from src.hermes.agent_loader import get_agent_system_prompt
|
||||
@@ -266,7 +266,9 @@ async def process_nl_message(
|
||||
success = False
|
||||
error_type: str | None = None
|
||||
try:
|
||||
ollama_base = getattr(settings, "OLLAMA_URL", "http://34.143.170.20:11434") # 2026-05-03 ogt: ADR-110 GCP-A Primary
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
ollama_base = resolve_ollama_endpoint("hermes")
|
||||
async with httpx.AsyncClient(timeout=_OLLAMA_TIMEOUT) as _hc:
|
||||
resp = await _hc.post(
|
||||
f"{ollama_base}/api/chat",
|
||||
|
||||
@@ -26,7 +26,7 @@ from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Literal
|
||||
|
||||
import httpx
|
||||
@@ -125,7 +125,7 @@ class DecisionFusionAdapter:
|
||||
# Public API
|
||||
# =========================================================================
|
||||
|
||||
async def fuse_decision(self, event: "AiGovernanceEvent") -> FusedDecision:
|
||||
async def fuse_decision(self, event: AiGovernanceEvent) -> FusedDecision:
|
||||
"""三維融合:LLM × Playbook × MCP → FusedDecision。
|
||||
|
||||
三個維度並行評估(asyncio.gather),任一失敗靜默降為 0.5。
|
||||
@@ -226,7 +226,7 @@ class DecisionFusionAdapter:
|
||||
# =========================================================================
|
||||
|
||||
async def _score_llm(
|
||||
self, event: "AiGovernanceEvent"
|
||||
self, event: AiGovernanceEvent
|
||||
) -> tuple[float, str, dict[str, Any]]:
|
||||
"""Ollama LLM 推理:治理事件情境 → 建議動作 + 信心度。
|
||||
|
||||
@@ -254,7 +254,9 @@ class DecisionFusionAdapter:
|
||||
"只輸出 CONFIDENCE 和 ACTION 兩行,不要其他解釋。"
|
||||
)
|
||||
|
||||
ollama_url = getattr(self._settings, "OLLAMA_URL", "http://192.168.0.111:11434") # 2026-05-04 ogt: ADR-110 修正 — 111 primary
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
ollama_url = resolve_ollama_endpoint("deep_rca")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
@@ -320,7 +322,7 @@ class DecisionFusionAdapter:
|
||||
# =========================================================================
|
||||
|
||||
async def _score_playbook(
|
||||
self, event: "AiGovernanceEvent"
|
||||
self, event: AiGovernanceEvent
|
||||
) -> tuple[float, str | None, float | None]:
|
||||
"""Playbook 相似度比對 → 取最高 trust_score。
|
||||
|
||||
@@ -373,7 +375,7 @@ class DecisionFusionAdapter:
|
||||
# =========================================================================
|
||||
|
||||
async def _score_mcp(
|
||||
self, event: "AiGovernanceEvent"
|
||||
self, event: AiGovernanceEvent
|
||||
) -> tuple[float, dict[str, Any]]:
|
||||
"""Prometheus 情報採集 → MCP 感官品質分數。
|
||||
|
||||
|
||||
@@ -33,10 +33,11 @@ logger = structlog.get_logger(__name__)
|
||||
# ============================================================
|
||||
# 設定
|
||||
# ============================================================
|
||||
# 2026-05-03 ogt: ADR-110 GCP-A Primary — 改從 settings 讀取,不再硬編碼 111
|
||||
# 2026-05-05 Codex: 重摘要走 111 lane,避免污染 GCP alert-fast lane
|
||||
def _get_ollama_url() -> str:
|
||||
from src.core.config import get_settings
|
||||
return get_settings().OLLAMA_URL
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
return resolve_ollama_endpoint("deep_rca")
|
||||
# D1 集中化 2026-04-11: 從 models.json providers.ollama.models.drift_summary 讀取
|
||||
NARRATOR_MODEL = get_model("ollama", "drift_summary")
|
||||
NARRATOR_TIMEOUT = 90.0 # seconds
|
||||
@@ -120,8 +121,8 @@ class DriftNarratorService:
|
||||
|
||||
async def narrate_and_notify(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
interpretation: "DriftInterpretation | None" = None,
|
||||
report: DriftReport,
|
||||
interpretation: DriftInterpretation | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
生成人話摘要並推送 Telegram
|
||||
@@ -166,7 +167,7 @@ class DriftNarratorService:
|
||||
medium=report.medium_count,
|
||||
)
|
||||
|
||||
def _should_narrate(self, report: "DriftReport") -> bool:
|
||||
def _should_narrate(self, report: DriftReport) -> bool:
|
||||
"""觸發條件:high >= 1 or medium >= 3"""
|
||||
# 過濾 HPA 白名單後重算
|
||||
non_hpa_items = [
|
||||
@@ -180,8 +181,8 @@ class DriftNarratorService:
|
||||
|
||||
async def _generate_narrative_and_items(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
interpretation: "DriftInterpretation | None",
|
||||
report: DriftReport,
|
||||
interpretation: DriftInterpretation | None,
|
||||
) -> tuple[str, list[dict], dict]:
|
||||
"""
|
||||
2026-04-18 ogt + Claude Opus 4.7: B 方案 — LLM 產生 narrative + 結構化 items
|
||||
@@ -354,8 +355,8 @@ class DriftNarratorService:
|
||||
|
||||
def _fallback_recommendation(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
interpretation: "DriftInterpretation | None",
|
||||
report: DriftReport,
|
||||
interpretation: DriftInterpretation | None,
|
||||
) -> dict:
|
||||
"""
|
||||
2026-04-20 P0.2 ogt + Claude Opus 4.7: LLM 沒給 recommendation 時的 Python fallback
|
||||
@@ -397,7 +398,7 @@ class DriftNarratorService:
|
||||
|
||||
async def _log_ai_action_to_db(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
report: DriftReport,
|
||||
prompt: str,
|
||||
raw_response: str | None,
|
||||
narrative: str,
|
||||
@@ -416,7 +417,9 @@ class DriftNarratorService:
|
||||
- 若能找到該 drift 的 incident 關聯,設 parent_op_id
|
||||
"""
|
||||
import json as _json
|
||||
|
||||
from sqlalchemy import text as _sql
|
||||
|
||||
from src.db.base import get_db_context
|
||||
|
||||
input_json = _json.dumps({
|
||||
@@ -511,7 +514,7 @@ class DriftNarratorService:
|
||||
items_count=len(items),
|
||||
)
|
||||
|
||||
def _format_drift_for_llm(self, report: "DriftReport") -> str:
|
||||
def _format_drift_for_llm(self, report: DriftReport) -> str:
|
||||
"""
|
||||
2026-04-18 ogt + Claude Opus 4.7: B 方案 — 餵 LLM 用的 JSON 序列化
|
||||
保留更多原始 context 給 LLM 推理,不做 30 字元暴力截斷
|
||||
@@ -582,7 +585,7 @@ class DriftNarratorService:
|
||||
# 一般變化
|
||||
return f"{from_val} → {to_val}"
|
||||
|
||||
def _fallback_items(self, report: "DriftReport") -> list[dict]:
|
||||
def _fallback_items(self, report: DriftReport) -> list[dict]:
|
||||
"""
|
||||
LLM 失敗時的 Python 智能摘要 (取代舊 str()[:30])
|
||||
- 過濾白名單
|
||||
@@ -605,7 +608,7 @@ class DriftNarratorService:
|
||||
})
|
||||
return items
|
||||
|
||||
def _format_intent_summary(self, interpretation: "DriftInterpretation | None") -> str:
|
||||
def _format_intent_summary(self, interpretation: DriftInterpretation | None) -> str:
|
||||
if not interpretation:
|
||||
return "無意圖分析"
|
||||
return (
|
||||
@@ -616,8 +619,8 @@ class DriftNarratorService:
|
||||
|
||||
def _fallback_narrative(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
interpretation: "DriftInterpretation | None",
|
||||
report: DriftReport,
|
||||
interpretation: DriftInterpretation | None,
|
||||
) -> str:
|
||||
"""LLM 失敗時的結構化 fallback"""
|
||||
resources = list({
|
||||
@@ -636,7 +639,7 @@ class DriftNarratorService:
|
||||
|
||||
async def _send_telegram(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
report: DriftReport,
|
||||
narrative: str,
|
||||
items: list[dict],
|
||||
recommendation: dict | None = None,
|
||||
@@ -667,7 +670,7 @@ class DriftNarratorService:
|
||||
except Exception as e:
|
||||
logger.warning("drift_narrator_telegram_error", error=str(e))
|
||||
|
||||
def _count_nontrivial_drift(self, report: "DriftReport") -> int:
|
||||
def _count_nontrivial_drift(self, report: DriftReport) -> int:
|
||||
"""
|
||||
計算非白名單、非 trivial (K8s 自動補齊) 的 drift 數
|
||||
用於 Telegram 底部「還有 N 項」顯示實際可操作數量
|
||||
@@ -704,7 +707,7 @@ class DriftNarratorService:
|
||||
|
||||
def _render_telegram_body(
|
||||
self,
|
||||
report: "DriftReport",
|
||||
report: DriftReport,
|
||||
narrative: str,
|
||||
items: list[dict],
|
||||
recommendation: dict | None = None,
|
||||
|
||||
@@ -21,8 +21,8 @@ from typing import Protocol
|
||||
import httpx
|
||||
import structlog
|
||||
|
||||
from src.core.config import settings
|
||||
from src.services.model_registry import get_model as _get_model
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
@@ -98,7 +98,7 @@ class OllamaEmbeddingService:
|
||||
P1 修復 (2026-03-29): 維度配置化,支援更多模型
|
||||
"""
|
||||
self._model = model
|
||||
self._ollama_url = ollama_url or settings.OLLAMA_URL
|
||||
self._ollama_url = ollama_url or resolve_ollama_endpoint("embedding")
|
||||
self._timeout = timeout
|
||||
self._default_dimension = default_dimension or self.MODEL_DIMENSIONS.get(
|
||||
model, self.DEFAULT_DIMENSION
|
||||
|
||||
@@ -21,7 +21,6 @@ AWOOOI — Image Analysis Service (Phase 34, ADR-067)
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
@@ -29,14 +28,13 @@ from typing import TYPE_CHECKING
|
||||
import httpx
|
||||
import structlog
|
||||
|
||||
from src.core.config import get_settings
|
||||
from src.services.model_registry import get_model
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
settings = get_settings()
|
||||
|
||||
# D1 集中化 2026-04-11: 從 models.json providers.ollama.models.image_analysis 讀取
|
||||
_MODEL = get_model("ollama", "image_analysis")
|
||||
@@ -124,7 +122,7 @@ class ImageAnalysisService:
|
||||
image_b64 = base64.b64encode(image_path.read_bytes()).decode()
|
||||
http = await self._get_http()
|
||||
resp = await http.post(
|
||||
f"{settings.OLLAMA_URL}/api/generate",
|
||||
f"{resolve_ollama_endpoint('image_analysis')}/api/generate",
|
||||
json={
|
||||
"model": _MODEL,
|
||||
"prompt": question,
|
||||
|
||||
@@ -31,8 +31,8 @@ from typing import Protocol, runtime_checkable
|
||||
import httpx
|
||||
import structlog
|
||||
|
||||
from src.core.config import settings
|
||||
from src.services.model_registry import get_model_registry
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
@@ -549,7 +549,7 @@ class IntentClassifier:
|
||||
# 呼叫 Ollama
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
f"{settings.OLLAMA_URL}/api/generate",
|
||||
f"{resolve_ollama_endpoint('hermes')}/api/generate",
|
||||
json={
|
||||
"model": model_name,
|
||||
"prompt": prompt,
|
||||
|
||||
@@ -15,10 +15,11 @@ import structlog
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# 2026-05-03 ogt: ADR-110 GCP-A Primary — 改從 settings 讀取,不再硬編碼 111
|
||||
# 2026-05-05 Codex: KB 萃取走 111 lane,避免污染 GCP alert-fast lane
|
||||
def _get_ollama_base() -> str:
|
||||
from src.core.config import get_settings
|
||||
return get_settings().OLLAMA_URL
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
return resolve_ollama_endpoint("deep_rca")
|
||||
_EXTRACT_MODEL = "llama3.2:3b"
|
||||
_EXTRACT_TIMEOUT = 30.0 # 秒,容忍慢速
|
||||
|
||||
@@ -117,7 +118,11 @@ class KnowledgeExtractorService:
|
||||
category = self._infer_category(incident)
|
||||
|
||||
# 5. 建立 KB 條目
|
||||
from src.models.knowledge import EntrySource, EntryType, KnowledgeEntryCreate
|
||||
from src.models.knowledge import (
|
||||
EntrySource,
|
||||
EntryType,
|
||||
KnowledgeEntryCreate,
|
||||
)
|
||||
from src.services.knowledge_service import get_knowledge_service
|
||||
|
||||
entry_data = KnowledgeEntryCreate(
|
||||
|
||||
@@ -20,11 +20,10 @@ from pathlib import Path
|
||||
import httpx
|
||||
import structlog
|
||||
|
||||
from src.core.config import get_settings
|
||||
import src.repositories.rag_chunk_repository as rag_repo
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
settings = get_settings()
|
||||
|
||||
_EMBED_MODEL = "nomic-embed-text"
|
||||
_GEN_MODEL = "qwen2.5:7b-instruct"
|
||||
@@ -131,7 +130,7 @@ class KnowledgeRAGService:
|
||||
try:
|
||||
http = await self._get_http()
|
||||
resp = await http.post(
|
||||
f"{settings.OLLAMA_URL}/api/embeddings",
|
||||
f"{resolve_ollama_endpoint('embedding')}/api/embeddings",
|
||||
json={"model": _EMBED_MODEL, "prompt": text},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
@@ -150,7 +149,7 @@ class KnowledgeRAGService:
|
||||
try:
|
||||
http = await self._get_http()
|
||||
resp = await http.post(
|
||||
f"{settings.OLLAMA_URL}/api/generate",
|
||||
f"{resolve_ollama_endpoint('rag')}/api/generate",
|
||||
json={
|
||||
"model": _GEN_MODEL,
|
||||
"prompt": prompt,
|
||||
|
||||
@@ -186,7 +186,7 @@ class LocalCodeReviewService:
|
||||
)
|
||||
return {
|
||||
"review_text": (
|
||||
"⚠️ Code Review:GCP-B/Ollama 審查未完成,"
|
||||
"⚠️ Code Review:本地 Ollama 審查未完成,"
|
||||
"已依成本策略跳過 Gemini fallback。"
|
||||
),
|
||||
"issues_count": 1,
|
||||
|
||||
@@ -35,8 +35,9 @@ logger = structlog.get_logger(__name__)
|
||||
# ============================================================
|
||||
# 2026-05-03 ogt: ADR-110 GCP-A Primary — 改從 settings 讀取,不再硬編碼 111
|
||||
def _get_ollama_url() -> str:
|
||||
from src.core.config import get_settings
|
||||
return get_settings().OLLAMA_URL
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
return resolve_ollama_endpoint("deep_rca")
|
||||
# D1 集中化 2026-04-11: 從 models.json providers.ollama.models.log_anomaly 讀取
|
||||
SUMMARY_MODEL = get_model("ollama", "log_anomaly")
|
||||
LLM_TIMEOUT = 180.0 # deepseek-r1 硬超時
|
||||
@@ -145,7 +146,7 @@ class LogSummaryService:
|
||||
self.summarize(pod_name, namespace),
|
||||
timeout=SOFT_TIMEOUT,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
except TimeoutError:
|
||||
logger.info(
|
||||
"log_summary_soft_timeout",
|
||||
pod=pod_name,
|
||||
@@ -182,7 +183,7 @@ class LogSummaryService:
|
||||
def _extract_anomaly_lines(self, raw_logs: str) -> list[str]:
|
||||
"""過濾異常行 + 敏感資料遮蔽,取最後 ANOMALY_TAIL_LINES 行"""
|
||||
lines = raw_logs.splitlines()
|
||||
anomaly = [l for l in lines if _ANOMALY_PATTERN.search(l)]
|
||||
anomaly = [line for line in lines if _ANOMALY_PATTERN.search(line)]
|
||||
# 取最後 N 行
|
||||
anomaly = anomaly[-ANOMALY_TAIL_LINES:]
|
||||
# 遮蔽敏感資料
|
||||
|
||||
@@ -16,27 +16,37 @@ from src.core.config import settings
|
||||
OllamaWorkloadType = Literal[
|
||||
"interactive",
|
||||
"healthcheck",
|
||||
"alert_fast",
|
||||
"batch",
|
||||
"embedding",
|
||||
"rag",
|
||||
"code_review",
|
||||
"shadow",
|
||||
"canary",
|
||||
"deep_rca",
|
||||
"image_analysis",
|
||||
"hermes",
|
||||
"local_required",
|
||||
"privacy_sensitive",
|
||||
"dr",
|
||||
]
|
||||
|
||||
_GCP_B_PREFERRED_WORKLOADS = {
|
||||
_GCP_A_PREFERRED_WORKLOADS = {
|
||||
"interactive",
|
||||
"healthcheck",
|
||||
"alert_fast",
|
||||
}
|
||||
|
||||
_LOCAL_PREFERRED_WORKLOADS = {
|
||||
"batch",
|
||||
"embedding",
|
||||
"rag",
|
||||
"code_review",
|
||||
"shadow",
|
||||
"canary",
|
||||
}
|
||||
|
||||
_LOCAL_PREFERRED_WORKLOADS = {
|
||||
"deep_rca",
|
||||
"image_analysis",
|
||||
"hermes",
|
||||
"local_required",
|
||||
"privacy_sensitive",
|
||||
"dr",
|
||||
@@ -68,20 +78,28 @@ def resolve_ollama_selection(
|
||||
secondary = cfg.OLLAMA_SECONDARY_URL
|
||||
fallback = cfg.OLLAMA_FALLBACK_URL
|
||||
|
||||
if workload_type in _GCP_B_PREFERRED_WORKLOADS and secondary:
|
||||
if workload_type in _LOCAL_PREFERRED_WORKLOADS:
|
||||
if fallback:
|
||||
return OllamaEndpointSelection(
|
||||
url=fallback,
|
||||
provider_name="ollama_local",
|
||||
workload_type=workload_type,
|
||||
reason="local_heavy_or_privacy_lane",
|
||||
)
|
||||
if secondary:
|
||||
return OllamaEndpointSelection(
|
||||
url=secondary,
|
||||
provider_name="ollama_gcp_b",
|
||||
workload_type=workload_type,
|
||||
reason="local_missing_gcp_b_fallback",
|
||||
)
|
||||
|
||||
if workload_type not in _GCP_A_PREFERRED_WORKLOADS and secondary:
|
||||
return OllamaEndpointSelection(
|
||||
url=secondary,
|
||||
provider_name="ollama_gcp_b",
|
||||
workload_type=workload_type,
|
||||
reason="gcp_b_batch_lane",
|
||||
)
|
||||
|
||||
if workload_type in _LOCAL_PREFERRED_WORKLOADS and fallback:
|
||||
return OllamaEndpointSelection(
|
||||
url=fallback,
|
||||
provider_name="ollama_local",
|
||||
workload_type=workload_type,
|
||||
reason="local_privacy_or_dr_lane",
|
||||
reason="gcp_b_default_non_alert_lane",
|
||||
)
|
||||
|
||||
return OllamaEndpointSelection(
|
||||
|
||||
@@ -29,9 +29,9 @@ from typing import Any
|
||||
import httpx
|
||||
import structlog
|
||||
|
||||
from src.core.config import settings
|
||||
from src.models.playbook import Playbook, SymptomPattern
|
||||
from src.repositories.interfaces import IEmbeddingCacheRepository
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
@@ -146,7 +146,7 @@ class PlaybookRAGService:
|
||||
"""
|
||||
self._http_client = http_client
|
||||
self._embedding_cache = embedding_cache
|
||||
self.ollama_url = settings.OLLAMA_URL
|
||||
self.ollama_url = resolve_ollama_endpoint("embedding")
|
||||
self.embedding_model = EMBEDDING_MODEL
|
||||
|
||||
# =========================================================================
|
||||
|
||||
@@ -33,7 +33,7 @@ async def _noop_save(*args: Any, **kwargs: Any) -> None:
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_large_pr_uses_gcp_b_ollama_when_gemini_fallback_disabled(
|
||||
async def test_large_pr_uses_local_ollama_when_gemini_fallback_disabled(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
monkeypatch.setattr(
|
||||
@@ -44,7 +44,7 @@ async def test_large_pr_uses_gcp_b_ollama_when_gemini_fallback_disabled(
|
||||
monkeypatch.setattr(
|
||||
review_module,
|
||||
"resolve_ollama_endpoint",
|
||||
lambda workload_type: "http://gcp-b:11436",
|
||||
lambda workload_type: "http://local-111:11434",
|
||||
)
|
||||
|
||||
client = _FakeClient()
|
||||
@@ -69,7 +69,7 @@ async def test_large_pr_uses_gcp_b_ollama_when_gemini_fallback_disabled(
|
||||
|
||||
assert result is not None
|
||||
assert result["provider"] == "ollama"
|
||||
assert client.posted_urls == ["http://gcp-b:11436/api/generate"]
|
||||
assert client.posted_urls == ["http://local-111:11434/api/generate"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -84,7 +84,7 @@ async def test_ollama_failure_does_not_fall_back_to_gemini_by_default(
|
||||
monkeypatch.setattr(
|
||||
review_module,
|
||||
"resolve_ollama_endpoint",
|
||||
lambda workload_type: "http://gcp-b:11436",
|
||||
lambda workload_type: "http://local-111:11434",
|
||||
)
|
||||
|
||||
client = _FakeClient(fail=True)
|
||||
@@ -110,7 +110,7 @@ async def test_ollama_failure_does_not_fall_back_to_gemini_by_default(
|
||||
assert result is not None
|
||||
assert result["provider"] == "ollama_unavailable"
|
||||
assert result["cloud_fallback_skipped"] is True
|
||||
assert client.posted_urls == ["http://gcp-b:11436/api/generate"]
|
||||
assert client.posted_urls == ["http://local-111:11434/api/generate"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
||||
from types import SimpleNamespace
|
||||
|
||||
from src.services.ollama_endpoint_resolver import (
|
||||
resolve_ollama_endpoint,
|
||||
resolve_ollama_selection,
|
||||
)
|
||||
|
||||
@@ -21,20 +20,30 @@ def _settings(
|
||||
)
|
||||
|
||||
|
||||
def test_batch_workloads_prefer_gcp_b() -> None:
|
||||
def test_heavy_workloads_prefer_local_lane() -> None:
|
||||
cfg = _settings()
|
||||
|
||||
for workload in ("batch", "embedding", "rag", "code_review", "shadow", "canary"):
|
||||
for workload in (
|
||||
"batch",
|
||||
"embedding",
|
||||
"rag",
|
||||
"code_review",
|
||||
"shadow",
|
||||
"canary",
|
||||
"deep_rca",
|
||||
"image_analysis",
|
||||
"hermes",
|
||||
):
|
||||
selection = resolve_ollama_selection(workload, config=cfg)
|
||||
assert selection.url == "http://192.168.0.110:11436"
|
||||
assert selection.provider_name == "ollama_gcp_b"
|
||||
assert selection.reason == "gcp_b_batch_lane"
|
||||
assert selection.url == "http://192.168.0.110:11437"
|
||||
assert selection.provider_name == "ollama_local"
|
||||
assert selection.reason == "local_heavy_or_privacy_lane"
|
||||
|
||||
|
||||
def test_interactive_workloads_stay_on_gcp_a() -> None:
|
||||
cfg = _settings()
|
||||
|
||||
for workload in ("interactive", "healthcheck"):
|
||||
for workload in ("interactive", "healthcheck", "alert_fast"):
|
||||
selection = resolve_ollama_selection(workload, config=cfg)
|
||||
assert selection.url == "http://192.168.0.110:11435"
|
||||
assert selection.provider_name == "ollama_gcp_a"
|
||||
@@ -49,7 +58,10 @@ def test_local_required_workloads_use_local_lane() -> None:
|
||||
assert selection.provider_name == "ollama_local"
|
||||
|
||||
|
||||
def test_batch_workloads_fall_back_to_primary_when_secondary_missing() -> None:
|
||||
cfg = _settings(secondary="")
|
||||
def test_heavy_workloads_fall_back_to_gcp_b_when_local_missing() -> None:
|
||||
cfg = _settings(fallback="")
|
||||
|
||||
assert resolve_ollama_endpoint("embedding", config=cfg) == "http://192.168.0.110:11435"
|
||||
selection = resolve_ollama_selection("embedding", config=cfg)
|
||||
assert selection.url == "http://192.168.0.110:11436"
|
||||
assert selection.provider_name == "ollama_gcp_b"
|
||||
assert selection.reason == "local_missing_gcp_b_fallback"
|
||||
|
||||
@@ -3193,3 +3193,26 @@ bash scripts/ops/ollama-topology-check.sh
|
||||
```
|
||||
|
||||
結論:GCP-A/B 可作 `alert-fast` lane,但目前不應承擔 14B/32B 同步告警推理;重模型必須由 AwoooP Inference Gateway 隔離到 async / 111 / GPU 節點。
|
||||
|
||||
### Runtime 過渡護欄
|
||||
|
||||
在 Inference Gateway 尚未接管所有 provider 前,先調整 `ollama_endpoint_resolver`:
|
||||
|
||||
- `interactive` / `healthcheck` / `alert_fast` 保持 GCP-A 優先
|
||||
- `code_review` / `rag` / `embedding` / `deep_rca` / `image_analysis` / `hermes` 改為 111 優先
|
||||
- 111 不可用時才回 GCP-B,避免 GCP-A/B 在告警 canary 期間被 7B/14B/32B 模型污染
|
||||
|
||||
驗證:
|
||||
|
||||
```bash
|
||||
/Users/ogt/awoooi/apps/api/.venv/bin/python -m ruff check apps/api/src/core/config.py apps/api/src/services/ollama_endpoint_resolver.py apps/api/src/services/knowledge_rag_service.py apps/api/src/services/playbook_rag.py apps/api/src/services/log_summary_service.py apps/api/src/services/image_analysis_service.py apps/api/src/services/local_code_review_service.py apps/api/src/hermes/nl_gateway.py apps/api/tests/test_ollama_endpoint_resolver.py apps/api/tests/test_local_code_review_cloud_fallback.py
|
||||
# All checks passed
|
||||
|
||||
DATABASE_URL=postgresql+asyncpg://u:p@localhost:5432/test REDIS_URL=redis://localhost:6379/0 \
|
||||
/Users/ogt/awoooi/apps/api/.venv/bin/python -m pytest \
|
||||
apps/api/tests/test_ollama_endpoint_resolver.py \
|
||||
apps/api/tests/test_local_code_review_cloud_fallback.py \
|
||||
apps/api/tests/test_ollama_provider_endpoints.py \
|
||||
apps/api/tests/test_openclaw_alert_cloud_fallback_gate.py -q
|
||||
# 15 passed
|
||||
```
|
||||
|
||||
@@ -35,8 +35,8 @@ gateway must own runtime scheduling.
|
||||
| Lane | Model | Allowed hosts | Notes |
|
||||
|------|-------|---------------|-------|
|
||||
| `alert-fast` | `gemma3:4b` | GCP-A, GCP-B, 111 | Synchronous, protected |
|
||||
| `code-review` | `qwen2.5-coder:7b` | GCP-B, 111 | Never 32B on GCP during alert canary |
|
||||
| `embedding` | `bge-m3` | GCP-A, GCP-B, 111 | Short timeout |
|
||||
| `code-review` | `qwen2.5-coder:7b` | 111, then GCP-B | Transitional: keep GCP-B clean during alert canary |
|
||||
| `embedding` | `bge-m3` | 111, then GCP-B | Transitional: keep GCP-A/B clean during alert canary |
|
||||
| `deep-rca` | 14B-class model | 111 or GPU node | Async only |
|
||||
| `paid-emergency` | Gemini / Claude | Cloud | Budget-gated emergency fallback |
|
||||
|
||||
@@ -88,8 +88,8 @@ if lane == alert-fast:
|
||||
|
||||
if lane == code-review:
|
||||
model = qwen2.5-coder:7b
|
||||
try GCP-B with 90s timeout
|
||||
try 111 with 120s timeout
|
||||
try GCP-B with 90s timeout only if 111 is unavailable
|
||||
|
||||
if lane == deep-rca:
|
||||
reject synchronous request
|
||||
@@ -150,4 +150,3 @@ OLLAMA_FALLBACK_URL: "http://192.168.0.111:11434"
|
||||
```
|
||||
|
||||
Do not disable budget hard kill during rollback.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user