fix(ollama): cooldown noisy failed endpoints
This commit is contained in:
@@ -22,6 +22,11 @@ import httpx
|
||||
import structlog
|
||||
|
||||
from src.services.model_registry import get_model as _get_model
|
||||
from src.services.ollama_endpoint_circuit_breaker import (
|
||||
filter_ollama_urls_with_cooldown,
|
||||
record_ollama_endpoint_failure,
|
||||
record_ollama_endpoint_success,
|
||||
)
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_order
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
@@ -135,6 +140,17 @@ class OllamaEmbeddingService:
|
||||
)
|
||||
return self._client
|
||||
|
||||
def _active_ollama_endpoints(self) -> tuple[tuple[str, str], ...]:
|
||||
active_urls = filter_ollama_urls_with_cooldown(
|
||||
url for url, _provider_name in self._ollama_endpoints
|
||||
)
|
||||
active_url_set = set(active_urls)
|
||||
return tuple(
|
||||
(url, provider_name)
|
||||
for url, provider_name in self._ollama_endpoints
|
||||
if url in active_url_set
|
||||
)
|
||||
|
||||
async def embed_text(self, text: str) -> list[float]:
|
||||
"""
|
||||
將單一文本轉換為向量
|
||||
@@ -151,7 +167,7 @@ class OllamaEmbeddingService:
|
||||
client = await self._get_client()
|
||||
|
||||
last_error: Exception | None = None
|
||||
for endpoint_url, provider_name in self._ollama_endpoints:
|
||||
for endpoint_url, provider_name in self._active_ollama_endpoints():
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{endpoint_url}/api/embeddings",
|
||||
@@ -161,6 +177,7 @@ class OllamaEmbeddingService:
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
record_ollama_endpoint_success(endpoint_url)
|
||||
|
||||
data = response.json()
|
||||
embedding = data.get("embedding", [])
|
||||
@@ -179,6 +196,7 @@ class OllamaEmbeddingService:
|
||||
|
||||
except httpx.TimeoutException as e:
|
||||
last_error = e
|
||||
record_ollama_endpoint_failure(endpoint_url)
|
||||
logger.error(
|
||||
"embedding_timeout",
|
||||
model=self._model,
|
||||
@@ -187,6 +205,8 @@ class OllamaEmbeddingService:
|
||||
)
|
||||
except httpx.HTTPStatusError as e:
|
||||
last_error = e
|
||||
if e.response.status_code >= 500:
|
||||
record_ollama_endpoint_failure(endpoint_url)
|
||||
logger.error(
|
||||
"embedding_http_error",
|
||||
status=e.response.status_code,
|
||||
@@ -195,6 +215,8 @@ class OllamaEmbeddingService:
|
||||
)
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
if isinstance(e, httpx.TransportError):
|
||||
record_ollama_endpoint_failure(endpoint_url)
|
||||
logger.error(
|
||||
"embedding_error",
|
||||
error=str(e),
|
||||
|
||||
@@ -22,7 +22,11 @@ import structlog
|
||||
|
||||
import src.repositories.rag_chunk_repository as rag_repo
|
||||
from src.core.config import settings
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_order
|
||||
from src.services.ollama_endpoint_circuit_breaker import (
|
||||
record_ollama_endpoint_failure,
|
||||
record_ollama_endpoint_success,
|
||||
resolve_ollama_order_with_cooldown,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
@@ -129,7 +133,7 @@ class KnowledgeRAGService:
|
||||
|
||||
async def _embed(self, text: str) -> list[float] | None:
|
||||
http = await self._get_http()
|
||||
for endpoint in resolve_ollama_order("embedding"):
|
||||
for endpoint in resolve_ollama_order_with_cooldown("embedding"):
|
||||
if not endpoint.url:
|
||||
continue
|
||||
try:
|
||||
@@ -141,17 +145,22 @@ class KnowledgeRAGService:
|
||||
},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
record_ollama_endpoint_success(endpoint.url)
|
||||
logger.debug(
|
||||
"rag_embed_success",
|
||||
provider=endpoint.provider_name,
|
||||
)
|
||||
return resp.json().get("embedding")
|
||||
if resp.status_code >= 500:
|
||||
record_ollama_endpoint_failure(endpoint.url)
|
||||
logger.warning(
|
||||
"rag_embed_http_error",
|
||||
provider=endpoint.provider_name,
|
||||
status=resp.status_code,
|
||||
)
|
||||
except Exception as e:
|
||||
if isinstance(e, (httpx.TimeoutException, httpx.TransportError)):
|
||||
record_ollama_endpoint_failure(endpoint.url)
|
||||
logger.warning(
|
||||
"rag_embed_failed",
|
||||
provider=endpoint.provider_name,
|
||||
@@ -167,7 +176,7 @@ class KnowledgeRAGService:
|
||||
f"=== 問題 ===\n{question}"
|
||||
)
|
||||
http = await self._get_http()
|
||||
for endpoint in resolve_ollama_order("rag"):
|
||||
for endpoint in resolve_ollama_order_with_cooldown("rag"):
|
||||
if not endpoint.url:
|
||||
continue
|
||||
try:
|
||||
@@ -182,17 +191,22 @@ class KnowledgeRAGService:
|
||||
timeout=httpx.Timeout(90.0, connect=10.0),
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
record_ollama_endpoint_success(endpoint.url)
|
||||
logger.debug(
|
||||
"rag_generate_success",
|
||||
provider=endpoint.provider_name,
|
||||
)
|
||||
return resp.json().get("response", "").strip()
|
||||
if resp.status_code >= 500:
|
||||
record_ollama_endpoint_failure(endpoint.url)
|
||||
logger.warning(
|
||||
"rag_generate_http_error",
|
||||
provider=endpoint.provider_name,
|
||||
status=resp.status_code,
|
||||
)
|
||||
except Exception as e:
|
||||
if isinstance(e, (httpx.TimeoutException, httpx.TransportError)):
|
||||
record_ollama_endpoint_failure(endpoint.url)
|
||||
logger.error(
|
||||
"rag_generate_failed",
|
||||
provider=endpoint.provider_name,
|
||||
|
||||
90
apps/api/src/services/ollama_endpoint_circuit_breaker.py
Normal file
90
apps/api/src/services/ollama_endpoint_circuit_breaker.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Lightweight in-process cooldown for noisy Ollama endpoint failures.
|
||||
|
||||
This does not change ADR-110 policy order. It only suppresses endpoints that
|
||||
just failed for short-lived high-volume callers such as embedding/RAG, while
|
||||
leaving health checks and failover status free to probe the full topology.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
|
||||
from src.services.ollama_endpoint_resolver import (
|
||||
OllamaEndpointSelection,
|
||||
OllamaWorkloadType,
|
||||
resolve_ollama_order,
|
||||
)
|
||||
|
||||
DEFAULT_OLLAMA_ENDPOINT_COOLDOWN_SECONDS = 60.0
|
||||
|
||||
_blocked_until_by_url: dict[str, float] = {}
|
||||
|
||||
|
||||
def _normalize_url(url: str) -> str:
|
||||
return url.rstrip("/")
|
||||
|
||||
|
||||
def record_ollama_endpoint_failure(
|
||||
url: str,
|
||||
*,
|
||||
cooldown_seconds: float = DEFAULT_OLLAMA_ENDPOINT_COOLDOWN_SECONDS,
|
||||
now: float | None = None,
|
||||
) -> None:
|
||||
"""Temporarily suppress an endpoint after network/5xx failure."""
|
||||
if not url:
|
||||
return
|
||||
current_time = time.monotonic() if now is None else now
|
||||
_blocked_until_by_url[_normalize_url(url)] = current_time + cooldown_seconds
|
||||
|
||||
|
||||
def record_ollama_endpoint_success(url: str) -> None:
|
||||
"""Clear cooldown when an endpoint succeeds again."""
|
||||
if not url:
|
||||
return
|
||||
_blocked_until_by_url.pop(_normalize_url(url), None)
|
||||
|
||||
|
||||
def is_ollama_endpoint_blocked(url: str, *, now: float | None = None) -> bool:
|
||||
if not url:
|
||||
return False
|
||||
current_time = time.monotonic() if now is None else now
|
||||
normalized = _normalize_url(url)
|
||||
blocked_until = _blocked_until_by_url.get(normalized)
|
||||
if blocked_until is None:
|
||||
return False
|
||||
if blocked_until <= current_time:
|
||||
_blocked_until_by_url.pop(normalized, None)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def filter_ollama_urls_with_cooldown(
|
||||
urls: Iterable[str],
|
||||
*,
|
||||
now: float | None = None,
|
||||
) -> tuple[str, ...]:
|
||||
"""Return non-cooldown URLs, preserving original order and all-blocked recovery."""
|
||||
ordered_urls = tuple(url for url in urls if url)
|
||||
available = tuple(url for url in ordered_urls if not is_ollama_endpoint_blocked(url, now=now))
|
||||
return available or ordered_urls
|
||||
|
||||
|
||||
def resolve_ollama_order_with_cooldown(
|
||||
workload_type: OllamaWorkloadType = "interactive",
|
||||
*,
|
||||
now: float | None = None,
|
||||
) -> tuple[OllamaEndpointSelection, ...]:
|
||||
"""Return resolver order with short-lived noisy-failure cooldown applied."""
|
||||
order = resolve_ollama_order(workload_type)
|
||||
available = tuple(
|
||||
endpoint
|
||||
for endpoint in order
|
||||
if endpoint.url and not is_ollama_endpoint_blocked(endpoint.url, now=now)
|
||||
)
|
||||
return available or order
|
||||
|
||||
|
||||
def reset_ollama_endpoint_cooldown_for_tests() -> None:
|
||||
_blocked_until_by_url.clear()
|
||||
@@ -32,6 +32,11 @@ import structlog
|
||||
from src.core.config import settings
|
||||
from src.models.playbook import Playbook, SymptomPattern
|
||||
from src.repositories.interfaces import IEmbeddingCacheRepository
|
||||
from src.services.ollama_endpoint_circuit_breaker import (
|
||||
filter_ollama_urls_with_cooldown,
|
||||
record_ollama_endpoint_failure,
|
||||
record_ollama_endpoint_success,
|
||||
)
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
@@ -204,7 +209,7 @@ class PlaybookRAGService:
|
||||
try:
|
||||
client = await self._get_http_client()
|
||||
last_error = ""
|
||||
for endpoint_url in self.ollama_urls:
|
||||
for endpoint_url in filter_ollama_urls_with_cooldown(self.ollama_urls):
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{endpoint_url}/api/embeddings",
|
||||
@@ -217,6 +222,8 @@ class PlaybookRAGService:
|
||||
|
||||
if response.status_code != 200:
|
||||
last_error = f"http_{response.status_code}"
|
||||
if response.status_code >= 500:
|
||||
record_ollama_endpoint_failure(endpoint_url)
|
||||
logger.warning(
|
||||
"ollama_embedding_failed",
|
||||
endpoint=endpoint_url,
|
||||
@@ -237,10 +244,16 @@ class PlaybookRAGService:
|
||||
)
|
||||
continue
|
||||
|
||||
record_ollama_endpoint_success(endpoint_url)
|
||||
logger.info("ollama_embedding_success", endpoint=endpoint_url)
|
||||
return normalize_vector(embedding)
|
||||
except Exception as endpoint_error:
|
||||
last_error = str(endpoint_error)
|
||||
if isinstance(
|
||||
endpoint_error,
|
||||
(httpx.TimeoutException, httpx.TransportError),
|
||||
):
|
||||
record_ollama_endpoint_failure(endpoint_url)
|
||||
logger.warning(
|
||||
"ollama_embedding_endpoint_error",
|
||||
endpoint=endpoint_url,
|
||||
|
||||
72
apps/api/tests/test_ollama_endpoint_circuit_breaker.py
Normal file
72
apps/api/tests/test_ollama_endpoint_circuit_breaker.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from types import SimpleNamespace
|
||||
|
||||
from src.services.ollama_endpoint_circuit_breaker import (
|
||||
filter_ollama_urls_with_cooldown,
|
||||
is_ollama_endpoint_blocked,
|
||||
record_ollama_endpoint_failure,
|
||||
record_ollama_endpoint_success,
|
||||
reset_ollama_endpoint_cooldown_for_tests,
|
||||
resolve_ollama_order_with_cooldown,
|
||||
)
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_order
|
||||
|
||||
|
||||
def setup_function() -> None:
|
||||
reset_ollama_endpoint_cooldown_for_tests()
|
||||
|
||||
|
||||
def teardown_function() -> None:
|
||||
reset_ollama_endpoint_cooldown_for_tests()
|
||||
|
||||
|
||||
def test_cooldown_filters_failed_urls_without_reordering() -> None:
|
||||
urls = ("http://gcp-a:11434", "http://gcp-b:11434", "http://local-111:11434")
|
||||
|
||||
record_ollama_endpoint_failure("http://gcp-a:11434", now=100.0)
|
||||
record_ollama_endpoint_failure("http://gcp-b:11434", now=100.0)
|
||||
|
||||
assert filter_ollama_urls_with_cooldown(urls, now=101.0) == ("http://local-111:11434",)
|
||||
|
||||
|
||||
def test_cooldown_expires_and_success_clears_block() -> None:
|
||||
record_ollama_endpoint_failure("http://gcp-a:11434", cooldown_seconds=10.0, now=100.0)
|
||||
|
||||
assert is_ollama_endpoint_blocked("http://gcp-a:11434", now=109.9)
|
||||
assert not is_ollama_endpoint_blocked("http://gcp-a:11434", now=110.1)
|
||||
|
||||
record_ollama_endpoint_failure("http://gcp-a:11434", now=200.0)
|
||||
record_ollama_endpoint_success("http://gcp-a:11434")
|
||||
assert not is_ollama_endpoint_blocked("http://gcp-a:11434", now=201.0)
|
||||
|
||||
|
||||
def test_all_blocked_returns_full_order_for_recovery_probe() -> None:
|
||||
urls = ("http://gcp-a:11434", "http://gcp-b:11434")
|
||||
|
||||
record_ollama_endpoint_failure("http://gcp-a:11434", now=100.0)
|
||||
record_ollama_endpoint_failure("http://gcp-b:11434", now=100.0)
|
||||
|
||||
assert filter_ollama_urls_with_cooldown(urls, now=101.0) == urls
|
||||
|
||||
|
||||
def test_resolver_order_itself_remains_global_policy(monkeypatch) -> None:
|
||||
cfg = SimpleNamespace(
|
||||
OLLAMA_URL="http://gcp-a:11434",
|
||||
OLLAMA_SECONDARY_URL="http://gcp-b:11434",
|
||||
OLLAMA_FALLBACK_URL="http://local-111:11434",
|
||||
)
|
||||
import src.services.ollama_endpoint_resolver as resolver
|
||||
|
||||
monkeypatch.setattr(resolver, "settings", cfg)
|
||||
|
||||
record_ollama_endpoint_failure("http://gcp-a:11434", now=100.0)
|
||||
cooled = resolve_ollama_order_with_cooldown("embedding", now=101.0)
|
||||
policy = resolve_ollama_order("embedding", config=cfg)
|
||||
|
||||
assert [endpoint.provider_name for endpoint in cooled] == ["ollama_gcp_b", "ollama_local"]
|
||||
assert [endpoint.provider_name for endpoint in policy] == [
|
||||
"ollama_gcp_a",
|
||||
"ollama_gcp_b",
|
||||
"ollama_local",
|
||||
]
|
||||
@@ -1,3 +1,81 @@
|
||||
## 2026-05-25|T173 GCP upstream 紅燈驗證與 Ollama endpoint cooldown 降噪
|
||||
|
||||
**背景**:
|
||||
|
||||
- T172 已恢復 production `GCP-A -> GCP-B -> 111 -> Gemini` 順序,且 111 fallback 已正常承接。
|
||||
- production `/api/v1/health` 仍 degraded,原因是 110 proxy 對 GCP-A/GCP-B upstream 回 502 / connection refused。
|
||||
- nginx error log 顯示大量 `POST /api/embeddings` 反覆打 `11435/11436`,在 GCP-A/B 已知 offline 時造成不必要的 latency 與 log 噪音。
|
||||
|
||||
**live 驗證結論**:
|
||||
|
||||
- 本機與 110 直打:
|
||||
- `34.143.170.20:11434` -> connection refused。
|
||||
- `34.21.145.224:11434` -> connection refused。
|
||||
- SSH:
|
||||
- GCP-A `34.143.170.20:22` -> connection refused。
|
||||
- GCP-B `34.21.145.224:22` -> port open,但現有 `owen_taipei` / `oleetsai` / 常見 user + 本機 key 均 publickey denied。
|
||||
- GCP 控制面:
|
||||
- `owen.hy.tsai@gmail.com` 與 `owen.tsai@gmail.com` 均缺 `compute.instances.list` / `compute.firewalls.list` / `getIamPolicy` 權限。
|
||||
- 判讀:目前無法從本 session 直接修 GCP VM / firewall / OS service;需要具 Compute/IAM 權限者恢復 GCP-A/B,或提供可用 SSH key。
|
||||
|
||||
**本次修補**:
|
||||
|
||||
- 新增 `ollama_endpoint_circuit_breaker.py`:
|
||||
- 短 TTL in-process cooldown,預設 60 秒。
|
||||
- 不改 ADR-110 policy order;只在高頻 embedding/RAG 路徑暫時略過剛失敗的 endpoint。
|
||||
- 若所有 endpoint 都被 cooldown,回到完整順序,避免永遠不探測恢復。
|
||||
- 偵測到 main 上曾短暫把 `OLLAMA_URL` / `OLLAMA_SECONDARY_URL` 都切到 `11437`;這會讓 `ollama_gcp_a` label 實際指向 111,造成 Operator 誤判。
|
||||
- 本輪改回 manifest:`11435 -> GCP-A`、`11436 -> GCP-B`、`11437 -> 111`,讓 policy / label / frontend 事實保持一致。
|
||||
- 接入高頻路徑:
|
||||
- `embedding_service.py`
|
||||
- `knowledge_rag_service.py`
|
||||
- `playbook_rag.py`
|
||||
- 成功時會清除 cooldown;network/timeout/5xx 失敗才短暫標記,不因 4xx 或資料錯誤誤封 endpoint。
|
||||
- 前端/health/failover status 不套用 cooldown,仍會顯示 GCP-A/B 真實紅燈,不消音、不假裝 healthy。
|
||||
|
||||
**local verification**:
|
||||
|
||||
```text
|
||||
py_compile:
|
||||
ollama_endpoint_circuit_breaker.py
|
||||
embedding_service.py
|
||||
knowledge_rag_service.py
|
||||
playbook_rag.py
|
||||
test_ollama_endpoint_circuit_breaker.py
|
||||
-> ok
|
||||
ruff F/E9 targeted -> passed
|
||||
pytest:
|
||||
test_ollama_endpoint_circuit_breaker.py
|
||||
test_ollama_endpoint_resolver.py
|
||||
test_playbook_service.py
|
||||
-> 23 passed
|
||||
git diff --check -> ok
|
||||
```
|
||||
|
||||
**注意 / 下一步**:
|
||||
|
||||
- 這不是修復 GCP-A/B;它是 runtime 降噪與 latency 保護。
|
||||
- GCP-A/B 的真正修復仍需:
|
||||
- GCP Compute IAM:`compute.instances.get/list`、`compute.firewalls.list/update`、必要時 serial console / reset 權限。
|
||||
- 或可用 SSH key,能進 GCP-B 檢查 `systemctl status ollama`、`ss -tlnp`、`ufw/iptables`、`OLLAMA_HOST`。
|
||||
- 權限恢復後,優先修 GCP-B,因為 SSH port 已開;GCP-A 則先查 VM 狀態 / firewall / SSH service。
|
||||
|
||||
**目前整體進度**:
|
||||
|
||||
- AwoooP 告警可觀測鏈:約 99.2%。
|
||||
- 低風險自動修復閉環:約 95.5%。
|
||||
- 前端 AI 自動化管理介面同步:約 96.4%。
|
||||
- Telegram detail/history 可解釋性:約 95.5%。
|
||||
- Callback evidence / DB 回放性:約 95.6%。
|
||||
- MCP / 自建 MCP 使用可視性:約 88%。
|
||||
- Sentry / SigNoz source correlation 可視性:約 88%。
|
||||
- Ansible / PlayBook 決策可視性:約 84.8%。
|
||||
- KM owner-review / completion 可治理鏈:約 84%。
|
||||
- AI Provider lane 健康度:約 84%(111 fallback 正常;GCP-A/B 仍待 Compute/SSH 權限修復;高頻路徑已降噪)。
|
||||
- 完整 AI 自動化管理產品化:約 93.9%。
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-25|T172 Ollama Provider lane 恢復 111 fallback 接線
|
||||
|
||||
**背景**:
|
||||
|
||||
@@ -17,10 +17,11 @@ data:
|
||||
# 服務端點 (非機密)
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: 改指向 111(GPU 機,RTX)
|
||||
# 188 = CPU-only Ollama,推理極慢(>60s);111 有 GPU,avg 10s
|
||||
# 2026-05-25 Codex: emergency failover while both GCP-A and GCP-B refuse 11434.
|
||||
# Keep all Ollama slots on the live 111 proxy until GCP SSH/Ollama is repaired.
|
||||
OLLAMA_URL: "http://192.168.0.110:11437"
|
||||
OLLAMA_SECONDARY_URL: "http://192.168.0.110:11437"
|
||||
# 2026-05-25 Codex: keep ADR-110 production policy visible and ordered.
|
||||
# High-volume callers use short endpoint cooldowns when GCP-A/B are down,
|
||||
# but health/status must still show GCP-A -> GCP-B -> 111 before Gemini.
|
||||
OLLAMA_URL: "http://192.168.0.110:11435"
|
||||
OLLAMA_SECONDARY_URL: "http://192.168.0.110:11436"
|
||||
OLLAMA_FALLBACK_URL: "http://192.168.0.110:11437"
|
||||
OPENCLAW_URL: "http://192.168.0.188:8088"
|
||||
KALI_SCANNER_URL: "http://192.168.0.112:8080"
|
||||
|
||||
@@ -70,9 +70,9 @@ spec:
|
||||
- name: TELEGRAM_ENABLE_POLLING
|
||||
value: "true"
|
||||
- name: OLLAMA_URL
|
||||
value: "http://192.168.0.110:11437" # 2026-05-25 Codex: emergency local 111 failover while GCP-A/B refuse 11434
|
||||
value: "http://192.168.0.110:11435" # 2026-05-25 Codex: GCP-A via 110 proxy; cooldown protects noisy offline retries
|
||||
- name: OLLAMA_SECONDARY_URL
|
||||
value: "http://192.168.0.110:11437" # 2026-05-25 Codex: emergency local 111 failover while GCP-A/B refuse 11434
|
||||
value: "http://192.168.0.110:11436" # 2026-05-25 Codex: GCP-B via 110 proxy; cooldown protects noisy offline retries
|
||||
- name: OLLAMA_FALLBACK_URL
|
||||
value: "http://192.168.0.110:11437" # 2026-05-25 Codex: 111 via 110 proxy before Gemini
|
||||
- name: ALERT_AI_ALLOW_CLOUD_FALLBACK
|
||||
|
||||
Reference in New Issue
Block a user