fix(ollama): cooldown noisy failed endpoints
All checks were successful
CD Pipeline / tests (push) Successful in 58s
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / build-and-deploy (push) Successful in 3m45s
CD Pipeline / post-deploy-checks (push) Successful in 1m17s

This commit is contained in:
Your Name
2026-05-25 12:09:40 +08:00
parent 8a78344bcc
commit 2dcd214156
8 changed files with 301 additions and 11 deletions

View File

@@ -22,6 +22,11 @@ import httpx
import structlog
from src.services.model_registry import get_model as _get_model
from src.services.ollama_endpoint_circuit_breaker import (
filter_ollama_urls_with_cooldown,
record_ollama_endpoint_failure,
record_ollama_endpoint_success,
)
from src.services.ollama_endpoint_resolver import resolve_ollama_order
logger = structlog.get_logger(__name__)
@@ -135,6 +140,17 @@ class OllamaEmbeddingService:
)
return self._client
def _active_ollama_endpoints(self) -> tuple[tuple[str, str], ...]:
active_urls = filter_ollama_urls_with_cooldown(
url for url, _provider_name in self._ollama_endpoints
)
active_url_set = set(active_urls)
return tuple(
(url, provider_name)
for url, provider_name in self._ollama_endpoints
if url in active_url_set
)
async def embed_text(self, text: str) -> list[float]:
"""
將單一文本轉換為向量
@@ -151,7 +167,7 @@ class OllamaEmbeddingService:
client = await self._get_client()
last_error: Exception | None = None
for endpoint_url, provider_name in self._ollama_endpoints:
for endpoint_url, provider_name in self._active_ollama_endpoints():
try:
response = await client.post(
f"{endpoint_url}/api/embeddings",
@@ -161,6 +177,7 @@ class OllamaEmbeddingService:
},
)
response.raise_for_status()
record_ollama_endpoint_success(endpoint_url)
data = response.json()
embedding = data.get("embedding", [])
@@ -179,6 +196,7 @@ class OllamaEmbeddingService:
except httpx.TimeoutException as e:
last_error = e
record_ollama_endpoint_failure(endpoint_url)
logger.error(
"embedding_timeout",
model=self._model,
@@ -187,6 +205,8 @@ class OllamaEmbeddingService:
)
except httpx.HTTPStatusError as e:
last_error = e
if e.response.status_code >= 500:
record_ollama_endpoint_failure(endpoint_url)
logger.error(
"embedding_http_error",
status=e.response.status_code,
@@ -195,6 +215,8 @@ class OllamaEmbeddingService:
)
except Exception as e:
last_error = e
if isinstance(e, httpx.TransportError):
record_ollama_endpoint_failure(endpoint_url)
logger.error(
"embedding_error",
error=str(e),

View File

@@ -22,7 +22,11 @@ import structlog
import src.repositories.rag_chunk_repository as rag_repo
from src.core.config import settings
from src.services.ollama_endpoint_resolver import resolve_ollama_order
from src.services.ollama_endpoint_circuit_breaker import (
record_ollama_endpoint_failure,
record_ollama_endpoint_success,
resolve_ollama_order_with_cooldown,
)
logger = structlog.get_logger(__name__)
@@ -129,7 +133,7 @@ class KnowledgeRAGService:
async def _embed(self, text: str) -> list[float] | None:
http = await self._get_http()
for endpoint in resolve_ollama_order("embedding"):
for endpoint in resolve_ollama_order_with_cooldown("embedding"):
if not endpoint.url:
continue
try:
@@ -141,17 +145,22 @@ class KnowledgeRAGService:
},
)
if resp.status_code == 200:
record_ollama_endpoint_success(endpoint.url)
logger.debug(
"rag_embed_success",
provider=endpoint.provider_name,
)
return resp.json().get("embedding")
if resp.status_code >= 500:
record_ollama_endpoint_failure(endpoint.url)
logger.warning(
"rag_embed_http_error",
provider=endpoint.provider_name,
status=resp.status_code,
)
except Exception as e:
if isinstance(e, (httpx.TimeoutException, httpx.TransportError)):
record_ollama_endpoint_failure(endpoint.url)
logger.warning(
"rag_embed_failed",
provider=endpoint.provider_name,
@@ -167,7 +176,7 @@ class KnowledgeRAGService:
f"=== 問題 ===\n{question}"
)
http = await self._get_http()
for endpoint in resolve_ollama_order("rag"):
for endpoint in resolve_ollama_order_with_cooldown("rag"):
if not endpoint.url:
continue
try:
@@ -182,17 +191,22 @@ class KnowledgeRAGService:
timeout=httpx.Timeout(90.0, connect=10.0),
)
if resp.status_code == 200:
record_ollama_endpoint_success(endpoint.url)
logger.debug(
"rag_generate_success",
provider=endpoint.provider_name,
)
return resp.json().get("response", "").strip()
if resp.status_code >= 500:
record_ollama_endpoint_failure(endpoint.url)
logger.warning(
"rag_generate_http_error",
provider=endpoint.provider_name,
status=resp.status_code,
)
except Exception as e:
if isinstance(e, (httpx.TimeoutException, httpx.TransportError)):
record_ollama_endpoint_failure(endpoint.url)
logger.error(
"rag_generate_failed",
provider=endpoint.provider_name,

View File

@@ -0,0 +1,90 @@
"""
Lightweight in-process cooldown for noisy Ollama endpoint failures.
This does not change ADR-110 policy order. It only suppresses endpoints that
just failed for short-lived high-volume callers such as embedding/RAG, while
leaving health checks and failover status free to probe the full topology.
"""
from __future__ import annotations
import time
from collections.abc import Iterable
from src.services.ollama_endpoint_resolver import (
OllamaEndpointSelection,
OllamaWorkloadType,
resolve_ollama_order,
)
DEFAULT_OLLAMA_ENDPOINT_COOLDOWN_SECONDS = 60.0
_blocked_until_by_url: dict[str, float] = {}
def _normalize_url(url: str) -> str:
return url.rstrip("/")
def record_ollama_endpoint_failure(
url: str,
*,
cooldown_seconds: float = DEFAULT_OLLAMA_ENDPOINT_COOLDOWN_SECONDS,
now: float | None = None,
) -> None:
"""Temporarily suppress an endpoint after network/5xx failure."""
if not url:
return
current_time = time.monotonic() if now is None else now
_blocked_until_by_url[_normalize_url(url)] = current_time + cooldown_seconds
def record_ollama_endpoint_success(url: str) -> None:
"""Clear cooldown when an endpoint succeeds again."""
if not url:
return
_blocked_until_by_url.pop(_normalize_url(url), None)
def is_ollama_endpoint_blocked(url: str, *, now: float | None = None) -> bool:
if not url:
return False
current_time = time.monotonic() if now is None else now
normalized = _normalize_url(url)
blocked_until = _blocked_until_by_url.get(normalized)
if blocked_until is None:
return False
if blocked_until <= current_time:
_blocked_until_by_url.pop(normalized, None)
return False
return True
def filter_ollama_urls_with_cooldown(
urls: Iterable[str],
*,
now: float | None = None,
) -> tuple[str, ...]:
"""Return non-cooldown URLs, preserving original order and all-blocked recovery."""
ordered_urls = tuple(url for url in urls if url)
available = tuple(url for url in ordered_urls if not is_ollama_endpoint_blocked(url, now=now))
return available or ordered_urls
def resolve_ollama_order_with_cooldown(
workload_type: OllamaWorkloadType = "interactive",
*,
now: float | None = None,
) -> tuple[OllamaEndpointSelection, ...]:
"""Return resolver order with short-lived noisy-failure cooldown applied."""
order = resolve_ollama_order(workload_type)
available = tuple(
endpoint
for endpoint in order
if endpoint.url and not is_ollama_endpoint_blocked(endpoint.url, now=now)
)
return available or order
def reset_ollama_endpoint_cooldown_for_tests() -> None:
_blocked_until_by_url.clear()

View File

@@ -32,6 +32,11 @@ import structlog
from src.core.config import settings
from src.models.playbook import Playbook, SymptomPattern
from src.repositories.interfaces import IEmbeddingCacheRepository
from src.services.ollama_endpoint_circuit_breaker import (
filter_ollama_urls_with_cooldown,
record_ollama_endpoint_failure,
record_ollama_endpoint_success,
)
from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint
logger = structlog.get_logger(__name__)
@@ -204,7 +209,7 @@ class PlaybookRAGService:
try:
client = await self._get_http_client()
last_error = ""
for endpoint_url in self.ollama_urls:
for endpoint_url in filter_ollama_urls_with_cooldown(self.ollama_urls):
try:
response = await client.post(
f"{endpoint_url}/api/embeddings",
@@ -217,6 +222,8 @@ class PlaybookRAGService:
if response.status_code != 200:
last_error = f"http_{response.status_code}"
if response.status_code >= 500:
record_ollama_endpoint_failure(endpoint_url)
logger.warning(
"ollama_embedding_failed",
endpoint=endpoint_url,
@@ -237,10 +244,16 @@ class PlaybookRAGService:
)
continue
record_ollama_endpoint_success(endpoint_url)
logger.info("ollama_embedding_success", endpoint=endpoint_url)
return normalize_vector(embedding)
except Exception as endpoint_error:
last_error = str(endpoint_error)
if isinstance(
endpoint_error,
(httpx.TimeoutException, httpx.TransportError),
):
record_ollama_endpoint_failure(endpoint_url)
logger.warning(
"ollama_embedding_endpoint_error",
endpoint=endpoint_url,

View File

@@ -0,0 +1,72 @@
from __future__ import annotations
from types import SimpleNamespace
from src.services.ollama_endpoint_circuit_breaker import (
filter_ollama_urls_with_cooldown,
is_ollama_endpoint_blocked,
record_ollama_endpoint_failure,
record_ollama_endpoint_success,
reset_ollama_endpoint_cooldown_for_tests,
resolve_ollama_order_with_cooldown,
)
from src.services.ollama_endpoint_resolver import resolve_ollama_order
def setup_function() -> None:
reset_ollama_endpoint_cooldown_for_tests()
def teardown_function() -> None:
reset_ollama_endpoint_cooldown_for_tests()
def test_cooldown_filters_failed_urls_without_reordering() -> None:
urls = ("http://gcp-a:11434", "http://gcp-b:11434", "http://local-111:11434")
record_ollama_endpoint_failure("http://gcp-a:11434", now=100.0)
record_ollama_endpoint_failure("http://gcp-b:11434", now=100.0)
assert filter_ollama_urls_with_cooldown(urls, now=101.0) == ("http://local-111:11434",)
def test_cooldown_expires_and_success_clears_block() -> None:
record_ollama_endpoint_failure("http://gcp-a:11434", cooldown_seconds=10.0, now=100.0)
assert is_ollama_endpoint_blocked("http://gcp-a:11434", now=109.9)
assert not is_ollama_endpoint_blocked("http://gcp-a:11434", now=110.1)
record_ollama_endpoint_failure("http://gcp-a:11434", now=200.0)
record_ollama_endpoint_success("http://gcp-a:11434")
assert not is_ollama_endpoint_blocked("http://gcp-a:11434", now=201.0)
def test_all_blocked_returns_full_order_for_recovery_probe() -> None:
urls = ("http://gcp-a:11434", "http://gcp-b:11434")
record_ollama_endpoint_failure("http://gcp-a:11434", now=100.0)
record_ollama_endpoint_failure("http://gcp-b:11434", now=100.0)
assert filter_ollama_urls_with_cooldown(urls, now=101.0) == urls
def test_resolver_order_itself_remains_global_policy(monkeypatch) -> None:
cfg = SimpleNamespace(
OLLAMA_URL="http://gcp-a:11434",
OLLAMA_SECONDARY_URL="http://gcp-b:11434",
OLLAMA_FALLBACK_URL="http://local-111:11434",
)
import src.services.ollama_endpoint_resolver as resolver
monkeypatch.setattr(resolver, "settings", cfg)
record_ollama_endpoint_failure("http://gcp-a:11434", now=100.0)
cooled = resolve_ollama_order_with_cooldown("embedding", now=101.0)
policy = resolve_ollama_order("embedding", config=cfg)
assert [endpoint.provider_name for endpoint in cooled] == ["ollama_gcp_b", "ollama_local"]
assert [endpoint.provider_name for endpoint in policy] == [
"ollama_gcp_a",
"ollama_gcp_b",
"ollama_local",
]

View File

@@ -1,3 +1,81 @@
## 2026-05-25T173 GCP upstream 紅燈驗證與 Ollama endpoint cooldown 降噪
**背景**
- T172 已恢復 production `GCP-A -> GCP-B -> 111 -> Gemini` 順序,且 111 fallback 已正常承接。
- production `/api/v1/health` 仍 degraded原因是 110 proxy 對 GCP-A/GCP-B upstream 回 502 / connection refused。
- nginx error log 顯示大量 `POST /api/embeddings` 反覆打 `11435/11436`,在 GCP-A/B 已知 offline 時造成不必要的 latency 與 log 噪音。
**live 驗證結論**
- 本機與 110 直打:
- `34.143.170.20:11434` -> connection refused。
- `34.21.145.224:11434` -> connection refused。
- SSH
- GCP-A `34.143.170.20:22` -> connection refused。
- GCP-B `34.21.145.224:22` -> port open但現有 `owen_taipei` / `oleetsai` / 常見 user + 本機 key 均 publickey denied。
- GCP 控制面:
- `owen.hy.tsai@gmail.com``owen.tsai@gmail.com` 均缺 `compute.instances.list` / `compute.firewalls.list` / `getIamPolicy` 權限。
- 判讀:目前無法從本 session 直接修 GCP VM / firewall / OS service需要具 Compute/IAM 權限者恢復 GCP-A/B或提供可用 SSH key。
**本次修補**
- 新增 `ollama_endpoint_circuit_breaker.py`
- 短 TTL in-process cooldown預設 60 秒。
- 不改 ADR-110 policy order只在高頻 embedding/RAG 路徑暫時略過剛失敗的 endpoint。
- 若所有 endpoint 都被 cooldown回到完整順序避免永遠不探測恢復。
- 偵測到 main 上曾短暫把 `OLLAMA_URL` / `OLLAMA_SECONDARY_URL` 都切到 `11437`;這會讓 `ollama_gcp_a` label 實際指向 111造成 Operator 誤判。
- 本輪改回 manifest`11435 -> GCP-A``11436 -> GCP-B``11437 -> 111`,讓 policy / label / frontend 事實保持一致。
- 接入高頻路徑:
- `embedding_service.py`
- `knowledge_rag_service.py`
- `playbook_rag.py`
- 成功時會清除 cooldownnetwork/timeout/5xx 失敗才短暫標記,不因 4xx 或資料錯誤誤封 endpoint。
- 前端/health/failover status 不套用 cooldown仍會顯示 GCP-A/B 真實紅燈,不消音、不假裝 healthy。
**local verification**
```text
py_compile:
ollama_endpoint_circuit_breaker.py
embedding_service.py
knowledge_rag_service.py
playbook_rag.py
test_ollama_endpoint_circuit_breaker.py
-> ok
ruff F/E9 targeted -> passed
pytest:
test_ollama_endpoint_circuit_breaker.py
test_ollama_endpoint_resolver.py
test_playbook_service.py
-> 23 passed
git diff --check -> ok
```
**注意 / 下一步**
- 這不是修復 GCP-A/B它是 runtime 降噪與 latency 保護。
- GCP-A/B 的真正修復仍需:
- GCP Compute IAM`compute.instances.get/list``compute.firewalls.list/update`、必要時 serial console / reset 權限。
- 或可用 SSH key能進 GCP-B 檢查 `systemctl status ollama``ss -tlnp``ufw/iptables``OLLAMA_HOST`
- 權限恢復後,優先修 GCP-B因為 SSH port 已開GCP-A 則先查 VM 狀態 / firewall / SSH service。
**目前整體進度**
- AwoooP 告警可觀測鏈:約 99.2%。
- 低風險自動修復閉環:約 95.5%。
- 前端 AI 自動化管理介面同步:約 96.4%。
- Telegram detail/history 可解釋性:約 95.5%。
- Callback evidence / DB 回放性:約 95.6%。
- MCP / 自建 MCP 使用可視性:約 88%。
- Sentry / SigNoz source correlation 可視性:約 88%。
- Ansible / PlayBook 決策可視性:約 84.8%。
- KM owner-review / completion 可治理鏈:約 84%。
- AI Provider lane 健康度:約 84%111 fallback 正常GCP-A/B 仍待 Compute/SSH 權限修復;高頻路徑已降噪)。
- 完整 AI 自動化管理產品化:約 93.9%。
---
## 2026-05-25T172 Ollama Provider lane 恢復 111 fallback 接線
**背景**

View File

@@ -17,10 +17,11 @@ data:
# 服務端點 (非機密)
# 2026-04-16 ogt + Claude Sonnet 4.6: 改指向 111GPU 機RTX
# 188 = CPU-only Ollama推理極慢>60s111 有 GPUavg 10s
# 2026-05-25 Codex: emergency failover while both GCP-A and GCP-B refuse 11434.
# Keep all Ollama slots on the live 111 proxy until GCP SSH/Ollama is repaired.
OLLAMA_URL: "http://192.168.0.110:11437"
OLLAMA_SECONDARY_URL: "http://192.168.0.110:11437"
# 2026-05-25 Codex: keep ADR-110 production policy visible and ordered.
# High-volume callers use short endpoint cooldowns when GCP-A/B are down,
# but health/status must still show GCP-A -> GCP-B -> 111 before Gemini.
OLLAMA_URL: "http://192.168.0.110:11435"
OLLAMA_SECONDARY_URL: "http://192.168.0.110:11436"
OLLAMA_FALLBACK_URL: "http://192.168.0.110:11437"
OPENCLAW_URL: "http://192.168.0.188:8088"
KALI_SCANNER_URL: "http://192.168.0.112:8080"

View File

@@ -70,9 +70,9 @@ spec:
- name: TELEGRAM_ENABLE_POLLING
value: "true"
- name: OLLAMA_URL
value: "http://192.168.0.110:11437" # 2026-05-25 Codex: emergency local 111 failover while GCP-A/B refuse 11434
value: "http://192.168.0.110:11435" # 2026-05-25 Codex: GCP-A via 110 proxy; cooldown protects noisy offline retries
- name: OLLAMA_SECONDARY_URL
value: "http://192.168.0.110:11437" # 2026-05-25 Codex: emergency local 111 failover while GCP-A/B refuse 11434
value: "http://192.168.0.110:11436" # 2026-05-25 Codex: GCP-B via 110 proxy; cooldown protects noisy offline retries
- name: OLLAMA_FALLBACK_URL
value: "http://192.168.0.110:11437" # 2026-05-25 Codex: 111 via 110 proxy before Gemini
- name: ALERT_AI_ALLOW_CLOUD_FALLBACK