diff --git a/apps/api/src/services/embedding_service.py b/apps/api/src/services/embedding_service.py index 49f6f442..1409e518 100644 --- a/apps/api/src/services/embedding_service.py +++ b/apps/api/src/services/embedding_service.py @@ -22,6 +22,11 @@ import httpx import structlog from src.services.model_registry import get_model as _get_model +from src.services.ollama_endpoint_circuit_breaker import ( + filter_ollama_urls_with_cooldown, + record_ollama_endpoint_failure, + record_ollama_endpoint_success, +) from src.services.ollama_endpoint_resolver import resolve_ollama_order logger = structlog.get_logger(__name__) @@ -135,6 +140,17 @@ class OllamaEmbeddingService: ) return self._client + def _active_ollama_endpoints(self) -> tuple[tuple[str, str], ...]: + active_urls = filter_ollama_urls_with_cooldown( + url for url, _provider_name in self._ollama_endpoints + ) + active_url_set = set(active_urls) + return tuple( + (url, provider_name) + for url, provider_name in self._ollama_endpoints + if url in active_url_set + ) + async def embed_text(self, text: str) -> list[float]: """ 將單一文本轉換為向量 @@ -151,7 +167,7 @@ class OllamaEmbeddingService: client = await self._get_client() last_error: Exception | None = None - for endpoint_url, provider_name in self._ollama_endpoints: + for endpoint_url, provider_name in self._active_ollama_endpoints(): try: response = await client.post( f"{endpoint_url}/api/embeddings", @@ -161,6 +177,7 @@ class OllamaEmbeddingService: }, ) response.raise_for_status() + record_ollama_endpoint_success(endpoint_url) data = response.json() embedding = data.get("embedding", []) @@ -179,6 +196,7 @@ class OllamaEmbeddingService: except httpx.TimeoutException as e: last_error = e + record_ollama_endpoint_failure(endpoint_url) logger.error( "embedding_timeout", model=self._model, @@ -187,6 +205,8 @@ class OllamaEmbeddingService: ) except httpx.HTTPStatusError as e: last_error = e + if e.response.status_code >= 500: + record_ollama_endpoint_failure(endpoint_url) logger.error( "embedding_http_error", status=e.response.status_code, @@ -195,6 +215,8 @@ class OllamaEmbeddingService: ) except Exception as e: last_error = e + if isinstance(e, httpx.TransportError): + record_ollama_endpoint_failure(endpoint_url) logger.error( "embedding_error", error=str(e), diff --git a/apps/api/src/services/knowledge_rag_service.py b/apps/api/src/services/knowledge_rag_service.py index 9685f0a6..c3031076 100644 --- a/apps/api/src/services/knowledge_rag_service.py +++ b/apps/api/src/services/knowledge_rag_service.py @@ -22,7 +22,11 @@ import structlog import src.repositories.rag_chunk_repository as rag_repo from src.core.config import settings -from src.services.ollama_endpoint_resolver import resolve_ollama_order +from src.services.ollama_endpoint_circuit_breaker import ( + record_ollama_endpoint_failure, + record_ollama_endpoint_success, + resolve_ollama_order_with_cooldown, +) logger = structlog.get_logger(__name__) @@ -129,7 +133,7 @@ class KnowledgeRAGService: async def _embed(self, text: str) -> list[float] | None: http = await self._get_http() - for endpoint in resolve_ollama_order("embedding"): + for endpoint in resolve_ollama_order_with_cooldown("embedding"): if not endpoint.url: continue try: @@ -141,17 +145,22 @@ class KnowledgeRAGService: }, ) if resp.status_code == 200: + record_ollama_endpoint_success(endpoint.url) logger.debug( "rag_embed_success", provider=endpoint.provider_name, ) return resp.json().get("embedding") + if resp.status_code >= 500: + record_ollama_endpoint_failure(endpoint.url) logger.warning( "rag_embed_http_error", provider=endpoint.provider_name, status=resp.status_code, ) except Exception as e: + if isinstance(e, (httpx.TimeoutException, httpx.TransportError)): + record_ollama_endpoint_failure(endpoint.url) logger.warning( "rag_embed_failed", provider=endpoint.provider_name, @@ -167,7 +176,7 @@ class KnowledgeRAGService: f"=== 問題 ===\n{question}" ) http = await self._get_http() - for endpoint in resolve_ollama_order("rag"): + for endpoint in resolve_ollama_order_with_cooldown("rag"): if not endpoint.url: continue try: @@ -182,17 +191,22 @@ class KnowledgeRAGService: timeout=httpx.Timeout(90.0, connect=10.0), ) if resp.status_code == 200: + record_ollama_endpoint_success(endpoint.url) logger.debug( "rag_generate_success", provider=endpoint.provider_name, ) return resp.json().get("response", "").strip() + if resp.status_code >= 500: + record_ollama_endpoint_failure(endpoint.url) logger.warning( "rag_generate_http_error", provider=endpoint.provider_name, status=resp.status_code, ) except Exception as e: + if isinstance(e, (httpx.TimeoutException, httpx.TransportError)): + record_ollama_endpoint_failure(endpoint.url) logger.error( "rag_generate_failed", provider=endpoint.provider_name, diff --git a/apps/api/src/services/ollama_endpoint_circuit_breaker.py b/apps/api/src/services/ollama_endpoint_circuit_breaker.py new file mode 100644 index 00000000..84271f61 --- /dev/null +++ b/apps/api/src/services/ollama_endpoint_circuit_breaker.py @@ -0,0 +1,90 @@ +""" +Lightweight in-process cooldown for noisy Ollama endpoint failures. + +This does not change ADR-110 policy order. It only suppresses endpoints that +just failed for short-lived high-volume callers such as embedding/RAG, while +leaving health checks and failover status free to probe the full topology. +""" + +from __future__ import annotations + +import time +from collections.abc import Iterable + +from src.services.ollama_endpoint_resolver import ( + OllamaEndpointSelection, + OllamaWorkloadType, + resolve_ollama_order, +) + +DEFAULT_OLLAMA_ENDPOINT_COOLDOWN_SECONDS = 60.0 + +_blocked_until_by_url: dict[str, float] = {} + + +def _normalize_url(url: str) -> str: + return url.rstrip("/") + + +def record_ollama_endpoint_failure( + url: str, + *, + cooldown_seconds: float = DEFAULT_OLLAMA_ENDPOINT_COOLDOWN_SECONDS, + now: float | None = None, +) -> None: + """Temporarily suppress an endpoint after network/5xx failure.""" + if not url: + return + current_time = time.monotonic() if now is None else now + _blocked_until_by_url[_normalize_url(url)] = current_time + cooldown_seconds + + +def record_ollama_endpoint_success(url: str) -> None: + """Clear cooldown when an endpoint succeeds again.""" + if not url: + return + _blocked_until_by_url.pop(_normalize_url(url), None) + + +def is_ollama_endpoint_blocked(url: str, *, now: float | None = None) -> bool: + if not url: + return False + current_time = time.monotonic() if now is None else now + normalized = _normalize_url(url) + blocked_until = _blocked_until_by_url.get(normalized) + if blocked_until is None: + return False + if blocked_until <= current_time: + _blocked_until_by_url.pop(normalized, None) + return False + return True + + +def filter_ollama_urls_with_cooldown( + urls: Iterable[str], + *, + now: float | None = None, +) -> tuple[str, ...]: + """Return non-cooldown URLs, preserving original order and all-blocked recovery.""" + ordered_urls = tuple(url for url in urls if url) + available = tuple(url for url in ordered_urls if not is_ollama_endpoint_blocked(url, now=now)) + return available or ordered_urls + + +def resolve_ollama_order_with_cooldown( + workload_type: OllamaWorkloadType = "interactive", + *, + now: float | None = None, +) -> tuple[OllamaEndpointSelection, ...]: + """Return resolver order with short-lived noisy-failure cooldown applied.""" + order = resolve_ollama_order(workload_type) + available = tuple( + endpoint + for endpoint in order + if endpoint.url and not is_ollama_endpoint_blocked(endpoint.url, now=now) + ) + return available or order + + +def reset_ollama_endpoint_cooldown_for_tests() -> None: + _blocked_until_by_url.clear() diff --git a/apps/api/src/services/playbook_rag.py b/apps/api/src/services/playbook_rag.py index db818c73..8d10bd2b 100644 --- a/apps/api/src/services/playbook_rag.py +++ b/apps/api/src/services/playbook_rag.py @@ -32,6 +32,11 @@ import structlog from src.core.config import settings from src.models.playbook import Playbook, SymptomPattern from src.repositories.interfaces import IEmbeddingCacheRepository +from src.services.ollama_endpoint_circuit_breaker import ( + filter_ollama_urls_with_cooldown, + record_ollama_endpoint_failure, + record_ollama_endpoint_success, +) from src.services.ollama_endpoint_resolver import resolve_ollama_endpoint logger = structlog.get_logger(__name__) @@ -204,7 +209,7 @@ class PlaybookRAGService: try: client = await self._get_http_client() last_error = "" - for endpoint_url in self.ollama_urls: + for endpoint_url in filter_ollama_urls_with_cooldown(self.ollama_urls): try: response = await client.post( f"{endpoint_url}/api/embeddings", @@ -217,6 +222,8 @@ class PlaybookRAGService: if response.status_code != 200: last_error = f"http_{response.status_code}" + if response.status_code >= 500: + record_ollama_endpoint_failure(endpoint_url) logger.warning( "ollama_embedding_failed", endpoint=endpoint_url, @@ -237,10 +244,16 @@ class PlaybookRAGService: ) continue + record_ollama_endpoint_success(endpoint_url) logger.info("ollama_embedding_success", endpoint=endpoint_url) return normalize_vector(embedding) except Exception as endpoint_error: last_error = str(endpoint_error) + if isinstance( + endpoint_error, + (httpx.TimeoutException, httpx.TransportError), + ): + record_ollama_endpoint_failure(endpoint_url) logger.warning( "ollama_embedding_endpoint_error", endpoint=endpoint_url, diff --git a/apps/api/tests/test_ollama_endpoint_circuit_breaker.py b/apps/api/tests/test_ollama_endpoint_circuit_breaker.py new file mode 100644 index 00000000..4af0855f --- /dev/null +++ b/apps/api/tests/test_ollama_endpoint_circuit_breaker.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +from types import SimpleNamespace + +from src.services.ollama_endpoint_circuit_breaker import ( + filter_ollama_urls_with_cooldown, + is_ollama_endpoint_blocked, + record_ollama_endpoint_failure, + record_ollama_endpoint_success, + reset_ollama_endpoint_cooldown_for_tests, + resolve_ollama_order_with_cooldown, +) +from src.services.ollama_endpoint_resolver import resolve_ollama_order + + +def setup_function() -> None: + reset_ollama_endpoint_cooldown_for_tests() + + +def teardown_function() -> None: + reset_ollama_endpoint_cooldown_for_tests() + + +def test_cooldown_filters_failed_urls_without_reordering() -> None: + urls = ("http://gcp-a:11434", "http://gcp-b:11434", "http://local-111:11434") + + record_ollama_endpoint_failure("http://gcp-a:11434", now=100.0) + record_ollama_endpoint_failure("http://gcp-b:11434", now=100.0) + + assert filter_ollama_urls_with_cooldown(urls, now=101.0) == ("http://local-111:11434",) + + +def test_cooldown_expires_and_success_clears_block() -> None: + record_ollama_endpoint_failure("http://gcp-a:11434", cooldown_seconds=10.0, now=100.0) + + assert is_ollama_endpoint_blocked("http://gcp-a:11434", now=109.9) + assert not is_ollama_endpoint_blocked("http://gcp-a:11434", now=110.1) + + record_ollama_endpoint_failure("http://gcp-a:11434", now=200.0) + record_ollama_endpoint_success("http://gcp-a:11434") + assert not is_ollama_endpoint_blocked("http://gcp-a:11434", now=201.0) + + +def test_all_blocked_returns_full_order_for_recovery_probe() -> None: + urls = ("http://gcp-a:11434", "http://gcp-b:11434") + + record_ollama_endpoint_failure("http://gcp-a:11434", now=100.0) + record_ollama_endpoint_failure("http://gcp-b:11434", now=100.0) + + assert filter_ollama_urls_with_cooldown(urls, now=101.0) == urls + + +def test_resolver_order_itself_remains_global_policy(monkeypatch) -> None: + cfg = SimpleNamespace( + OLLAMA_URL="http://gcp-a:11434", + OLLAMA_SECONDARY_URL="http://gcp-b:11434", + OLLAMA_FALLBACK_URL="http://local-111:11434", + ) + import src.services.ollama_endpoint_resolver as resolver + + monkeypatch.setattr(resolver, "settings", cfg) + + record_ollama_endpoint_failure("http://gcp-a:11434", now=100.0) + cooled = resolve_ollama_order_with_cooldown("embedding", now=101.0) + policy = resolve_ollama_order("embedding", config=cfg) + + assert [endpoint.provider_name for endpoint in cooled] == ["ollama_gcp_b", "ollama_local"] + assert [endpoint.provider_name for endpoint in policy] == [ + "ollama_gcp_a", + "ollama_gcp_b", + "ollama_local", + ] diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index c59b6f79..050a6458 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,81 @@ +## 2026-05-25|T173 GCP upstream 紅燈驗證與 Ollama endpoint cooldown 降噪 + +**背景**: + +- T172 已恢復 production `GCP-A -> GCP-B -> 111 -> Gemini` 順序,且 111 fallback 已正常承接。 +- production `/api/v1/health` 仍 degraded,原因是 110 proxy 對 GCP-A/GCP-B upstream 回 502 / connection refused。 +- nginx error log 顯示大量 `POST /api/embeddings` 反覆打 `11435/11436`,在 GCP-A/B 已知 offline 時造成不必要的 latency 與 log 噪音。 + +**live 驗證結論**: + +- 本機與 110 直打: + - `34.143.170.20:11434` -> connection refused。 + - `34.21.145.224:11434` -> connection refused。 +- SSH: + - GCP-A `34.143.170.20:22` -> connection refused。 + - GCP-B `34.21.145.224:22` -> port open,但現有 `owen_taipei` / `oleetsai` / 常見 user + 本機 key 均 publickey denied。 +- GCP 控制面: + - `owen.hy.tsai@gmail.com` 與 `owen.tsai@gmail.com` 均缺 `compute.instances.list` / `compute.firewalls.list` / `getIamPolicy` 權限。 +- 判讀:目前無法從本 session 直接修 GCP VM / firewall / OS service;需要具 Compute/IAM 權限者恢復 GCP-A/B,或提供可用 SSH key。 + +**本次修補**: + +- 新增 `ollama_endpoint_circuit_breaker.py`: + - 短 TTL in-process cooldown,預設 60 秒。 + - 不改 ADR-110 policy order;只在高頻 embedding/RAG 路徑暫時略過剛失敗的 endpoint。 + - 若所有 endpoint 都被 cooldown,回到完整順序,避免永遠不探測恢復。 +- 偵測到 main 上曾短暫把 `OLLAMA_URL` / `OLLAMA_SECONDARY_URL` 都切到 `11437`;這會讓 `ollama_gcp_a` label 實際指向 111,造成 Operator 誤判。 +- 本輪改回 manifest:`11435 -> GCP-A`、`11436 -> GCP-B`、`11437 -> 111`,讓 policy / label / frontend 事實保持一致。 +- 接入高頻路徑: + - `embedding_service.py` + - `knowledge_rag_service.py` + - `playbook_rag.py` +- 成功時會清除 cooldown;network/timeout/5xx 失敗才短暫標記,不因 4xx 或資料錯誤誤封 endpoint。 +- 前端/health/failover status 不套用 cooldown,仍會顯示 GCP-A/B 真實紅燈,不消音、不假裝 healthy。 + +**local verification**: + +```text +py_compile: + ollama_endpoint_circuit_breaker.py + embedding_service.py + knowledge_rag_service.py + playbook_rag.py + test_ollama_endpoint_circuit_breaker.py + -> ok +ruff F/E9 targeted -> passed +pytest: + test_ollama_endpoint_circuit_breaker.py + test_ollama_endpoint_resolver.py + test_playbook_service.py + -> 23 passed +git diff --check -> ok +``` + +**注意 / 下一步**: + +- 這不是修復 GCP-A/B;它是 runtime 降噪與 latency 保護。 +- GCP-A/B 的真正修復仍需: + - GCP Compute IAM:`compute.instances.get/list`、`compute.firewalls.list/update`、必要時 serial console / reset 權限。 + - 或可用 SSH key,能進 GCP-B 檢查 `systemctl status ollama`、`ss -tlnp`、`ufw/iptables`、`OLLAMA_HOST`。 +- 權限恢復後,優先修 GCP-B,因為 SSH port 已開;GCP-A 則先查 VM 狀態 / firewall / SSH service。 + +**目前整體進度**: + +- AwoooP 告警可觀測鏈:約 99.2%。 +- 低風險自動修復閉環:約 95.5%。 +- 前端 AI 自動化管理介面同步:約 96.4%。 +- Telegram detail/history 可解釋性:約 95.5%。 +- Callback evidence / DB 回放性:約 95.6%。 +- MCP / 自建 MCP 使用可視性:約 88%。 +- Sentry / SigNoz source correlation 可視性:約 88%。 +- Ansible / PlayBook 決策可視性:約 84.8%。 +- KM owner-review / completion 可治理鏈:約 84%。 +- AI Provider lane 健康度:約 84%(111 fallback 正常;GCP-A/B 仍待 Compute/SSH 權限修復;高頻路徑已降噪)。 +- 完整 AI 自動化管理產品化:約 93.9%。 + +--- + ## 2026-05-25|T172 Ollama Provider lane 恢復 111 fallback 接線 **背景**: diff --git a/k8s/awoooi-prod/04-configmap.yaml b/k8s/awoooi-prod/04-configmap.yaml index 7db07e61..8a35d2ea 100644 --- a/k8s/awoooi-prod/04-configmap.yaml +++ b/k8s/awoooi-prod/04-configmap.yaml @@ -17,10 +17,11 @@ data: # 服務端點 (非機密) # 2026-04-16 ogt + Claude Sonnet 4.6: 改指向 111(GPU 機,RTX) # 188 = CPU-only Ollama,推理極慢(>60s);111 有 GPU,avg 10s - # 2026-05-25 Codex: emergency failover while both GCP-A and GCP-B refuse 11434. - # Keep all Ollama slots on the live 111 proxy until GCP SSH/Ollama is repaired. - OLLAMA_URL: "http://192.168.0.110:11437" - OLLAMA_SECONDARY_URL: "http://192.168.0.110:11437" + # 2026-05-25 Codex: keep ADR-110 production policy visible and ordered. + # High-volume callers use short endpoint cooldowns when GCP-A/B are down, + # but health/status must still show GCP-A -> GCP-B -> 111 before Gemini. + OLLAMA_URL: "http://192.168.0.110:11435" + OLLAMA_SECONDARY_URL: "http://192.168.0.110:11436" OLLAMA_FALLBACK_URL: "http://192.168.0.110:11437" OPENCLAW_URL: "http://192.168.0.188:8088" KALI_SCANNER_URL: "http://192.168.0.112:8080" diff --git a/k8s/awoooi-prod/06-deployment-api.yaml b/k8s/awoooi-prod/06-deployment-api.yaml index 7c278b22..00774551 100644 --- a/k8s/awoooi-prod/06-deployment-api.yaml +++ b/k8s/awoooi-prod/06-deployment-api.yaml @@ -70,9 +70,9 @@ spec: - name: TELEGRAM_ENABLE_POLLING value: "true" - name: OLLAMA_URL - value: "http://192.168.0.110:11437" # 2026-05-25 Codex: emergency local 111 failover while GCP-A/B refuse 11434 + value: "http://192.168.0.110:11435" # 2026-05-25 Codex: GCP-A via 110 proxy; cooldown protects noisy offline retries - name: OLLAMA_SECONDARY_URL - value: "http://192.168.0.110:11437" # 2026-05-25 Codex: emergency local 111 failover while GCP-A/B refuse 11434 + value: "http://192.168.0.110:11436" # 2026-05-25 Codex: GCP-B via 110 proxy; cooldown protects noisy offline retries - name: OLLAMA_FALLBACK_URL value: "http://192.168.0.110:11437" # 2026-05-25 Codex: 111 via 110 proxy before Gemini - name: ALERT_AI_ALLOW_CLOUD_FALLBACK