diff --git a/apps/api/src/services/nvidia_provider.py b/apps/api/src/services/nvidia_provider.py index 8f95fb83..2cf1f6bd 100644 --- a/apps/api/src/services/nvidia_provider.py +++ b/apps/api/src/services/nvidia_provider.py @@ -18,12 +18,16 @@ NVIDIA Nemotron Provider - ADR-036 from __future__ import annotations +import asyncio import json +import random import time +from enum import Enum from typing import Any, Protocol, runtime_checkable # 2026-03-29 ogt: P2-1 Protocol import httpx import structlog +from prometheus_client import Counter, Histogram # 2026-03-29 ogt: P3-3 Prometheus from src.core.config import get_settings from src.core.telemetry import get_tracer # 2026-03-29 ogt: P1-2 OTEL 追蹤 @@ -105,6 +109,131 @@ NVIDIA_TIMEOUT = 60.0 # 重試次數 MAX_RETRIES = 2 +# ============================================================================= +# P3-1: Circuit Breaker 配置 (2026-03-29 ogt) +# ============================================================================= + +# Circuit Breaker 閾值 +CIRCUIT_BREAKER_FAILURE_THRESHOLD = 3 # 連續失敗次數觸發斷路 +CIRCUIT_BREAKER_RECOVERY_TIMEOUT = 60 # 斷路後等待恢復時間 (秒) +CIRCUIT_BREAKER_HALF_OPEN_REQUESTS = 1 # 半開狀態允許的測試請求數 + +# P3-2: 指數退避配置 +RETRY_BASE_DELAY = 1.0 # 基礎延遲 (秒) +RETRY_MAX_DELAY = 30.0 # 最大延遲 (秒) +RETRY_EXPONENTIAL_BASE = 2 # 指數基數 + +# ============================================================================= +# P3-3: Prometheus Metrics (2026-03-29 ogt) +# ============================================================================= + +NVIDIA_REQUESTS_TOTAL = Counter( + "nvidia_tool_call_requests_total", + "Total NVIDIA Tool Calling requests", + ["status", "tool_name"], +) + +NVIDIA_LATENCY_HISTOGRAM = Histogram( + "nvidia_tool_call_latency_seconds", + "NVIDIA Tool Calling latency in seconds", + buckets=[1, 5, 10, 15, 20, 30, 45, 60], +) + +NVIDIA_CIRCUIT_BREAKER_STATE = Counter( + "nvidia_circuit_breaker_state_changes_total", + "Circuit breaker state changes", + ["from_state", "to_state"], +) + + +# ============================================================================= +# P3-1: Circuit Breaker 狀態機 (2026-03-29 ogt) +# ============================================================================= + + +class CircuitState(Enum): + """Circuit Breaker 狀態""" + + CLOSED = "closed" # 正常運作 + OPEN = "open" # 斷路,拒絕請求 + HALF_OPEN = "half_open" # 測試恢復 + + +class CircuitBreaker: + """ + Circuit Breaker 實作 - P3-1 優化 + + 防止連鎖故障,當 NVIDIA API 連續失敗時自動斷路。 + + 狀態轉換: + CLOSED → (連續失敗 >= 3) → OPEN + OPEN → (等待 60s) → HALF_OPEN + HALF_OPEN → (成功) → CLOSED + HALF_OPEN → (失敗) → OPEN + """ + + def __init__( + self, + failure_threshold: int = CIRCUIT_BREAKER_FAILURE_THRESHOLD, + recovery_timeout: float = CIRCUIT_BREAKER_RECOVERY_TIMEOUT, + ): + self._state = CircuitState.CLOSED + self._failure_count = 0 + self._last_failure_time: float = 0 + self._failure_threshold = failure_threshold + self._recovery_timeout = recovery_timeout + + @property + def state(self) -> CircuitState: + """取得當前狀態 (含自動轉換檢查)""" + if self._state == CircuitState.OPEN: + # 檢查是否應該轉為 HALF_OPEN + if time.time() - self._last_failure_time >= self._recovery_timeout: + self._transition_to(CircuitState.HALF_OPEN) + return self._state + + def _transition_to(self, new_state: CircuitState) -> None: + """狀態轉換 (含 Prometheus 記錄)""" + old_state = self._state + if old_state != new_state: + NVIDIA_CIRCUIT_BREAKER_STATE.labels( + from_state=old_state.value, to_state=new_state.value + ).inc() + logger.info( + "circuit_breaker_state_change", + from_state=old_state.value, + to_state=new_state.value, + ) + self._state = new_state + + def can_execute(self) -> bool: + """是否允許執行請求""" + state = self.state # 觸發自動狀態檢查 + if state == CircuitState.CLOSED: + return True + if state == CircuitState.HALF_OPEN: + return True # 允許測試請求 + return False # OPEN 狀態拒絕 + + def record_success(self) -> None: + """記錄成功""" + if self._state == CircuitState.HALF_OPEN: + self._transition_to(CircuitState.CLOSED) + self._failure_count = 0 + + def record_failure(self) -> None: + """記錄失敗""" + self._failure_count += 1 + self._last_failure_time = time.time() + + if self._state == CircuitState.HALF_OPEN: + # HALF_OPEN 失敗,重新斷路 + self._transition_to(CircuitState.OPEN) + elif self._failure_count >= self._failure_threshold: + # 連續失敗達閾值,斷路 + self._transition_to(CircuitState.OPEN) + + # 高風險 Tool 清單 (需要 HITL 審核) HIGH_RISK_TOOLS: set[str] = { "delete_pod", @@ -152,9 +281,12 @@ class NvidiaProvider: Args: api_key: NVIDIA API Key (預設從 settings 取得) + + 2026-03-29 ogt: P3-1 加入 Circuit Breaker """ self._api_key = api_key or settings.NVIDIA_API_KEY self._client: httpx.AsyncClient | None = None + self._circuit_breaker = CircuitBreaker() # P3-1: Circuit Breaker async def _get_client(self) -> httpx.AsyncClient: """取得或建立 HTTP Client""" @@ -193,6 +325,7 @@ class NvidiaProvider: NvidiaProviderResult: 包含驗證後的 Tool Calls 2026-03-29 ogt: P1-1/P1-2 修復 - 加入 OTEL + Langfuse 追蹤 + 2026-03-29 ogt: P3-1/P3-2/P3-3 - Circuit Breaker + 指數退避 + Prometheus """ start_time = time.perf_counter() @@ -202,9 +335,24 @@ class NvidiaProvider: span.set_attribute("ai.model", model) span.set_attribute("ai.tool_count", len(tools)) + # P3-1: Circuit Breaker 檢查 + if not self._circuit_breaker.can_execute(): + span.set_attribute("ai.error", "circuit_breaker_open") + NVIDIA_REQUESTS_TOTAL.labels(status="circuit_open", tool_name="").inc() + logger.warning( + "nvidia_circuit_breaker_open", + state=self._circuit_breaker.state.value, + ) + return NvidiaProviderResult( + success=False, + error="Circuit Breaker OPEN - NVIDIA API 暫時不可用", + fallback_triggered=True, + ) + # 檢查 API Key if not self._api_key: span.set_attribute("ai.error", "api_key_not_set") + NVIDIA_REQUESTS_TOTAL.labels(status="error", tool_name="").inc() return NvidiaProviderResult( success=False, error="NVIDIA_API_KEY 未設定", @@ -240,13 +388,14 @@ class NvidiaProvider: metadata={"model": model, "tool_count": len(tools)}, ) as langfuse_ctx: - # 執行請求 (含重試) + # 執行請求 (含 P3-2 指數退避重試) response_data: dict | None = None last_error: str | None = None for attempt in range(MAX_RETRIES + 1): try: response_data = await self._send_request(request_body) + self._circuit_breaker.record_success() # P3-1 break except Exception as e: last_error = str(e) @@ -258,15 +407,26 @@ class NvidiaProvider: error=last_error, ) if attempt == MAX_RETRIES: + self._circuit_breaker.record_failure() # P3-1 break + # P3-2: 指數退避 (含 jitter) + delay = min( + RETRY_BASE_DELAY * (RETRY_EXPONENTIAL_BASE ** attempt), + RETRY_MAX_DELAY, + ) + jitter = random.uniform(0, delay * 0.1) # 10% jitter + await asyncio.sleep(delay + jitter) latency_ms = (time.perf_counter() - start_time) * 1000 + latency_seconds = latency_ms / 1000 span.set_attribute("ai.latency_ms", round(latency_ms, 2)) + NVIDIA_LATENCY_HISTOGRAM.observe(latency_seconds) # P3-3 # 請求失敗 if response_data is None: span.set_attribute("ai.success", False) span.set_attribute("ai.error", last_error or "unknown") + NVIDIA_REQUESTS_TOTAL.labels(status="error", tool_name="").inc() logger.error( "nvidia_request_failed", error=last_error, @@ -335,6 +495,13 @@ class NvidiaProvider: }, ) + # P3-3: Prometheus 成功指標 + for tc in tool_calls: + if tc.valid and tc.tool_name: + NVIDIA_REQUESTS_TOTAL.labels( + status="success", tool_name=tc.tool_name + ).inc() + logger.info( "nvidia_tool_call_completed", success=True, diff --git a/apps/api/tests/test_nvidia_provider.py b/apps/api/tests/test_nvidia_provider.py index d1907a39..d868b5e2 100644 --- a/apps/api/tests/test_nvidia_provider.py +++ b/apps/api/tests/test_nvidia_provider.py @@ -471,3 +471,132 @@ class TestRateLimiterIntegration: assert "nvidia" in COST_LIMITS assert COST_LIMITS["nvidia"]["total_cost_usd"] == 0.0 # 免費 + + +class TestCircuitBreaker: + """P3-1: Circuit Breaker 測試""" + + def test_circuit_breaker_initial_state(self): + """測試 Circuit Breaker 初始狀態""" + from src.services.nvidia_provider import CircuitBreaker, CircuitState + + cb = CircuitBreaker() + assert cb.state == CircuitState.CLOSED + assert cb.can_execute() + + def test_circuit_breaker_opens_after_failures(self): + """測試連續失敗後斷路""" + from src.services.nvidia_provider import CircuitBreaker, CircuitState + + cb = CircuitBreaker(failure_threshold=3) + + # 連續 3 次失敗 + cb.record_failure() + assert cb.state == CircuitState.CLOSED + cb.record_failure() + assert cb.state == CircuitState.CLOSED + cb.record_failure() + assert cb.state == CircuitState.OPEN + assert not cb.can_execute() + + def test_circuit_breaker_success_resets_count(self): + """測試成功重置失敗計數""" + from src.services.nvidia_provider import CircuitBreaker, CircuitState + + cb = CircuitBreaker(failure_threshold=3) + + cb.record_failure() + cb.record_failure() + cb.record_success() # 重置 + + # 需要再 3 次失敗才能斷路 + cb.record_failure() + cb.record_failure() + assert cb.state == CircuitState.CLOSED + cb.record_failure() + assert cb.state == CircuitState.OPEN + + def test_circuit_breaker_half_open_recovery(self): + """測試半開狀態恢復""" + from src.services.nvidia_provider import CircuitBreaker, CircuitState + + cb = CircuitBreaker(failure_threshold=1, recovery_timeout=0.1) + + cb.record_failure() # 觸發斷路 + assert cb.state == CircuitState.OPEN + + import time + time.sleep(0.15) # 等待恢復 + + # 檢查狀態會觸發 HALF_OPEN 轉換 + assert cb.state == CircuitState.HALF_OPEN + assert cb.can_execute() + + # 成功後回到 CLOSED + cb.record_success() + assert cb.state == CircuitState.CLOSED + + def test_circuit_breaker_half_open_failure(self): + """測試半開狀態失敗重新斷路""" + from src.services.nvidia_provider import CircuitBreaker, CircuitState + + cb = CircuitBreaker(failure_threshold=1, recovery_timeout=0.1) + + cb.record_failure() + assert cb.state == CircuitState.OPEN + + import time + time.sleep(0.15) + + assert cb.state == CircuitState.HALF_OPEN + + # 失敗,重新斷路 + cb.record_failure() + assert cb.state == CircuitState.OPEN + + def test_provider_has_circuit_breaker(self): + """測試 NvidiaProvider 有 Circuit Breaker""" + from src.services.nvidia_provider import NvidiaProvider + + provider = NvidiaProvider() + assert hasattr(provider, "_circuit_breaker") + + +class TestPrometheusMetrics: + """P3-3: Prometheus Metrics 測試""" + + def test_metrics_defined(self): + """測試 Prometheus Metrics 已定義""" + from src.services.nvidia_provider import ( + NVIDIA_REQUESTS_TOTAL, + NVIDIA_LATENCY_HISTOGRAM, + NVIDIA_CIRCUIT_BREAKER_STATE, + ) + + assert NVIDIA_REQUESTS_TOTAL is not None + assert NVIDIA_LATENCY_HISTOGRAM is not None + assert NVIDIA_CIRCUIT_BREAKER_STATE is not None + + def test_metrics_labels(self): + """測試 Metrics Labels 正確""" + from src.services.nvidia_provider import NVIDIA_REQUESTS_TOTAL + + # 應該能建立帶 label 的 metric + metric = NVIDIA_REQUESTS_TOTAL.labels(status="test", tool_name="test_tool") + assert metric is not None + + +class TestExponentialBackoff: + """P3-2: 指數退避測試""" + + def test_backoff_constants_defined(self): + """測試退避常數已定義""" + from src.services.nvidia_provider import ( + RETRY_BASE_DELAY, + RETRY_MAX_DELAY, + RETRY_EXPONENTIAL_BASE, + ) + + assert RETRY_BASE_DELAY == 1.0 + assert RETRY_MAX_DELAY == 30.0 + assert RETRY_EXPONENTIAL_BASE == 2