feat(ai): Phase 20 P3 優化 - Circuit Breaker + 指數退避 + Prometheus
P3-1: Circuit Breaker 狀態機 (CLOSED/OPEN/HALF_OPEN) - 連續 3 次失敗觸發斷路 - 60 秒後自動嘗試恢復 - 防止連鎖故障 P3-2: 指數退避重試 - 基礎延遲 1s,最大 30s - 含 10% jitter 避免雷鳴 P3-3: Prometheus Metrics - nvidia_tool_call_requests_total (status, tool_name) - nvidia_tool_call_latency_seconds (histogram) - nvidia_circuit_breaker_state_changes_total 測試: 25 → 34 PASSED Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -18,12 +18,16 @@ NVIDIA Nemotron Provider - ADR-036
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from enum import Enum
|
||||
from typing import Any, Protocol, runtime_checkable # 2026-03-29 ogt: P2-1 Protocol
|
||||
|
||||
import httpx
|
||||
import structlog
|
||||
from prometheus_client import Counter, Histogram # 2026-03-29 ogt: P3-3 Prometheus
|
||||
|
||||
from src.core.config import get_settings
|
||||
from src.core.telemetry import get_tracer # 2026-03-29 ogt: P1-2 OTEL 追蹤
|
||||
@@ -105,6 +109,131 @@ NVIDIA_TIMEOUT = 60.0
|
||||
# 重試次數
|
||||
MAX_RETRIES = 2
|
||||
|
||||
# =============================================================================
|
||||
# P3-1: Circuit Breaker 配置 (2026-03-29 ogt)
|
||||
# =============================================================================
|
||||
|
||||
# Circuit Breaker 閾值
|
||||
CIRCUIT_BREAKER_FAILURE_THRESHOLD = 3 # 連續失敗次數觸發斷路
|
||||
CIRCUIT_BREAKER_RECOVERY_TIMEOUT = 60 # 斷路後等待恢復時間 (秒)
|
||||
CIRCUIT_BREAKER_HALF_OPEN_REQUESTS = 1 # 半開狀態允許的測試請求數
|
||||
|
||||
# P3-2: 指數退避配置
|
||||
RETRY_BASE_DELAY = 1.0 # 基礎延遲 (秒)
|
||||
RETRY_MAX_DELAY = 30.0 # 最大延遲 (秒)
|
||||
RETRY_EXPONENTIAL_BASE = 2 # 指數基數
|
||||
|
||||
# =============================================================================
|
||||
# P3-3: Prometheus Metrics (2026-03-29 ogt)
|
||||
# =============================================================================
|
||||
|
||||
NVIDIA_REQUESTS_TOTAL = Counter(
|
||||
"nvidia_tool_call_requests_total",
|
||||
"Total NVIDIA Tool Calling requests",
|
||||
["status", "tool_name"],
|
||||
)
|
||||
|
||||
NVIDIA_LATENCY_HISTOGRAM = Histogram(
|
||||
"nvidia_tool_call_latency_seconds",
|
||||
"NVIDIA Tool Calling latency in seconds",
|
||||
buckets=[1, 5, 10, 15, 20, 30, 45, 60],
|
||||
)
|
||||
|
||||
NVIDIA_CIRCUIT_BREAKER_STATE = Counter(
|
||||
"nvidia_circuit_breaker_state_changes_total",
|
||||
"Circuit breaker state changes",
|
||||
["from_state", "to_state"],
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# P3-1: Circuit Breaker 狀態機 (2026-03-29 ogt)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class CircuitState(Enum):
|
||||
"""Circuit Breaker 狀態"""
|
||||
|
||||
CLOSED = "closed" # 正常運作
|
||||
OPEN = "open" # 斷路,拒絕請求
|
||||
HALF_OPEN = "half_open" # 測試恢復
|
||||
|
||||
|
||||
class CircuitBreaker:
|
||||
"""
|
||||
Circuit Breaker 實作 - P3-1 優化
|
||||
|
||||
防止連鎖故障,當 NVIDIA API 連續失敗時自動斷路。
|
||||
|
||||
狀態轉換:
|
||||
CLOSED → (連續失敗 >= 3) → OPEN
|
||||
OPEN → (等待 60s) → HALF_OPEN
|
||||
HALF_OPEN → (成功) → CLOSED
|
||||
HALF_OPEN → (失敗) → OPEN
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
failure_threshold: int = CIRCUIT_BREAKER_FAILURE_THRESHOLD,
|
||||
recovery_timeout: float = CIRCUIT_BREAKER_RECOVERY_TIMEOUT,
|
||||
):
|
||||
self._state = CircuitState.CLOSED
|
||||
self._failure_count = 0
|
||||
self._last_failure_time: float = 0
|
||||
self._failure_threshold = failure_threshold
|
||||
self._recovery_timeout = recovery_timeout
|
||||
|
||||
@property
|
||||
def state(self) -> CircuitState:
|
||||
"""取得當前狀態 (含自動轉換檢查)"""
|
||||
if self._state == CircuitState.OPEN:
|
||||
# 檢查是否應該轉為 HALF_OPEN
|
||||
if time.time() - self._last_failure_time >= self._recovery_timeout:
|
||||
self._transition_to(CircuitState.HALF_OPEN)
|
||||
return self._state
|
||||
|
||||
def _transition_to(self, new_state: CircuitState) -> None:
|
||||
"""狀態轉換 (含 Prometheus 記錄)"""
|
||||
old_state = self._state
|
||||
if old_state != new_state:
|
||||
NVIDIA_CIRCUIT_BREAKER_STATE.labels(
|
||||
from_state=old_state.value, to_state=new_state.value
|
||||
).inc()
|
||||
logger.info(
|
||||
"circuit_breaker_state_change",
|
||||
from_state=old_state.value,
|
||||
to_state=new_state.value,
|
||||
)
|
||||
self._state = new_state
|
||||
|
||||
def can_execute(self) -> bool:
|
||||
"""是否允許執行請求"""
|
||||
state = self.state # 觸發自動狀態檢查
|
||||
if state == CircuitState.CLOSED:
|
||||
return True
|
||||
if state == CircuitState.HALF_OPEN:
|
||||
return True # 允許測試請求
|
||||
return False # OPEN 狀態拒絕
|
||||
|
||||
def record_success(self) -> None:
|
||||
"""記錄成功"""
|
||||
if self._state == CircuitState.HALF_OPEN:
|
||||
self._transition_to(CircuitState.CLOSED)
|
||||
self._failure_count = 0
|
||||
|
||||
def record_failure(self) -> None:
|
||||
"""記錄失敗"""
|
||||
self._failure_count += 1
|
||||
self._last_failure_time = time.time()
|
||||
|
||||
if self._state == CircuitState.HALF_OPEN:
|
||||
# HALF_OPEN 失敗,重新斷路
|
||||
self._transition_to(CircuitState.OPEN)
|
||||
elif self._failure_count >= self._failure_threshold:
|
||||
# 連續失敗達閾值,斷路
|
||||
self._transition_to(CircuitState.OPEN)
|
||||
|
||||
|
||||
# 高風險 Tool 清單 (需要 HITL 審核)
|
||||
HIGH_RISK_TOOLS: set[str] = {
|
||||
"delete_pod",
|
||||
@@ -152,9 +281,12 @@ class NvidiaProvider:
|
||||
|
||||
Args:
|
||||
api_key: NVIDIA API Key (預設從 settings 取得)
|
||||
|
||||
2026-03-29 ogt: P3-1 加入 Circuit Breaker
|
||||
"""
|
||||
self._api_key = api_key or settings.NVIDIA_API_KEY
|
||||
self._client: httpx.AsyncClient | None = None
|
||||
self._circuit_breaker = CircuitBreaker() # P3-1: Circuit Breaker
|
||||
|
||||
async def _get_client(self) -> httpx.AsyncClient:
|
||||
"""取得或建立 HTTP Client"""
|
||||
@@ -193,6 +325,7 @@ class NvidiaProvider:
|
||||
NvidiaProviderResult: 包含驗證後的 Tool Calls
|
||||
|
||||
2026-03-29 ogt: P1-1/P1-2 修復 - 加入 OTEL + Langfuse 追蹤
|
||||
2026-03-29 ogt: P3-1/P3-2/P3-3 - Circuit Breaker + 指數退避 + Prometheus
|
||||
"""
|
||||
start_time = time.perf_counter()
|
||||
|
||||
@@ -202,9 +335,24 @@ class NvidiaProvider:
|
||||
span.set_attribute("ai.model", model)
|
||||
span.set_attribute("ai.tool_count", len(tools))
|
||||
|
||||
# P3-1: Circuit Breaker 檢查
|
||||
if not self._circuit_breaker.can_execute():
|
||||
span.set_attribute("ai.error", "circuit_breaker_open")
|
||||
NVIDIA_REQUESTS_TOTAL.labels(status="circuit_open", tool_name="").inc()
|
||||
logger.warning(
|
||||
"nvidia_circuit_breaker_open",
|
||||
state=self._circuit_breaker.state.value,
|
||||
)
|
||||
return NvidiaProviderResult(
|
||||
success=False,
|
||||
error="Circuit Breaker OPEN - NVIDIA API 暫時不可用",
|
||||
fallback_triggered=True,
|
||||
)
|
||||
|
||||
# 檢查 API Key
|
||||
if not self._api_key:
|
||||
span.set_attribute("ai.error", "api_key_not_set")
|
||||
NVIDIA_REQUESTS_TOTAL.labels(status="error", tool_name="").inc()
|
||||
return NvidiaProviderResult(
|
||||
success=False,
|
||||
error="NVIDIA_API_KEY 未設定",
|
||||
@@ -240,13 +388,14 @@ class NvidiaProvider:
|
||||
metadata={"model": model, "tool_count": len(tools)},
|
||||
) as langfuse_ctx:
|
||||
|
||||
# 執行請求 (含重試)
|
||||
# 執行請求 (含 P3-2 指數退避重試)
|
||||
response_data: dict | None = None
|
||||
last_error: str | None = None
|
||||
|
||||
for attempt in range(MAX_RETRIES + 1):
|
||||
try:
|
||||
response_data = await self._send_request(request_body)
|
||||
self._circuit_breaker.record_success() # P3-1
|
||||
break
|
||||
except Exception as e:
|
||||
last_error = str(e)
|
||||
@@ -258,15 +407,26 @@ class NvidiaProvider:
|
||||
error=last_error,
|
||||
)
|
||||
if attempt == MAX_RETRIES:
|
||||
self._circuit_breaker.record_failure() # P3-1
|
||||
break
|
||||
# P3-2: 指數退避 (含 jitter)
|
||||
delay = min(
|
||||
RETRY_BASE_DELAY * (RETRY_EXPONENTIAL_BASE ** attempt),
|
||||
RETRY_MAX_DELAY,
|
||||
)
|
||||
jitter = random.uniform(0, delay * 0.1) # 10% jitter
|
||||
await asyncio.sleep(delay + jitter)
|
||||
|
||||
latency_ms = (time.perf_counter() - start_time) * 1000
|
||||
latency_seconds = latency_ms / 1000
|
||||
span.set_attribute("ai.latency_ms", round(latency_ms, 2))
|
||||
NVIDIA_LATENCY_HISTOGRAM.observe(latency_seconds) # P3-3
|
||||
|
||||
# 請求失敗
|
||||
if response_data is None:
|
||||
span.set_attribute("ai.success", False)
|
||||
span.set_attribute("ai.error", last_error or "unknown")
|
||||
NVIDIA_REQUESTS_TOTAL.labels(status="error", tool_name="").inc()
|
||||
logger.error(
|
||||
"nvidia_request_failed",
|
||||
error=last_error,
|
||||
@@ -335,6 +495,13 @@ class NvidiaProvider:
|
||||
},
|
||||
)
|
||||
|
||||
# P3-3: Prometheus 成功指標
|
||||
for tc in tool_calls:
|
||||
if tc.valid and tc.tool_name:
|
||||
NVIDIA_REQUESTS_TOTAL.labels(
|
||||
status="success", tool_name=tc.tool_name
|
||||
).inc()
|
||||
|
||||
logger.info(
|
||||
"nvidia_tool_call_completed",
|
||||
success=True,
|
||||
|
||||
@@ -471,3 +471,132 @@ class TestRateLimiterIntegration:
|
||||
|
||||
assert "nvidia" in COST_LIMITS
|
||||
assert COST_LIMITS["nvidia"]["total_cost_usd"] == 0.0 # 免費
|
||||
|
||||
|
||||
class TestCircuitBreaker:
|
||||
"""P3-1: Circuit Breaker 測試"""
|
||||
|
||||
def test_circuit_breaker_initial_state(self):
|
||||
"""測試 Circuit Breaker 初始狀態"""
|
||||
from src.services.nvidia_provider import CircuitBreaker, CircuitState
|
||||
|
||||
cb = CircuitBreaker()
|
||||
assert cb.state == CircuitState.CLOSED
|
||||
assert cb.can_execute()
|
||||
|
||||
def test_circuit_breaker_opens_after_failures(self):
|
||||
"""測試連續失敗後斷路"""
|
||||
from src.services.nvidia_provider import CircuitBreaker, CircuitState
|
||||
|
||||
cb = CircuitBreaker(failure_threshold=3)
|
||||
|
||||
# 連續 3 次失敗
|
||||
cb.record_failure()
|
||||
assert cb.state == CircuitState.CLOSED
|
||||
cb.record_failure()
|
||||
assert cb.state == CircuitState.CLOSED
|
||||
cb.record_failure()
|
||||
assert cb.state == CircuitState.OPEN
|
||||
assert not cb.can_execute()
|
||||
|
||||
def test_circuit_breaker_success_resets_count(self):
|
||||
"""測試成功重置失敗計數"""
|
||||
from src.services.nvidia_provider import CircuitBreaker, CircuitState
|
||||
|
||||
cb = CircuitBreaker(failure_threshold=3)
|
||||
|
||||
cb.record_failure()
|
||||
cb.record_failure()
|
||||
cb.record_success() # 重置
|
||||
|
||||
# 需要再 3 次失敗才能斷路
|
||||
cb.record_failure()
|
||||
cb.record_failure()
|
||||
assert cb.state == CircuitState.CLOSED
|
||||
cb.record_failure()
|
||||
assert cb.state == CircuitState.OPEN
|
||||
|
||||
def test_circuit_breaker_half_open_recovery(self):
|
||||
"""測試半開狀態恢復"""
|
||||
from src.services.nvidia_provider import CircuitBreaker, CircuitState
|
||||
|
||||
cb = CircuitBreaker(failure_threshold=1, recovery_timeout=0.1)
|
||||
|
||||
cb.record_failure() # 觸發斷路
|
||||
assert cb.state == CircuitState.OPEN
|
||||
|
||||
import time
|
||||
time.sleep(0.15) # 等待恢復
|
||||
|
||||
# 檢查狀態會觸發 HALF_OPEN 轉換
|
||||
assert cb.state == CircuitState.HALF_OPEN
|
||||
assert cb.can_execute()
|
||||
|
||||
# 成功後回到 CLOSED
|
||||
cb.record_success()
|
||||
assert cb.state == CircuitState.CLOSED
|
||||
|
||||
def test_circuit_breaker_half_open_failure(self):
|
||||
"""測試半開狀態失敗重新斷路"""
|
||||
from src.services.nvidia_provider import CircuitBreaker, CircuitState
|
||||
|
||||
cb = CircuitBreaker(failure_threshold=1, recovery_timeout=0.1)
|
||||
|
||||
cb.record_failure()
|
||||
assert cb.state == CircuitState.OPEN
|
||||
|
||||
import time
|
||||
time.sleep(0.15)
|
||||
|
||||
assert cb.state == CircuitState.HALF_OPEN
|
||||
|
||||
# 失敗,重新斷路
|
||||
cb.record_failure()
|
||||
assert cb.state == CircuitState.OPEN
|
||||
|
||||
def test_provider_has_circuit_breaker(self):
|
||||
"""測試 NvidiaProvider 有 Circuit Breaker"""
|
||||
from src.services.nvidia_provider import NvidiaProvider
|
||||
|
||||
provider = NvidiaProvider()
|
||||
assert hasattr(provider, "_circuit_breaker")
|
||||
|
||||
|
||||
class TestPrometheusMetrics:
|
||||
"""P3-3: Prometheus Metrics 測試"""
|
||||
|
||||
def test_metrics_defined(self):
|
||||
"""測試 Prometheus Metrics 已定義"""
|
||||
from src.services.nvidia_provider import (
|
||||
NVIDIA_REQUESTS_TOTAL,
|
||||
NVIDIA_LATENCY_HISTOGRAM,
|
||||
NVIDIA_CIRCUIT_BREAKER_STATE,
|
||||
)
|
||||
|
||||
assert NVIDIA_REQUESTS_TOTAL is not None
|
||||
assert NVIDIA_LATENCY_HISTOGRAM is not None
|
||||
assert NVIDIA_CIRCUIT_BREAKER_STATE is not None
|
||||
|
||||
def test_metrics_labels(self):
|
||||
"""測試 Metrics Labels 正確"""
|
||||
from src.services.nvidia_provider import NVIDIA_REQUESTS_TOTAL
|
||||
|
||||
# 應該能建立帶 label 的 metric
|
||||
metric = NVIDIA_REQUESTS_TOTAL.labels(status="test", tool_name="test_tool")
|
||||
assert metric is not None
|
||||
|
||||
|
||||
class TestExponentialBackoff:
|
||||
"""P3-2: 指數退避測試"""
|
||||
|
||||
def test_backoff_constants_defined(self):
|
||||
"""測試退避常數已定義"""
|
||||
from src.services.nvidia_provider import (
|
||||
RETRY_BASE_DELAY,
|
||||
RETRY_MAX_DELAY,
|
||||
RETRY_EXPONENTIAL_BASE,
|
||||
)
|
||||
|
||||
assert RETRY_BASE_DELAY == 1.0
|
||||
assert RETRY_MAX_DELAY == 30.0
|
||||
assert RETRY_EXPONENTIAL_BASE == 2
|
||||
|
||||
Reference in New Issue
Block a user