feat(ai): Phase 20 P3 優化 - Circuit Breaker + 指數退避 + Prometheus

P3-1: Circuit Breaker 狀態機 (CLOSED/OPEN/HALF_OPEN)
- 連續 3 次失敗觸發斷路
- 60 秒後自動嘗試恢復
- 防止連鎖故障

P3-2: 指數退避重試
- 基礎延遲 1s,最大 30s
- 含 10% jitter 避免雷鳴

P3-3: Prometheus Metrics
- nvidia_tool_call_requests_total (status, tool_name)
- nvidia_tool_call_latency_seconds (histogram)
- nvidia_circuit_breaker_state_changes_total

測試: 25 → 34 PASSED

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-03-29 01:49:08 +08:00
parent d9a6f9d066
commit ae21ba2cc6
2 changed files with 297 additions and 1 deletions

View File

@@ -18,12 +18,16 @@ NVIDIA Nemotron Provider - ADR-036
from __future__ import annotations
import asyncio
import json
import random
import time
from enum import Enum
from typing import Any, Protocol, runtime_checkable # 2026-03-29 ogt: P2-1 Protocol
import httpx
import structlog
from prometheus_client import Counter, Histogram # 2026-03-29 ogt: P3-3 Prometheus
from src.core.config import get_settings
from src.core.telemetry import get_tracer # 2026-03-29 ogt: P1-2 OTEL 追蹤
@@ -105,6 +109,131 @@ NVIDIA_TIMEOUT = 60.0
# 重試次數
MAX_RETRIES = 2
# =============================================================================
# P3-1: Circuit Breaker 配置 (2026-03-29 ogt)
# =============================================================================
# Circuit Breaker 閾值
CIRCUIT_BREAKER_FAILURE_THRESHOLD = 3 # 連續失敗次數觸發斷路
CIRCUIT_BREAKER_RECOVERY_TIMEOUT = 60 # 斷路後等待恢復時間 (秒)
CIRCUIT_BREAKER_HALF_OPEN_REQUESTS = 1 # 半開狀態允許的測試請求數
# P3-2: 指數退避配置
RETRY_BASE_DELAY = 1.0 # 基礎延遲 (秒)
RETRY_MAX_DELAY = 30.0 # 最大延遲 (秒)
RETRY_EXPONENTIAL_BASE = 2 # 指數基數
# =============================================================================
# P3-3: Prometheus Metrics (2026-03-29 ogt)
# =============================================================================
NVIDIA_REQUESTS_TOTAL = Counter(
"nvidia_tool_call_requests_total",
"Total NVIDIA Tool Calling requests",
["status", "tool_name"],
)
NVIDIA_LATENCY_HISTOGRAM = Histogram(
"nvidia_tool_call_latency_seconds",
"NVIDIA Tool Calling latency in seconds",
buckets=[1, 5, 10, 15, 20, 30, 45, 60],
)
NVIDIA_CIRCUIT_BREAKER_STATE = Counter(
"nvidia_circuit_breaker_state_changes_total",
"Circuit breaker state changes",
["from_state", "to_state"],
)
# =============================================================================
# P3-1: Circuit Breaker 狀態機 (2026-03-29 ogt)
# =============================================================================
class CircuitState(Enum):
"""Circuit Breaker 狀態"""
CLOSED = "closed" # 正常運作
OPEN = "open" # 斷路,拒絕請求
HALF_OPEN = "half_open" # 測試恢復
class CircuitBreaker:
"""
Circuit Breaker 實作 - P3-1 優化
防止連鎖故障,當 NVIDIA API 連續失敗時自動斷路。
狀態轉換:
CLOSED → (連續失敗 >= 3) → OPEN
OPEN → (等待 60s) → HALF_OPEN
HALF_OPEN → (成功) → CLOSED
HALF_OPEN → (失敗) → OPEN
"""
def __init__(
self,
failure_threshold: int = CIRCUIT_BREAKER_FAILURE_THRESHOLD,
recovery_timeout: float = CIRCUIT_BREAKER_RECOVERY_TIMEOUT,
):
self._state = CircuitState.CLOSED
self._failure_count = 0
self._last_failure_time: float = 0
self._failure_threshold = failure_threshold
self._recovery_timeout = recovery_timeout
@property
def state(self) -> CircuitState:
"""取得當前狀態 (含自動轉換檢查)"""
if self._state == CircuitState.OPEN:
# 檢查是否應該轉為 HALF_OPEN
if time.time() - self._last_failure_time >= self._recovery_timeout:
self._transition_to(CircuitState.HALF_OPEN)
return self._state
def _transition_to(self, new_state: CircuitState) -> None:
"""狀態轉換 (含 Prometheus 記錄)"""
old_state = self._state
if old_state != new_state:
NVIDIA_CIRCUIT_BREAKER_STATE.labels(
from_state=old_state.value, to_state=new_state.value
).inc()
logger.info(
"circuit_breaker_state_change",
from_state=old_state.value,
to_state=new_state.value,
)
self._state = new_state
def can_execute(self) -> bool:
"""是否允許執行請求"""
state = self.state # 觸發自動狀態檢查
if state == CircuitState.CLOSED:
return True
if state == CircuitState.HALF_OPEN:
return True # 允許測試請求
return False # OPEN 狀態拒絕
def record_success(self) -> None:
"""記錄成功"""
if self._state == CircuitState.HALF_OPEN:
self._transition_to(CircuitState.CLOSED)
self._failure_count = 0
def record_failure(self) -> None:
"""記錄失敗"""
self._failure_count += 1
self._last_failure_time = time.time()
if self._state == CircuitState.HALF_OPEN:
# HALF_OPEN 失敗,重新斷路
self._transition_to(CircuitState.OPEN)
elif self._failure_count >= self._failure_threshold:
# 連續失敗達閾值,斷路
self._transition_to(CircuitState.OPEN)
# 高風險 Tool 清單 (需要 HITL 審核)
HIGH_RISK_TOOLS: set[str] = {
"delete_pod",
@@ -152,9 +281,12 @@ class NvidiaProvider:
Args:
api_key: NVIDIA API Key (預設從 settings 取得)
2026-03-29 ogt: P3-1 加入 Circuit Breaker
"""
self._api_key = api_key or settings.NVIDIA_API_KEY
self._client: httpx.AsyncClient | None = None
self._circuit_breaker = CircuitBreaker() # P3-1: Circuit Breaker
async def _get_client(self) -> httpx.AsyncClient:
"""取得或建立 HTTP Client"""
@@ -193,6 +325,7 @@ class NvidiaProvider:
NvidiaProviderResult: 包含驗證後的 Tool Calls
2026-03-29 ogt: P1-1/P1-2 修復 - 加入 OTEL + Langfuse 追蹤
2026-03-29 ogt: P3-1/P3-2/P3-3 - Circuit Breaker + 指數退避 + Prometheus
"""
start_time = time.perf_counter()
@@ -202,9 +335,24 @@ class NvidiaProvider:
span.set_attribute("ai.model", model)
span.set_attribute("ai.tool_count", len(tools))
# P3-1: Circuit Breaker 檢查
if not self._circuit_breaker.can_execute():
span.set_attribute("ai.error", "circuit_breaker_open")
NVIDIA_REQUESTS_TOTAL.labels(status="circuit_open", tool_name="").inc()
logger.warning(
"nvidia_circuit_breaker_open",
state=self._circuit_breaker.state.value,
)
return NvidiaProviderResult(
success=False,
error="Circuit Breaker OPEN - NVIDIA API 暫時不可用",
fallback_triggered=True,
)
# 檢查 API Key
if not self._api_key:
span.set_attribute("ai.error", "api_key_not_set")
NVIDIA_REQUESTS_TOTAL.labels(status="error", tool_name="").inc()
return NvidiaProviderResult(
success=False,
error="NVIDIA_API_KEY 未設定",
@@ -240,13 +388,14 @@ class NvidiaProvider:
metadata={"model": model, "tool_count": len(tools)},
) as langfuse_ctx:
# 執行請求 (含重試)
# 執行請求 (含 P3-2 指數退避重試)
response_data: dict | None = None
last_error: str | None = None
for attempt in range(MAX_RETRIES + 1):
try:
response_data = await self._send_request(request_body)
self._circuit_breaker.record_success() # P3-1
break
except Exception as e:
last_error = str(e)
@@ -258,15 +407,26 @@ class NvidiaProvider:
error=last_error,
)
if attempt == MAX_RETRIES:
self._circuit_breaker.record_failure() # P3-1
break
# P3-2: 指數退避 (含 jitter)
delay = min(
RETRY_BASE_DELAY * (RETRY_EXPONENTIAL_BASE ** attempt),
RETRY_MAX_DELAY,
)
jitter = random.uniform(0, delay * 0.1) # 10% jitter
await asyncio.sleep(delay + jitter)
latency_ms = (time.perf_counter() - start_time) * 1000
latency_seconds = latency_ms / 1000
span.set_attribute("ai.latency_ms", round(latency_ms, 2))
NVIDIA_LATENCY_HISTOGRAM.observe(latency_seconds) # P3-3
# 請求失敗
if response_data is None:
span.set_attribute("ai.success", False)
span.set_attribute("ai.error", last_error or "unknown")
NVIDIA_REQUESTS_TOTAL.labels(status="error", tool_name="").inc()
logger.error(
"nvidia_request_failed",
error=last_error,
@@ -335,6 +495,13 @@ class NvidiaProvider:
},
)
# P3-3: Prometheus 成功指標
for tc in tool_calls:
if tc.valid and tc.tool_name:
NVIDIA_REQUESTS_TOTAL.labels(
status="success", tool_name=tc.tool_name
).inc()
logger.info(
"nvidia_tool_call_completed",
success=True,

View File

@@ -471,3 +471,132 @@ class TestRateLimiterIntegration:
assert "nvidia" in COST_LIMITS
assert COST_LIMITS["nvidia"]["total_cost_usd"] == 0.0 # 免費
class TestCircuitBreaker:
"""P3-1: Circuit Breaker 測試"""
def test_circuit_breaker_initial_state(self):
"""測試 Circuit Breaker 初始狀態"""
from src.services.nvidia_provider import CircuitBreaker, CircuitState
cb = CircuitBreaker()
assert cb.state == CircuitState.CLOSED
assert cb.can_execute()
def test_circuit_breaker_opens_after_failures(self):
"""測試連續失敗後斷路"""
from src.services.nvidia_provider import CircuitBreaker, CircuitState
cb = CircuitBreaker(failure_threshold=3)
# 連續 3 次失敗
cb.record_failure()
assert cb.state == CircuitState.CLOSED
cb.record_failure()
assert cb.state == CircuitState.CLOSED
cb.record_failure()
assert cb.state == CircuitState.OPEN
assert not cb.can_execute()
def test_circuit_breaker_success_resets_count(self):
"""測試成功重置失敗計數"""
from src.services.nvidia_provider import CircuitBreaker, CircuitState
cb = CircuitBreaker(failure_threshold=3)
cb.record_failure()
cb.record_failure()
cb.record_success() # 重置
# 需要再 3 次失敗才能斷路
cb.record_failure()
cb.record_failure()
assert cb.state == CircuitState.CLOSED
cb.record_failure()
assert cb.state == CircuitState.OPEN
def test_circuit_breaker_half_open_recovery(self):
"""測試半開狀態恢復"""
from src.services.nvidia_provider import CircuitBreaker, CircuitState
cb = CircuitBreaker(failure_threshold=1, recovery_timeout=0.1)
cb.record_failure() # 觸發斷路
assert cb.state == CircuitState.OPEN
import time
time.sleep(0.15) # 等待恢復
# 檢查狀態會觸發 HALF_OPEN 轉換
assert cb.state == CircuitState.HALF_OPEN
assert cb.can_execute()
# 成功後回到 CLOSED
cb.record_success()
assert cb.state == CircuitState.CLOSED
def test_circuit_breaker_half_open_failure(self):
"""測試半開狀態失敗重新斷路"""
from src.services.nvidia_provider import CircuitBreaker, CircuitState
cb = CircuitBreaker(failure_threshold=1, recovery_timeout=0.1)
cb.record_failure()
assert cb.state == CircuitState.OPEN
import time
time.sleep(0.15)
assert cb.state == CircuitState.HALF_OPEN
# 失敗,重新斷路
cb.record_failure()
assert cb.state == CircuitState.OPEN
def test_provider_has_circuit_breaker(self):
"""測試 NvidiaProvider 有 Circuit Breaker"""
from src.services.nvidia_provider import NvidiaProvider
provider = NvidiaProvider()
assert hasattr(provider, "_circuit_breaker")
class TestPrometheusMetrics:
"""P3-3: Prometheus Metrics 測試"""
def test_metrics_defined(self):
"""測試 Prometheus Metrics 已定義"""
from src.services.nvidia_provider import (
NVIDIA_REQUESTS_TOTAL,
NVIDIA_LATENCY_HISTOGRAM,
NVIDIA_CIRCUIT_BREAKER_STATE,
)
assert NVIDIA_REQUESTS_TOTAL is not None
assert NVIDIA_LATENCY_HISTOGRAM is not None
assert NVIDIA_CIRCUIT_BREAKER_STATE is not None
def test_metrics_labels(self):
"""測試 Metrics Labels 正確"""
from src.services.nvidia_provider import NVIDIA_REQUESTS_TOTAL
# 應該能建立帶 label 的 metric
metric = NVIDIA_REQUESTS_TOTAL.labels(status="test", tool_name="test_tool")
assert metric is not None
class TestExponentialBackoff:
"""P3-2: 指數退避測試"""
def test_backoff_constants_defined(self):
"""測試退避常數已定義"""
from src.services.nvidia_provider import (
RETRY_BASE_DELAY,
RETRY_MAX_DELAY,
RETRY_EXPONENTIAL_BASE,
)
assert RETRY_BASE_DELAY == 1.0
assert RETRY_MAX_DELAY == 30.0
assert RETRY_EXPONENTIAL_BASE == 2