# NVIDIA Nemotron Tool Calling 告警規則 # ================================================= # 版本: v1.0 # 建立日期: 2026-03-29 # ADR: ADR-036 # 用途: 監控 NVIDIA NIM API + Circuit Breaker 狀態 # # 部署方式: # kubectl apply -f k8s/monitoring/nvidia-alerts.yaml # ================================================= apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: nvidia-tool-calling-rules namespace: monitoring labels: app: prometheus release: prometheus spec: groups: # ========================================================================= # NVIDIA Tool Calling 告警群組 # ========================================================================= - name: nvidia_tool_calling interval: 30s rules: # ------------------------------------------------------------------- # Circuit Breaker 斷路告警 (P1) # ------------------------------------------------------------------- - alert: NvidiaCircuitBreakerOpen expr: nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0 for: 1m labels: severity: warning service: nvidia-nemotron owner: ai-team annotations: summary: "NVIDIA Circuit Breaker 已斷路" description: "Circuit Breaker 已切換至 OPEN 狀態,API 請求將被拒絕" runbook: "docs/runbooks/NVIDIA-CIRCUIT-BREAKER.md" auto_repair: "fallback_to_gemini" # ------------------------------------------------------------------- # Tool Calling 高延遲告警 (P2) # ------------------------------------------------------------------- - alert: NvidiaToolCallingHighLatency expr: histogram_quantile(0.95, nvidia_tool_call_latency_seconds_bucket) > 45 for: 5m labels: severity: warning service: nvidia-nemotron owner: ai-team annotations: summary: "NVIDIA Tool Calling P95 延遲 > 45s" description: "Tool Calling 請求延遲過高,可能影響用戶體驗" auto_repair: "switch_model" # ------------------------------------------------------------------- # Tool Calling 高錯誤率告警 (P0) # ------------------------------------------------------------------- - alert: NvidiaToolCallingHighErrorRate expr: | rate(nvidia_tool_call_requests_total{status="error"}[5m]) / rate(nvidia_tool_call_requests_total[5m]) > 0.1 for: 5m labels: severity: critical service: nvidia-nemotron owner: ai-team annotations: summary: "NVIDIA Tool Calling 錯誤率 > 10%" description: "Tool Calling 錯誤率過高,可能是 API 問題或網路問題" auto_repair: "fallback_to_gemini" # ------------------------------------------------------------------- # Circuit Breaker Half-Open 恢復通知 (Info) # ------------------------------------------------------------------- - alert: NvidiaCircuitBreakerHalfOpen expr: nvidia_circuit_breaker_state_changes_total{to_state="half_open"} > 0 for: 30s labels: severity: info service: nvidia-nemotron owner: ai-team annotations: summary: "NVIDIA Circuit Breaker 正在恢復測試" description: "Circuit Breaker 進入 HALF_OPEN 狀態,正在測試 API 是否恢復" # ------------------------------------------------------------------- # Circuit Breaker 恢復通知 (Info) # ------------------------------------------------------------------- - alert: NvidiaCircuitBreakerClosed expr: | increase(nvidia_circuit_breaker_state_changes_total{to_state="closed"}[5m]) > 0 and nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0 for: 30s labels: severity: info service: nvidia-nemotron owner: ai-team annotations: summary: "NVIDIA Circuit Breaker 已恢復正常" description: "Circuit Breaker 已從斷路狀態恢復" # ------------------------------------------------------------------- # 無請求告警 (可能服務異常) # ------------------------------------------------------------------- - alert: NvidiaNoRequests expr: | rate(nvidia_tool_call_requests_total[15m]) == 0 unless on() (kube_pod_container_status_running{pod=~"awoooi-api.*"} == 0) for: 30m labels: severity: warning service: nvidia-nemotron owner: ai-team annotations: summary: "NVIDIA Tool Calling 30 分鐘內無請求" description: "可能是整合問題或服務未被使用"