服務註冊表: - 新增 nvidia-nemotron AI 服務 - 3 個 Prometheus metrics 定義 - 4 個告警規則 (circuit_breaker, timeout, error_rate, rate_limit) - fallback 策略 (nvidia → gemini → ollama) Alertmanager 規則: - NvidiaCircuitBreakerOpen (P1) - NvidiaToolCallingHighLatency (P2) - NvidiaToolCallingHighErrorRate (P0) - NvidiaCircuitBreakerHalfOpen (Info) - NvidiaCircuitBreakerClosed (Info) - NvidiaNoRequests (P3) 自動修復: - fallback_to_gemini - fallback_to_ollama - switch_model ADR: ADR-036 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
123 lines
4.9 KiB
YAML
123 lines
4.9 KiB
YAML
# NVIDIA Nemotron Tool Calling 告警規則
|
||
# =================================================
|
||
# 版本: v1.0
|
||
# 建立日期: 2026-03-29
|
||
# ADR: ADR-036
|
||
# 用途: 監控 NVIDIA NIM API + Circuit Breaker 狀態
|
||
#
|
||
# 部署方式:
|
||
# kubectl apply -f k8s/monitoring/nvidia-alerts.yaml
|
||
# =================================================
|
||
|
||
apiVersion: monitoring.coreos.com/v1
|
||
kind: PrometheusRule
|
||
metadata:
|
||
name: nvidia-tool-calling-rules
|
||
namespace: monitoring
|
||
labels:
|
||
app: prometheus
|
||
release: prometheus
|
||
spec:
|
||
groups:
|
||
# =========================================================================
|
||
# NVIDIA Tool Calling 告警群組
|
||
# =========================================================================
|
||
- name: nvidia_tool_calling
|
||
interval: 30s
|
||
rules:
|
||
# -------------------------------------------------------------------
|
||
# Circuit Breaker 斷路告警 (P1)
|
||
# -------------------------------------------------------------------
|
||
- alert: NvidiaCircuitBreakerOpen
|
||
expr: nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0
|
||
for: 1m
|
||
labels:
|
||
severity: warning
|
||
service: nvidia-nemotron
|
||
owner: ai-team
|
||
annotations:
|
||
summary: "NVIDIA Circuit Breaker 已斷路"
|
||
description: "Circuit Breaker 已切換至 OPEN 狀態,API 請求將被拒絕"
|
||
runbook: "docs/runbooks/NVIDIA-CIRCUIT-BREAKER.md"
|
||
auto_repair: "fallback_to_gemini"
|
||
|
||
# -------------------------------------------------------------------
|
||
# Tool Calling 高延遲告警 (P2)
|
||
# -------------------------------------------------------------------
|
||
- alert: NvidiaToolCallingHighLatency
|
||
expr: histogram_quantile(0.95, nvidia_tool_call_latency_seconds_bucket) > 45
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
service: nvidia-nemotron
|
||
owner: ai-team
|
||
annotations:
|
||
summary: "NVIDIA Tool Calling P95 延遲 > 45s"
|
||
description: "Tool Calling 請求延遲過高,可能影響用戶體驗"
|
||
auto_repair: "switch_model"
|
||
|
||
# -------------------------------------------------------------------
|
||
# Tool Calling 高錯誤率告警 (P0)
|
||
# -------------------------------------------------------------------
|
||
- alert: NvidiaToolCallingHighErrorRate
|
||
expr: |
|
||
rate(nvidia_tool_call_requests_total{status="error"}[5m])
|
||
/
|
||
rate(nvidia_tool_call_requests_total[5m]) > 0.1
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
service: nvidia-nemotron
|
||
owner: ai-team
|
||
annotations:
|
||
summary: "NVIDIA Tool Calling 錯誤率 > 10%"
|
||
description: "Tool Calling 錯誤率過高,可能是 API 問題或網路問題"
|
||
auto_repair: "fallback_to_gemini"
|
||
|
||
# -------------------------------------------------------------------
|
||
# Circuit Breaker Half-Open 恢復通知 (Info)
|
||
# -------------------------------------------------------------------
|
||
- alert: NvidiaCircuitBreakerHalfOpen
|
||
expr: nvidia_circuit_breaker_state_changes_total{to_state="half_open"} > 0
|
||
for: 30s
|
||
labels:
|
||
severity: info
|
||
service: nvidia-nemotron
|
||
owner: ai-team
|
||
annotations:
|
||
summary: "NVIDIA Circuit Breaker 正在恢復測試"
|
||
description: "Circuit Breaker 進入 HALF_OPEN 狀態,正在測試 API 是否恢復"
|
||
|
||
# -------------------------------------------------------------------
|
||
# Circuit Breaker 恢復通知 (Info)
|
||
# -------------------------------------------------------------------
|
||
- alert: NvidiaCircuitBreakerClosed
|
||
expr: |
|
||
increase(nvidia_circuit_breaker_state_changes_total{to_state="closed"}[5m]) > 0
|
||
and
|
||
nvidia_circuit_breaker_state_changes_total{to_state="open"} > 0
|
||
for: 30s
|
||
labels:
|
||
severity: info
|
||
service: nvidia-nemotron
|
||
owner: ai-team
|
||
annotations:
|
||
summary: "NVIDIA Circuit Breaker 已恢復正常"
|
||
description: "Circuit Breaker 已從斷路狀態恢復"
|
||
|
||
# -------------------------------------------------------------------
|
||
# 無請求告警 (可能服務異常)
|
||
# -------------------------------------------------------------------
|
||
- alert: NvidiaNoRequests
|
||
expr: |
|
||
rate(nvidia_tool_call_requests_total[15m]) == 0
|
||
unless on() (kube_pod_container_status_running{pod=~"awoooi-api.*"} == 0)
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
service: nvidia-nemotron
|
||
owner: ai-team
|
||
annotations:
|
||
summary: "NVIDIA Tool Calling 30 分鐘內無請求"
|
||
description: "可能是整合問題或服務未被使用"
|