# SignOz 告警規則配置 # 負責人: DevOps Commander # 版本: v1.0 # 日期: 2026-03-29 # ADR: ADR-037 (監控增強架構) # # 部署目標: 192.168.0.188 SignOz (Docker) # Webhook: http://awoooi-api.awoooi-prod:8000/api/v1/webhooks/signoz groups: # ========================================================================= # API Error Rate 告警 # ========================================================================= - name: api_errors rules: - alert: APIHighErrorRate expr: | sum(rate(signoz_spans_total{ service_name="awoooi-api", status_code=~"5.." }[5m])) by (service_name) / sum(rate(signoz_spans_total{ service_name="awoooi-api" }[5m])) by (service_name) > 0.05 for: 5m labels: severity: critical source: signoz team: backend annotations: summary: "API 錯誤率 > 5%" description: "服務 {{ $labels.service_name }} 錯誤率: {{ $value | humanizePercentage }}" runbook_url: "https://awoooi.internal/runbooks/api-error-rate" # ========================================================================= # Latency 告警 # ========================================================================= - name: latency rules: - alert: APIHighLatencyP99 expr: | histogram_quantile(0.99, sum(rate(signoz_spans_duration_bucket{ service_name="awoooi-api" }[5m])) by (le, service_name) ) > 2 for: 5m labels: severity: warning source: signoz team: backend annotations: summary: "API P99 延遲 > 2s" description: "服務 {{ $labels.service_name }} P99: {{ $value }}s" - alert: APIHighLatencyP95 expr: | histogram_quantile(0.95, sum(rate(signoz_spans_duration_bucket{ service_name="awoooi-api" }[5m])) by (le, service_name) ) > 1 for: 10m labels: severity: warning source: signoz team: backend annotations: summary: "API P95 延遲 > 1s" description: "服務 {{ $labels.service_name }} P95: {{ $value }}s" # ========================================================================= # Trace 異常告警 # ========================================================================= - name: traces rules: - alert: NoTracesReceived expr: | sum(rate(signoz_spans_total[15m])) == 0 for: 15m labels: severity: warning source: signoz team: platform annotations: summary: "15 分鐘內無 Trace 數據" description: "可能是 OTEL Collector 或應用程式問題,請檢查 192.168.0.188:24318 端點" - alert: HighSpanDropRate expr: | sum(rate(otelcol_exporter_send_failed_spans[5m])) / sum(rate(otelcol_exporter_sent_spans[5m])) > 0.01 for: 5m labels: severity: warning source: signoz team: platform annotations: summary: "Span 丟棄率 > 1%" description: "OTEL Collector 可能有性能問題或目標不可達" # ADR-037 Phase E: 長時間 Trace 告警 - alert: LongRunningTrace expr: | max(signoz_spans_duration{ service_name="awoooi-api", status_code!~"5.." }) by (trace_id, operation) > 10 for: 1m labels: severity: critical source: signoz team: backend annotations: summary: "Trace 執行超過 10 秒" description: "操作 {{ $labels.operation }} 執行時間 {{ $value }}s (trace: {{ $labels.trace_id }})" runbook_url: "https://awoooi.internal/runbooks/long-trace" # ========================================================================= # NVIDIA Nemotron 監控 (ADR-036) # ========================================================================= - name: nvidia_api rules: - alert: NVIDIAHighLatency expr: | histogram_quantile(0.95, sum(rate(signoz_spans_duration_bucket{ service_name="awoooi-api", operation=~".*nvidia.*" }[5m])) by (le) ) > 5 for: 5m labels: severity: warning source: signoz team: ai annotations: summary: "NVIDIA API P95 延遲 > 5s" description: "Tool Calling 可能有性能問題" - alert: NVIDIAHighErrorRate expr: | sum(rate(signoz_spans_total{ service_name="awoooi-api", operation=~".*nvidia.*", status_code=~"5.." }[5m])) / sum(rate(signoz_spans_total{ service_name="awoooi-api", operation=~".*nvidia.*" }[5m])) > 0.1 for: 5m labels: severity: warning source: signoz team: ai annotations: summary: "NVIDIA API 錯誤率 > 10%" description: "可能需要 Fallback 到 Ollama"