Files
awoooi/ops/signoz/alerting/rules.yaml
OG T a5a6bd3408 feat(monitoring): K8s alert rules + Grafana dashboards + ops 腳本
- k8s/monitoring/alert-chain-monitor.yaml
- k8s/monitoring/database-alerts.yaml
- ops/grafana/ Grafana dashboards
- ops/signoz/ SignOz 配置
- ops/scripts/ 維運腳本

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-29 16:04:14 +08:00

166 lines
5.2 KiB
YAML

# SignOz 告警規則配置
# 負責人: DevOps Commander
# 版本: v1.0
# 日期: 2026-03-29
# ADR: ADR-037 (監控增強架構)
#
# 部署目標: 192.168.0.188 SignOz (Docker)
# Webhook: http://awoooi-api.awoooi-prod:8000/api/v1/webhooks/signoz
groups:
# =========================================================================
# API Error Rate 告警
# =========================================================================
- name: api_errors
rules:
- alert: APIHighErrorRate
expr: |
sum(rate(signoz_spans_total{
service_name="awoooi-api",
status_code=~"5.."
}[5m])) by (service_name)
/
sum(rate(signoz_spans_total{
service_name="awoooi-api"
}[5m])) by (service_name)
> 0.05
for: 5m
labels:
severity: critical
source: signoz
team: backend
annotations:
summary: "API 錯誤率 > 5%"
description: "服務 {{ $labels.service_name }} 錯誤率: {{ $value | humanizePercentage }}"
runbook_url: "https://awoooi.internal/runbooks/api-error-rate"
# =========================================================================
# Latency 告警
# =========================================================================
- name: latency
rules:
- alert: APIHighLatencyP99
expr: |
histogram_quantile(0.99,
sum(rate(signoz_spans_duration_bucket{
service_name="awoooi-api"
}[5m])) by (le, service_name)
) > 2
for: 5m
labels:
severity: warning
source: signoz
team: backend
annotations:
summary: "API P99 延遲 > 2s"
description: "服務 {{ $labels.service_name }} P99: {{ $value }}s"
- alert: APIHighLatencyP95
expr: |
histogram_quantile(0.95,
sum(rate(signoz_spans_duration_bucket{
service_name="awoooi-api"
}[5m])) by (le, service_name)
) > 1
for: 10m
labels:
severity: warning
source: signoz
team: backend
annotations:
summary: "API P95 延遲 > 1s"
description: "服務 {{ $labels.service_name }} P95: {{ $value }}s"
# =========================================================================
# Trace 異常告警
# =========================================================================
- name: traces
rules:
- alert: NoTracesReceived
expr: |
sum(rate(signoz_spans_total[15m])) == 0
for: 15m
labels:
severity: warning
source: signoz
team: platform
annotations:
summary: "15 分鐘內無 Trace 數據"
description: "可能是 OTEL Collector 或應用程式問題,請檢查 192.168.0.188:24318 端點"
- alert: HighSpanDropRate
expr: |
sum(rate(otelcol_exporter_send_failed_spans[5m]))
/
sum(rate(otelcol_exporter_sent_spans[5m]))
> 0.01
for: 5m
labels:
severity: warning
source: signoz
team: platform
annotations:
summary: "Span 丟棄率 > 1%"
description: "OTEL Collector 可能有性能問題或目標不可達"
# ADR-037 Phase E: 長時間 Trace 告警
- alert: LongRunningTrace
expr: |
max(signoz_spans_duration{
service_name="awoooi-api",
status_code!~"5.."
}) by (trace_id, operation) > 10
for: 1m
labels:
severity: critical
source: signoz
team: backend
annotations:
summary: "Trace 執行超過 10 秒"
description: "操作 {{ $labels.operation }} 執行時間 {{ $value }}s (trace: {{ $labels.trace_id }})"
runbook_url: "https://awoooi.internal/runbooks/long-trace"
# =========================================================================
# NVIDIA Nemotron 監控 (ADR-036)
# =========================================================================
- name: nvidia_api
rules:
- alert: NVIDIAHighLatency
expr: |
histogram_quantile(0.95,
sum(rate(signoz_spans_duration_bucket{
service_name="awoooi-api",
operation=~".*nvidia.*"
}[5m])) by (le)
) > 5
for: 5m
labels:
severity: warning
source: signoz
team: ai
annotations:
summary: "NVIDIA API P95 延遲 > 5s"
description: "Tool Calling 可能有性能問題"
- alert: NVIDIAHighErrorRate
expr: |
sum(rate(signoz_spans_total{
service_name="awoooi-api",
operation=~".*nvidia.*",
status_code=~"5.."
}[5m]))
/
sum(rate(signoz_spans_total{
service_name="awoooi-api",
operation=~".*nvidia.*"
}[5m]))
> 0.1
for: 5m
labels:
severity: warning
source: signoz
team: ai
annotations:
summary: "NVIDIA API 錯誤率 > 10%"
description: "可能需要 Fallback 到 Ollama"