- k8s/monitoring/alert-chain-monitor.yaml - k8s/monitoring/database-alerts.yaml - ops/grafana/ Grafana dashboards - ops/signoz/ SignOz 配置 - ops/scripts/ 維運腳本 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
166 lines
5.2 KiB
YAML
166 lines
5.2 KiB
YAML
# SignOz 告警規則配置
|
|
# 負責人: DevOps Commander
|
|
# 版本: v1.0
|
|
# 日期: 2026-03-29
|
|
# ADR: ADR-037 (監控增強架構)
|
|
#
|
|
# 部署目標: 192.168.0.188 SignOz (Docker)
|
|
# Webhook: http://awoooi-api.awoooi-prod:8000/api/v1/webhooks/signoz
|
|
|
|
groups:
|
|
# =========================================================================
|
|
# API Error Rate 告警
|
|
# =========================================================================
|
|
- name: api_errors
|
|
rules:
|
|
- alert: APIHighErrorRate
|
|
expr: |
|
|
sum(rate(signoz_spans_total{
|
|
service_name="awoooi-api",
|
|
status_code=~"5.."
|
|
}[5m])) by (service_name)
|
|
/
|
|
sum(rate(signoz_spans_total{
|
|
service_name="awoooi-api"
|
|
}[5m])) by (service_name)
|
|
> 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
source: signoz
|
|
team: backend
|
|
annotations:
|
|
summary: "API 錯誤率 > 5%"
|
|
description: "服務 {{ $labels.service_name }} 錯誤率: {{ $value | humanizePercentage }}"
|
|
runbook_url: "https://awoooi.internal/runbooks/api-error-rate"
|
|
|
|
# =========================================================================
|
|
# Latency 告警
|
|
# =========================================================================
|
|
- name: latency
|
|
rules:
|
|
- alert: APIHighLatencyP99
|
|
expr: |
|
|
histogram_quantile(0.99,
|
|
sum(rate(signoz_spans_duration_bucket{
|
|
service_name="awoooi-api"
|
|
}[5m])) by (le, service_name)
|
|
) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
source: signoz
|
|
team: backend
|
|
annotations:
|
|
summary: "API P99 延遲 > 2s"
|
|
description: "服務 {{ $labels.service_name }} P99: {{ $value }}s"
|
|
|
|
- alert: APIHighLatencyP95
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(signoz_spans_duration_bucket{
|
|
service_name="awoooi-api"
|
|
}[5m])) by (le, service_name)
|
|
) > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
source: signoz
|
|
team: backend
|
|
annotations:
|
|
summary: "API P95 延遲 > 1s"
|
|
description: "服務 {{ $labels.service_name }} P95: {{ $value }}s"
|
|
|
|
# =========================================================================
|
|
# Trace 異常告警
|
|
# =========================================================================
|
|
- name: traces
|
|
rules:
|
|
- alert: NoTracesReceived
|
|
expr: |
|
|
sum(rate(signoz_spans_total[15m])) == 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
source: signoz
|
|
team: platform
|
|
annotations:
|
|
summary: "15 分鐘內無 Trace 數據"
|
|
description: "可能是 OTEL Collector 或應用程式問題,請檢查 192.168.0.188:24318 端點"
|
|
|
|
- alert: HighSpanDropRate
|
|
expr: |
|
|
sum(rate(otelcol_exporter_send_failed_spans[5m]))
|
|
/
|
|
sum(rate(otelcol_exporter_sent_spans[5m]))
|
|
> 0.01
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
source: signoz
|
|
team: platform
|
|
annotations:
|
|
summary: "Span 丟棄率 > 1%"
|
|
description: "OTEL Collector 可能有性能問題或目標不可達"
|
|
|
|
# ADR-037 Phase E: 長時間 Trace 告警
|
|
- alert: LongRunningTrace
|
|
expr: |
|
|
max(signoz_spans_duration{
|
|
service_name="awoooi-api",
|
|
status_code!~"5.."
|
|
}) by (trace_id, operation) > 10
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
source: signoz
|
|
team: backend
|
|
annotations:
|
|
summary: "Trace 執行超過 10 秒"
|
|
description: "操作 {{ $labels.operation }} 執行時間 {{ $value }}s (trace: {{ $labels.trace_id }})"
|
|
runbook_url: "https://awoooi.internal/runbooks/long-trace"
|
|
|
|
# =========================================================================
|
|
# NVIDIA Nemotron 監控 (ADR-036)
|
|
# =========================================================================
|
|
- name: nvidia_api
|
|
rules:
|
|
- alert: NVIDIAHighLatency
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(signoz_spans_duration_bucket{
|
|
service_name="awoooi-api",
|
|
operation=~".*nvidia.*"
|
|
}[5m])) by (le)
|
|
) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
source: signoz
|
|
team: ai
|
|
annotations:
|
|
summary: "NVIDIA API P95 延遲 > 5s"
|
|
description: "Tool Calling 可能有性能問題"
|
|
|
|
- alert: NVIDIAHighErrorRate
|
|
expr: |
|
|
sum(rate(signoz_spans_total{
|
|
service_name="awoooi-api",
|
|
operation=~".*nvidia.*",
|
|
status_code=~"5.."
|
|
}[5m]))
|
|
/
|
|
sum(rate(signoz_spans_total{
|
|
service_name="awoooi-api",
|
|
operation=~".*nvidia.*"
|
|
}[5m]))
|
|
> 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
source: signoz
|
|
team: ai
|
|
annotations:
|
|
summary: "NVIDIA API 錯誤率 > 10%"
|
|
description: "可能需要 Fallback 到 Ollama"
|