feat(monitoring): K8s alert rules + Grafana dashboards + ops 腳本

- k8s/monitoring/alert-chain-monitor.yaml
- k8s/monitoring/database-alerts.yaml
- ops/grafana/ Grafana dashboards
- ops/signoz/ SignOz 配置
- ops/scripts/ 維運腳本

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-03-29 16:04:14 +08:00
parent 89e05e6ea2
commit a5a6bd3408
6 changed files with 2452 additions and 0 deletions

View File

@@ -0,0 +1,147 @@
apiVersion: batch/v1
kind: Job
metadata:
name: migrate-phase18-audit-logs
namespace: awoooi-prod
labels:
app: awoooi-migration
phase: "18"
spec:
ttlSecondsAfterFinished: 300 # 5 分鐘後自動清理
backoffLimit: 1
template:
spec:
restartPolicy: Never
containers:
- name: migrate
image: postgres:15-alpine
command:
- /bin/sh
- -c
- |
echo "=========================================="
echo "Phase 18 AuditLog Migration"
echo "=========================================="
# 從 SECRET 讀取連線資訊
DB_HOST=$(echo $DATABASE_URL | sed 's/.*@\([^:]*\):.*/\1/')
DB_PORT=$(echo $DATABASE_URL | sed 's/.*:\([0-9]*\)\/.*/\1/')
DB_NAME=$(echo $DATABASE_URL | sed 's/.*\/\([^?]*\).*/\1/')
DB_USER=$(echo $DATABASE_URL | sed 's/.*\/\/\([^:]*\):.*/\1/')
DB_PASS=$(echo $DATABASE_URL | sed 's/.*:\/\/[^:]*:\([^@]*\)@.*/\1/')
echo "Connecting to: $DB_HOST:$DB_PORT/$DB_NAME"
export PGPASSWORD="$DB_PASS"
# 執行遷移
psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" <<'EOSQL'
-- 1. authorization_channel
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'audit_logs' AND column_name = 'authorization_channel'
) THEN
ALTER TABLE audit_logs ADD COLUMN authorization_channel VARCHAR(20);
RAISE NOTICE 'Added: authorization_channel';
ELSE
RAISE NOTICE 'Exists: authorization_channel';
END IF;
END $$;
-- 2. retry_count
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'audit_logs' AND column_name = 'retry_count'
) THEN
ALTER TABLE audit_logs ADD COLUMN retry_count INTEGER DEFAULT 0 NOT NULL;
RAISE NOTICE 'Added: retry_count';
ELSE
RAISE NOTICE 'Exists: retry_count';
END IF;
END $$;
-- 3. failure_classification
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'audit_logs' AND column_name = 'failure_classification'
) THEN
ALTER TABLE audit_logs ADD COLUMN failure_classification VARCHAR(50);
RAISE NOTICE 'Added: failure_classification';
ELSE
RAISE NOTICE 'Exists: failure_classification';
END IF;
END $$;
-- 4. source_approval_id
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'audit_logs' AND column_name = 'source_approval_id'
) THEN
ALTER TABLE audit_logs ADD COLUMN source_approval_id VARCHAR(36);
RAISE NOTICE 'Added: source_approval_id';
ELSE
RAISE NOTICE 'Exists: source_approval_id';
END IF;
END $$;
-- 5. auto_repair_attempted
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'audit_logs' AND column_name = 'auto_repair_attempted'
) THEN
ALTER TABLE audit_logs ADD COLUMN auto_repair_attempted BOOLEAN DEFAULT FALSE NOT NULL;
RAISE NOTICE 'Added: auto_repair_attempted';
ELSE
RAISE NOTICE 'Exists: auto_repair_attempted';
END IF;
END $$;
-- 6. auto_repair_result
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'audit_logs' AND column_name = 'auto_repair_result'
) THEN
ALTER TABLE audit_logs ADD COLUMN auto_repair_result TEXT;
RAISE NOTICE 'Added: auto_repair_result';
ELSE
RAISE NOTICE 'Exists: auto_repair_result';
END IF;
END $$;
-- 創建索引
CREATE INDEX IF NOT EXISTS ix_audit_authorization_channel ON audit_logs(authorization_channel);
CREATE INDEX IF NOT EXISTS ix_audit_failure_classification ON audit_logs(failure_classification);
CREATE INDEX IF NOT EXISTS ix_audit_source_approval_id ON audit_logs(source_approval_id);
-- 驗證
SELECT column_name, data_type
FROM information_schema.columns
WHERE table_name = 'audit_logs'
ORDER BY ordinal_position;
EOSQL
echo "=========================================="
echo "Migration completed!"
echo "=========================================="
envFrom:
- secretRef:
name: awoooi-secrets
resources:
requests:
memory: "64Mi"
cpu: "100m"
limits:
memory: "128Mi"
cpu: "200m"

View File

@@ -0,0 +1,183 @@
# AWOOOI 告警鏈路監控
# 負責人: DevOps Commander
# 版本: v1.0
# 日期: 2026-03-29
# ADR: ADR-037 (監控增強架構) Wave B.1
#
# 用途: 監控告警鏈路健康狀態,防止 2026-03-26 路徑錯誤導致無告警事故重演
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: alert-chain-monitor
namespace: monitoring
labels:
release: prometheus
app: prometheus
spec:
groups:
# =========================================================================
# 告警鏈路健康監控
# =========================================================================
- name: alert_chain
rules:
# -----------------------------------------------------------------
# Alertmanager Webhook 鏈路異常
# -----------------------------------------------------------------
- alert: AlertChainBroken_Alertmanager
expr: |
sum(rate(awoooi_webhook_requests_total{
source="alertmanager",
status!="success"
}[5m])) / sum(rate(awoooi_webhook_requests_total{
source="alertmanager"
}[5m])) > 0.1
for: 10m
labels:
severity: critical
service: alert-chain
team: platform
annotations:
summary: "Alertmanager Webhook 錯誤率 > 10%"
description: "告警可能無法正確送達 AWOOOI API"
runbook_url: "https://awoooi.internal/runbooks/alert-chain"
# -----------------------------------------------------------------
# Sentry Webhook 鏈路異常
# -----------------------------------------------------------------
- alert: AlertChainBroken_Sentry
expr: |
sum(rate(awoooi_webhook_requests_total{
source="sentry",
status!="success"
}[5m])) / sum(rate(awoooi_webhook_requests_total{
source="sentry"
}[5m])) > 0.1
for: 10m
labels:
severity: warning
service: alert-chain
team: platform
annotations:
summary: "Sentry Webhook 錯誤率 > 10%"
description: "Sentry 錯誤可能無法正確處理"
# -----------------------------------------------------------------
# SignOz Webhook 鏈路異常
# -----------------------------------------------------------------
- alert: AlertChainBroken_SignOz
expr: |
sum(rate(awoooi_webhook_requests_total{
source="signoz",
status!="success"
}[5m])) / sum(rate(awoooi_webhook_requests_total{
source="signoz"
}[5m])) > 0.1
for: 10m
labels:
severity: warning
service: alert-chain
team: platform
annotations:
summary: "SignOz Webhook 錯誤率 > 10%"
description: "SignOz 告警可能無法正確處理"
# -----------------------------------------------------------------
# 告警鏈路完全斷裂 (無任何告警)
# P1-1 修復: 按 source 分組 (2026-03-29)
# -----------------------------------------------------------------
- alert: NoAlertsReceived2Hours
expr: |
time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
for: 5m
labels:
severity: warning
service: alert-chain
team: platform
annotations:
summary: "2 小時內未收到任何告警"
description: "可能是告警鏈路問題或系統異常穩定。請執行 Smoke Test: python ops/scripts/alert_chain_smoke_test.py"
# -----------------------------------------------------------------
# 告警鏈路健康狀態
# -----------------------------------------------------------------
- alert: AlertChainUnhealthy
expr: |
awoooi_alert_chain_healthy == 0
for: 5m
labels:
severity: critical
service: alert-chain
team: platform
annotations:
summary: "告警鏈路不健康"
description: "{{ $labels.source }} 告警鏈路標記為不健康,最近處理失敗"
# =========================================================================
# Telegram 通知監控
# =========================================================================
- name: telegram_notifications
rules:
- alert: TelegramNotificationsFailing
expr: |
sum(rate(awoooi_telegram_notifications_total{
status="failed"
}[15m])) / sum(rate(awoooi_telegram_notifications_total[15m])) > 0.3
for: 15m
labels:
severity: warning
service: telegram
team: platform
annotations:
summary: "Telegram 通知失敗率 > 30%"
description: "告警可能無法正確發送到 Telegram"
# =========================================================================
# 異常頻率升級監控 (ADR-037)
# =========================================================================
- name: anomaly_escalation
rules:
- alert: FrequentAnomalyEscalation
expr: |
sum(rate(awoooi_anomaly_escalation_total{
level=~"ESCALATE|PERMANENT_FIX"
}[1h])) > 5
for: 5m
labels:
severity: warning
service: anomaly-counter
team: backend
annotations:
summary: "頻繁異常升級告警"
description: "過去 1 小時有超過 5 次異常升級到 ESCALATE 或 PERMANENT_FIX 級別"
- alert: PermanentFixRequired
expr: |
sum(rate(awoooi_anomaly_escalation_total{
level="PERMANENT_FIX"
}[1h])) > 0
for: 1m
labels:
severity: critical
service: anomaly-counter
team: backend
annotations:
summary: "需要永久修復的異常"
description: "有異常升級到 PERMANENT_FIX 級別,需要根本修復而非重啟"
# =========================================================================
# 自動修復監控
# =========================================================================
- name: auto_repair
rules:
- alert: AutoRepairLowSuccessRate
expr: |
awoooi_auto_repair_success_rate < 0.3
for: 30m
labels:
severity: warning
service: auto-repair
team: backend
annotations:
summary: "自動修復成功率過低 (< 30%)"
description: "動作 {{ $labels.action }} 的成功率只有 {{ $value | humanizePercentage }},建議檢查修復邏輯"

View File

@@ -0,0 +1,264 @@
# =============================================================================
# AWOOOI Database Alerts
# =============================================================================
# 負責人: DevOps Commander
# 版本: v1.0
# 日期: 2026-03-29
# ADR: ADR-037 Phase B (Database Exporters)
#
# 告警目標: PostgreSQL (192.168.0.188:5432) + Redis (192.168.0.188:6380)
# =============================================================================
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: database-alerts
namespace: monitoring
labels:
release: prometheus
app: prometheus
spec:
groups:
# =========================================================================
# PostgreSQL 告警
# =========================================================================
- name: postgresql
rules:
# -----------------------------------------------------------------
# 連接池即將耗盡 (> 80%)
# -----------------------------------------------------------------
- alert: PostgreSQLConnectionPoolNearLimit
expr: |
sum(pg_stat_activity_count{state="active"}) by (datname) > 80
for: 5m
labels:
severity: warning
service: postgres
team: infra
annotations:
summary: "PostgreSQL 活躍連接數過高"
description: "Database {{ $labels.datname }} 活躍連接: {{ $value }}"
runbook_url: "https://awoooi.internal/runbooks/postgres-connections"
# -----------------------------------------------------------------
# 連接池耗盡 (> 95%)
# -----------------------------------------------------------------
- alert: PostgreSQLConnectionPoolExhausted
expr: |
sum(pg_stat_activity_count{state="active"}) by (datname) > 95
for: 2m
labels:
severity: critical
service: postgres
team: infra
annotations:
summary: "PostgreSQL 連接池即將耗盡"
description: "Database {{ $labels.datname }} 活躍連接 > 95"
# -----------------------------------------------------------------
# 慢查詢過多
# -----------------------------------------------------------------
- alert: PostgreSQLSlowQueries
expr: pg_slow_queries > 5
for: 5m
labels:
severity: warning
service: postgres
team: backend
annotations:
summary: "PostgreSQL 慢查詢數量過多"
description: "User {{ $labels.usename }} 有 {{ $value }} 個慢查詢 (> 1s)"
# -----------------------------------------------------------------
# 鎖等待過多
# -----------------------------------------------------------------
- alert: PostgreSQLLockWaiting
expr: sum(pg_locks_waiting) > 10
for: 2m
labels:
severity: warning
service: postgres
team: backend
annotations:
summary: "PostgreSQL 鎖等待過多"
description: "{{ $value }} 個查詢正在等待鎖"
# -----------------------------------------------------------------
# 表膨脹 (Dead Tuple > 20%)
# -----------------------------------------------------------------
- alert: PostgreSQLTableBloat
expr: pg_stat_user_tables_bloat_dead_tuple_ratio > 20
for: 30m
labels:
severity: warning
service: postgres
team: infra
annotations:
summary: "PostgreSQL 表膨脹嚴重"
description: "Table {{ $labels.relname }} dead tuple 比例: {{ $value }}%"
# -----------------------------------------------------------------
# 資料庫不可達
# -----------------------------------------------------------------
- alert: PostgreSQLDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
service: postgres
team: infra
annotations:
summary: "PostgreSQL 無法連線"
description: "PostgreSQL Exporter 無法連接到資料庫"
# -----------------------------------------------------------------
# 長時間執行查詢 (> 60s)
# -----------------------------------------------------------------
- alert: PostgreSQLLongRunningQuery
expr: pg_longest_query_seconds > 60
for: 1m
labels:
severity: warning
service: postgres
team: backend
annotations:
summary: "PostgreSQL 長時間執行查詢"
description: "User {{ $labels.usename }} 查詢已執行 {{ $value | humanizeDuration }}"
# -----------------------------------------------------------------
# 高 Rollback 率 (> 5%)
# -----------------------------------------------------------------
- alert: PostgreSQLHighRollbackRate
expr: pg_stat_database_transactions_rollback_ratio > 5
for: 15m
labels:
severity: warning
service: postgres
team: backend
annotations:
summary: "PostgreSQL 事務回滾率過高"
description: "Database {{ $labels.datname }} 回滾率: {{ $value }}%"
# =========================================================================
# Redis 告警
# =========================================================================
- name: redis
rules:
# -----------------------------------------------------------------
# 記憶體使用過高 (> 85%)
# -----------------------------------------------------------------
- alert: RedisMemoryHigh
expr: |
redis_memory_used_bytes / redis_memory_max_bytes > 0.85
for: 5m
labels:
severity: warning
service: redis
team: infra
annotations:
summary: "Redis 記憶體使用 > 85%"
description: "Redis 記憶體使用: {{ $value | humanizePercentage }}"
# -----------------------------------------------------------------
# 記憶體即將耗盡 (> 95%)
# -----------------------------------------------------------------
- alert: RedisMemoryCritical
expr: |
redis_memory_used_bytes / redis_memory_max_bytes > 0.95
for: 2m
labels:
severity: critical
service: redis
team: infra
annotations:
summary: "Redis 記憶體即將耗盡"
description: "Redis 記憶體使用 > 95%"
# -----------------------------------------------------------------
# 快取命中率過低 (< 80%)
# -----------------------------------------------------------------
- alert: RedisCacheHitRateLow
expr: |
rate(redis_keyspace_hits_total[5m])
/
(rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))
< 0.8
for: 15m
labels:
severity: warning
service: redis
team: backend
annotations:
summary: "Redis 快取命中率過低"
description: "命中率: {{ $value | humanizePercentage }}"
# -----------------------------------------------------------------
# 連接數過高
# -----------------------------------------------------------------
- alert: RedisConnectionsHigh
expr: redis_connected_clients > 500
for: 5m
labels:
severity: warning
service: redis
team: infra
annotations:
summary: "Redis 連接數過高"
description: "連接數: {{ $value }}"
# -----------------------------------------------------------------
# Key 驅逐頻繁
# -----------------------------------------------------------------
- alert: RedisEvictedKeys
expr: rate(redis_evicted_keys_total[5m]) > 100
for: 5m
labels:
severity: warning
service: redis
team: backend
annotations:
summary: "Redis Key 驅逐頻繁"
description: "每秒驅逐 {{ $value }} 個 key"
# -----------------------------------------------------------------
# Redis 不可達
# -----------------------------------------------------------------
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
service: redis
team: infra
annotations:
summary: "Redis 無法連線"
description: "Redis Exporter 無法連接到 Redis"
# -----------------------------------------------------------------
# 命令延遲過高
# -----------------------------------------------------------------
- alert: RedisLatencyHigh
expr: |
redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01
for: 5m
labels:
severity: warning
service: redis
team: backend
annotations:
summary: "Redis 命令延遲過高"
description: "平均命令延遲 > 10ms"
# -----------------------------------------------------------------
# 阻塞客戶端
# -----------------------------------------------------------------
- alert: RedisBlockedClients
expr: redis_blocked_clients > 10
for: 5m
labels:
severity: warning
service: redis
team: backend
annotations:
summary: "Redis 有阻塞的客戶端"
description: "{{ $value }} 個客戶端被阻塞 (BLPOP/BRPOP)"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,267 @@
#!/usr/bin/env python3
"""
AWOOOI 告警鏈路 E2E Smoke Test
==============================
ADR-037 Wave A.6: 告警鏈路端到端驗證腳本
執行方式:
python ops/scripts/alert_chain_smoke_test.py
驗證項目:
1. Health Endpoint 可達
2. Alertmanager Webhook 可達
3. Sentry Webhook 可達
4. SignOz Webhook 可達
5. Telegram 連通性
版本: v1.0
建立: 2026-03-29 (台北時區)
建立者: Claude Code (Phase 21 ADR-037)
"""
import asyncio
import os
import sys
from datetime import datetime
import httpx
# API 基礎位址 (可透過環境變數覆蓋)
API_BASE = os.getenv(
"AWOOOI_API_BASE",
"http://awoooi-api.awoooi-prod.svc.cluster.local:8000"
)
# 本地測試用:
# API_BASE = "http://localhost:8000"
TIMEOUT = 30
async def test_health_endpoint() -> bool:
"""測試 Health Endpoint"""
print("1. Testing Health Endpoint...")
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
try:
response = await client.get(f"{API_BASE}/api/v1/health")
if response.status_code == 200:
print(" ✅ Health: OK")
return True
else:
print(f" ❌ Health: HTTP {response.status_code}")
return False
except Exception as e:
print(f" ❌ Health: {e}")
return False
async def test_alertmanager_webhook() -> bool:
"""測試 Alertmanager Webhook"""
print("2. Testing Alertmanager Webhook...")
test_payload = {
"version": "4",
"status": "firing",
"alerts": [{
"status": "firing",
"labels": {
"alertname": "E2E_SMOKE_TEST",
"severity": "info",
"service": "smoke-test",
"namespace": "test",
},
"annotations": {
"summary": "E2E Smoke Test - Please Ignore",
"description": f"Auto test @ {datetime.now().isoformat()}",
},
"startsAt": datetime.now().isoformat() + "Z",
}]
}
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
try:
response = await client.post(
f"{API_BASE}/api/v1/webhooks/alertmanager",
json=test_payload,
)
if response.status_code == 200:
print(" ✅ Alertmanager Webhook: OK")
return True
else:
print(f" ❌ Alertmanager Webhook: HTTP {response.status_code}")
print(f" Response: {response.text[:200]}")
return False
except Exception as e:
print(f" ❌ Alertmanager Webhook: {e}")
return False
async def test_sentry_webhook() -> bool:
"""測試 Sentry Webhook"""
print("3. Testing Sentry Webhook...")
# 使用唯一 ID 避免去重
test_id = f"smoke-test-{datetime.now().strftime('%Y%m%d%H%M%S%f')}"
test_payload = {
"action": "triggered",
"data": {
"issue": {
"id": test_id,
"title": "E2E Smoke Test Error",
"level": "info", # 使用 info 避免觸發實際告警
"culprit": "smoke_test.py:test",
"project": {"slug": "awoooi-api"},
"firstSeen": datetime.now().isoformat(),
"count": 1,
},
"event": {
"message": "E2E Smoke Test - Please Ignore",
"platform": "python",
},
},
}
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
try:
response = await client.post(
f"{API_BASE}/api/v1/webhooks/sentry/error",
json=test_payload,
)
if response.status_code == 200:
result = response.json()
status = result.get("status")
if status in ["accepted", "deduplicated", "ignored"]:
print(f" ✅ Sentry Webhook: OK (status={status})")
return True
print(f" ❌ Sentry Webhook: HTTP {response.status_code}")
return False
except Exception as e:
print(f" ❌ Sentry Webhook: {e}")
return False
async def test_signoz_webhook() -> bool:
"""測試 SignOz Webhook"""
print("4. Testing SignOz Webhook...")
test_payload = {
"alertname": "E2E_SMOKE_TEST",
"status": "firing",
"labels": {
"alertname": "E2E_SMOKE_TEST",
"severity": "info",
"service_name": "smoke-test",
"source": "signoz",
},
"annotations": {
"summary": "E2E Smoke Test - Please Ignore",
"description": f"Auto test @ {datetime.now().isoformat()}",
},
"startsAt": datetime.now().isoformat() + "Z",
}
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
try:
response = await client.post(
f"{API_BASE}/api/v1/webhooks/signoz/alert",
json=test_payload,
)
if response.status_code == 200:
result = response.json()
if result.get("status") == "ok":
print(" ✅ SignOz Webhook: OK")
return True
print(f" ❌ SignOz Webhook: HTTP {response.status_code}")
return False
except Exception as e:
print(f" ❌ SignOz Webhook: {e}")
return False
async def test_signoz_health() -> bool:
"""測試 SignOz Webhook Health"""
print("5. Testing SignOz Webhook Health...")
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
try:
response = await client.get(
f"{API_BASE}/api/v1/webhooks/signoz/health"
)
if response.status_code == 200:
print(" ✅ SignOz Health: OK")
return True
else:
print(f" ❌ SignOz Health: HTTP {response.status_code}")
return False
except Exception as e:
print(f" ❌ SignOz Health: {e}")
return False
async def test_telegram_connectivity() -> bool:
"""測試 Telegram 連通性"""
print("6. Testing Telegram Connectivity...")
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
try:
response = await client.get(f"{API_BASE}/api/v1/telegram/status")
if response.status_code == 200:
data = response.json()
if data.get("connected"):
print(" ✅ Telegram: Connected")
return True
else:
print(" ⚠️ Telegram: Not Connected (endpoint reachable)")
return True # 端點可達即可
elif response.status_code == 404:
print(" ⚠️ Telegram: Endpoint not found (skipped)")
return True # 不影響整體測試
else:
print(f" ❌ Telegram: HTTP {response.status_code}")
return False
except Exception as e:
print(f" ⚠️ Telegram: {e} (skipped)")
return True # 不影響整體測試
async def main():
"""執行所有 Smoke Test"""
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print("=" * 60)
print(" AWOOOI Alert Chain E2E Smoke Test")
print(f" Time: {now}")
print(f" Target: {API_BASE}")
print("=" * 60)
print()
# 依序執行測試 (非並行,方便除錯)
results = []
results.append(await test_health_endpoint())
results.append(await test_alertmanager_webhook())
results.append(await test_sentry_webhook())
results.append(await test_signoz_webhook())
results.append(await test_signoz_health())
results.append(await test_telegram_connectivity())
print()
print("=" * 60)
passed = sum(results)
total = len(results)
if passed == total:
print(f" ✅ ALL PASSED ({passed}/{total})")
print("=" * 60)
sys.exit(0)
else:
failed = total - passed
print(f" ❌ FAILED ({failed}/{total} tests failed)")
print("=" * 60)
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,165 @@
# SignOz 告警規則配置
# 負責人: DevOps Commander
# 版本: v1.0
# 日期: 2026-03-29
# ADR: ADR-037 (監控增強架構)
#
# 部署目標: 192.168.0.188 SignOz (Docker)
# Webhook: http://awoooi-api.awoooi-prod:8000/api/v1/webhooks/signoz
groups:
# =========================================================================
# API Error Rate 告警
# =========================================================================
- name: api_errors
rules:
- alert: APIHighErrorRate
expr: |
sum(rate(signoz_spans_total{
service_name="awoooi-api",
status_code=~"5.."
}[5m])) by (service_name)
/
sum(rate(signoz_spans_total{
service_name="awoooi-api"
}[5m])) by (service_name)
> 0.05
for: 5m
labels:
severity: critical
source: signoz
team: backend
annotations:
summary: "API 錯誤率 > 5%"
description: "服務 {{ $labels.service_name }} 錯誤率: {{ $value | humanizePercentage }}"
runbook_url: "https://awoooi.internal/runbooks/api-error-rate"
# =========================================================================
# Latency 告警
# =========================================================================
- name: latency
rules:
- alert: APIHighLatencyP99
expr: |
histogram_quantile(0.99,
sum(rate(signoz_spans_duration_bucket{
service_name="awoooi-api"
}[5m])) by (le, service_name)
) > 2
for: 5m
labels:
severity: warning
source: signoz
team: backend
annotations:
summary: "API P99 延遲 > 2s"
description: "服務 {{ $labels.service_name }} P99: {{ $value }}s"
- alert: APIHighLatencyP95
expr: |
histogram_quantile(0.95,
sum(rate(signoz_spans_duration_bucket{
service_name="awoooi-api"
}[5m])) by (le, service_name)
) > 1
for: 10m
labels:
severity: warning
source: signoz
team: backend
annotations:
summary: "API P95 延遲 > 1s"
description: "服務 {{ $labels.service_name }} P95: {{ $value }}s"
# =========================================================================
# Trace 異常告警
# =========================================================================
- name: traces
rules:
- alert: NoTracesReceived
expr: |
sum(rate(signoz_spans_total[15m])) == 0
for: 15m
labels:
severity: warning
source: signoz
team: platform
annotations:
summary: "15 分鐘內無 Trace 數據"
description: "可能是 OTEL Collector 或應用程式問題,請檢查 192.168.0.188:24318 端點"
- alert: HighSpanDropRate
expr: |
sum(rate(otelcol_exporter_send_failed_spans[5m]))
/
sum(rate(otelcol_exporter_sent_spans[5m]))
> 0.01
for: 5m
labels:
severity: warning
source: signoz
team: platform
annotations:
summary: "Span 丟棄率 > 1%"
description: "OTEL Collector 可能有性能問題或目標不可達"
# ADR-037 Phase E: 長時間 Trace 告警
- alert: LongRunningTrace
expr: |
max(signoz_spans_duration{
service_name="awoooi-api",
status_code!~"5.."
}) by (trace_id, operation) > 10
for: 1m
labels:
severity: critical
source: signoz
team: backend
annotations:
summary: "Trace 執行超過 10 秒"
description: "操作 {{ $labels.operation }} 執行時間 {{ $value }}s (trace: {{ $labels.trace_id }})"
runbook_url: "https://awoooi.internal/runbooks/long-trace"
# =========================================================================
# NVIDIA Nemotron 監控 (ADR-036)
# =========================================================================
- name: nvidia_api
rules:
- alert: NVIDIAHighLatency
expr: |
histogram_quantile(0.95,
sum(rate(signoz_spans_duration_bucket{
service_name="awoooi-api",
operation=~".*nvidia.*"
}[5m])) by (le)
) > 5
for: 5m
labels:
severity: warning
source: signoz
team: ai
annotations:
summary: "NVIDIA API P95 延遲 > 5s"
description: "Tool Calling 可能有性能問題"
- alert: NVIDIAHighErrorRate
expr: |
sum(rate(signoz_spans_total{
service_name="awoooi-api",
operation=~".*nvidia.*",
status_code=~"5.."
}[5m]))
/
sum(rate(signoz_spans_total{
service_name="awoooi-api",
operation=~".*nvidia.*"
}[5m]))
> 0.1
for: 5m
labels:
severity: warning
source: signoz
team: ai
annotations:
summary: "NVIDIA API 錯誤率 > 10%"
description: "可能需要 Fallback 到 Ollama"