feat(monitoring): K8s alert rules + Grafana dashboards + ops 腳本
- k8s/monitoring/alert-chain-monitor.yaml - k8s/monitoring/database-alerts.yaml - ops/grafana/ Grafana dashboards - ops/signoz/ SignOz 配置 - ops/scripts/ 維運腳本 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
147
k8s/jobs/migrate-phase18-audit-logs.yaml
Normal file
147
k8s/jobs/migrate-phase18-audit-logs.yaml
Normal file
@@ -0,0 +1,147 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: migrate-phase18-audit-logs
|
||||
namespace: awoooi-prod
|
||||
labels:
|
||||
app: awoooi-migration
|
||||
phase: "18"
|
||||
spec:
|
||||
ttlSecondsAfterFinished: 300 # 5 分鐘後自動清理
|
||||
backoffLimit: 1
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
containers:
|
||||
- name: migrate
|
||||
image: postgres:15-alpine
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
echo "=========================================="
|
||||
echo "Phase 18 AuditLog Migration"
|
||||
echo "=========================================="
|
||||
|
||||
# 從 SECRET 讀取連線資訊
|
||||
DB_HOST=$(echo $DATABASE_URL | sed 's/.*@\([^:]*\):.*/\1/')
|
||||
DB_PORT=$(echo $DATABASE_URL | sed 's/.*:\([0-9]*\)\/.*/\1/')
|
||||
DB_NAME=$(echo $DATABASE_URL | sed 's/.*\/\([^?]*\).*/\1/')
|
||||
DB_USER=$(echo $DATABASE_URL | sed 's/.*\/\/\([^:]*\):.*/\1/')
|
||||
DB_PASS=$(echo $DATABASE_URL | sed 's/.*:\/\/[^:]*:\([^@]*\)@.*/\1/')
|
||||
|
||||
echo "Connecting to: $DB_HOST:$DB_PORT/$DB_NAME"
|
||||
|
||||
export PGPASSWORD="$DB_PASS"
|
||||
|
||||
# 執行遷移
|
||||
psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" <<'EOSQL'
|
||||
-- 1. authorization_channel
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'audit_logs' AND column_name = 'authorization_channel'
|
||||
) THEN
|
||||
ALTER TABLE audit_logs ADD COLUMN authorization_channel VARCHAR(20);
|
||||
RAISE NOTICE 'Added: authorization_channel';
|
||||
ELSE
|
||||
RAISE NOTICE 'Exists: authorization_channel';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- 2. retry_count
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'audit_logs' AND column_name = 'retry_count'
|
||||
) THEN
|
||||
ALTER TABLE audit_logs ADD COLUMN retry_count INTEGER DEFAULT 0 NOT NULL;
|
||||
RAISE NOTICE 'Added: retry_count';
|
||||
ELSE
|
||||
RAISE NOTICE 'Exists: retry_count';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- 3. failure_classification
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'audit_logs' AND column_name = 'failure_classification'
|
||||
) THEN
|
||||
ALTER TABLE audit_logs ADD COLUMN failure_classification VARCHAR(50);
|
||||
RAISE NOTICE 'Added: failure_classification';
|
||||
ELSE
|
||||
RAISE NOTICE 'Exists: failure_classification';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- 4. source_approval_id
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'audit_logs' AND column_name = 'source_approval_id'
|
||||
) THEN
|
||||
ALTER TABLE audit_logs ADD COLUMN source_approval_id VARCHAR(36);
|
||||
RAISE NOTICE 'Added: source_approval_id';
|
||||
ELSE
|
||||
RAISE NOTICE 'Exists: source_approval_id';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- 5. auto_repair_attempted
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'audit_logs' AND column_name = 'auto_repair_attempted'
|
||||
) THEN
|
||||
ALTER TABLE audit_logs ADD COLUMN auto_repair_attempted BOOLEAN DEFAULT FALSE NOT NULL;
|
||||
RAISE NOTICE 'Added: auto_repair_attempted';
|
||||
ELSE
|
||||
RAISE NOTICE 'Exists: auto_repair_attempted';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- 6. auto_repair_result
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_name = 'audit_logs' AND column_name = 'auto_repair_result'
|
||||
) THEN
|
||||
ALTER TABLE audit_logs ADD COLUMN auto_repair_result TEXT;
|
||||
RAISE NOTICE 'Added: auto_repair_result';
|
||||
ELSE
|
||||
RAISE NOTICE 'Exists: auto_repair_result';
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- 創建索引
|
||||
CREATE INDEX IF NOT EXISTS ix_audit_authorization_channel ON audit_logs(authorization_channel);
|
||||
CREATE INDEX IF NOT EXISTS ix_audit_failure_classification ON audit_logs(failure_classification);
|
||||
CREATE INDEX IF NOT EXISTS ix_audit_source_approval_id ON audit_logs(source_approval_id);
|
||||
|
||||
-- 驗證
|
||||
SELECT column_name, data_type
|
||||
FROM information_schema.columns
|
||||
WHERE table_name = 'audit_logs'
|
||||
ORDER BY ordinal_position;
|
||||
EOSQL
|
||||
|
||||
echo "=========================================="
|
||||
echo "Migration completed!"
|
||||
echo "=========================================="
|
||||
envFrom:
|
||||
- secretRef:
|
||||
name: awoooi-secrets
|
||||
resources:
|
||||
requests:
|
||||
memory: "64Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "128Mi"
|
||||
cpu: "200m"
|
||||
183
k8s/monitoring/alert-chain-monitor.yaml
Normal file
183
k8s/monitoring/alert-chain-monitor.yaml
Normal file
@@ -0,0 +1,183 @@
|
||||
# AWOOOI 告警鏈路監控
|
||||
# 負責人: DevOps Commander
|
||||
# 版本: v1.0
|
||||
# 日期: 2026-03-29
|
||||
# ADR: ADR-037 (監控增強架構) Wave B.1
|
||||
#
|
||||
# 用途: 監控告警鏈路健康狀態,防止 2026-03-26 路徑錯誤導致無告警事故重演
|
||||
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: alert-chain-monitor
|
||||
namespace: monitoring
|
||||
labels:
|
||||
release: prometheus
|
||||
app: prometheus
|
||||
spec:
|
||||
groups:
|
||||
# =========================================================================
|
||||
# 告警鏈路健康監控
|
||||
# =========================================================================
|
||||
- name: alert_chain
|
||||
rules:
|
||||
# -----------------------------------------------------------------
|
||||
# Alertmanager Webhook 鏈路異常
|
||||
# -----------------------------------------------------------------
|
||||
- alert: AlertChainBroken_Alertmanager
|
||||
expr: |
|
||||
sum(rate(awoooi_webhook_requests_total{
|
||||
source="alertmanager",
|
||||
status!="success"
|
||||
}[5m])) / sum(rate(awoooi_webhook_requests_total{
|
||||
source="alertmanager"
|
||||
}[5m])) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
service: alert-chain
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Alertmanager Webhook 錯誤率 > 10%"
|
||||
description: "告警可能無法正確送達 AWOOOI API"
|
||||
runbook_url: "https://awoooi.internal/runbooks/alert-chain"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# Sentry Webhook 鏈路異常
|
||||
# -----------------------------------------------------------------
|
||||
- alert: AlertChainBroken_Sentry
|
||||
expr: |
|
||||
sum(rate(awoooi_webhook_requests_total{
|
||||
source="sentry",
|
||||
status!="success"
|
||||
}[5m])) / sum(rate(awoooi_webhook_requests_total{
|
||||
source="sentry"
|
||||
}[5m])) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: alert-chain
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Sentry Webhook 錯誤率 > 10%"
|
||||
description: "Sentry 錯誤可能無法正確處理"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# SignOz Webhook 鏈路異常
|
||||
# -----------------------------------------------------------------
|
||||
- alert: AlertChainBroken_SignOz
|
||||
expr: |
|
||||
sum(rate(awoooi_webhook_requests_total{
|
||||
source="signoz",
|
||||
status!="success"
|
||||
}[5m])) / sum(rate(awoooi_webhook_requests_total{
|
||||
source="signoz"
|
||||
}[5m])) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: alert-chain
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "SignOz Webhook 錯誤率 > 10%"
|
||||
description: "SignOz 告警可能無法正確處理"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 告警鏈路完全斷裂 (無任何告警)
|
||||
# P1-1 修復: 按 source 分組 (2026-03-29)
|
||||
# -----------------------------------------------------------------
|
||||
- alert: NoAlertsReceived2Hours
|
||||
expr: |
|
||||
time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: alert-chain
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "2 小時內未收到任何告警"
|
||||
description: "可能是告警鏈路問題或系統異常穩定。請執行 Smoke Test: python ops/scripts/alert_chain_smoke_test.py"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 告警鏈路健康狀態
|
||||
# -----------------------------------------------------------------
|
||||
- alert: AlertChainUnhealthy
|
||||
expr: |
|
||||
awoooi_alert_chain_healthy == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: alert-chain
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "告警鏈路不健康"
|
||||
description: "{{ $labels.source }} 告警鏈路標記為不健康,最近處理失敗"
|
||||
|
||||
# =========================================================================
|
||||
# Telegram 通知監控
|
||||
# =========================================================================
|
||||
- name: telegram_notifications
|
||||
rules:
|
||||
- alert: TelegramNotificationsFailing
|
||||
expr: |
|
||||
sum(rate(awoooi_telegram_notifications_total{
|
||||
status="failed"
|
||||
}[15m])) / sum(rate(awoooi_telegram_notifications_total[15m])) > 0.3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
service: telegram
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Telegram 通知失敗率 > 30%"
|
||||
description: "告警可能無法正確發送到 Telegram"
|
||||
|
||||
# =========================================================================
|
||||
# 異常頻率升級監控 (ADR-037)
|
||||
# =========================================================================
|
||||
- name: anomaly_escalation
|
||||
rules:
|
||||
- alert: FrequentAnomalyEscalation
|
||||
expr: |
|
||||
sum(rate(awoooi_anomaly_escalation_total{
|
||||
level=~"ESCALATE|PERMANENT_FIX"
|
||||
}[1h])) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: anomaly-counter
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "頻繁異常升級告警"
|
||||
description: "過去 1 小時有超過 5 次異常升級到 ESCALATE 或 PERMANENT_FIX 級別"
|
||||
|
||||
- alert: PermanentFixRequired
|
||||
expr: |
|
||||
sum(rate(awoooi_anomaly_escalation_total{
|
||||
level="PERMANENT_FIX"
|
||||
}[1h])) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: anomaly-counter
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "需要永久修復的異常"
|
||||
description: "有異常升級到 PERMANENT_FIX 級別,需要根本修復而非重啟"
|
||||
|
||||
# =========================================================================
|
||||
# 自動修復監控
|
||||
# =========================================================================
|
||||
- name: auto_repair
|
||||
rules:
|
||||
- alert: AutoRepairLowSuccessRate
|
||||
expr: |
|
||||
awoooi_auto_repair_success_rate < 0.3
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
service: auto-repair
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "自動修復成功率過低 (< 30%)"
|
||||
description: "動作 {{ $labels.action }} 的成功率只有 {{ $value | humanizePercentage }},建議檢查修復邏輯"
|
||||
264
k8s/monitoring/database-alerts.yaml
Normal file
264
k8s/monitoring/database-alerts.yaml
Normal file
@@ -0,0 +1,264 @@
|
||||
# =============================================================================
|
||||
# AWOOOI Database Alerts
|
||||
# =============================================================================
|
||||
# 負責人: DevOps Commander
|
||||
# 版本: v1.0
|
||||
# 日期: 2026-03-29
|
||||
# ADR: ADR-037 Phase B (Database Exporters)
|
||||
#
|
||||
# 告警目標: PostgreSQL (192.168.0.188:5432) + Redis (192.168.0.188:6380)
|
||||
# =============================================================================
|
||||
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: database-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
release: prometheus
|
||||
app: prometheus
|
||||
spec:
|
||||
groups:
|
||||
# =========================================================================
|
||||
# PostgreSQL 告警
|
||||
# =========================================================================
|
||||
- name: postgresql
|
||||
rules:
|
||||
# -----------------------------------------------------------------
|
||||
# 連接池即將耗盡 (> 80%)
|
||||
# -----------------------------------------------------------------
|
||||
- alert: PostgreSQLConnectionPoolNearLimit
|
||||
expr: |
|
||||
sum(pg_stat_activity_count{state="active"}) by (datname) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: postgres
|
||||
team: infra
|
||||
annotations:
|
||||
summary: "PostgreSQL 活躍連接數過高"
|
||||
description: "Database {{ $labels.datname }} 活躍連接: {{ $value }}"
|
||||
runbook_url: "https://awoooi.internal/runbooks/postgres-connections"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 連接池耗盡 (> 95%)
|
||||
# -----------------------------------------------------------------
|
||||
- alert: PostgreSQLConnectionPoolExhausted
|
||||
expr: |
|
||||
sum(pg_stat_activity_count{state="active"}) by (datname) > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: postgres
|
||||
team: infra
|
||||
annotations:
|
||||
summary: "PostgreSQL 連接池即將耗盡"
|
||||
description: "Database {{ $labels.datname }} 活躍連接 > 95"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 慢查詢過多
|
||||
# -----------------------------------------------------------------
|
||||
- alert: PostgreSQLSlowQueries
|
||||
expr: pg_slow_queries > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: postgres
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "PostgreSQL 慢查詢數量過多"
|
||||
description: "User {{ $labels.usename }} 有 {{ $value }} 個慢查詢 (> 1s)"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 鎖等待過多
|
||||
# -----------------------------------------------------------------
|
||||
- alert: PostgreSQLLockWaiting
|
||||
expr: sum(pg_locks_waiting) > 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
service: postgres
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "PostgreSQL 鎖等待過多"
|
||||
description: "{{ $value }} 個查詢正在等待鎖"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 表膨脹 (Dead Tuple > 20%)
|
||||
# -----------------------------------------------------------------
|
||||
- alert: PostgreSQLTableBloat
|
||||
expr: pg_stat_user_tables_bloat_dead_tuple_ratio > 20
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
service: postgres
|
||||
team: infra
|
||||
annotations:
|
||||
summary: "PostgreSQL 表膨脹嚴重"
|
||||
description: "Table {{ $labels.relname }} dead tuple 比例: {{ $value }}%"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 資料庫不可達
|
||||
# -----------------------------------------------------------------
|
||||
- alert: PostgreSQLDown
|
||||
expr: pg_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: postgres
|
||||
team: infra
|
||||
annotations:
|
||||
summary: "PostgreSQL 無法連線"
|
||||
description: "PostgreSQL Exporter 無法連接到資料庫"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 長時間執行查詢 (> 60s)
|
||||
# -----------------------------------------------------------------
|
||||
- alert: PostgreSQLLongRunningQuery
|
||||
expr: pg_longest_query_seconds > 60
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
service: postgres
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "PostgreSQL 長時間執行查詢"
|
||||
description: "User {{ $labels.usename }} 查詢已執行 {{ $value | humanizeDuration }}"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 高 Rollback 率 (> 5%)
|
||||
# -----------------------------------------------------------------
|
||||
- alert: PostgreSQLHighRollbackRate
|
||||
expr: pg_stat_database_transactions_rollback_ratio > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
service: postgres
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "PostgreSQL 事務回滾率過高"
|
||||
description: "Database {{ $labels.datname }} 回滾率: {{ $value }}%"
|
||||
|
||||
# =========================================================================
|
||||
# Redis 告警
|
||||
# =========================================================================
|
||||
- name: redis
|
||||
rules:
|
||||
# -----------------------------------------------------------------
|
||||
# 記憶體使用過高 (> 85%)
|
||||
# -----------------------------------------------------------------
|
||||
- alert: RedisMemoryHigh
|
||||
expr: |
|
||||
redis_memory_used_bytes / redis_memory_max_bytes > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: redis
|
||||
team: infra
|
||||
annotations:
|
||||
summary: "Redis 記憶體使用 > 85%"
|
||||
description: "Redis 記憶體使用: {{ $value | humanizePercentage }}"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 記憶體即將耗盡 (> 95%)
|
||||
# -----------------------------------------------------------------
|
||||
- alert: RedisMemoryCritical
|
||||
expr: |
|
||||
redis_memory_used_bytes / redis_memory_max_bytes > 0.95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: redis
|
||||
team: infra
|
||||
annotations:
|
||||
summary: "Redis 記憶體即將耗盡"
|
||||
description: "Redis 記憶體使用 > 95%"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 快取命中率過低 (< 80%)
|
||||
# -----------------------------------------------------------------
|
||||
- alert: RedisCacheHitRateLow
|
||||
expr: |
|
||||
rate(redis_keyspace_hits_total[5m])
|
||||
/
|
||||
(rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))
|
||||
< 0.8
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
service: redis
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "Redis 快取命中率過低"
|
||||
description: "命中率: {{ $value | humanizePercentage }}"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 連接數過高
|
||||
# -----------------------------------------------------------------
|
||||
- alert: RedisConnectionsHigh
|
||||
expr: redis_connected_clients > 500
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: redis
|
||||
team: infra
|
||||
annotations:
|
||||
summary: "Redis 連接數過高"
|
||||
description: "連接數: {{ $value }}"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# Key 驅逐頻繁
|
||||
# -----------------------------------------------------------------
|
||||
- alert: RedisEvictedKeys
|
||||
expr: rate(redis_evicted_keys_total[5m]) > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: redis
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "Redis Key 驅逐頻繁"
|
||||
description: "每秒驅逐 {{ $value }} 個 key"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# Redis 不可達
|
||||
# -----------------------------------------------------------------
|
||||
- alert: RedisDown
|
||||
expr: redis_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: redis
|
||||
team: infra
|
||||
annotations:
|
||||
summary: "Redis 無法連線"
|
||||
description: "Redis Exporter 無法連接到 Redis"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 命令延遲過高
|
||||
# -----------------------------------------------------------------
|
||||
- alert: RedisLatencyHigh
|
||||
expr: |
|
||||
redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: redis
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "Redis 命令延遲過高"
|
||||
description: "平均命令延遲 > 10ms"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 阻塞客戶端
|
||||
# -----------------------------------------------------------------
|
||||
- alert: RedisBlockedClients
|
||||
expr: redis_blocked_clients > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: redis
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "Redis 有阻塞的客戶端"
|
||||
description: "{{ $value }} 個客戶端被阻塞 (BLPOP/BRPOP)"
|
||||
1426
ops/grafana/dashboards/nvidia-nemotron.json
Normal file
1426
ops/grafana/dashboards/nvidia-nemotron.json
Normal file
File diff suppressed because it is too large
Load Diff
267
ops/scripts/alert_chain_smoke_test.py
Executable file
267
ops/scripts/alert_chain_smoke_test.py
Executable file
@@ -0,0 +1,267 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AWOOOI 告警鏈路 E2E Smoke Test
|
||||
==============================
|
||||
|
||||
ADR-037 Wave A.6: 告警鏈路端到端驗證腳本
|
||||
|
||||
執行方式:
|
||||
python ops/scripts/alert_chain_smoke_test.py
|
||||
|
||||
驗證項目:
|
||||
1. Health Endpoint 可達
|
||||
2. Alertmanager Webhook 可達
|
||||
3. Sentry Webhook 可達
|
||||
4. SignOz Webhook 可達
|
||||
5. Telegram 連通性
|
||||
|
||||
版本: v1.0
|
||||
建立: 2026-03-29 (台北時區)
|
||||
建立者: Claude Code (Phase 21 ADR-037)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
import httpx
|
||||
|
||||
# API 基礎位址 (可透過環境變數覆蓋)
|
||||
API_BASE = os.getenv(
|
||||
"AWOOOI_API_BASE",
|
||||
"http://awoooi-api.awoooi-prod.svc.cluster.local:8000"
|
||||
)
|
||||
|
||||
# 本地測試用:
|
||||
# API_BASE = "http://localhost:8000"
|
||||
|
||||
TIMEOUT = 30
|
||||
|
||||
|
||||
async def test_health_endpoint() -> bool:
|
||||
"""測試 Health Endpoint"""
|
||||
print("1. Testing Health Endpoint...")
|
||||
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
try:
|
||||
response = await client.get(f"{API_BASE}/api/v1/health")
|
||||
if response.status_code == 200:
|
||||
print(" ✅ Health: OK")
|
||||
return True
|
||||
else:
|
||||
print(f" ❌ Health: HTTP {response.status_code}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" ❌ Health: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def test_alertmanager_webhook() -> bool:
|
||||
"""測試 Alertmanager Webhook"""
|
||||
print("2. Testing Alertmanager Webhook...")
|
||||
|
||||
test_payload = {
|
||||
"version": "4",
|
||||
"status": "firing",
|
||||
"alerts": [{
|
||||
"status": "firing",
|
||||
"labels": {
|
||||
"alertname": "E2E_SMOKE_TEST",
|
||||
"severity": "info",
|
||||
"service": "smoke-test",
|
||||
"namespace": "test",
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "E2E Smoke Test - Please Ignore",
|
||||
"description": f"Auto test @ {datetime.now().isoformat()}",
|
||||
},
|
||||
"startsAt": datetime.now().isoformat() + "Z",
|
||||
}]
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{API_BASE}/api/v1/webhooks/alertmanager",
|
||||
json=test_payload,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
print(" ✅ Alertmanager Webhook: OK")
|
||||
return True
|
||||
else:
|
||||
print(f" ❌ Alertmanager Webhook: HTTP {response.status_code}")
|
||||
print(f" Response: {response.text[:200]}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" ❌ Alertmanager Webhook: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def test_sentry_webhook() -> bool:
|
||||
"""測試 Sentry Webhook"""
|
||||
print("3. Testing Sentry Webhook...")
|
||||
|
||||
# 使用唯一 ID 避免去重
|
||||
test_id = f"smoke-test-{datetime.now().strftime('%Y%m%d%H%M%S%f')}"
|
||||
|
||||
test_payload = {
|
||||
"action": "triggered",
|
||||
"data": {
|
||||
"issue": {
|
||||
"id": test_id,
|
||||
"title": "E2E Smoke Test Error",
|
||||
"level": "info", # 使用 info 避免觸發實際告警
|
||||
"culprit": "smoke_test.py:test",
|
||||
"project": {"slug": "awoooi-api"},
|
||||
"firstSeen": datetime.now().isoformat(),
|
||||
"count": 1,
|
||||
},
|
||||
"event": {
|
||||
"message": "E2E Smoke Test - Please Ignore",
|
||||
"platform": "python",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{API_BASE}/api/v1/webhooks/sentry/error",
|
||||
json=test_payload,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
status = result.get("status")
|
||||
if status in ["accepted", "deduplicated", "ignored"]:
|
||||
print(f" ✅ Sentry Webhook: OK (status={status})")
|
||||
return True
|
||||
print(f" ❌ Sentry Webhook: HTTP {response.status_code}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" ❌ Sentry Webhook: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def test_signoz_webhook() -> bool:
|
||||
"""測試 SignOz Webhook"""
|
||||
print("4. Testing SignOz Webhook...")
|
||||
|
||||
test_payload = {
|
||||
"alertname": "E2E_SMOKE_TEST",
|
||||
"status": "firing",
|
||||
"labels": {
|
||||
"alertname": "E2E_SMOKE_TEST",
|
||||
"severity": "info",
|
||||
"service_name": "smoke-test",
|
||||
"source": "signoz",
|
||||
},
|
||||
"annotations": {
|
||||
"summary": "E2E Smoke Test - Please Ignore",
|
||||
"description": f"Auto test @ {datetime.now().isoformat()}",
|
||||
},
|
||||
"startsAt": datetime.now().isoformat() + "Z",
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{API_BASE}/api/v1/webhooks/signoz/alert",
|
||||
json=test_payload,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
if result.get("status") == "ok":
|
||||
print(" ✅ SignOz Webhook: OK")
|
||||
return True
|
||||
print(f" ❌ SignOz Webhook: HTTP {response.status_code}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" ❌ SignOz Webhook: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def test_signoz_health() -> bool:
|
||||
"""測試 SignOz Webhook Health"""
|
||||
print("5. Testing SignOz Webhook Health...")
|
||||
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
try:
|
||||
response = await client.get(
|
||||
f"{API_BASE}/api/v1/webhooks/signoz/health"
|
||||
)
|
||||
if response.status_code == 200:
|
||||
print(" ✅ SignOz Health: OK")
|
||||
return True
|
||||
else:
|
||||
print(f" ❌ SignOz Health: HTTP {response.status_code}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" ❌ SignOz Health: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def test_telegram_connectivity() -> bool:
|
||||
"""測試 Telegram 連通性"""
|
||||
print("6. Testing Telegram Connectivity...")
|
||||
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
try:
|
||||
response = await client.get(f"{API_BASE}/api/v1/telegram/status")
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data.get("connected"):
|
||||
print(" ✅ Telegram: Connected")
|
||||
return True
|
||||
else:
|
||||
print(" ⚠️ Telegram: Not Connected (endpoint reachable)")
|
||||
return True # 端點可達即可
|
||||
elif response.status_code == 404:
|
||||
print(" ⚠️ Telegram: Endpoint not found (skipped)")
|
||||
return True # 不影響整體測試
|
||||
else:
|
||||
print(f" ❌ Telegram: HTTP {response.status_code}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Telegram: {e} (skipped)")
|
||||
return True # 不影響整體測試
|
||||
|
||||
|
||||
async def main():
|
||||
"""執行所有 Smoke Test"""
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
print("=" * 60)
|
||||
print(" AWOOOI Alert Chain E2E Smoke Test")
|
||||
print(f" Time: {now}")
|
||||
print(f" Target: {API_BASE}")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# 依序執行測試 (非並行,方便除錯)
|
||||
results = []
|
||||
results.append(await test_health_endpoint())
|
||||
results.append(await test_alertmanager_webhook())
|
||||
results.append(await test_sentry_webhook())
|
||||
results.append(await test_signoz_webhook())
|
||||
results.append(await test_signoz_health())
|
||||
results.append(await test_telegram_connectivity())
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
passed = sum(results)
|
||||
total = len(results)
|
||||
|
||||
if passed == total:
|
||||
print(f" ✅ ALL PASSED ({passed}/{total})")
|
||||
print("=" * 60)
|
||||
sys.exit(0)
|
||||
else:
|
||||
failed = total - passed
|
||||
print(f" ❌ FAILED ({failed}/{total} tests failed)")
|
||||
print("=" * 60)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
165
ops/signoz/alerting/rules.yaml
Normal file
165
ops/signoz/alerting/rules.yaml
Normal file
@@ -0,0 +1,165 @@
|
||||
# SignOz 告警規則配置
|
||||
# 負責人: DevOps Commander
|
||||
# 版本: v1.0
|
||||
# 日期: 2026-03-29
|
||||
# ADR: ADR-037 (監控增強架構)
|
||||
#
|
||||
# 部署目標: 192.168.0.188 SignOz (Docker)
|
||||
# Webhook: http://awoooi-api.awoooi-prod:8000/api/v1/webhooks/signoz
|
||||
|
||||
groups:
|
||||
# =========================================================================
|
||||
# API Error Rate 告警
|
||||
# =========================================================================
|
||||
- name: api_errors
|
||||
rules:
|
||||
- alert: APIHighErrorRate
|
||||
expr: |
|
||||
sum(rate(signoz_spans_total{
|
||||
service_name="awoooi-api",
|
||||
status_code=~"5.."
|
||||
}[5m])) by (service_name)
|
||||
/
|
||||
sum(rate(signoz_spans_total{
|
||||
service_name="awoooi-api"
|
||||
}[5m])) by (service_name)
|
||||
> 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
source: signoz
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "API 錯誤率 > 5%"
|
||||
description: "服務 {{ $labels.service_name }} 錯誤率: {{ $value | humanizePercentage }}"
|
||||
runbook_url: "https://awoooi.internal/runbooks/api-error-rate"
|
||||
|
||||
# =========================================================================
|
||||
# Latency 告警
|
||||
# =========================================================================
|
||||
- name: latency
|
||||
rules:
|
||||
- alert: APIHighLatencyP99
|
||||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(signoz_spans_duration_bucket{
|
||||
service_name="awoooi-api"
|
||||
}[5m])) by (le, service_name)
|
||||
) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
source: signoz
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "API P99 延遲 > 2s"
|
||||
description: "服務 {{ $labels.service_name }} P99: {{ $value }}s"
|
||||
|
||||
- alert: APIHighLatencyP95
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(signoz_spans_duration_bucket{
|
||||
service_name="awoooi-api"
|
||||
}[5m])) by (le, service_name)
|
||||
) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
source: signoz
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "API P95 延遲 > 1s"
|
||||
description: "服務 {{ $labels.service_name }} P95: {{ $value }}s"
|
||||
|
||||
# =========================================================================
|
||||
# Trace 異常告警
|
||||
# =========================================================================
|
||||
- name: traces
|
||||
rules:
|
||||
- alert: NoTracesReceived
|
||||
expr: |
|
||||
sum(rate(signoz_spans_total[15m])) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
source: signoz
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "15 分鐘內無 Trace 數據"
|
||||
description: "可能是 OTEL Collector 或應用程式問題,請檢查 192.168.0.188:24318 端點"
|
||||
|
||||
- alert: HighSpanDropRate
|
||||
expr: |
|
||||
sum(rate(otelcol_exporter_send_failed_spans[5m]))
|
||||
/
|
||||
sum(rate(otelcol_exporter_sent_spans[5m]))
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
source: signoz
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Span 丟棄率 > 1%"
|
||||
description: "OTEL Collector 可能有性能問題或目標不可達"
|
||||
|
||||
# ADR-037 Phase E: 長時間 Trace 告警
|
||||
- alert: LongRunningTrace
|
||||
expr: |
|
||||
max(signoz_spans_duration{
|
||||
service_name="awoooi-api",
|
||||
status_code!~"5.."
|
||||
}) by (trace_id, operation) > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
source: signoz
|
||||
team: backend
|
||||
annotations:
|
||||
summary: "Trace 執行超過 10 秒"
|
||||
description: "操作 {{ $labels.operation }} 執行時間 {{ $value }}s (trace: {{ $labels.trace_id }})"
|
||||
runbook_url: "https://awoooi.internal/runbooks/long-trace"
|
||||
|
||||
# =========================================================================
|
||||
# NVIDIA Nemotron 監控 (ADR-036)
|
||||
# =========================================================================
|
||||
- name: nvidia_api
|
||||
rules:
|
||||
- alert: NVIDIAHighLatency
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(signoz_spans_duration_bucket{
|
||||
service_name="awoooi-api",
|
||||
operation=~".*nvidia.*"
|
||||
}[5m])) by (le)
|
||||
) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
source: signoz
|
||||
team: ai
|
||||
annotations:
|
||||
summary: "NVIDIA API P95 延遲 > 5s"
|
||||
description: "Tool Calling 可能有性能問題"
|
||||
|
||||
- alert: NVIDIAHighErrorRate
|
||||
expr: |
|
||||
sum(rate(signoz_spans_total{
|
||||
service_name="awoooi-api",
|
||||
operation=~".*nvidia.*",
|
||||
status_code=~"5.."
|
||||
}[5m]))
|
||||
/
|
||||
sum(rate(signoz_spans_total{
|
||||
service_name="awoooi-api",
|
||||
operation=~".*nvidia.*"
|
||||
}[5m]))
|
||||
> 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
source: signoz
|
||||
team: ai
|
||||
annotations:
|
||||
summary: "NVIDIA API 錯誤率 > 10%"
|
||||
description: "可能需要 Fallback 到 Ollama"
|
||||
Reference in New Issue
Block a user