From a5a6bd34083c97c08f322c619e43ae8743d67bfb Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 29 Mar 2026 16:04:14 +0800 Subject: [PATCH] =?UTF-8?q?feat(monitoring):=20K8s=20alert=20rules=20+=20G?= =?UTF-8?q?rafana=20dashboards=20+=20ops=20=E8=85=B3=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - k8s/monitoring/alert-chain-monitor.yaml - k8s/monitoring/database-alerts.yaml - ops/grafana/ Grafana dashboards - ops/signoz/ SignOz 配置 - ops/scripts/ 維運腳本 Co-Authored-By: Claude Opus 4.5 --- k8s/jobs/migrate-phase18-audit-logs.yaml | 147 ++ k8s/monitoring/alert-chain-monitor.yaml | 183 +++ k8s/monitoring/database-alerts.yaml | 264 ++++ ops/grafana/dashboards/nvidia-nemotron.json | 1426 +++++++++++++++++++ ops/scripts/alert_chain_smoke_test.py | 267 ++++ ops/signoz/alerting/rules.yaml | 165 +++ 6 files changed, 2452 insertions(+) create mode 100644 k8s/jobs/migrate-phase18-audit-logs.yaml create mode 100644 k8s/monitoring/alert-chain-monitor.yaml create mode 100644 k8s/monitoring/database-alerts.yaml create mode 100644 ops/grafana/dashboards/nvidia-nemotron.json create mode 100755 ops/scripts/alert_chain_smoke_test.py create mode 100644 ops/signoz/alerting/rules.yaml diff --git a/k8s/jobs/migrate-phase18-audit-logs.yaml b/k8s/jobs/migrate-phase18-audit-logs.yaml new file mode 100644 index 00000000..c09599df --- /dev/null +++ b/k8s/jobs/migrate-phase18-audit-logs.yaml @@ -0,0 +1,147 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: migrate-phase18-audit-logs + namespace: awoooi-prod + labels: + app: awoooi-migration + phase: "18" +spec: + ttlSecondsAfterFinished: 300 # 5 分鐘後自動清理 + backoffLimit: 1 + template: + spec: + restartPolicy: Never + containers: + - name: migrate + image: postgres:15-alpine + command: + - /bin/sh + - -c + - | + echo "==========================================" + echo "Phase 18 AuditLog Migration" + echo "==========================================" + + # 從 SECRET 讀取連線資訊 + DB_HOST=$(echo $DATABASE_URL | sed 's/.*@\([^:]*\):.*/\1/') + DB_PORT=$(echo $DATABASE_URL | sed 's/.*:\([0-9]*\)\/.*/\1/') + DB_NAME=$(echo $DATABASE_URL | sed 's/.*\/\([^?]*\).*/\1/') + DB_USER=$(echo $DATABASE_URL | sed 's/.*\/\/\([^:]*\):.*/\1/') + DB_PASS=$(echo $DATABASE_URL | sed 's/.*:\/\/[^:]*:\([^@]*\)@.*/\1/') + + echo "Connecting to: $DB_HOST:$DB_PORT/$DB_NAME" + + export PGPASSWORD="$DB_PASS" + + # 執行遷移 + psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" <<'EOSQL' + -- 1. authorization_channel + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'audit_logs' AND column_name = 'authorization_channel' + ) THEN + ALTER TABLE audit_logs ADD COLUMN authorization_channel VARCHAR(20); + RAISE NOTICE 'Added: authorization_channel'; + ELSE + RAISE NOTICE 'Exists: authorization_channel'; + END IF; + END $$; + + -- 2. retry_count + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'audit_logs' AND column_name = 'retry_count' + ) THEN + ALTER TABLE audit_logs ADD COLUMN retry_count INTEGER DEFAULT 0 NOT NULL; + RAISE NOTICE 'Added: retry_count'; + ELSE + RAISE NOTICE 'Exists: retry_count'; + END IF; + END $$; + + -- 3. failure_classification + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'audit_logs' AND column_name = 'failure_classification' + ) THEN + ALTER TABLE audit_logs ADD COLUMN failure_classification VARCHAR(50); + RAISE NOTICE 'Added: failure_classification'; + ELSE + RAISE NOTICE 'Exists: failure_classification'; + END IF; + END $$; + + -- 4. source_approval_id + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'audit_logs' AND column_name = 'source_approval_id' + ) THEN + ALTER TABLE audit_logs ADD COLUMN source_approval_id VARCHAR(36); + RAISE NOTICE 'Added: source_approval_id'; + ELSE + RAISE NOTICE 'Exists: source_approval_id'; + END IF; + END $$; + + -- 5. auto_repair_attempted + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'audit_logs' AND column_name = 'auto_repair_attempted' + ) THEN + ALTER TABLE audit_logs ADD COLUMN auto_repair_attempted BOOLEAN DEFAULT FALSE NOT NULL; + RAISE NOTICE 'Added: auto_repair_attempted'; + ELSE + RAISE NOTICE 'Exists: auto_repair_attempted'; + END IF; + END $$; + + -- 6. auto_repair_result + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'audit_logs' AND column_name = 'auto_repair_result' + ) THEN + ALTER TABLE audit_logs ADD COLUMN auto_repair_result TEXT; + RAISE NOTICE 'Added: auto_repair_result'; + ELSE + RAISE NOTICE 'Exists: auto_repair_result'; + END IF; + END $$; + + -- 創建索引 + CREATE INDEX IF NOT EXISTS ix_audit_authorization_channel ON audit_logs(authorization_channel); + CREATE INDEX IF NOT EXISTS ix_audit_failure_classification ON audit_logs(failure_classification); + CREATE INDEX IF NOT EXISTS ix_audit_source_approval_id ON audit_logs(source_approval_id); + + -- 驗證 + SELECT column_name, data_type + FROM information_schema.columns + WHERE table_name = 'audit_logs' + ORDER BY ordinal_position; + EOSQL + + echo "==========================================" + echo "Migration completed!" + echo "==========================================" + envFrom: + - secretRef: + name: awoooi-secrets + resources: + requests: + memory: "64Mi" + cpu: "100m" + limits: + memory: "128Mi" + cpu: "200m" diff --git a/k8s/monitoring/alert-chain-monitor.yaml b/k8s/monitoring/alert-chain-monitor.yaml new file mode 100644 index 00000000..189905c9 --- /dev/null +++ b/k8s/monitoring/alert-chain-monitor.yaml @@ -0,0 +1,183 @@ +# AWOOOI 告警鏈路監控 +# 負責人: DevOps Commander +# 版本: v1.0 +# 日期: 2026-03-29 +# ADR: ADR-037 (監控增強架構) Wave B.1 +# +# 用途: 監控告警鏈路健康狀態,防止 2026-03-26 路徑錯誤導致無告警事故重演 + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: alert-chain-monitor + namespace: monitoring + labels: + release: prometheus + app: prometheus +spec: + groups: + # ========================================================================= + # 告警鏈路健康監控 + # ========================================================================= + - name: alert_chain + rules: + # ----------------------------------------------------------------- + # Alertmanager Webhook 鏈路異常 + # ----------------------------------------------------------------- + - alert: AlertChainBroken_Alertmanager + expr: | + sum(rate(awoooi_webhook_requests_total{ + source="alertmanager", + status!="success" + }[5m])) / sum(rate(awoooi_webhook_requests_total{ + source="alertmanager" + }[5m])) > 0.1 + for: 10m + labels: + severity: critical + service: alert-chain + team: platform + annotations: + summary: "Alertmanager Webhook 錯誤率 > 10%" + description: "告警可能無法正確送達 AWOOOI API" + runbook_url: "https://awoooi.internal/runbooks/alert-chain" + + # ----------------------------------------------------------------- + # Sentry Webhook 鏈路異常 + # ----------------------------------------------------------------- + - alert: AlertChainBroken_Sentry + expr: | + sum(rate(awoooi_webhook_requests_total{ + source="sentry", + status!="success" + }[5m])) / sum(rate(awoooi_webhook_requests_total{ + source="sentry" + }[5m])) > 0.1 + for: 10m + labels: + severity: warning + service: alert-chain + team: platform + annotations: + summary: "Sentry Webhook 錯誤率 > 10%" + description: "Sentry 錯誤可能無法正確處理" + + # ----------------------------------------------------------------- + # SignOz Webhook 鏈路異常 + # ----------------------------------------------------------------- + - alert: AlertChainBroken_SignOz + expr: | + sum(rate(awoooi_webhook_requests_total{ + source="signoz", + status!="success" + }[5m])) / sum(rate(awoooi_webhook_requests_total{ + source="signoz" + }[5m])) > 0.1 + for: 10m + labels: + severity: warning + service: alert-chain + team: platform + annotations: + summary: "SignOz Webhook 錯誤率 > 10%" + description: "SignOz 告警可能無法正確處理" + + # ----------------------------------------------------------------- + # 告警鏈路完全斷裂 (無任何告警) + # P1-1 修復: 按 source 分組 (2026-03-29) + # ----------------------------------------------------------------- + - alert: NoAlertsReceived2Hours + expr: | + time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200 + for: 5m + labels: + severity: warning + service: alert-chain + team: platform + annotations: + summary: "2 小時內未收到任何告警" + description: "可能是告警鏈路問題或系統異常穩定。請執行 Smoke Test: python ops/scripts/alert_chain_smoke_test.py" + + # ----------------------------------------------------------------- + # 告警鏈路健康狀態 + # ----------------------------------------------------------------- + - alert: AlertChainUnhealthy + expr: | + awoooi_alert_chain_healthy == 0 + for: 5m + labels: + severity: critical + service: alert-chain + team: platform + annotations: + summary: "告警鏈路不健康" + description: "{{ $labels.source }} 告警鏈路標記為不健康,最近處理失敗" + + # ========================================================================= + # Telegram 通知監控 + # ========================================================================= + - name: telegram_notifications + rules: + - alert: TelegramNotificationsFailing + expr: | + sum(rate(awoooi_telegram_notifications_total{ + status="failed" + }[15m])) / sum(rate(awoooi_telegram_notifications_total[15m])) > 0.3 + for: 15m + labels: + severity: warning + service: telegram + team: platform + annotations: + summary: "Telegram 通知失敗率 > 30%" + description: "告警可能無法正確發送到 Telegram" + + # ========================================================================= + # 異常頻率升級監控 (ADR-037) + # ========================================================================= + - name: anomaly_escalation + rules: + - alert: FrequentAnomalyEscalation + expr: | + sum(rate(awoooi_anomaly_escalation_total{ + level=~"ESCALATE|PERMANENT_FIX" + }[1h])) > 5 + for: 5m + labels: + severity: warning + service: anomaly-counter + team: backend + annotations: + summary: "頻繁異常升級告警" + description: "過去 1 小時有超過 5 次異常升級到 ESCALATE 或 PERMANENT_FIX 級別" + + - alert: PermanentFixRequired + expr: | + sum(rate(awoooi_anomaly_escalation_total{ + level="PERMANENT_FIX" + }[1h])) > 0 + for: 1m + labels: + severity: critical + service: anomaly-counter + team: backend + annotations: + summary: "需要永久修復的異常" + description: "有異常升級到 PERMANENT_FIX 級別,需要根本修復而非重啟" + + # ========================================================================= + # 自動修復監控 + # ========================================================================= + - name: auto_repair + rules: + - alert: AutoRepairLowSuccessRate + expr: | + awoooi_auto_repair_success_rate < 0.3 + for: 30m + labels: + severity: warning + service: auto-repair + team: backend + annotations: + summary: "自動修復成功率過低 (< 30%)" + description: "動作 {{ $labels.action }} 的成功率只有 {{ $value | humanizePercentage }},建議檢查修復邏輯" diff --git a/k8s/monitoring/database-alerts.yaml b/k8s/monitoring/database-alerts.yaml new file mode 100644 index 00000000..4a9c4f2e --- /dev/null +++ b/k8s/monitoring/database-alerts.yaml @@ -0,0 +1,264 @@ +# ============================================================================= +# AWOOOI Database Alerts +# ============================================================================= +# 負責人: DevOps Commander +# 版本: v1.0 +# 日期: 2026-03-29 +# ADR: ADR-037 Phase B (Database Exporters) +# +# 告警目標: PostgreSQL (192.168.0.188:5432) + Redis (192.168.0.188:6380) +# ============================================================================= + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: database-alerts + namespace: monitoring + labels: + release: prometheus + app: prometheus +spec: + groups: + # ========================================================================= + # PostgreSQL 告警 + # ========================================================================= + - name: postgresql + rules: + # ----------------------------------------------------------------- + # 連接池即將耗盡 (> 80%) + # ----------------------------------------------------------------- + - alert: PostgreSQLConnectionPoolNearLimit + expr: | + sum(pg_stat_activity_count{state="active"}) by (datname) > 80 + for: 5m + labels: + severity: warning + service: postgres + team: infra + annotations: + summary: "PostgreSQL 活躍連接數過高" + description: "Database {{ $labels.datname }} 活躍連接: {{ $value }}" + runbook_url: "https://awoooi.internal/runbooks/postgres-connections" + + # ----------------------------------------------------------------- + # 連接池耗盡 (> 95%) + # ----------------------------------------------------------------- + - alert: PostgreSQLConnectionPoolExhausted + expr: | + sum(pg_stat_activity_count{state="active"}) by (datname) > 95 + for: 2m + labels: + severity: critical + service: postgres + team: infra + annotations: + summary: "PostgreSQL 連接池即將耗盡" + description: "Database {{ $labels.datname }} 活躍連接 > 95" + + # ----------------------------------------------------------------- + # 慢查詢過多 + # ----------------------------------------------------------------- + - alert: PostgreSQLSlowQueries + expr: pg_slow_queries > 5 + for: 5m + labels: + severity: warning + service: postgres + team: backend + annotations: + summary: "PostgreSQL 慢查詢數量過多" + description: "User {{ $labels.usename }} 有 {{ $value }} 個慢查詢 (> 1s)" + + # ----------------------------------------------------------------- + # 鎖等待過多 + # ----------------------------------------------------------------- + - alert: PostgreSQLLockWaiting + expr: sum(pg_locks_waiting) > 10 + for: 2m + labels: + severity: warning + service: postgres + team: backend + annotations: + summary: "PostgreSQL 鎖等待過多" + description: "{{ $value }} 個查詢正在等待鎖" + + # ----------------------------------------------------------------- + # 表膨脹 (Dead Tuple > 20%) + # ----------------------------------------------------------------- + - alert: PostgreSQLTableBloat + expr: pg_stat_user_tables_bloat_dead_tuple_ratio > 20 + for: 30m + labels: + severity: warning + service: postgres + team: infra + annotations: + summary: "PostgreSQL 表膨脹嚴重" + description: "Table {{ $labels.relname }} dead tuple 比例: {{ $value }}%" + + # ----------------------------------------------------------------- + # 資料庫不可達 + # ----------------------------------------------------------------- + - alert: PostgreSQLDown + expr: pg_up == 0 + for: 1m + labels: + severity: critical + service: postgres + team: infra + annotations: + summary: "PostgreSQL 無法連線" + description: "PostgreSQL Exporter 無法連接到資料庫" + + # ----------------------------------------------------------------- + # 長時間執行查詢 (> 60s) + # ----------------------------------------------------------------- + - alert: PostgreSQLLongRunningQuery + expr: pg_longest_query_seconds > 60 + for: 1m + labels: + severity: warning + service: postgres + team: backend + annotations: + summary: "PostgreSQL 長時間執行查詢" + description: "User {{ $labels.usename }} 查詢已執行 {{ $value | humanizeDuration }}" + + # ----------------------------------------------------------------- + # 高 Rollback 率 (> 5%) + # ----------------------------------------------------------------- + - alert: PostgreSQLHighRollbackRate + expr: pg_stat_database_transactions_rollback_ratio > 5 + for: 15m + labels: + severity: warning + service: postgres + team: backend + annotations: + summary: "PostgreSQL 事務回滾率過高" + description: "Database {{ $labels.datname }} 回滾率: {{ $value }}%" + + # ========================================================================= + # Redis 告警 + # ========================================================================= + - name: redis + rules: + # ----------------------------------------------------------------- + # 記憶體使用過高 (> 85%) + # ----------------------------------------------------------------- + - alert: RedisMemoryHigh + expr: | + redis_memory_used_bytes / redis_memory_max_bytes > 0.85 + for: 5m + labels: + severity: warning + service: redis + team: infra + annotations: + summary: "Redis 記憶體使用 > 85%" + description: "Redis 記憶體使用: {{ $value | humanizePercentage }}" + + # ----------------------------------------------------------------- + # 記憶體即將耗盡 (> 95%) + # ----------------------------------------------------------------- + - alert: RedisMemoryCritical + expr: | + redis_memory_used_bytes / redis_memory_max_bytes > 0.95 + for: 2m + labels: + severity: critical + service: redis + team: infra + annotations: + summary: "Redis 記憶體即將耗盡" + description: "Redis 記憶體使用 > 95%" + + # ----------------------------------------------------------------- + # 快取命中率過低 (< 80%) + # ----------------------------------------------------------------- + - alert: RedisCacheHitRateLow + expr: | + rate(redis_keyspace_hits_total[5m]) + / + (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m])) + < 0.8 + for: 15m + labels: + severity: warning + service: redis + team: backend + annotations: + summary: "Redis 快取命中率過低" + description: "命中率: {{ $value | humanizePercentage }}" + + # ----------------------------------------------------------------- + # 連接數過高 + # ----------------------------------------------------------------- + - alert: RedisConnectionsHigh + expr: redis_connected_clients > 500 + for: 5m + labels: + severity: warning + service: redis + team: infra + annotations: + summary: "Redis 連接數過高" + description: "連接數: {{ $value }}" + + # ----------------------------------------------------------------- + # Key 驅逐頻繁 + # ----------------------------------------------------------------- + - alert: RedisEvictedKeys + expr: rate(redis_evicted_keys_total[5m]) > 100 + for: 5m + labels: + severity: warning + service: redis + team: backend + annotations: + summary: "Redis Key 驅逐頻繁" + description: "每秒驅逐 {{ $value }} 個 key" + + # ----------------------------------------------------------------- + # Redis 不可達 + # ----------------------------------------------------------------- + - alert: RedisDown + expr: redis_up == 0 + for: 1m + labels: + severity: critical + service: redis + team: infra + annotations: + summary: "Redis 無法連線" + description: "Redis Exporter 無法連接到 Redis" + + # ----------------------------------------------------------------- + # 命令延遲過高 + # ----------------------------------------------------------------- + - alert: RedisLatencyHigh + expr: | + redis_commands_duration_seconds_total / redis_commands_processed_total > 0.01 + for: 5m + labels: + severity: warning + service: redis + team: backend + annotations: + summary: "Redis 命令延遲過高" + description: "平均命令延遲 > 10ms" + + # ----------------------------------------------------------------- + # 阻塞客戶端 + # ----------------------------------------------------------------- + - alert: RedisBlockedClients + expr: redis_blocked_clients > 10 + for: 5m + labels: + severity: warning + service: redis + team: backend + annotations: + summary: "Redis 有阻塞的客戶端" + description: "{{ $value }} 個客戶端被阻塞 (BLPOP/BRPOP)" diff --git a/ops/grafana/dashboards/nvidia-nemotron.json b/ops/grafana/dashboards/nvidia-nemotron.json new file mode 100644 index 00000000..579dd7e7 --- /dev/null +++ b/ops/grafana/dashboards/nvidia-nemotron.json @@ -0,0 +1,1426 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "NVIDIA Nemotron Tool Calling 監控儀表板 (ADR-036 / Phase 20)", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "Health Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "OPEN" + }, + "1": { + "color": "green", + "index": 0, + "text": "CLOSED" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "nvidia_circuit_breaker_state{service=\"nemotron\"}", + "legendFormat": "Circuit Breaker", + "refId": "A" + } + ], + "title": "Circuit Breaker", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 4, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(nvidia_tool_call_requests_total[5m])) by (status)", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "Tool Call Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 30 + }, + { + "color": "red", + "value": 45 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 14, + "y": 1 + }, + "id": 4, + "options": { + "legend": { + "calcs": ["mean", "max", "p95"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum(rate(nvidia_tool_call_latency_seconds_bucket[5m])) by (le))", + "legendFormat": "P50", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum(rate(nvidia_tool_call_latency_seconds_bucket[5m])) by (le))", + "legendFormat": "P95", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(nvidia_tool_call_latency_seconds_bucket[5m])) by (le))", + "legendFormat": "P99", + "refId": "C" + } + ], + "title": "Tool Call Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.05 + }, + { + "color": "red", + "value": 0.1 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 5 + }, + "id": 5, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(nvidia_tool_call_requests_total{status=\"error\"}[5m])) / sum(rate(nvidia_tool_call_requests_total[5m]))", + "legendFormat": "Error Rate", + "refId": "A" + } + ], + "title": "Error Rate", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 6, + "panels": [], + "title": "Circuit Breaker Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "stepAfter", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 0, + "text": "OPEN" + }, + "1": { + "color": "green", + "index": 1, + "text": "CLOSED" + }, + "2": { + "color": "yellow", + "index": 2, + "text": "HALF-OPEN" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "nvidia_circuit_breaker_state{service=\"nemotron\"}", + "legendFormat": "State", + "refId": "A" + } + ], + "title": "Circuit Breaker State Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 8, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "increase(nvidia_circuit_breaker_state_changes_total[1h])", + "legendFormat": "{{from_state}} → {{to_state}}", + "refId": "A" + } + ], + "title": "Circuit Breaker State Changes (Last Hour)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 9, + "panels": [], + "title": "Fallback & Degradation", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "nvidia" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#76b900", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "gemini" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#4285f4", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ollama" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#ff6b6b", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["mean", "sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(awoooi_ai_requests_total{provider=\"nvidia\"}[5m]))", + "legendFormat": "nvidia", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(awoooi_ai_requests_total{provider=\"gemini\"}[5m]))", + "legendFormat": "gemini", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(awoooi_ai_requests_total{provider=\"ollama\"}[5m]))", + "legendFormat": "ollama", + "refId": "C" + } + ], + "title": "AI Provider Traffic Distribution", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "nvidia" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#76b900", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "gemini" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#4285f4", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ollama" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#ff6b6b", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 17 + }, + "id": 11, + "options": { + "displayLabels": ["percent"], + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "values": ["value"] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": ["sum"], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(increase(awoooi_ai_requests_total{provider=\"nvidia\"}[24h]))", + "legendFormat": "nvidia", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(increase(awoooi_ai_requests_total{provider=\"gemini\"}[24h]))", + "legendFormat": "gemini", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(increase(awoooi_ai_requests_total{provider=\"ollama\"}[24h]))", + "legendFormat": "ollama", + "refId": "C" + } + ], + "title": "Provider Distribution (24h)", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 17 + }, + "id": 12, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "increase(nvidia_fallback_total{target=\"gemini\"}[1h])", + "legendFormat": "→ Gemini", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "increase(nvidia_fallback_total{target=\"ollama\"}[1h])", + "legendFormat": "→ Ollama", + "refId": "B" + } + ], + "title": "Fallback Count (Last Hour)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 13, + "panels": [], + "title": "Tool Calling Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 14, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "increase(nvidia_tool_calls_total[1h])", + "legendFormat": "{{tool_name}}", + "refId": "A" + } + ], + "title": "Tool Calls by Tool Name (Last Hour)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5000 + }, + { + "color": "red", + "value": 10000 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 15, + "options": { + "legend": { + "calcs": ["mean", "sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(nvidia_tokens_used_total{type=\"input\"}[5m])) * 60", + "legendFormat": "Input Tokens/min", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(nvidia_tokens_used_total{type=\"output\"}[5m])) * 60", + "legendFormat": "Output Tokens/min", + "refId": "B" + } + ], + "title": "Token Usage Rate", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 16, + "panels": [], + "title": "Anomaly Frequency (ADR-037)", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "dashed" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 3 + }, + { + "color": "orange", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 17, + "options": { + "legend": { + "calcs": ["max", "last"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "awoooi_anomaly_frequency{service=\"nvidia-nemotron\"}", + "legendFormat": "{{alert_name}}", + "refId": "A" + } + ], + "title": "NVIDIA Anomaly Frequency (24h)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "REPEAT" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ESCALATE" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PERMANENT_FIX" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 18, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "increase(awoooi_anomaly_escalation_total{service=\"nvidia-nemotron\"}[1h])", + "legendFormat": "{{level}}", + "refId": "A" + } + ], + "title": "Escalation Events (Last Hour)", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "style": "dark", + "tags": ["nvidia", "nemotron", "tool-calling", "ai", "adr-037"], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "Asia/Taipei", + "title": "NVIDIA Nemotron Tool Calling", + "uid": "nvidia-nemotron", + "version": 1, + "weekStart": "monday" +} diff --git a/ops/scripts/alert_chain_smoke_test.py b/ops/scripts/alert_chain_smoke_test.py new file mode 100755 index 00000000..33a819e1 --- /dev/null +++ b/ops/scripts/alert_chain_smoke_test.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +""" +AWOOOI 告警鏈路 E2E Smoke Test +============================== + +ADR-037 Wave A.6: 告警鏈路端到端驗證腳本 + +執行方式: + python ops/scripts/alert_chain_smoke_test.py + +驗證項目: +1. Health Endpoint 可達 +2. Alertmanager Webhook 可達 +3. Sentry Webhook 可達 +4. SignOz Webhook 可達 +5. Telegram 連通性 + +版本: v1.0 +建立: 2026-03-29 (台北時區) +建立者: Claude Code (Phase 21 ADR-037) +""" + +import asyncio +import os +import sys +from datetime import datetime + +import httpx + +# API 基礎位址 (可透過環境變數覆蓋) +API_BASE = os.getenv( + "AWOOOI_API_BASE", + "http://awoooi-api.awoooi-prod.svc.cluster.local:8000" +) + +# 本地測試用: +# API_BASE = "http://localhost:8000" + +TIMEOUT = 30 + + +async def test_health_endpoint() -> bool: + """測試 Health Endpoint""" + print("1. Testing Health Endpoint...") + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + try: + response = await client.get(f"{API_BASE}/api/v1/health") + if response.status_code == 200: + print(" ✅ Health: OK") + return True + else: + print(f" ❌ Health: HTTP {response.status_code}") + return False + except Exception as e: + print(f" ❌ Health: {e}") + return False + + +async def test_alertmanager_webhook() -> bool: + """測試 Alertmanager Webhook""" + print("2. Testing Alertmanager Webhook...") + + test_payload = { + "version": "4", + "status": "firing", + "alerts": [{ + "status": "firing", + "labels": { + "alertname": "E2E_SMOKE_TEST", + "severity": "info", + "service": "smoke-test", + "namespace": "test", + }, + "annotations": { + "summary": "E2E Smoke Test - Please Ignore", + "description": f"Auto test @ {datetime.now().isoformat()}", + }, + "startsAt": datetime.now().isoformat() + "Z", + }] + } + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + try: + response = await client.post( + f"{API_BASE}/api/v1/webhooks/alertmanager", + json=test_payload, + ) + if response.status_code == 200: + print(" ✅ Alertmanager Webhook: OK") + return True + else: + print(f" ❌ Alertmanager Webhook: HTTP {response.status_code}") + print(f" Response: {response.text[:200]}") + return False + except Exception as e: + print(f" ❌ Alertmanager Webhook: {e}") + return False + + +async def test_sentry_webhook() -> bool: + """測試 Sentry Webhook""" + print("3. Testing Sentry Webhook...") + + # 使用唯一 ID 避免去重 + test_id = f"smoke-test-{datetime.now().strftime('%Y%m%d%H%M%S%f')}" + + test_payload = { + "action": "triggered", + "data": { + "issue": { + "id": test_id, + "title": "E2E Smoke Test Error", + "level": "info", # 使用 info 避免觸發實際告警 + "culprit": "smoke_test.py:test", + "project": {"slug": "awoooi-api"}, + "firstSeen": datetime.now().isoformat(), + "count": 1, + }, + "event": { + "message": "E2E Smoke Test - Please Ignore", + "platform": "python", + }, + }, + } + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + try: + response = await client.post( + f"{API_BASE}/api/v1/webhooks/sentry/error", + json=test_payload, + ) + if response.status_code == 200: + result = response.json() + status = result.get("status") + if status in ["accepted", "deduplicated", "ignored"]: + print(f" ✅ Sentry Webhook: OK (status={status})") + return True + print(f" ❌ Sentry Webhook: HTTP {response.status_code}") + return False + except Exception as e: + print(f" ❌ Sentry Webhook: {e}") + return False + + +async def test_signoz_webhook() -> bool: + """測試 SignOz Webhook""" + print("4. Testing SignOz Webhook...") + + test_payload = { + "alertname": "E2E_SMOKE_TEST", + "status": "firing", + "labels": { + "alertname": "E2E_SMOKE_TEST", + "severity": "info", + "service_name": "smoke-test", + "source": "signoz", + }, + "annotations": { + "summary": "E2E Smoke Test - Please Ignore", + "description": f"Auto test @ {datetime.now().isoformat()}", + }, + "startsAt": datetime.now().isoformat() + "Z", + } + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + try: + response = await client.post( + f"{API_BASE}/api/v1/webhooks/signoz/alert", + json=test_payload, + ) + if response.status_code == 200: + result = response.json() + if result.get("status") == "ok": + print(" ✅ SignOz Webhook: OK") + return True + print(f" ❌ SignOz Webhook: HTTP {response.status_code}") + return False + except Exception as e: + print(f" ❌ SignOz Webhook: {e}") + return False + + +async def test_signoz_health() -> bool: + """測試 SignOz Webhook Health""" + print("5. Testing SignOz Webhook Health...") + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + try: + response = await client.get( + f"{API_BASE}/api/v1/webhooks/signoz/health" + ) + if response.status_code == 200: + print(" ✅ SignOz Health: OK") + return True + else: + print(f" ❌ SignOz Health: HTTP {response.status_code}") + return False + except Exception as e: + print(f" ❌ SignOz Health: {e}") + return False + + +async def test_telegram_connectivity() -> bool: + """測試 Telegram 連通性""" + print("6. Testing Telegram Connectivity...") + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + try: + response = await client.get(f"{API_BASE}/api/v1/telegram/status") + if response.status_code == 200: + data = response.json() + if data.get("connected"): + print(" ✅ Telegram: Connected") + return True + else: + print(" ⚠️ Telegram: Not Connected (endpoint reachable)") + return True # 端點可達即可 + elif response.status_code == 404: + print(" ⚠️ Telegram: Endpoint not found (skipped)") + return True # 不影響整體測試 + else: + print(f" ❌ Telegram: HTTP {response.status_code}") + return False + except Exception as e: + print(f" ⚠️ Telegram: {e} (skipped)") + return True # 不影響整體測試 + + +async def main(): + """執行所有 Smoke Test""" + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + print("=" * 60) + print(" AWOOOI Alert Chain E2E Smoke Test") + print(f" Time: {now}") + print(f" Target: {API_BASE}") + print("=" * 60) + print() + + # 依序執行測試 (非並行,方便除錯) + results = [] + results.append(await test_health_endpoint()) + results.append(await test_alertmanager_webhook()) + results.append(await test_sentry_webhook()) + results.append(await test_signoz_webhook()) + results.append(await test_signoz_health()) + results.append(await test_telegram_connectivity()) + + print() + print("=" * 60) + passed = sum(results) + total = len(results) + + if passed == total: + print(f" ✅ ALL PASSED ({passed}/{total})") + print("=" * 60) + sys.exit(0) + else: + failed = total - passed + print(f" ❌ FAILED ({failed}/{total} tests failed)") + print("=" * 60) + sys.exit(1) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/ops/signoz/alerting/rules.yaml b/ops/signoz/alerting/rules.yaml new file mode 100644 index 00000000..75461f1d --- /dev/null +++ b/ops/signoz/alerting/rules.yaml @@ -0,0 +1,165 @@ +# SignOz 告警規則配置 +# 負責人: DevOps Commander +# 版本: v1.0 +# 日期: 2026-03-29 +# ADR: ADR-037 (監控增強架構) +# +# 部署目標: 192.168.0.188 SignOz (Docker) +# Webhook: http://awoooi-api.awoooi-prod:8000/api/v1/webhooks/signoz + +groups: + # ========================================================================= + # API Error Rate 告警 + # ========================================================================= + - name: api_errors + rules: + - alert: APIHighErrorRate + expr: | + sum(rate(signoz_spans_total{ + service_name="awoooi-api", + status_code=~"5.." + }[5m])) by (service_name) + / + sum(rate(signoz_spans_total{ + service_name="awoooi-api" + }[5m])) by (service_name) + > 0.05 + for: 5m + labels: + severity: critical + source: signoz + team: backend + annotations: + summary: "API 錯誤率 > 5%" + description: "服務 {{ $labels.service_name }} 錯誤率: {{ $value | humanizePercentage }}" + runbook_url: "https://awoooi.internal/runbooks/api-error-rate" + + # ========================================================================= + # Latency 告警 + # ========================================================================= + - name: latency + rules: + - alert: APIHighLatencyP99 + expr: | + histogram_quantile(0.99, + sum(rate(signoz_spans_duration_bucket{ + service_name="awoooi-api" + }[5m])) by (le, service_name) + ) > 2 + for: 5m + labels: + severity: warning + source: signoz + team: backend + annotations: + summary: "API P99 延遲 > 2s" + description: "服務 {{ $labels.service_name }} P99: {{ $value }}s" + + - alert: APIHighLatencyP95 + expr: | + histogram_quantile(0.95, + sum(rate(signoz_spans_duration_bucket{ + service_name="awoooi-api" + }[5m])) by (le, service_name) + ) > 1 + for: 10m + labels: + severity: warning + source: signoz + team: backend + annotations: + summary: "API P95 延遲 > 1s" + description: "服務 {{ $labels.service_name }} P95: {{ $value }}s" + + # ========================================================================= + # Trace 異常告警 + # ========================================================================= + - name: traces + rules: + - alert: NoTracesReceived + expr: | + sum(rate(signoz_spans_total[15m])) == 0 + for: 15m + labels: + severity: warning + source: signoz + team: platform + annotations: + summary: "15 分鐘內無 Trace 數據" + description: "可能是 OTEL Collector 或應用程式問題,請檢查 192.168.0.188:24318 端點" + + - alert: HighSpanDropRate + expr: | + sum(rate(otelcol_exporter_send_failed_spans[5m])) + / + sum(rate(otelcol_exporter_sent_spans[5m])) + > 0.01 + for: 5m + labels: + severity: warning + source: signoz + team: platform + annotations: + summary: "Span 丟棄率 > 1%" + description: "OTEL Collector 可能有性能問題或目標不可達" + + # ADR-037 Phase E: 長時間 Trace 告警 + - alert: LongRunningTrace + expr: | + max(signoz_spans_duration{ + service_name="awoooi-api", + status_code!~"5.." + }) by (trace_id, operation) > 10 + for: 1m + labels: + severity: critical + source: signoz + team: backend + annotations: + summary: "Trace 執行超過 10 秒" + description: "操作 {{ $labels.operation }} 執行時間 {{ $value }}s (trace: {{ $labels.trace_id }})" + runbook_url: "https://awoooi.internal/runbooks/long-trace" + + # ========================================================================= + # NVIDIA Nemotron 監控 (ADR-036) + # ========================================================================= + - name: nvidia_api + rules: + - alert: NVIDIAHighLatency + expr: | + histogram_quantile(0.95, + sum(rate(signoz_spans_duration_bucket{ + service_name="awoooi-api", + operation=~".*nvidia.*" + }[5m])) by (le) + ) > 5 + for: 5m + labels: + severity: warning + source: signoz + team: ai + annotations: + summary: "NVIDIA API P95 延遲 > 5s" + description: "Tool Calling 可能有性能問題" + + - alert: NVIDIAHighErrorRate + expr: | + sum(rate(signoz_spans_total{ + service_name="awoooi-api", + operation=~".*nvidia.*", + status_code=~"5.." + }[5m])) + / + sum(rate(signoz_spans_total{ + service_name="awoooi-api", + operation=~".*nvidia.*" + }[5m])) + > 0.1 + for: 5m + labels: + severity: warning + source: signoz + team: ai + annotations: + summary: "NVIDIA API 錯誤率 > 10%" + description: "可能需要 Fallback 到 Ollama"