From b43e1f18185ef7a9c7f3cdc4f692d17b5866afcf Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 9 Apr 2026 11:49:28 +0800 Subject: [PATCH] =?UTF-8?q?feat(rules):=20L2-2=20alerts-unified=20?= =?UTF-8?q?=E2=80=94=20=E8=A3=9C=E5=85=85=2014=20=E6=A2=9D=20Prometheus=20?= =?UTF-8?q?=E5=91=8A=E8=AD=A6=E8=A6=8F=E5=89=87=20+=20target=5Fdown=20?= =?UTF-8?q?=E8=87=AA=E5=8B=95=E4=BF=AE=E5=BE=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增規則: - postgresql_down / postgresql_connection_pool / postgresql_slow_queries - redis_down / ollama_down / minio_down / minio_disk_high / harbor_down - k3s_node_down / awoooi_api_down / alert_chain_broken / nvidia_circuit_breaker 修正: - target_down: kubectl_command 從診斷改為自動重啟 exporter (docker restart / systemctl) Co-Authored-By: Claude Sonnet 4.6 --- apps/api/alert_rules.yaml | 351 +++++++++++++++++++++++++++++++++++++- 1 file changed, 342 insertions(+), 9 deletions(-) diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index 12fad747..c0f09a97 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -48,26 +48,26 @@ rules: - id: target_down priority: 20 - description: Prometheus scrape target 下線 + description: Prometheus scrape target 下線 — 自動重啟 exporter match: alertname: - TargetDown - InstanceDown response: - action_title: "確認 {job} ({instance}) 服務存活" - description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。" + action_title: "重啟 {job} exporter on {host}" + description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。自動重啟主機上的 exporter container。" suggested_action: RESTART_DEPLOYMENT - kubectl_command: "ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'" - estimated_downtime: "監控盲區持續中" + kubectl_command: "ssh {host} 'docker restart $(docker ps -a --filter name=exporter --format \"{{.Names}}\" | head -1) 2>/dev/null || systemctl restart node_exporter 2>/dev/null || systemctl restart prometheus-node-exporter'" + estimated_downtime: "~30s" risk: medium responsibility: INFRA - responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇" + responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇,自動重啟 exporter" secondary_teams: [] optimization: - type: MONITORING - description: "確認 exporter 進程是否存活" - command: "ssh {host} 'ps aux | grep exporter | grep -v grep'" - reasoning: "[規則匹配] Prometheus target 下線,先 SSH 確認主機存活再重啟 exporter。" + description: "確認 exporter 重啟後可被 Prometheus scrape" + command: "ssh {host} 'curl -s http://localhost:{port}/metrics | head -3'" + reasoning: "[規則匹配] Prometheus target 下線,SSH 到主機重啟 exporter container 或 systemd service。" # ── K8s Pod 層 ────────────────────────────────────────────── @@ -177,6 +177,339 @@ rules: command: "# 調整 initialDelaySeconds >= 應用啟動時間" reasoning: "[規則匹配] 先查 previous log 確認 crash 原因,再決定修復策略。" + # ── 資料庫層 ───────────────────────────────────────────────── + + - id: postgresql_down + priority: 70 + description: PostgreSQL 服務下線 + match: + alertname: + - PostgreSQLDown + message: + - postgresql + - postgres + - pg down + response: + action_title: "重啟 PostgreSQL {target}" + description: "⚙️ 規則匹配: PostgreSQL ({instance}) 無法連線。常見原因: 程序崩潰、磁碟空間不足、連線數超限。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "kubectl rollout restart deployment/postgresql -n {namespace}" + estimated_downtime: "~2 min" + risk: critical + responsibility: DB + responsibility_reasoning: "PostgreSQL 下線屬資料庫團隊責任,需立即確認資料完整性" + secondary_teams: [INFRA, BE] + optimization: + - type: HEALTH_CHECK + description: "確認 PostgreSQL 連線與資料完整性" + command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT 1'" + reasoning: "[規則匹配] PostgreSQL 下線影響所有依賴服務,優先重啟恢復,同時確認資料無損。" + + - id: postgresql_connection_pool + priority: 75 + description: PostgreSQL 連線池耗盡或接近上限 + match: + alertname: + - PostgreSQLConnectionPoolNearLimit + - PostgreSQLConnectionPoolExhausted + message: + - connection pool + - connections + - pgbouncer + response: + action_title: "清理 PostgreSQL 閒置連線" + description: "⚙️ 規則匹配: PostgreSQL 連線池使用率過高,可能導致新請求被拒絕。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state = ''idle'' AND state_change < NOW() - INTERVAL ''5 minutes'';'" + estimated_downtime: "0" + risk: critical + responsibility: DB + responsibility_reasoning: "連線池管理屬資料庫設定範疇" + secondary_teams: [BE] + optimization: + - type: CONNECTION_POOL + description: "調整 max_connections 或啟用 PgBouncer 連線池" + command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SHOW max_connections;'" + reasoning: "[規則匹配] 清理閒置連線是最快恢復手段,同時需排查連線洩漏。" + + - id: postgresql_slow_queries + priority: 80 + description: PostgreSQL 慢查詢告警 + match: + alertname: + - PostgreSQLSlowQueries + - PostgreSQLLockWaiting + message: + - slow query + - lock wait + - deadlock + response: + action_title: "診斷 PostgreSQL 慢查詢 + 索引優化" + description: "⚙️ 規則匹配: PostgreSQL 存在慢查詢或鎖等待,影響系統整體性能。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pid, query, state, wait_event_type, wait_event FROM pg_stat_activity WHERE state != ''idle'' ORDER BY query_start;'" + estimated_downtime: "0" + risk: medium + responsibility: DB + responsibility_reasoning: "慢查詢優化屬資料庫效能調優範疇" + secondary_teams: [BE] + optimization: + - type: INDEX + description: "使用 EXPLAIN ANALYZE 找出缺少索引的查詢" + command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT * FROM pg_stat_user_tables ORDER BY seq_scan DESC LIMIT 10;'" + reasoning: "[規則匹配] 先找出阻塞查詢,必要時 pg_terminate_backend 解除鎖定。" + + # ── 基礎設施服務層 ────────────────────────────────────────── + + - id: redis_down + priority: 85 + description: Redis 服務下線 + match: + alertname: + - RedisDown + message: + - redis + - cache down + response: + action_title: "重啟 Redis {target}" + description: "⚙️ 規則匹配: Redis ({instance}) 無法連線。影響 Session 管理、去重快取、AI Router 狀態。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "kubectl rollout restart deployment/redis -n {namespace}" + estimated_downtime: "~30s" + risk: critical + responsibility: INFRA + responsibility_reasoning: "Redis 屬基礎設施快取層,下線影響多個上層服務" + secondary_teams: [BE] + optimization: + - type: HEALTH_CHECK + description: "確認 Redis 連線" + command: "kubectl exec -n {namespace} deployment/redis -- redis-cli ping" + reasoning: "[規則匹配] Redis 下線會導致去重失效和 AI Router 狀態丟失,需立即重啟。" + + - id: ollama_down + priority: 90 + description: Ollama AI 服務下線 + match: + alertname: + - OllamaDown + message: + - ollama + - llm down + - ai service + response: + action_title: "重啟 Ollama 服務 on {host}" + description: "⚙️ 規則匹配: Ollama ({instance}) 無法連線。影響 AI 規則自動生成和本地推理。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "ssh {host} 'systemctl restart ollama || docker restart ollama'" + estimated_downtime: "~2 min (model reload)" + risk: medium + responsibility: INFRA + responsibility_reasoning: "Ollama 屬 AI 推理基礎設施,由基礎設施團隊管理" + secondary_teams: [] + optimization: + - type: HEALTH_CHECK + description: "確認 Ollama 狀態和已載入模型" + command: "curl -s http://{host}:11434/api/tags | jq '.models[].name'" + reasoning: "[規則匹配] Ollama 下線觸發 AI Router fallback 至 Gemini,重啟恢復本地推理能力。" + + - id: minio_down + priority: 95 + description: MinIO 物件儲存下線 + match: + alertname: + - MinioDown + message: + - minio + - s3 + - object storage + response: + action_title: "重啟 MinIO {target}" + description: "⚙️ 規則匹配: MinIO ({instance}) 無法連線。影響靜態資源和備份儲存。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "ssh {host} 'docker restart minio'" + estimated_downtime: "~1 min" + risk: critical + responsibility: INFRA + responsibility_reasoning: "MinIO 屬物件儲存基礎設施" + secondary_teams: [] + optimization: + - type: DISK_CHECK + description: "確認磁碟空間充足" + command: "ssh {host} 'df -h /data/minio'" + reasoning: "[規則匹配] MinIO 下線需先確認磁碟空間,再重啟服務。" + + - id: minio_disk_high + priority: 96 + description: MinIO 磁碟使用率過高 + match: + alertname: + - MinioDiskUsageHigh + - MinioDiskUsageCritical + message: + - disk usage + - disk full + - storage + response: + action_title: "清理 MinIO 過期資料 on {host}" + description: "⚙️ 規則匹配: MinIO 磁碟使用率過高,需清理舊資料或擴展儲存空間。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "ssh {host} 'df -h /data/minio && du -sh /data/minio/* | sort -rh | head -10'" + estimated_downtime: "0" + risk: critical + responsibility: INFRA + responsibility_reasoning: "磁碟空間管理屬基礎設施團隊責任" + secondary_teams: [] + optimization: + - type: CLEANUP + description: "清理 MinIO 舊備份和 lifecycle policy" + command: "mc admin lifecycle add local --expiry-days 30" + reasoning: "[規則匹配] 磁碟滿會導致寫入失敗,需立即清理最大的目錄。" + + - id: harbor_down + priority: 97 + description: Harbor Registry 下線 + match: + alertname: + - HarborDown + message: + - harbor + - registry + - docker registry + response: + action_title: "重啟 Harbor Registry on {host}" + description: "⚙️ 規則匹配: Harbor ({instance}) 無法連線。影響 CD 部署流程。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "ssh {host} 'cd /data/harbor && docker-compose up -d'" + estimated_downtime: "~2 min" + risk: critical + responsibility: INFRA + responsibility_reasoning: "Harbor 是 CD 部署的核心依賴,屬基礎設施團隊責任" + secondary_teams: [] + optimization: + - type: HEALTH_CHECK + description: "確認 Harbor 各組件狀態" + command: "ssh {host} 'cd /data/harbor && docker-compose ps'" + reasoning: "[規則匹配] Harbor 下線會阻塞所有 CD 部署,需立即重啟。" + + # ── K8s 叢集層 ────────────────────────────────────────────── + + - id: k3s_node_down + priority: 100 + description: K3s 節點下線 + match: + alertname: + - K3sNodeDown + - K3sVIPDown + message: + - node down + - node not ready + - k3s + response: + action_title: "確認 K3s 節點 {target} 狀態" + description: "⚙️ 規則匹配: K3s 節點下線,影響叢集可用性和 Pod 調度。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "kubectl get nodes -o wide && kubectl describe node {target}" + estimated_downtime: "依節點恢復時間" + risk: critical + responsibility: INFRA + responsibility_reasoning: "K3s 叢集節點管理屬基礎設施團隊責任" + secondary_teams: [] + optimization: + - type: NODE_DRAIN + description: "先 drain 節點確保 Pod 安全遷移" + command: "kubectl drain {target} --ignore-daemonsets --delete-emptydir-data" + reasoning: "[規則匹配] 節點下線需先確認主機可達性,必要時手動遷移 workload。" + + - id: awoooi_api_down + priority: 105 + description: AWOOOI API 服務下線 + match: + alertname: + - AWOOOIApiDown + - OpenClawDown + message: + - awoooi api + - openclaw + - api down + response: + action_title: "重啟 AWOOOI API deployment" + description: "⚙️ 規則匹配: AWOOOI API 無法連線。影響所有告警處理和 AI 決策流程。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "kubectl rollout restart deployment/awoooi-api -n awoooi" + estimated_downtime: "~1 min" + risk: critical + responsibility: BE + responsibility_reasoning: "AWOOOI API 是核心服務,屬後端團隊直接責任" + secondary_teams: [INFRA] + optimization: + - type: HEALTH_CHECK + description: "確認 API Pod 狀態和最近 log" + command: "kubectl get pods -n awoooi && kubectl logs -n awoooi deployment/awoooi-api --tail=50" + reasoning: "[規則匹配] AWOOOI API 下線需立即重啟,同時查 Pod log 確認根因。" + + # ── 告警鏈路監控 ──────────────────────────────────────────── + + - id: alert_chain_broken + priority: 110 + description: 告警鏈路中斷 + match: + alertname: + - AlertChainBroken_Alertmanager + - AlertChainBroken_Sentry + - AlertChainBroken_SignOz + - AlertChainUnhealthy + - NoAlertsReceived2Hours + message: + - alert chain + - alertmanager + - no alerts + response: + action_title: "診斷告警鏈路中斷" + description: "⚙️ 規則匹配: 告警鏈路異常,可能導致真實告警無法送達 Telegram。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "kubectl get pods -n monitoring && curl -s http://192.168.0.120:9093/api/v1/status | jq '.data.uptime'" + estimated_downtime: "監控盲區持續中" + risk: critical + responsibility: INFRA + responsibility_reasoning: "告警鏈路屬基礎設施監控體系,需立即修復確保可觀測性" + secondary_teams: [BE] + optimization: + - type: E2E_TEST + description: "發送測試告警驗證整條鏈路" + command: "curl -X POST http://192.168.0.125:32334/api/v1/test-alert -H 'Content-Type: application/json' -d '{\"test\": true}'" + reasoning: "[規則匹配] 告警鏈路中斷等同監控失明,最高優先修復。" + + # ── GPU / AI 基礎設施 ──────────────────────────────────────── + + - id: nvidia_circuit_breaker + priority: 115 + description: NVIDIA/Nemotron 熔斷器開啟 + match: + alertname: + - NvidiaCircuitBreakerOpen + - NvidiaToolCallingHighErrorRate + - NvidiaToolCallingHighLatency + message: + - circuit breaker + - nvidia + - nemotron + - tool calling + response: + action_title: "確認 NVIDIA API 熔斷狀態" + description: "⚙️ 規則匹配: NVIDIA/Nemotron 熔斷器開啟或錯誤率過高,AI Router 已自動降級。" + suggested_action: RESTART_DEPLOYMENT + kubectl_command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/status | jq '.providers'" + estimated_downtime: "0 (已自動 fallback)" + risk: medium + responsibility: BE + responsibility_reasoning: "AI Provider 熔斷管理屬後端 AI Router 責任範圍" + secondary_teams: [] + optimization: + - type: CIRCUIT_BREAKER_RESET + description: "等待熔斷器自動恢復 (half-open 狀態)" + command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/reset -X POST" + reasoning: "[規則匹配] AI Router 已自動降級至備援 Provider,監控熔斷器恢復狀態即可。" + # ── 通用兜底 ──────────────────────────────────────────────── - id: generic_fallback