feat(rules): L2-2 alerts-unified — 補充 14 條 Prometheus 告警規則 + target_down 自動修復

新增規則: - postgresql_down / postgresql_connection_pool / postgresql_slow_queries - redis_down / ollama_down / minio_down / minio_disk_high / harbor_down - k3s_node_down / awoooi_api_down / alert_chain_broken / nvidia_circuit_breaker 修正: - target_down: kubectl_command 從診斷改為自動重啟 exporter (docker restart / systemctl) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-09 11:49:28 +08:00
parent afe52c2c70
commit b43e1f1818
1 changed files with 342 additions and 9 deletions
--- a/apps/api/alert_rules.yaml
+++ b/apps/api/alert_rules.yaml
@@ -48,26 +48,26 @@ rules:

  - id: target_down
    priority: 20
-    description: Prometheus scrape target 下線
+    description: Prometheus scrape target 下線 — 自動重啟 exporter
    match:
      alertname:
        - TargetDown
        - InstanceDown
    response:
-      action_title: "確認 {job} ({instance}) 服務存活"
-      description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。常見原因: 主機離線、exporter crash、防火牆封鎖。"
+      action_title: "重啟 {job} exporter on {host}"
+      description: "⚙️ 規則匹配: Prometheus 無法抓取 {instance} ({job}) 指標。自動重啟主機上的 exporter container。"
      suggested_action: RESTART_DEPLOYMENT
-      kubectl_command: "ssh {host} 'systemctl status node_exporter 2>/dev/null || docker ps | grep exporter'"
-      estimated_downtime: "監控盲區持續中"
+      kubectl_command: "ssh {host} 'docker restart $(docker ps -a --filter name=exporter --format \"{{.Names}}\" | head -1) 2>/dev/null || systemctl restart node_exporter 2>/dev/null || systemctl restart prometheus-node-exporter'"
+      estimated_downtime: "~30s"
      risk: medium
      responsibility: INFRA
-      responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇"
+      responsibility_reasoning: "Prometheus scrape 目標下線屬基礎設施監控範疇，自動重啟 exporter"
      secondary_teams: []
      optimization:
        - type: MONITORING
-          description: "確認 exporter 進程是否存活"
-          command: "ssh {host} 'ps aux | grep exporter | grep -v grep'"
-      reasoning: "[規則匹配] Prometheus target 下線，先 SSH 確認主機存活再重啟 exporter。"
+          description: "確認 exporter 重啟後可被 Prometheus scrape"
+          command: "ssh {host} 'curl -s http://localhost:{port}/metrics | head -3'"
+      reasoning: "[規則匹配] Prometheus target 下線，SSH 到主機重啟 exporter container 或 systemd service。"

  # ── K8s Pod 層 ──────────────────────────────────────────────

@@ -177,6 +177,339 @@ rules:
          command: "# 調整 initialDelaySeconds >= 應用啟動時間"
      reasoning: "[規則匹配] 先查 previous log 確認 crash 原因，再決定修復策略。"

+  # ── 資料庫層 ─────────────────────────────────────────────────
+
+  - id: postgresql_down
+    priority: 70
+    description: PostgreSQL 服務下線
+    match:
+      alertname:
+        - PostgreSQLDown
+      message:
+        - postgresql
+        - postgres
+        - pg down
+    response:
+      action_title: "重啟 PostgreSQL {target}"
+      description: "⚙️ 規則匹配: PostgreSQL ({instance}) 無法連線。常見原因: 程序崩潰、磁碟空間不足、連線數超限。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "kubectl rollout restart deployment/postgresql -n {namespace}"
+      estimated_downtime: "~2 min"
+      risk: critical
+      responsibility: DB
+      responsibility_reasoning: "PostgreSQL 下線屬資料庫團隊責任，需立即確認資料完整性"
+      secondary_teams: [INFRA, BE]
+      optimization:
+        - type: HEALTH_CHECK
+          description: "確認 PostgreSQL 連線與資料完整性"
+          command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT 1'"
+      reasoning: "[規則匹配] PostgreSQL 下線影響所有依賴服務，優先重啟恢復，同時確認資料無損。"
+
+  - id: postgresql_connection_pool
+    priority: 75
+    description: PostgreSQL 連線池耗盡或接近上限
+    match:
+      alertname:
+        - PostgreSQLConnectionPoolNearLimit
+        - PostgreSQLConnectionPoolExhausted
+      message:
+        - connection pool
+        - connections
+        - pgbouncer
+    response:
+      action_title: "清理 PostgreSQL 閒置連線"
+      description: "⚙️ 規則匹配: PostgreSQL 連線池使用率過高，可能導致新請求被拒絕。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state = ''idle'' AND state_change < NOW() - INTERVAL ''5 minutes'';'"
+      estimated_downtime: "0"
+      risk: critical
+      responsibility: DB
+      responsibility_reasoning: "連線池管理屬資料庫設定範疇"
+      secondary_teams: [BE]
+      optimization:
+        - type: CONNECTION_POOL
+          description: "調整 max_connections 或啟用 PgBouncer 連線池"
+          command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SHOW max_connections;'"
+      reasoning: "[規則匹配] 清理閒置連線是最快恢復手段，同時需排查連線洩漏。"
+
+  - id: postgresql_slow_queries
+    priority: 80
+    description: PostgreSQL 慢查詢告警
+    match:
+      alertname:
+        - PostgreSQLSlowQueries
+        - PostgreSQLLockWaiting
+      message:
+        - slow query
+        - lock wait
+        - deadlock
+    response:
+      action_title: "診斷 PostgreSQL 慢查詢 + 索引優化"
+      description: "⚙️ 規則匹配: PostgreSQL 存在慢查詢或鎖等待，影響系統整體性能。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT pid, query, state, wait_event_type, wait_event FROM pg_stat_activity WHERE state != ''idle'' ORDER BY query_start;'"
+      estimated_downtime: "0"
+      risk: medium
+      responsibility: DB
+      responsibility_reasoning: "慢查詢優化屬資料庫效能調優範疇"
+      secondary_teams: [BE]
+      optimization:
+        - type: INDEX
+          description: "使用 EXPLAIN ANALYZE 找出缺少索引的查詢"
+          command: "kubectl exec -n {namespace} deployment/postgresql -- psql -U postgres -c 'SELECT * FROM pg_stat_user_tables ORDER BY seq_scan DESC LIMIT 10;'"
+      reasoning: "[規則匹配] 先找出阻塞查詢，必要時 pg_terminate_backend 解除鎖定。"
+
+  # ── 基礎設施服務層 ──────────────────────────────────────────
+
+  - id: redis_down
+    priority: 85
+    description: Redis 服務下線
+    match:
+      alertname:
+        - RedisDown
+      message:
+        - redis
+        - cache down
+    response:
+      action_title: "重啟 Redis {target}"
+      description: "⚙️ 規則匹配: Redis ({instance}) 無法連線。影響 Session 管理、去重快取、AI Router 狀態。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "kubectl rollout restart deployment/redis -n {namespace}"
+      estimated_downtime: "~30s"
+      risk: critical
+      responsibility: INFRA
+      responsibility_reasoning: "Redis 屬基礎設施快取層，下線影響多個上層服務"
+      secondary_teams: [BE]
+      optimization:
+        - type: HEALTH_CHECK
+          description: "確認 Redis 連線"
+          command: "kubectl exec -n {namespace} deployment/redis -- redis-cli ping"
+      reasoning: "[規則匹配] Redis 下線會導致去重失效和 AI Router 狀態丟失，需立即重啟。"
+
+  - id: ollama_down
+    priority: 90
+    description: Ollama AI 服務下線
+    match:
+      alertname:
+        - OllamaDown
+      message:
+        - ollama
+        - llm down
+        - ai service
+    response:
+      action_title: "重啟 Ollama 服務 on {host}"
+      description: "⚙️ 規則匹配: Ollama ({instance}) 無法連線。影響 AI 規則自動生成和本地推理。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "ssh {host} 'systemctl restart ollama || docker restart ollama'"
+      estimated_downtime: "~2 min (model reload)"
+      risk: medium
+      responsibility: INFRA
+      responsibility_reasoning: "Ollama 屬 AI 推理基礎設施，由基礎設施團隊管理"
+      secondary_teams: []
+      optimization:
+        - type: HEALTH_CHECK
+          description: "確認 Ollama 狀態和已載入模型"
+          command: "curl -s http://{host}:11434/api/tags | jq '.models[].name'"
+      reasoning: "[規則匹配] Ollama 下線觸發 AI Router fallback 至 Gemini，重啟恢復本地推理能力。"
+
+  - id: minio_down
+    priority: 95
+    description: MinIO 物件儲存下線
+    match:
+      alertname:
+        - MinioDown
+      message:
+        - minio
+        - s3
+        - object storage
+    response:
+      action_title: "重啟 MinIO {target}"
+      description: "⚙️ 規則匹配: MinIO ({instance}) 無法連線。影響靜態資源和備份儲存。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "ssh {host} 'docker restart minio'"
+      estimated_downtime: "~1 min"
+      risk: critical
+      responsibility: INFRA
+      responsibility_reasoning: "MinIO 屬物件儲存基礎設施"
+      secondary_teams: []
+      optimization:
+        - type: DISK_CHECK
+          description: "確認磁碟空間充足"
+          command: "ssh {host} 'df -h /data/minio'"
+      reasoning: "[規則匹配] MinIO 下線需先確認磁碟空間，再重啟服務。"
+
+  - id: minio_disk_high
+    priority: 96
+    description: MinIO 磁碟使用率過高
+    match:
+      alertname:
+        - MinioDiskUsageHigh
+        - MinioDiskUsageCritical
+      message:
+        - disk usage
+        - disk full
+        - storage
+    response:
+      action_title: "清理 MinIO 過期資料 on {host}"
+      description: "⚙️ 規則匹配: MinIO 磁碟使用率過高，需清理舊資料或擴展儲存空間。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "ssh {host} 'df -h /data/minio && du -sh /data/minio/* | sort -rh | head -10'"
+      estimated_downtime: "0"
+      risk: critical
+      responsibility: INFRA
+      responsibility_reasoning: "磁碟空間管理屬基礎設施團隊責任"
+      secondary_teams: []
+      optimization:
+        - type: CLEANUP
+          description: "清理 MinIO 舊備份和 lifecycle policy"
+          command: "mc admin lifecycle add local --expiry-days 30"
+      reasoning: "[規則匹配] 磁碟滿會導致寫入失敗，需立即清理最大的目錄。"
+
+  - id: harbor_down
+    priority: 97
+    description: Harbor Registry 下線
+    match:
+      alertname:
+        - HarborDown
+      message:
+        - harbor
+        - registry
+        - docker registry
+    response:
+      action_title: "重啟 Harbor Registry on {host}"
+      description: "⚙️ 規則匹配: Harbor ({instance}) 無法連線。影響 CD 部署流程。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "ssh {host} 'cd /data/harbor && docker-compose up -d'"
+      estimated_downtime: "~2 min"
+      risk: critical
+      responsibility: INFRA
+      responsibility_reasoning: "Harbor 是 CD 部署的核心依賴，屬基礎設施團隊責任"
+      secondary_teams: []
+      optimization:
+        - type: HEALTH_CHECK
+          description: "確認 Harbor 各組件狀態"
+          command: "ssh {host} 'cd /data/harbor && docker-compose ps'"
+      reasoning: "[規則匹配] Harbor 下線會阻塞所有 CD 部署，需立即重啟。"
+
+  # ── K8s 叢集層 ──────────────────────────────────────────────
+
+  - id: k3s_node_down
+    priority: 100
+    description: K3s 節點下線
+    match:
+      alertname:
+        - K3sNodeDown
+        - K3sVIPDown
+      message:
+        - node down
+        - node not ready
+        - k3s
+    response:
+      action_title: "確認 K3s 節點 {target} 狀態"
+      description: "⚙️ 規則匹配: K3s 節點下線，影響叢集可用性和 Pod 調度。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "kubectl get nodes -o wide && kubectl describe node {target}"
+      estimated_downtime: "依節點恢復時間"
+      risk: critical
+      responsibility: INFRA
+      responsibility_reasoning: "K3s 叢集節點管理屬基礎設施團隊責任"
+      secondary_teams: []
+      optimization:
+        - type: NODE_DRAIN
+          description: "先 drain 節點確保 Pod 安全遷移"
+          command: "kubectl drain {target} --ignore-daemonsets --delete-emptydir-data"
+      reasoning: "[規則匹配] 節點下線需先確認主機可達性，必要時手動遷移 workload。"
+
+  - id: awoooi_api_down
+    priority: 105
+    description: AWOOOI API 服務下線
+    match:
+      alertname:
+        - AWOOOIApiDown
+        - OpenClawDown
+      message:
+        - awoooi api
+        - openclaw
+        - api down
+    response:
+      action_title: "重啟 AWOOOI API deployment"
+      description: "⚙️ 規則匹配: AWOOOI API 無法連線。影響所有告警處理和 AI 決策流程。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "kubectl rollout restart deployment/awoooi-api -n awoooi"
+      estimated_downtime: "~1 min"
+      risk: critical
+      responsibility: BE
+      responsibility_reasoning: "AWOOOI API 是核心服務，屬後端團隊直接責任"
+      secondary_teams: [INFRA]
+      optimization:
+        - type: HEALTH_CHECK
+          description: "確認 API Pod 狀態和最近 log"
+          command: "kubectl get pods -n awoooi && kubectl logs -n awoooi deployment/awoooi-api --tail=50"
+      reasoning: "[規則匹配] AWOOOI API 下線需立即重啟，同時查 Pod log 確認根因。"
+
+  # ── 告警鏈路監控 ────────────────────────────────────────────
+
+  - id: alert_chain_broken
+    priority: 110
+    description: 告警鏈路中斷
+    match:
+      alertname:
+        - AlertChainBroken_Alertmanager
+        - AlertChainBroken_Sentry
+        - AlertChainBroken_SignOz
+        - AlertChainUnhealthy
+        - NoAlertsReceived2Hours
+      message:
+        - alert chain
+        - alertmanager
+        - no alerts
+    response:
+      action_title: "診斷告警鏈路中斷"
+      description: "⚙️ 規則匹配: 告警鏈路異常，可能導致真實告警無法送達 Telegram。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "kubectl get pods -n monitoring && curl -s http://192.168.0.120:9093/api/v1/status | jq '.data.uptime'"
+      estimated_downtime: "監控盲區持續中"
+      risk: critical
+      responsibility: INFRA
+      responsibility_reasoning: "告警鏈路屬基礎設施監控體系，需立即修復確保可觀測性"
+      secondary_teams: [BE]
+      optimization:
+        - type: E2E_TEST
+          description: "發送測試告警驗證整條鏈路"
+          command: "curl -X POST http://192.168.0.125:32334/api/v1/test-alert -H 'Content-Type: application/json' -d '{\"test\": true}'"
+      reasoning: "[規則匹配] 告警鏈路中斷等同監控失明，最高優先修復。"
+
+  # ── GPU / AI 基礎設施 ────────────────────────────────────────
+
+  - id: nvidia_circuit_breaker
+    priority: 115
+    description: NVIDIA/Nemotron 熔斷器開啟
+    match:
+      alertname:
+        - NvidiaCircuitBreakerOpen
+        - NvidiaToolCallingHighErrorRate
+        - NvidiaToolCallingHighLatency
+      message:
+        - circuit breaker
+        - nvidia
+        - nemotron
+        - tool calling
+    response:
+      action_title: "確認 NVIDIA API 熔斷狀態"
+      description: "⚙️ 規則匹配: NVIDIA/Nemotron 熔斷器開啟或錯誤率過高，AI Router 已自動降級。"
+      suggested_action: RESTART_DEPLOYMENT
+      kubectl_command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/status | jq '.providers'"
+      estimated_downtime: "0 (已自動 fallback)"
+      risk: medium
+      responsibility: BE
+      responsibility_reasoning: "AI Provider 熔斷管理屬後端 AI Router 責任範圍"
+      secondary_teams: []
+      optimization:
+        - type: CIRCUIT_BREAKER_RESET
+          description: "等待熔斷器自動恢復 (half-open 狀態)"
+          command: "curl -s http://192.168.0.125:32334/api/v1/ai-router/reset -X POST"
+      reasoning: "[規則匹配] AI Router 已自動降級至備援 Provider，監控熔斷器恢復狀態即可。"
+
  # ── 通用兜底 ────────────────────────────────────────────────

  - id: generic_fallback