apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: momo-complete-alerts namespace: monitoring labels: release: prometheus app: kube-prometheus-stack spec: groups: # ============================================ # 1. OOM 記憶體告警 (觸發 oom-handler.sh) # ============================================ - name: memory-alerts rules: # OOM Killed 事件 - alert: PodOOMKilled expr: | kube_pod_container_status_last_terminated_reason{reason="OOMKilled", namespace="momo"} == 1 for: 0m labels: severity: critical auto_repair: "oom-handler" annotations: summary: "Pod {{ $labels.pod }} OOM Killed" description: "容器 {{ $labels.container }} 因記憶體不足被終止,將自動增加記憶體限制" repair_action: "自動增加記憶體限制 +50%" # 記憶體使用率過高 (預警) - alert: HighMemoryUsage expr: | ( container_memory_working_set_bytes{namespace="momo", container!=""} / container_spec_memory_limit_bytes{namespace="momo", container!=""} ) > 0.85 for: 5m labels: severity: warning auto_repair: "none" annotations: summary: "Pod {{ $labels.pod }} 記憶體使用率 > 85%" description: "容器 {{ $labels.container }} 記憶體使用率偏高,可能即將 OOM" # 記憶體接近上限 (即將 OOM) - alert: MemoryNearLimit expr: | ( container_memory_working_set_bytes{namespace="momo", container!=""} / container_spec_memory_limit_bytes{namespace="momo", container!=""} ) > 0.95 for: 2m labels: severity: critical auto_repair: "oom-handler" annotations: summary: "Pod {{ $labels.pod }} 記憶體即將耗盡" description: "記憶體使用率 > 95%,將自動增加記憶體限制" repair_action: "自動增加記憶體限制 +50%" # ============================================ # 2. PostgreSQL 告警 (觸發 postgres-repair.sh) # ============================================ - name: postgres-alerts rules: # PostgreSQL 連線失敗 - alert: PostgresDown expr: pg_up{namespace="momo"} == 0 for: 1m labels: severity: critical auto_repair: "postgres-repair" annotations: summary: "PostgreSQL 連線失敗" description: "momo namespace 的 PostgreSQL 無法連線,將自動重啟 Pod" repair_action: "自動重啟 PostgreSQL Pod" # 連線數過高 - alert: PostgresHighConnections expr: | pg_stat_activity_count{namespace="momo"} / pg_settings_max_connections{namespace="momo"} > 0.8 for: 5m labels: severity: warning auto_repair: "postgres-repair" annotations: summary: "PostgreSQL 連線數 > 80%" description: "連線數過高,將自動終止閒置連線" repair_action: "終止閒置超過 30 分鐘的連線" # 死鎖檢測 - alert: PostgresDeadlocks expr: rate(pg_stat_database_deadlocks{namespace="momo"}[5m]) > 0 for: 5m labels: severity: warning auto_repair: "postgres-repair" annotations: summary: "PostgreSQL 檢測到死鎖" description: "資料庫發生死鎖,將自動終止阻塞查詢" repair_action: "終止長時間阻塞的查詢" # 慢查詢過多 - alert: PostgresSlowQueries expr: | pg_stat_activity_max_tx_duration{namespace="momo"} > 300 for: 5m labels: severity: warning auto_repair: "postgres-repair" annotations: summary: "PostgreSQL 存在超過 5 分鐘的長查詢" description: "長時間執行的查詢可能影響效能" repair_action: "自動 VACUUM ANALYZE" # ============================================ # 3. 應用健康告警 (觸發 auto-rollback.sh) # ============================================ - name: app-health-alerts rules: # MOMO App 無回應 - alert: MomoAppDown expr: | probe_success{job="blackbox", instance=~".*mo.wooo.work.*"} == 0 or absent(probe_success{job="blackbox", instance=~".*mo.wooo.work.*"}) for: 2m labels: severity: critical auto_repair: "auto-rollback" annotations: summary: "MOMO App UAT 無回應" description: "https://mo.wooo.work 健康檢查失敗超過 2 分鐘" repair_action: "連續 5 次失敗將自動回滾" # GCP MOMO App 無回應 - alert: MomoAppGCPDown expr: | probe_success{job="blackbox", instance=~".*momo.wooo.work.*"} == 0 or absent(probe_success{job="blackbox", instance=~".*momo.wooo.work.*"}) for: 2m labels: severity: critical auto_repair: "domain-health-monitor" annotations: summary: "MOMO App GCP 無回應" description: "https://momo.wooo.work 健康檢查失敗" repair_action: "自動 kubectl rollout restart" # HTTP 5xx 錯誤率過高 - alert: HighHTTP5xxRate expr: | ( sum(rate(http_requests_total{status=~"5..", namespace="momo"}[5m])) / sum(rate(http_requests_total{namespace="momo"}[5m])) ) > 0.05 for: 5m labels: severity: warning auto_repair: "auto-rollback" annotations: summary: "HTTP 5xx 錯誤率 > 5%" description: "可能存在程式碼問題,將監控是否需要回滾" repair_action: "連續錯誤將觸發自動回滾" # Pod 頻繁重啟 - alert: PodRestartTooMany expr: | increase(kube_pod_container_status_restarts_total{namespace="momo"}[1h]) > 5 for: 5m labels: severity: warning auto_repair: "auto-rollback" annotations: summary: "Pod {{ $labels.pod }} 1 小時內重啟超過 5 次" description: "頻繁重啟可能表示程式碼有問題" repair_action: "將考慮自動回滾到上一版本" # ============================================ # 4. 基礎設施告警 (觸發 domain-health-monitor.sh) # ============================================ - name: infrastructure-alerts rules: # 磁碟空間不足 - alert: DiskSpaceLow expr: | ( node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} ) < 0.15 for: 5m labels: severity: warning auto_repair: "disk-cleanup" annotations: summary: "磁碟剩餘空間 < 15%" description: "磁碟空間不足,將自動清理" repair_action: "自動清理 Docker、日誌" # 磁碟空間嚴重不足 - alert: DiskSpaceCritical expr: | ( node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} ) < 0.05 for: 2m labels: severity: critical auto_repair: "disk-cleanup" annotations: summary: "磁碟剩餘空間 < 5%" description: "磁碟空間嚴重不足!" repair_action: "緊急自動清理" # Scheduler 停止 - alert: MomoSchedulerDown expr: | kube_deployment_status_replicas_available{deployment="momo-scheduler", namespace="momo"} == 0 for: 5m labels: severity: critical auto_repair: "domain-health-monitor" annotations: summary: "MOMO Scheduler 已停止" description: "排程器無可用副本" repair_action: "自動 kubectl rollout restart"