# ============================================================================= # WOOO TECH - Momo Pro System # Prometheus Alerting Rules # ============================================================================= apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: momo-alerts namespace: monitoring labels: release: prometheus spec: groups: # Pod 健康告警 - name: momo-pod-alerts rules: # Pod OOMKilled 告警 - alert: PodOOMKilled expr: | kube_pod_container_status_last_terminated_reason{reason="OOMKilled", namespace="momo"} == 1 for: 0m labels: severity: critical annotations: summary: "Pod OOMKilled: {{ $labels.pod }}" description: "Pod {{ $labels.pod }} 因記憶體不足被終止 (OOMKilled)" # Pod 重啟次數過多 (僅追蹤當前運行中的 Pod) - alert: PodRestartTooMany expr: | increase(kube_pod_container_status_restarts_total{namespace="momo"}[1h]) > 3 and on(pod, namespace) kube_pod_status_phase{namespace="momo", phase="Running"} == 1 for: 5m labels: severity: warning annotations: summary: "Pod 重啟頻繁: {{ $labels.pod }}" description: "Pod {{ $labels.pod }} 在過去 1 小時內重啟超過 3 次" # Pod 未就緒 - alert: PodNotReady expr: | kube_pod_status_ready{condition="true", namespace="momo"} == 0 for: 5m labels: severity: critical annotations: summary: "Pod 未就緒: {{ $labels.pod }}" description: "Pod {{ $labels.pod }} 超過 5 分鐘未就緒" # Pod Pending - alert: PodPending expr: | kube_pod_status_phase{phase="Pending", namespace="momo"} == 1 for: 10m labels: severity: warning annotations: summary: "Pod 處於 Pending: {{ $labels.pod }}" description: "Pod {{ $labels.pod }} 超過 10 分鐘無法啟動" # 資源使用告警 - name: momo-resource-alerts rules: # 記憶體使用率過高 - alert: HighMemoryUsage expr: | (container_memory_working_set_bytes{namespace="momo"} / container_spec_memory_limit_bytes{namespace="momo"}) > 0.8 for: 5m labels: severity: warning annotations: summary: "記憶體使用率過高: {{ $labels.pod }}" description: "Pod {{ $labels.pod }} 記憶體使用率超過 80%,當前值: {{ $value | humanizePercentage }}" # 記憶體即將耗盡 - alert: MemoryNearLimit expr: | (container_memory_working_set_bytes{namespace="momo"} / container_spec_memory_limit_bytes{namespace="momo"}) > 0.95 for: 2m labels: severity: critical annotations: summary: "記憶體即將耗盡: {{ $labels.pod }}" description: "Pod {{ $labels.pod }} 記憶體使用率超過 95%,即將 OOM!" # CPU 使用率過高 - alert: HighCPUUsage expr: | (rate(container_cpu_usage_seconds_total{namespace="momo"}[5m]) / container_spec_cpu_quota{namespace="momo"} * 100000) > 0.8 for: 10m labels: severity: warning annotations: summary: "CPU 使用率過高: {{ $labels.pod }}" description: "Pod {{ $labels.pod }} CPU 使用率超過 80%" # PostgreSQL 告警 - name: momo-postgres-alerts rules: # PostgreSQL 連線失敗 - alert: PostgresDown expr: | absent(kube_pod_status_ready{pod=~"momo-postgres.*", condition="true", namespace="momo"} == 1) for: 1m labels: severity: critical annotations: summary: "PostgreSQL 服務異常" description: "momo-postgres Pod 未就緒或不存在" # PVC 空間不足 - alert: PVCSpaceLow expr: | (kubelet_volume_stats_used_bytes{namespace="momo"} / kubelet_volume_stats_capacity_bytes{namespace="momo"}) > 0.85 for: 5m labels: severity: warning annotations: summary: "PVC 空間不足: {{ $labels.persistentvolumeclaim }}" description: "PVC {{ $labels.persistentvolumeclaim }} 使用率超過 85%" # 應用健康告警 - name: momo-app-alerts rules: # HTTP 5xx 錯誤率過高 - alert: HighHTTP5xxRate expr: | sum(rate(nginx_ingress_controller_requests{status=~"5.*", namespace="momo"}[5m])) / sum(rate(nginx_ingress_controller_requests{namespace="momo"}[5m])) > 0.05 for: 5m labels: severity: warning annotations: summary: "HTTP 5xx 錯誤率過高" description: "momo 應用 5xx 錯誤率超過 5%" # 服務不可用 (透過 K8s Pod 狀態檢測) - alert: MomoAppDown expr: | kube_deployment_status_replicas_available{deployment="momo-app", namespace="momo"} == 0 for: 2m labels: severity: critical annotations: summary: "MOMO 應用服務異常" description: "momo-app 部署沒有可用的副本,服務完全中斷" # Scheduler 不可用 - alert: MomoSchedulerDown expr: | kube_deployment_status_replicas_available{deployment="momo-scheduler", namespace="momo"} == 0 for: 2m labels: severity: critical annotations: summary: "MOMO Scheduler 服務異常" description: "momo-scheduler 部署沒有可用的副本,排程任務已停止" # Node 告警 - name: node-alerts rules: # Node 記憶體不足 - alert: NodeMemoryLow expr: | (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1 for: 5m labels: severity: critical annotations: summary: "節點記憶體不足: {{ $labels.instance }}" description: "節點 {{ $labels.instance }} 可用記憶體低於 10%" # Node 磁碟空間不足 - alert: NodeDiskLow expr: | (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15 for: 5m labels: severity: warning annotations: summary: "節點磁碟空間不足: {{ $labels.instance }}" description: "節點 {{ $labels.instance }} 根目錄可用空間低於 15%" # Node 不可用 - alert: NodeNotReady expr: | kube_node_status_condition{condition="Ready", status="true"} == 0 for: 5m labels: severity: critical annotations: summary: "節點不可用: {{ $labels.node }}" description: "節點 {{ $labels.node }} 超過 5 分鐘處於 NotReady 狀態"