# ============================================================================= # WOOO TECH - Momo Pro System # Prometheus Alert Rules # Version: 1.0 # ============================================================================= # # 告警嚴重程度定義: # - critical: 需要立即處理的嚴重問題 # - warning: 需要關注但不緊急的問題 # - info: 資訊性通知 # # ============================================================================= groups: # =========================================================================== # 主機資源監控告警 # =========================================================================== - name: host_alerts rules: # ----------------------------------------------------------------------- # CPU 使用率告警 # ----------------------------------------------------------------------- - alert: HostHighCpuUsage expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 50 for: 5m labels: severity: warning category: cpu annotations: summary: "主機 CPU 使用率過高" description: "主機 {{ $labels.instance }} CPU 使用率超過 50% 持續 5 分鐘,當前值: {{ $value | printf \"%.1f\" }}%" value: "{{ $value | printf \"%.1f\" }}%" - alert: HostCriticalCpuUsage expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: critical category: cpu annotations: summary: "主機 CPU 使用率嚴重過高" description: "主機 {{ $labels.instance }} CPU 使用率超過 80%,當前值: {{ $value | printf \"%.1f\" }}%" value: "{{ $value | printf \"%.1f\" }}%" # ----------------------------------------------------------------------- # 記憶體使用率告警 # ----------------------------------------------------------------------- - alert: HostHighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 50 for: 5m labels: severity: warning category: memory annotations: summary: "主機記憶體使用率過高" description: "主機 {{ $labels.instance }} 記憶體使用率超過 50% 持續 5 分鐘,當前值: {{ $value | printf \"%.1f\" }}%" value: "{{ $value | printf \"%.1f\" }}%" - alert: HostCriticalMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 5m labels: severity: critical category: memory annotations: summary: "主機記憶體使用率嚴重過高" description: "主機 {{ $labels.instance }} 記憶體使用率超過 85%,當前值: {{ $value | printf \"%.1f\" }}%" value: "{{ $value | printf \"%.1f\" }}%" # ----------------------------------------------------------------------- # 磁碟使用率告警 # ----------------------------------------------------------------------- - alert: HostHighDiskUsage expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 80 for: 5m labels: severity: warning category: disk annotations: summary: "主機磁碟使用率過高" description: "主機 {{ $labels.instance }} 磁碟 {{ $labels.mountpoint }} 使用率超過 80%,當前值: {{ $value | printf \"%.1f\" }}%" value: "{{ $value | printf \"%.1f\" }}%" - alert: HostCriticalDiskUsage expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 90 for: 5m labels: severity: critical category: disk annotations: summary: "主機磁碟空間嚴重不足" description: "主機 {{ $labels.instance }} 磁碟 {{ $labels.mountpoint }} 使用率超過 90%,當前值: {{ $value | printf \"%.1f\" }}%" value: "{{ $value | printf \"%.1f\" }}%" # ----------------------------------------------------------------------- # 系統負載告警 # ----------------------------------------------------------------------- - alert: HostHighLoadAverage expr: node_load5 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 0.8 for: 5m labels: severity: warning category: load annotations: summary: "主機系統負載過高" description: "主機 {{ $labels.instance }} 5分鐘負載平均值過高,當前值: {{ $value | printf \"%.2f\" }}" value: "{{ $value | printf \"%.2f\" }}" # =========================================================================== # 容器監控告警 # =========================================================================== - name: container_alerts rules: # ----------------------------------------------------------------------- # 容器 CPU 使用率 # ----------------------------------------------------------------------- - alert: ContainerHighCpuUsage expr: (rate(container_cpu_usage_seconds_total{name!=""}[5m]) * 100) > 50 for: 5m labels: severity: warning category: container_cpu annotations: summary: "容器 CPU 使用率過高" description: "容器 {{ $labels.name }} CPU 使用率超過 50%,當前值: {{ $value | printf \"%.1f\" }}%" container: "{{ $labels.name }}" value: "{{ $value | printf \"%.1f\" }}%" # ----------------------------------------------------------------------- # 容器記憶體使用率 # ----------------------------------------------------------------------- - alert: ContainerHighMemoryUsage expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""}) * 100 > 50 for: 5m labels: severity: warning category: container_memory annotations: summary: "容器記憶體使用率過高" description: "容器 {{ $labels.name }} 記憶體使用率超過 50%,當前值: {{ $value | printf \"%.1f\" }}%" container: "{{ $labels.name }}" value: "{{ $value | printf \"%.1f\" }}%" # =========================================================================== # 網站健康監控告警 # =========================================================================== - name: website_alerts rules: # ----------------------------------------------------------------------- # 網站無法訪問 # ----------------------------------------------------------------------- - alert: WebsiteDown expr: probe_success{job=~"blackbox-http.*"} == 0 for: 1m labels: severity: critical category: website annotations: summary: "網站無法訪問" description: "網站 {{ $labels.instance }} 無法訪問,請立即檢查" # ----------------------------------------------------------------------- # 網站響應時間過長 # ----------------------------------------------------------------------- - alert: WebsiteSlowResponse expr: probe_http_duration_seconds{job=~"blackbox-http.*"} > 5 for: 2m labels: severity: warning category: website annotations: summary: "網站響應緩慢" description: "網站 {{ $labels.instance }} 響應時間超過 5 秒,當前值: {{ $value | printf \"%.2f\" }} 秒" value: "{{ $value | printf \"%.2f\" }}s" # =========================================================================== # 網路連通性告警 # =========================================================================== - name: network_alerts rules: # ----------------------------------------------------------------------- # 主機無法 Ping # ----------------------------------------------------------------------- - alert: HostUnreachable expr: probe_success{job="blackbox-icmp"} == 0 for: 1m labels: severity: critical category: network annotations: summary: "主機無法連通" description: "主機 {{ $labels.instance }} 無法 ping 通,可能已離線" # ----------------------------------------------------------------------- # TCP 端口無法連接 # ----------------------------------------------------------------------- - alert: ServicePortDown expr: probe_success{job=~"blackbox-tcp.*"} == 0 for: 1m labels: severity: critical category: network annotations: summary: "服務端口無法連接" description: "服務 {{ $labels.instance }} 無法連接,請檢查服務狀態" # =========================================================================== # PostgreSQL 資料庫監控告警 # =========================================================================== - name: postgres_alerts rules: # ----------------------------------------------------------------------- # PostgreSQL 無法連接 # ----------------------------------------------------------------------- - alert: PostgresDown expr: pg_up == 0 for: 1m labels: severity: critical category: database annotations: summary: "PostgreSQL 無法連接" description: "PostgreSQL 資料庫無法連接,請立即檢查"