Files
ewoooc/k8s/monitoring/alerting-rules.yaml
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

196 lines
7.0 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# =============================================================================
# WOOO TECH - Momo Pro System
# Prometheus Alerting Rules
# =============================================================================
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: momo-alerts
namespace: monitoring
labels:
release: prometheus
spec:
groups:
# Pod 健康告警
- name: momo-pod-alerts
rules:
# Pod OOMKilled 告警
- alert: PodOOMKilled
expr: |
kube_pod_container_status_last_terminated_reason{reason="OOMKilled", namespace="momo"} == 1
for: 0m
labels:
severity: critical
annotations:
summary: "Pod OOMKilled: {{ $labels.pod }}"
description: "Pod {{ $labels.pod }} 因記憶體不足被終止 (OOMKilled)"
# Pod 重啟次數過多 (僅追蹤當前運行中的 Pod)
- alert: PodRestartTooMany
expr: |
increase(kube_pod_container_status_restarts_total{namespace="momo"}[1h]) > 3
and on(pod, namespace)
kube_pod_status_phase{namespace="momo", phase="Running"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Pod 重啟頻繁: {{ $labels.pod }}"
description: "Pod {{ $labels.pod }} 在過去 1 小時內重啟超過 3 次"
# Pod 未就緒
- alert: PodNotReady
expr: |
kube_pod_status_ready{condition="true", namespace="momo"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Pod 未就緒: {{ $labels.pod }}"
description: "Pod {{ $labels.pod }} 超過 5 分鐘未就緒"
# Pod Pending
- alert: PodPending
expr: |
kube_pod_status_phase{phase="Pending", namespace="momo"} == 1
for: 10m
labels:
severity: warning
annotations:
summary: "Pod 處於 Pending: {{ $labels.pod }}"
description: "Pod {{ $labels.pod }} 超過 10 分鐘無法啟動"
# 資源使用告警
- name: momo-resource-alerts
rules:
# 記憶體使用率過高
- alert: HighMemoryUsage
expr: |
(container_memory_working_set_bytes{namespace="momo"} / container_spec_memory_limit_bytes{namespace="momo"}) > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "記憶體使用率過高: {{ $labels.pod }}"
description: "Pod {{ $labels.pod }} 記憶體使用率超過 80%,當前值: {{ $value | humanizePercentage }}"
# 記憶體即將耗盡
- alert: MemoryNearLimit
expr: |
(container_memory_working_set_bytes{namespace="momo"} / container_spec_memory_limit_bytes{namespace="momo"}) > 0.95
for: 2m
labels:
severity: critical
annotations:
summary: "記憶體即將耗盡: {{ $labels.pod }}"
description: "Pod {{ $labels.pod }} 記憶體使用率超過 95%,即將 OOM"
# CPU 使用率過高
- alert: HighCPUUsage
expr: |
(rate(container_cpu_usage_seconds_total{namespace="momo"}[5m]) / container_spec_cpu_quota{namespace="momo"} * 100000) > 0.8
for: 10m
labels:
severity: warning
annotations:
summary: "CPU 使用率過高: {{ $labels.pod }}"
description: "Pod {{ $labels.pod }} CPU 使用率超過 80%"
# PostgreSQL 告警
- name: momo-postgres-alerts
rules:
# PostgreSQL 連線失敗
- alert: PostgresDown
expr: |
absent(kube_pod_status_ready{pod=~"momo-postgres.*", condition="true", namespace="momo"} == 1)
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL 服務異常"
description: "momo-postgres Pod 未就緒或不存在"
# PVC 空間不足
- alert: PVCSpaceLow
expr: |
(kubelet_volume_stats_used_bytes{namespace="momo"} / kubelet_volume_stats_capacity_bytes{namespace="momo"}) > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "PVC 空間不足: {{ $labels.persistentvolumeclaim }}"
description: "PVC {{ $labels.persistentvolumeclaim }} 使用率超過 85%"
# 應用健康告警
- name: momo-app-alerts
rules:
# HTTP 5xx 錯誤率過高
- alert: HighHTTP5xxRate
expr: |
sum(rate(nginx_ingress_controller_requests{status=~"5.*", namespace="momo"}[5m]))
/ sum(rate(nginx_ingress_controller_requests{namespace="momo"}[5m])) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "HTTP 5xx 錯誤率過高"
description: "momo 應用 5xx 錯誤率超過 5%"
# 服務不可用 (透過 K8s Pod 狀態檢測)
- alert: MomoAppDown
expr: |
kube_deployment_status_replicas_available{deployment="momo-app", namespace="momo"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "MOMO 應用服務異常"
description: "momo-app 部署沒有可用的副本,服務完全中斷"
# Scheduler 不可用
- alert: MomoSchedulerDown
expr: |
kube_deployment_status_replicas_available{deployment="momo-scheduler", namespace="momo"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "MOMO Scheduler 服務異常"
description: "momo-scheduler 部署沒有可用的副本,排程任務已停止"
# Node 告警
- name: node-alerts
rules:
# Node 記憶體不足
- alert: NodeMemoryLow
expr: |
(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "節點記憶體不足: {{ $labels.instance }}"
description: "節點 {{ $labels.instance }} 可用記憶體低於 10%"
# Node 磁碟空間不足
- alert: NodeDiskLow
expr: |
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
for: 5m
labels:
severity: warning
annotations:
summary: "節點磁碟空間不足: {{ $labels.instance }}"
description: "節點 {{ $labels.instance }} 根目錄可用空間低於 15%"
# Node 不可用
- alert: NodeNotReady
expr: |
kube_node_status_condition{condition="Ready", status="true"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "節點不可用: {{ $labels.node }}"
description: "節點 {{ $labels.node }} 超過 5 分鐘處於 NotReady 狀態"