Files
ewoooc/k8s/monitoring/complete-alerting-rules.yaml
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

227 lines
8.2 KiB
YAML

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: momo-complete-alerts
namespace: monitoring
labels:
release: prometheus
app: kube-prometheus-stack
spec:
groups:
# ============================================
# 1. OOM 記憶體告警 (觸發 oom-handler.sh)
# ============================================
- name: memory-alerts
rules:
# OOM Killed 事件
- alert: PodOOMKilled
expr: |
kube_pod_container_status_last_terminated_reason{reason="OOMKilled", namespace="momo"} == 1
for: 0m
labels:
severity: critical
auto_repair: "oom-handler"
annotations:
summary: "Pod {{ $labels.pod }} OOM Killed"
description: "容器 {{ $labels.container }} 因記憶體不足被終止,將自動增加記憶體限制"
repair_action: "自動增加記憶體限制 +50%"
# 記憶體使用率過高 (預警)
- alert: HighMemoryUsage
expr: |
(
container_memory_working_set_bytes{namespace="momo", container!=""}
/ container_spec_memory_limit_bytes{namespace="momo", container!=""}
) > 0.85
for: 5m
labels:
severity: warning
auto_repair: "none"
annotations:
summary: "Pod {{ $labels.pod }} 記憶體使用率 > 85%"
description: "容器 {{ $labels.container }} 記憶體使用率偏高,可能即將 OOM"
# 記憶體接近上限 (即將 OOM)
- alert: MemoryNearLimit
expr: |
(
container_memory_working_set_bytes{namespace="momo", container!=""}
/ container_spec_memory_limit_bytes{namespace="momo", container!=""}
) > 0.95
for: 2m
labels:
severity: critical
auto_repair: "oom-handler"
annotations:
summary: "Pod {{ $labels.pod }} 記憶體即將耗盡"
description: "記憶體使用率 > 95%,將自動增加記憶體限制"
repair_action: "自動增加記憶體限制 +50%"
# ============================================
# 2. PostgreSQL 告警 (觸發 postgres-repair.sh)
# ============================================
- name: postgres-alerts
rules:
# PostgreSQL 連線失敗
- alert: PostgresDown
expr: pg_up{namespace="momo"} == 0
for: 1m
labels:
severity: critical
auto_repair: "postgres-repair"
annotations:
summary: "PostgreSQL 連線失敗"
description: "momo namespace 的 PostgreSQL 無法連線,將自動重啟 Pod"
repair_action: "自動重啟 PostgreSQL Pod"
# 連線數過高
- alert: PostgresHighConnections
expr: |
pg_stat_activity_count{namespace="momo"}
/ pg_settings_max_connections{namespace="momo"} > 0.8
for: 5m
labels:
severity: warning
auto_repair: "postgres-repair"
annotations:
summary: "PostgreSQL 連線數 > 80%"
description: "連線數過高,將自動終止閒置連線"
repair_action: "終止閒置超過 30 分鐘的連線"
# 死鎖檢測
- alert: PostgresDeadlocks
expr: rate(pg_stat_database_deadlocks{namespace="momo"}[5m]) > 0
for: 5m
labels:
severity: warning
auto_repair: "postgres-repair"
annotations:
summary: "PostgreSQL 檢測到死鎖"
description: "資料庫發生死鎖,將自動終止阻塞查詢"
repair_action: "終止長時間阻塞的查詢"
# 慢查詢過多
- alert: PostgresSlowQueries
expr: |
pg_stat_activity_max_tx_duration{namespace="momo"} > 300
for: 5m
labels:
severity: warning
auto_repair: "postgres-repair"
annotations:
summary: "PostgreSQL 存在超過 5 分鐘的長查詢"
description: "長時間執行的查詢可能影響效能"
repair_action: "自動 VACUUM ANALYZE"
# ============================================
# 3. 應用健康告警 (觸發 auto-rollback.sh)
# ============================================
- name: app-health-alerts
rules:
# MOMO App 無回應
- alert: MomoAppDown
expr: |
probe_success{job="blackbox", instance=~".*mo.wooo.work.*"} == 0
or absent(probe_success{job="blackbox", instance=~".*mo.wooo.work.*"})
for: 2m
labels:
severity: critical
auto_repair: "auto-rollback"
annotations:
summary: "MOMO App UAT 無回應"
description: "https://mo.wooo.work 健康檢查失敗超過 2 分鐘"
repair_action: "連續 5 次失敗將自動回滾"
# GCP MOMO App 無回應
- alert: MomoAppGCPDown
expr: |
probe_success{job="blackbox", instance=~".*momo.wooo.work.*"} == 0
or absent(probe_success{job="blackbox", instance=~".*momo.wooo.work.*"})
for: 2m
labels:
severity: critical
auto_repair: "domain-health-monitor"
annotations:
summary: "MOMO App GCP 無回應"
description: "https://momo.wooo.work 健康檢查失敗"
repair_action: "自動 kubectl rollout restart"
# HTTP 5xx 錯誤率過高
- alert: HighHTTP5xxRate
expr: |
(
sum(rate(http_requests_total{status=~"5..", namespace="momo"}[5m]))
/ sum(rate(http_requests_total{namespace="momo"}[5m]))
) > 0.05
for: 5m
labels:
severity: warning
auto_repair: "auto-rollback"
annotations:
summary: "HTTP 5xx 錯誤率 > 5%"
description: "可能存在程式碼問題,將監控是否需要回滾"
repair_action: "連續錯誤將觸發自動回滾"
# Pod 頻繁重啟
- alert: PodRestartTooMany
expr: |
increase(kube_pod_container_status_restarts_total{namespace="momo"}[1h]) > 5
for: 5m
labels:
severity: warning
auto_repair: "auto-rollback"
annotations:
summary: "Pod {{ $labels.pod }} 1 小時內重啟超過 5 次"
description: "頻繁重啟可能表示程式碼有問題"
repair_action: "將考慮自動回滾到上一版本"
# ============================================
# 4. 基礎設施告警 (觸發 domain-health-monitor.sh)
# ============================================
- name: infrastructure-alerts
rules:
# 磁碟空間不足
- alert: DiskSpaceLow
expr: |
(
node_filesystem_avail_bytes{mountpoint="/"}
/ node_filesystem_size_bytes{mountpoint="/"}
) < 0.15
for: 5m
labels:
severity: warning
auto_repair: "disk-cleanup"
annotations:
summary: "磁碟剩餘空間 < 15%"
description: "磁碟空間不足,將自動清理"
repair_action: "自動清理 Docker、日誌"
# 磁碟空間嚴重不足
- alert: DiskSpaceCritical
expr: |
(
node_filesystem_avail_bytes{mountpoint="/"}
/ node_filesystem_size_bytes{mountpoint="/"}
) < 0.05
for: 2m
labels:
severity: critical
auto_repair: "disk-cleanup"
annotations:
summary: "磁碟剩餘空間 < 5%"
description: "磁碟空間嚴重不足!"
repair_action: "緊急自動清理"
# Scheduler 停止
- alert: MomoSchedulerDown
expr: |
kube_deployment_status_replicas_available{deployment="momo-scheduler", namespace="momo"} == 0
for: 5m
labels:
severity: critical
auto_repair: "domain-health-monitor"
annotations:
summary: "MOMO Scheduler 已停止"
description: "排程器無可用副本"
repair_action: "自動 kubectl rollout restart"