Some checks failed
CD Pipeline / deploy (push) Failing after 59s
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml) - 部署模式: rsync Python 檔案至 188 → docker restart (volume mount) - Dockerfile/requirements 變動時自動重建 Docker image - 部署通知: Telegram (開始/成功/失敗) - 健康檢查: https://mo.wooo.work/health (最多 5 次重試) - 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
227 lines
8.2 KiB
YAML
227 lines
8.2 KiB
YAML
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: momo-complete-alerts
|
|
namespace: monitoring
|
|
labels:
|
|
release: prometheus
|
|
app: kube-prometheus-stack
|
|
spec:
|
|
groups:
|
|
# ============================================
|
|
# 1. OOM 記憶體告警 (觸發 oom-handler.sh)
|
|
# ============================================
|
|
- name: memory-alerts
|
|
rules:
|
|
# OOM Killed 事件
|
|
- alert: PodOOMKilled
|
|
expr: |
|
|
kube_pod_container_status_last_terminated_reason{reason="OOMKilled", namespace="momo"} == 1
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
auto_repair: "oom-handler"
|
|
annotations:
|
|
summary: "Pod {{ $labels.pod }} OOM Killed"
|
|
description: "容器 {{ $labels.container }} 因記憶體不足被終止,將自動增加記憶體限制"
|
|
repair_action: "自動增加記憶體限制 +50%"
|
|
|
|
# 記憶體使用率過高 (預警)
|
|
- alert: HighMemoryUsage
|
|
expr: |
|
|
(
|
|
container_memory_working_set_bytes{namespace="momo", container!=""}
|
|
/ container_spec_memory_limit_bytes{namespace="momo", container!=""}
|
|
) > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
auto_repair: "none"
|
|
annotations:
|
|
summary: "Pod {{ $labels.pod }} 記憶體使用率 > 85%"
|
|
description: "容器 {{ $labels.container }} 記憶體使用率偏高,可能即將 OOM"
|
|
|
|
# 記憶體接近上限 (即將 OOM)
|
|
- alert: MemoryNearLimit
|
|
expr: |
|
|
(
|
|
container_memory_working_set_bytes{namespace="momo", container!=""}
|
|
/ container_spec_memory_limit_bytes{namespace="momo", container!=""}
|
|
) > 0.95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
auto_repair: "oom-handler"
|
|
annotations:
|
|
summary: "Pod {{ $labels.pod }} 記憶體即將耗盡"
|
|
description: "記憶體使用率 > 95%,將自動增加記憶體限制"
|
|
repair_action: "自動增加記憶體限制 +50%"
|
|
|
|
# ============================================
|
|
# 2. PostgreSQL 告警 (觸發 postgres-repair.sh)
|
|
# ============================================
|
|
- name: postgres-alerts
|
|
rules:
|
|
# PostgreSQL 連線失敗
|
|
- alert: PostgresDown
|
|
expr: pg_up{namespace="momo"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
auto_repair: "postgres-repair"
|
|
annotations:
|
|
summary: "PostgreSQL 連線失敗"
|
|
description: "momo namespace 的 PostgreSQL 無法連線,將自動重啟 Pod"
|
|
repair_action: "自動重啟 PostgreSQL Pod"
|
|
|
|
# 連線數過高
|
|
- alert: PostgresHighConnections
|
|
expr: |
|
|
pg_stat_activity_count{namespace="momo"}
|
|
/ pg_settings_max_connections{namespace="momo"} > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
auto_repair: "postgres-repair"
|
|
annotations:
|
|
summary: "PostgreSQL 連線數 > 80%"
|
|
description: "連線數過高,將自動終止閒置連線"
|
|
repair_action: "終止閒置超過 30 分鐘的連線"
|
|
|
|
# 死鎖檢測
|
|
- alert: PostgresDeadlocks
|
|
expr: rate(pg_stat_database_deadlocks{namespace="momo"}[5m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
auto_repair: "postgres-repair"
|
|
annotations:
|
|
summary: "PostgreSQL 檢測到死鎖"
|
|
description: "資料庫發生死鎖,將自動終止阻塞查詢"
|
|
repair_action: "終止長時間阻塞的查詢"
|
|
|
|
# 慢查詢過多
|
|
- alert: PostgresSlowQueries
|
|
expr: |
|
|
pg_stat_activity_max_tx_duration{namespace="momo"} > 300
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
auto_repair: "postgres-repair"
|
|
annotations:
|
|
summary: "PostgreSQL 存在超過 5 分鐘的長查詢"
|
|
description: "長時間執行的查詢可能影響效能"
|
|
repair_action: "自動 VACUUM ANALYZE"
|
|
|
|
# ============================================
|
|
# 3. 應用健康告警 (觸發 auto-rollback.sh)
|
|
# ============================================
|
|
- name: app-health-alerts
|
|
rules:
|
|
# MOMO App 無回應
|
|
- alert: MomoAppDown
|
|
expr: |
|
|
probe_success{job="blackbox", instance=~".*mo.wooo.work.*"} == 0
|
|
or absent(probe_success{job="blackbox", instance=~".*mo.wooo.work.*"})
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
auto_repair: "auto-rollback"
|
|
annotations:
|
|
summary: "MOMO App UAT 無回應"
|
|
description: "https://mo.wooo.work 健康檢查失敗超過 2 分鐘"
|
|
repair_action: "連續 5 次失敗將自動回滾"
|
|
|
|
# GCP MOMO App 無回應
|
|
- alert: MomoAppGCPDown
|
|
expr: |
|
|
probe_success{job="blackbox", instance=~".*momo.wooo.work.*"} == 0
|
|
or absent(probe_success{job="blackbox", instance=~".*momo.wooo.work.*"})
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
auto_repair: "domain-health-monitor"
|
|
annotations:
|
|
summary: "MOMO App GCP 無回應"
|
|
description: "https://momo.wooo.work 健康檢查失敗"
|
|
repair_action: "自動 kubectl rollout restart"
|
|
|
|
# HTTP 5xx 錯誤率過高
|
|
- alert: HighHTTP5xxRate
|
|
expr: |
|
|
(
|
|
sum(rate(http_requests_total{status=~"5..", namespace="momo"}[5m]))
|
|
/ sum(rate(http_requests_total{namespace="momo"}[5m]))
|
|
) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
auto_repair: "auto-rollback"
|
|
annotations:
|
|
summary: "HTTP 5xx 錯誤率 > 5%"
|
|
description: "可能存在程式碼問題,將監控是否需要回滾"
|
|
repair_action: "連續錯誤將觸發自動回滾"
|
|
|
|
# Pod 頻繁重啟
|
|
- alert: PodRestartTooMany
|
|
expr: |
|
|
increase(kube_pod_container_status_restarts_total{namespace="momo"}[1h]) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
auto_repair: "auto-rollback"
|
|
annotations:
|
|
summary: "Pod {{ $labels.pod }} 1 小時內重啟超過 5 次"
|
|
description: "頻繁重啟可能表示程式碼有問題"
|
|
repair_action: "將考慮自動回滾到上一版本"
|
|
|
|
# ============================================
|
|
# 4. 基礎設施告警 (觸發 domain-health-monitor.sh)
|
|
# ============================================
|
|
- name: infrastructure-alerts
|
|
rules:
|
|
# 磁碟空間不足
|
|
- alert: DiskSpaceLow
|
|
expr: |
|
|
(
|
|
node_filesystem_avail_bytes{mountpoint="/"}
|
|
/ node_filesystem_size_bytes{mountpoint="/"}
|
|
) < 0.15
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
auto_repair: "disk-cleanup"
|
|
annotations:
|
|
summary: "磁碟剩餘空間 < 15%"
|
|
description: "磁碟空間不足,將自動清理"
|
|
repair_action: "自動清理 Docker、日誌"
|
|
|
|
# 磁碟空間嚴重不足
|
|
- alert: DiskSpaceCritical
|
|
expr: |
|
|
(
|
|
node_filesystem_avail_bytes{mountpoint="/"}
|
|
/ node_filesystem_size_bytes{mountpoint="/"}
|
|
) < 0.05
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
auto_repair: "disk-cleanup"
|
|
annotations:
|
|
summary: "磁碟剩餘空間 < 5%"
|
|
description: "磁碟空間嚴重不足!"
|
|
repair_action: "緊急自動清理"
|
|
|
|
# Scheduler 停止
|
|
- alert: MomoSchedulerDown
|
|
expr: |
|
|
kube_deployment_status_replicas_available{deployment="momo-scheduler", namespace="momo"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
auto_repair: "domain-health-monitor"
|
|
annotations:
|
|
summary: "MOMO Scheduler 已停止"
|
|
description: "排程器無可用副本"
|
|
repair_action: "自動 kubectl rollout restart"
|