Some checks failed
CD Pipeline / deploy (push) Failing after 59s
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml) - 部署模式: rsync Python 檔案至 188 → docker restart (volume mount) - Dockerfile/requirements 變動時自動重建 Docker image - 部署通知: Telegram (開始/成功/失敗) - 健康檢查: https://mo.wooo.work/health (最多 5 次重試) - 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
196 lines
7.0 KiB
YAML
196 lines
7.0 KiB
YAML
# =============================================================================
|
||
# WOOO TECH - Momo Pro System
|
||
# Prometheus Alerting Rules
|
||
# =============================================================================
|
||
apiVersion: monitoring.coreos.com/v1
|
||
kind: PrometheusRule
|
||
metadata:
|
||
name: momo-alerts
|
||
namespace: monitoring
|
||
labels:
|
||
release: prometheus
|
||
spec:
|
||
groups:
|
||
# Pod 健康告警
|
||
- name: momo-pod-alerts
|
||
rules:
|
||
# Pod OOMKilled 告警
|
||
- alert: PodOOMKilled
|
||
expr: |
|
||
kube_pod_container_status_last_terminated_reason{reason="OOMKilled", namespace="momo"} == 1
|
||
for: 0m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Pod OOMKilled: {{ $labels.pod }}"
|
||
description: "Pod {{ $labels.pod }} 因記憶體不足被終止 (OOMKilled)"
|
||
|
||
# Pod 重啟次數過多 (僅追蹤當前運行中的 Pod)
|
||
- alert: PodRestartTooMany
|
||
expr: |
|
||
increase(kube_pod_container_status_restarts_total{namespace="momo"}[1h]) > 3
|
||
and on(pod, namespace)
|
||
kube_pod_status_phase{namespace="momo", phase="Running"} == 1
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "Pod 重啟頻繁: {{ $labels.pod }}"
|
||
description: "Pod {{ $labels.pod }} 在過去 1 小時內重啟超過 3 次"
|
||
|
||
# Pod 未就緒
|
||
- alert: PodNotReady
|
||
expr: |
|
||
kube_pod_status_ready{condition="true", namespace="momo"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Pod 未就緒: {{ $labels.pod }}"
|
||
description: "Pod {{ $labels.pod }} 超過 5 分鐘未就緒"
|
||
|
||
# Pod Pending
|
||
- alert: PodPending
|
||
expr: |
|
||
kube_pod_status_phase{phase="Pending", namespace="momo"} == 1
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "Pod 處於 Pending: {{ $labels.pod }}"
|
||
description: "Pod {{ $labels.pod }} 超過 10 分鐘無法啟動"
|
||
|
||
# 資源使用告警
|
||
- name: momo-resource-alerts
|
||
rules:
|
||
# 記憶體使用率過高
|
||
- alert: HighMemoryUsage
|
||
expr: |
|
||
(container_memory_working_set_bytes{namespace="momo"} / container_spec_memory_limit_bytes{namespace="momo"}) > 0.8
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "記憶體使用率過高: {{ $labels.pod }}"
|
||
description: "Pod {{ $labels.pod }} 記憶體使用率超過 80%,當前值: {{ $value | humanizePercentage }}"
|
||
|
||
# 記憶體即將耗盡
|
||
- alert: MemoryNearLimit
|
||
expr: |
|
||
(container_memory_working_set_bytes{namespace="momo"} / container_spec_memory_limit_bytes{namespace="momo"}) > 0.95
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "記憶體即將耗盡: {{ $labels.pod }}"
|
||
description: "Pod {{ $labels.pod }} 記憶體使用率超過 95%,即將 OOM!"
|
||
|
||
# CPU 使用率過高
|
||
- alert: HighCPUUsage
|
||
expr: |
|
||
(rate(container_cpu_usage_seconds_total{namespace="momo"}[5m]) / container_spec_cpu_quota{namespace="momo"} * 100000) > 0.8
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "CPU 使用率過高: {{ $labels.pod }}"
|
||
description: "Pod {{ $labels.pod }} CPU 使用率超過 80%"
|
||
|
||
# PostgreSQL 告警
|
||
- name: momo-postgres-alerts
|
||
rules:
|
||
# PostgreSQL 連線失敗
|
||
- alert: PostgresDown
|
||
expr: |
|
||
absent(kube_pod_status_ready{pod=~"momo-postgres.*", condition="true", namespace="momo"} == 1)
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "PostgreSQL 服務異常"
|
||
description: "momo-postgres Pod 未就緒或不存在"
|
||
|
||
# PVC 空間不足
|
||
- alert: PVCSpaceLow
|
||
expr: |
|
||
(kubelet_volume_stats_used_bytes{namespace="momo"} / kubelet_volume_stats_capacity_bytes{namespace="momo"}) > 0.85
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "PVC 空間不足: {{ $labels.persistentvolumeclaim }}"
|
||
description: "PVC {{ $labels.persistentvolumeclaim }} 使用率超過 85%"
|
||
|
||
# 應用健康告警
|
||
- name: momo-app-alerts
|
||
rules:
|
||
# HTTP 5xx 錯誤率過高
|
||
- alert: HighHTTP5xxRate
|
||
expr: |
|
||
sum(rate(nginx_ingress_controller_requests{status=~"5.*", namespace="momo"}[5m]))
|
||
/ sum(rate(nginx_ingress_controller_requests{namespace="momo"}[5m])) > 0.05
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "HTTP 5xx 錯誤率過高"
|
||
description: "momo 應用 5xx 錯誤率超過 5%"
|
||
|
||
# 服務不可用 (透過 K8s Pod 狀態檢測)
|
||
- alert: MomoAppDown
|
||
expr: |
|
||
kube_deployment_status_replicas_available{deployment="momo-app", namespace="momo"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "MOMO 應用服務異常"
|
||
description: "momo-app 部署沒有可用的副本,服務完全中斷"
|
||
|
||
# Scheduler 不可用
|
||
- alert: MomoSchedulerDown
|
||
expr: |
|
||
kube_deployment_status_replicas_available{deployment="momo-scheduler", namespace="momo"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "MOMO Scheduler 服務異常"
|
||
description: "momo-scheduler 部署沒有可用的副本,排程任務已停止"
|
||
|
||
# Node 告警
|
||
- name: node-alerts
|
||
rules:
|
||
# Node 記憶體不足
|
||
- alert: NodeMemoryLow
|
||
expr: |
|
||
(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "節點記憶體不足: {{ $labels.instance }}"
|
||
description: "節點 {{ $labels.instance }} 可用記憶體低於 10%"
|
||
|
||
# Node 磁碟空間不足
|
||
- alert: NodeDiskLow
|
||
expr: |
|
||
(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "節點磁碟空間不足: {{ $labels.instance }}"
|
||
description: "節點 {{ $labels.instance }} 根目錄可用空間低於 15%"
|
||
|
||
# Node 不可用
|
||
- alert: NodeNotReady
|
||
expr: |
|
||
kube_node_status_condition{condition="Ready", status="true"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "節點不可用: {{ $labels.node }}"
|
||
description: "節點 {{ $labels.node }} 超過 5 分鐘處於 NotReady 狀態"
|