Files
ewoooc/docker/prometheus/alert_rules.yml
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

224 lines
9.6 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# =============================================================================
# WOOO TECH - Momo Pro System
# Prometheus Alert Rules
# Version: 1.0
# =============================================================================
#
# 告警嚴重程度定義:
# - critical: 需要立即處理的嚴重問題
# - warning: 需要關注但不緊急的問題
# - info: 資訊性通知
#
# =============================================================================
groups:
# ===========================================================================
# 主機資源監控告警
# ===========================================================================
- name: host_alerts
rules:
# -----------------------------------------------------------------------
# CPU 使用率告警
# -----------------------------------------------------------------------
- alert: HostHighCpuUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 50
for: 5m
labels:
severity: warning
category: cpu
annotations:
summary: "主機 CPU 使用率過高"
description: "主機 {{ $labels.instance }} CPU 使用率超過 50% 持續 5 分鐘,當前值: {{ $value | printf \"%.1f\" }}%"
value: "{{ $value | printf \"%.1f\" }}%"
- alert: HostCriticalCpuUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: critical
category: cpu
annotations:
summary: "主機 CPU 使用率嚴重過高"
description: "主機 {{ $labels.instance }} CPU 使用率超過 80%,當前值: {{ $value | printf \"%.1f\" }}%"
value: "{{ $value | printf \"%.1f\" }}%"
# -----------------------------------------------------------------------
# 記憶體使用率告警
# -----------------------------------------------------------------------
- alert: HostHighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 50
for: 5m
labels:
severity: warning
category: memory
annotations:
summary: "主機記憶體使用率過高"
description: "主機 {{ $labels.instance }} 記憶體使用率超過 50% 持續 5 分鐘,當前值: {{ $value | printf \"%.1f\" }}%"
value: "{{ $value | printf \"%.1f\" }}%"
- alert: HostCriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: critical
category: memory
annotations:
summary: "主機記憶體使用率嚴重過高"
description: "主機 {{ $labels.instance }} 記憶體使用率超過 85%,當前值: {{ $value | printf \"%.1f\" }}%"
value: "{{ $value | printf \"%.1f\" }}%"
# -----------------------------------------------------------------------
# 磁碟使用率告警
# -----------------------------------------------------------------------
- alert: HostHighDiskUsage
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 80
for: 5m
labels:
severity: warning
category: disk
annotations:
summary: "主機磁碟使用率過高"
description: "主機 {{ $labels.instance }} 磁碟 {{ $labels.mountpoint }} 使用率超過 80%,當前值: {{ $value | printf \"%.1f\" }}%"
value: "{{ $value | printf \"%.1f\" }}%"
- alert: HostCriticalDiskUsage
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 90
for: 5m
labels:
severity: critical
category: disk
annotations:
summary: "主機磁碟空間嚴重不足"
description: "主機 {{ $labels.instance }} 磁碟 {{ $labels.mountpoint }} 使用率超過 90%,當前值: {{ $value | printf \"%.1f\" }}%"
value: "{{ $value | printf \"%.1f\" }}%"
# -----------------------------------------------------------------------
# 系統負載告警
# -----------------------------------------------------------------------
- alert: HostHighLoadAverage
expr: node_load5 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 0.8
for: 5m
labels:
severity: warning
category: load
annotations:
summary: "主機系統負載過高"
description: "主機 {{ $labels.instance }} 5分鐘負載平均值過高當前值: {{ $value | printf \"%.2f\" }}"
value: "{{ $value | printf \"%.2f\" }}"
# ===========================================================================
# 容器監控告警
# ===========================================================================
- name: container_alerts
rules:
# -----------------------------------------------------------------------
# 容器 CPU 使用率
# -----------------------------------------------------------------------
- alert: ContainerHighCpuUsage
expr: (rate(container_cpu_usage_seconds_total{name!=""}[5m]) * 100) > 50
for: 5m
labels:
severity: warning
category: container_cpu
annotations:
summary: "容器 CPU 使用率過高"
description: "容器 {{ $labels.name }} CPU 使用率超過 50%,當前值: {{ $value | printf \"%.1f\" }}%"
container: "{{ $labels.name }}"
value: "{{ $value | printf \"%.1f\" }}%"
# -----------------------------------------------------------------------
# 容器記憶體使用率
# -----------------------------------------------------------------------
- alert: ContainerHighMemoryUsage
expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""}) * 100 > 50
for: 5m
labels:
severity: warning
category: container_memory
annotations:
summary: "容器記憶體使用率過高"
description: "容器 {{ $labels.name }} 記憶體使用率超過 50%,當前值: {{ $value | printf \"%.1f\" }}%"
container: "{{ $labels.name }}"
value: "{{ $value | printf \"%.1f\" }}%"
# ===========================================================================
# 網站健康監控告警
# ===========================================================================
- name: website_alerts
rules:
# -----------------------------------------------------------------------
# 網站無法訪問
# -----------------------------------------------------------------------
- alert: WebsiteDown
expr: probe_success{job=~"blackbox-http.*"} == 0
for: 1m
labels:
severity: critical
category: website
annotations:
summary: "網站無法訪問"
description: "網站 {{ $labels.instance }} 無法訪問,請立即檢查"
# -----------------------------------------------------------------------
# 網站響應時間過長
# -----------------------------------------------------------------------
- alert: WebsiteSlowResponse
expr: probe_http_duration_seconds{job=~"blackbox-http.*"} > 5
for: 2m
labels:
severity: warning
category: website
annotations:
summary: "網站響應緩慢"
description: "網站 {{ $labels.instance }} 響應時間超過 5 秒,當前值: {{ $value | printf \"%.2f\" }} 秒"
value: "{{ $value | printf \"%.2f\" }}s"
# ===========================================================================
# 網路連通性告警
# ===========================================================================
- name: network_alerts
rules:
# -----------------------------------------------------------------------
# 主機無法 Ping
# -----------------------------------------------------------------------
- alert: HostUnreachable
expr: probe_success{job="blackbox-icmp"} == 0
for: 1m
labels:
severity: critical
category: network
annotations:
summary: "主機無法連通"
description: "主機 {{ $labels.instance }} 無法 ping 通,可能已離線"
# -----------------------------------------------------------------------
# TCP 端口無法連接
# -----------------------------------------------------------------------
- alert: ServicePortDown
expr: probe_success{job=~"blackbox-tcp.*"} == 0
for: 1m
labels:
severity: critical
category: network
annotations:
summary: "服務端口無法連接"
description: "服務 {{ $labels.instance }} 無法連接,請檢查服務狀態"
# ===========================================================================
# PostgreSQL 資料庫監控告警
# ===========================================================================
- name: postgres_alerts
rules:
# -----------------------------------------------------------------------
# PostgreSQL 無法連接
# -----------------------------------------------------------------------
- alert: PostgresDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
category: database
annotations:
summary: "PostgreSQL 無法連接"
description: "PostgreSQL 資料庫無法連接,請立即檢查"