Some checks failed
CD Pipeline / deploy (push) Failing after 59s
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml) - 部署模式: rsync Python 檔案至 188 → docker restart (volume mount) - Dockerfile/requirements 變動時自動重建 Docker image - 部署通知: Telegram (開始/成功/失敗) - 健康檢查: https://mo.wooo.work/health (最多 5 次重試) - 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
224 lines
9.6 KiB
YAML
224 lines
9.6 KiB
YAML
# =============================================================================
|
||
# WOOO TECH - Momo Pro System
|
||
# Prometheus Alert Rules
|
||
# Version: 1.0
|
||
# =============================================================================
|
||
#
|
||
# 告警嚴重程度定義:
|
||
# - critical: 需要立即處理的嚴重問題
|
||
# - warning: 需要關注但不緊急的問題
|
||
# - info: 資訊性通知
|
||
#
|
||
# =============================================================================
|
||
|
||
groups:
|
||
# ===========================================================================
|
||
# 主機資源監控告警
|
||
# ===========================================================================
|
||
- name: host_alerts
|
||
rules:
|
||
# -----------------------------------------------------------------------
|
||
# CPU 使用率告警
|
||
# -----------------------------------------------------------------------
|
||
- alert: HostHighCpuUsage
|
||
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 50
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: cpu
|
||
annotations:
|
||
summary: "主機 CPU 使用率過高"
|
||
description: "主機 {{ $labels.instance }} CPU 使用率超過 50% 持續 5 分鐘,當前值: {{ $value | printf \"%.1f\" }}%"
|
||
value: "{{ $value | printf \"%.1f\" }}%"
|
||
|
||
- alert: HostCriticalCpuUsage
|
||
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
category: cpu
|
||
annotations:
|
||
summary: "主機 CPU 使用率嚴重過高"
|
||
description: "主機 {{ $labels.instance }} CPU 使用率超過 80%,當前值: {{ $value | printf \"%.1f\" }}%"
|
||
value: "{{ $value | printf \"%.1f\" }}%"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# 記憶體使用率告警
|
||
# -----------------------------------------------------------------------
|
||
- alert: HostHighMemoryUsage
|
||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 50
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: memory
|
||
annotations:
|
||
summary: "主機記憶體使用率過高"
|
||
description: "主機 {{ $labels.instance }} 記憶體使用率超過 50% 持續 5 分鐘,當前值: {{ $value | printf \"%.1f\" }}%"
|
||
value: "{{ $value | printf \"%.1f\" }}%"
|
||
|
||
- alert: HostCriticalMemoryUsage
|
||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
category: memory
|
||
annotations:
|
||
summary: "主機記憶體使用率嚴重過高"
|
||
description: "主機 {{ $labels.instance }} 記憶體使用率超過 85%,當前值: {{ $value | printf \"%.1f\" }}%"
|
||
value: "{{ $value | printf \"%.1f\" }}%"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# 磁碟使用率告警
|
||
# -----------------------------------------------------------------------
|
||
- alert: HostHighDiskUsage
|
||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: disk
|
||
annotations:
|
||
summary: "主機磁碟使用率過高"
|
||
description: "主機 {{ $labels.instance }} 磁碟 {{ $labels.mountpoint }} 使用率超過 80%,當前值: {{ $value | printf \"%.1f\" }}%"
|
||
value: "{{ $value | printf \"%.1f\" }}%"
|
||
|
||
- alert: HostCriticalDiskUsage
|
||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 90
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
category: disk
|
||
annotations:
|
||
summary: "主機磁碟空間嚴重不足"
|
||
description: "主機 {{ $labels.instance }} 磁碟 {{ $labels.mountpoint }} 使用率超過 90%,當前值: {{ $value | printf \"%.1f\" }}%"
|
||
value: "{{ $value | printf \"%.1f\" }}%"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# 系統負載告警
|
||
# -----------------------------------------------------------------------
|
||
- alert: HostHighLoadAverage
|
||
expr: node_load5 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 0.8
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: load
|
||
annotations:
|
||
summary: "主機系統負載過高"
|
||
description: "主機 {{ $labels.instance }} 5分鐘負載平均值過高,當前值: {{ $value | printf \"%.2f\" }}"
|
||
value: "{{ $value | printf \"%.2f\" }}"
|
||
|
||
# ===========================================================================
|
||
# 容器監控告警
|
||
# ===========================================================================
|
||
- name: container_alerts
|
||
rules:
|
||
# -----------------------------------------------------------------------
|
||
# 容器 CPU 使用率
|
||
# -----------------------------------------------------------------------
|
||
- alert: ContainerHighCpuUsage
|
||
expr: (rate(container_cpu_usage_seconds_total{name!=""}[5m]) * 100) > 50
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: container_cpu
|
||
annotations:
|
||
summary: "容器 CPU 使用率過高"
|
||
description: "容器 {{ $labels.name }} CPU 使用率超過 50%,當前值: {{ $value | printf \"%.1f\" }}%"
|
||
container: "{{ $labels.name }}"
|
||
value: "{{ $value | printf \"%.1f\" }}%"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# 容器記憶體使用率
|
||
# -----------------------------------------------------------------------
|
||
- alert: ContainerHighMemoryUsage
|
||
expr: (container_memory_usage_bytes{name!=""} / container_spec_memory_limit_bytes{name!=""}) * 100 > 50
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
category: container_memory
|
||
annotations:
|
||
summary: "容器記憶體使用率過高"
|
||
description: "容器 {{ $labels.name }} 記憶體使用率超過 50%,當前值: {{ $value | printf \"%.1f\" }}%"
|
||
container: "{{ $labels.name }}"
|
||
value: "{{ $value | printf \"%.1f\" }}%"
|
||
|
||
# ===========================================================================
|
||
# 網站健康監控告警
|
||
# ===========================================================================
|
||
- name: website_alerts
|
||
rules:
|
||
# -----------------------------------------------------------------------
|
||
# 網站無法訪問
|
||
# -----------------------------------------------------------------------
|
||
- alert: WebsiteDown
|
||
expr: probe_success{job=~"blackbox-http.*"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
category: website
|
||
annotations:
|
||
summary: "網站無法訪問"
|
||
description: "網站 {{ $labels.instance }} 無法訪問,請立即檢查"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# 網站響應時間過長
|
||
# -----------------------------------------------------------------------
|
||
- alert: WebsiteSlowResponse
|
||
expr: probe_http_duration_seconds{job=~"blackbox-http.*"} > 5
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
category: website
|
||
annotations:
|
||
summary: "網站響應緩慢"
|
||
description: "網站 {{ $labels.instance }} 響應時間超過 5 秒,當前值: {{ $value | printf \"%.2f\" }} 秒"
|
||
value: "{{ $value | printf \"%.2f\" }}s"
|
||
|
||
# ===========================================================================
|
||
# 網路連通性告警
|
||
# ===========================================================================
|
||
- name: network_alerts
|
||
rules:
|
||
# -----------------------------------------------------------------------
|
||
# 主機無法 Ping
|
||
# -----------------------------------------------------------------------
|
||
- alert: HostUnreachable
|
||
expr: probe_success{job="blackbox-icmp"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
category: network
|
||
annotations:
|
||
summary: "主機無法連通"
|
||
description: "主機 {{ $labels.instance }} 無法 ping 通,可能已離線"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# TCP 端口無法連接
|
||
# -----------------------------------------------------------------------
|
||
- alert: ServicePortDown
|
||
expr: probe_success{job=~"blackbox-tcp.*"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
category: network
|
||
annotations:
|
||
summary: "服務端口無法連接"
|
||
description: "服務 {{ $labels.instance }} 無法連接,請檢查服務狀態"
|
||
|
||
# ===========================================================================
|
||
# PostgreSQL 資料庫監控告警
|
||
# ===========================================================================
|
||
- name: postgres_alerts
|
||
rules:
|
||
# -----------------------------------------------------------------------
|
||
# PostgreSQL 無法連接
|
||
# -----------------------------------------------------------------------
|
||
- alert: PostgresDown
|
||
expr: pg_up == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
category: database
|
||
annotations:
|
||
summary: "PostgreSQL 無法連接"
|
||
description: "PostgreSQL 資料庫無法連接,請立即檢查"
|