Files
awoooi/ops/monitoring/alerts-unified.yml
OG T dc27f8f811 ops(monitoring): 統一 Prometheus 告警規則 — 40+條含統一 layer 標籤
修正:
- ClawBotDown → OpenClawDown (舊命名廢棄)
- 加入 SentryDown/HarborDown/GiteaDown/AlertmanagerDown
- 所有規則補齊 layer/component/host/auto_repair 統一標籤
- 整合 k8s/monitoring/*.yaml → ops/monitoring/alerts-unified.yml

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 02:26:18 +08:00

417 lines
14 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ops/monitoring/alerts-unified.yml
# AWOOOI 統一 Prometheus 告警規則
# 2026-04-05 Claude Code: 整合所有規則,加入統一 layer 標籤
# 部署目標: 192.168.0.110:/home/wooo/monitoring/alerts.yml
# 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署)
#
# 標籤規範:
# layer: k8s | docker-110 | docker-188 | systemd-188
# component: 服務名稱
# team: ops | backend | ai | platform
# host: "110" | "188" | "120" | "121"
# auto_repair: "true" | "false"
groups:
# =========================================================================
# 主機層告警 (host_alerts)
# =========================================================================
- name: host_alerts
rules:
- alert: HostDown
expr: up{job=~"node-exporter.*"} == 0
for: 1m
labels:
severity: critical
layer: systemd-188
team: ops
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} 不可達"
description: "Node Exporter 無回應超過 1 分鐘"
- alert: HostHighCpuLoad
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} CPU 高負載"
description: "CPU 使用率超過 80%"
- alert: HostOutOfMemory
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} 記憶體不足"
description: "記憶體使用率超過 85%"
- alert: HostOutOfDiskSpace
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85
for: 5m
labels:
severity: warning
layer: systemd-188
team: ops
auto_repair: "false"
annotations:
summary: "主機 {{ $labels.host }} 磁碟空間不足"
description: "磁碟使用率超過 85%"
# =========================================================================
# K8s 叢集告警 (kubernetes_alerts)
# =========================================================================
- name: kubernetes_alerts
rules:
- alert: K3sNodeNotReady
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
for: 2m
labels:
severity: critical
layer: k8s
team: ops
auto_repair: "false"
annotations:
summary: "K3s 節點 {{ $labels.node }} 未就緒"
description: "節點超過 2 分鐘未達到 Ready 狀態"
- alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[15m]) > 0
for: 5m
labels:
severity: warning
layer: k8s
team: ops
auto_repair: "true"
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 持續重啟"
description: "Pod 在過去 15 分鐘內重啟次數異常"
- alert: KubePodNotReady
expr: kube_pod_status_ready{condition="true",namespace="awoooi-prod"} == 0
for: 5m
labels:
severity: warning
layer: k8s
team: ops
auto_repair: "true"
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 未就緒"
description: "Running 中的 Pod 超過 5 分鐘未達到 Ready 狀態"
- alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas{namespace="awoooi-prod"} != kube_deployment_status_replicas_available{namespace="awoooi-prod"}
for: 10m
labels:
severity: warning
layer: k8s
team: ops
auto_repair: "true"
annotations:
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} 副本數不匹配"
description: "期望副本數與可用副本數不一致超過 10 分鐘"
- alert: VeleroBackupFailed
expr: increase(velero_backup_failure_total[24h]) > 0
for: 5m
labels:
severity: warning
layer: k8s
team: ops
component: velero
auto_repair: "false"
annotations:
summary: "Velero 備份失敗"
description: "過去 24 小時有備份失敗"
- alert: VeleroBackupNotRun
expr: time() - velero_backup_last_successful_timestamp > 86400
for: 10m
labels:
severity: critical
layer: k8s
team: ops
component: velero
auto_repair: "false"
annotations:
summary: "Velero 超過 24 小時未成功備份"
description: "最後一次成功備份超過 24 小時"
# =========================================================================
# 資料庫告警 (database_alerts)
# =========================================================================
- name: database_alerts
rules:
- alert: PostgreSQLDown
expr: up{job="postgres-exporter"} == 0 or pg_up == 0
for: 1m
labels:
severity: critical
layer: systemd-188
component: postgres
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 資料庫離線"
description: "PostgreSQL Exporter 無法連接資料庫超過 1 分鐘"
- alert: RedisDown
expr: up{job="redis-exporter"} == 0 or redis_up == 0
for: 1m
labels:
severity: critical
layer: systemd-188
component: redis
host: "188"
team: ops
auto_repair: "false"
annotations:
summary: "Redis 快取服務離線"
description: "Redis Exporter 無法連接 Redis 超過 1 分鐘"
- alert: PostgreSQLHighConnections
expr: pg_stat_activity_count > 80
for: 5m
labels:
severity: warning
layer: systemd-188
component: postgres
team: ops
auto_repair: "false"
annotations:
summary: "PostgreSQL 連接數過高"
description: "當前連接數 {{ $value }} 超過 80"
- alert: RedisMemoryHigh
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
for: 5m
labels:
severity: warning
layer: systemd-188
component: redis
team: ops
auto_repair: "false"
annotations:
summary: "Redis 記憶體使用過高"
description: "Redis 記憶體使用率超過 80%"
# =========================================================================
# 服務可用性告警 (service_alerts) — 含 layer 標籤決定修復路徑
# =========================================================================
- name: service_alerts
rules:
# ---- 188 Docker 層 ----
- alert: OpenClawDown
# 2026-04-05 Claude Code: 修正舊命名 ClawBotDown → OpenClawDown
expr: up{job="clawbot"} == 0
for: 2m
labels:
severity: critical
layer: docker-188
component: openclaw
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "OpenClaw 服務離線"
description: "OpenClaw (192.168.0.188:8088) 已離線超過 2 分鐘"
- alert: SignOzDown
expr: probe_success{job="blackbox-http", instance=~".*3301.*"} == 0
for: 2m
labels:
severity: warning
layer: docker-188
component: signoz
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "SignOz 服務離線"
description: "SignOz (192.168.0.188:3301) 已離線超過 2 分鐘"
# ---- 110 Docker 層 ----
- alert: SentryDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9000"} == 0
for: 2m
labels:
severity: warning
layer: docker-110
component: sentry
host: "110"
team: ops
auto_repair: "true"
annotations:
summary: "Sentry 服務離線"
description: "Sentry (192.168.0.110:9000) 已離線超過 2 分鐘"
- alert: HarborDown
expr: probe_success{job="blackbox-http", instance=~".*5000.*"} == 0
for: 2m
labels:
severity: critical
layer: docker-110
component: harbor
host: "110"
team: ops
auto_repair: "true"
annotations:
summary: "Harbor Registry 離線"
description: "Harbor (192.168.0.110:5000) 已離線超過 2 分鐘CD pipeline 將無法拉取映像"
- alert: GiteaDown
expr: probe_success{job="blackbox-http", instance="http://192.168.0.110:3001"} == 0
for: 2m
labels:
severity: critical
layer: docker-110
component: gitea
host: "110"
team: ops
auto_repair: "true"
annotations:
summary: "Gitea Git 服務離線"
description: "Gitea (192.168.0.110:3001) 已離線超過 2 分鐘CD pipeline 失效"
- alert: AlertmanagerDown
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9093"} == 0
for: 2m
labels:
severity: critical
layer: docker-110
component: alertmanager
host: "110"
team: ops
auto_repair: "true"
annotations:
summary: "Alertmanager 離線"
description: "Alertmanager (192.168.0.110:9093) 已離線,所有告警將靜默"
# =========================================================================
# 告警鏈路監控 (alert_chain) — 防止 2026-03-26/04-05 事故重演
# =========================================================================
- name: alert_chain
rules:
- alert: AlertChainBroken_Alertmanager
expr: |
sum(rate(awoooi_webhook_requests_total{source="alertmanager",status!="success"}[5m]))
/ sum(rate(awoooi_webhook_requests_total{source="alertmanager"}[5m])) > 0.1
for: 10m
labels:
severity: critical
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "Alertmanager Webhook 錯誤率 > 10%"
description: "告警鏈路可能斷裂,請執行 E2E 驗證"
- alert: AlertChainBroken_Sentry
expr: |
sum(rate(awoooi_webhook_requests_total{source="sentry",status!="success"}[5m]))
/ sum(rate(awoooi_webhook_requests_total{source="sentry"}[5m])) > 0.1
for: 10m
labels:
severity: warning
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "Sentry Webhook 錯誤率 > 10%"
description: "Sentry 錯誤可能無法正確處理"
- alert: NoAlertsReceived2Hours
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
for: 5m
labels:
severity: warning
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "2 小時內未收到任何告警 ({{ $labels.source }})"
description: "可能是告警鏈路問題,請執行 Smoke Test"
- alert: AlertChainUnhealthy
expr: awoooi_alert_chain_healthy == 0
for: 5m
labels:
severity: critical
layer: k8s
team: platform
auto_repair: "false"
annotations:
summary: "告警鏈路不健康 ({{ $labels.source }})"
description: "告警鏈路標記為不健康,最近處理失敗"
# =========================================================================
# 自動修復監控 (auto_repair)
# =========================================================================
- name: auto_repair
rules:
- alert: AutoRepairLowSuccessRate
expr: awoooi_auto_repair_success_rate < 0.3
for: 30m
labels:
severity: warning
layer: k8s
team: backend
auto_repair: "false"
annotations:
summary: "自動修復成功率過低 ({{ $value | humanizePercentage }})"
description: "動作 {{ $labels.action }} 的成功率低於 30%,建議檢查 Playbook"
- alert: PermanentFixRequired
expr: sum(rate(awoooi_anomaly_escalation_total{level="PERMANENT_FIX"}[1h])) > 0
for: 1m
labels:
severity: critical
layer: k8s
team: backend
auto_repair: "false"
annotations:
summary: "需要永久修復的異常升級"
description: "有異常升級到 PERMANENT_FIX 級別,需要根本修復"
# =========================================================================
# MinIO / Kali 告警
# =========================================================================
- name: minio_kali_alerts
rules:
- alert: MinIODown
expr: probe_success{job="blackbox-http", instance=~".*9001.*"} == 0
for: 2m
labels:
severity: warning
layer: docker-188
component: minio
host: "188"
team: ops
auto_repair: "true"
annotations:
summary: "MinIO (Velero 備份) 離線"
description: "MinIO (192.168.0.188:9001) 已離線超過 2 分鐘Velero 備份可能失敗"
- alert: KaliScannerDown
expr: probe_success{job="blackbox-http", instance=~".*192.168.0.112.*"} == 0
for: 5m
labels:
severity: info
layer: docker-188
component: kali
host: "112"
team: ops
auto_repair: "false"
annotations:
summary: "Kali Scanner 離線"
description: "Kali (192.168.0.112:8080) 離線,安全掃描功能暫停"