修正: - ClawBotDown → OpenClawDown (舊命名廢棄) - 加入 SentryDown/HarborDown/GiteaDown/AlertmanagerDown - 所有規則補齊 layer/component/host/auto_repair 統一標籤 - 整合 k8s/monitoring/*.yaml → ops/monitoring/alerts-unified.yml Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
417 lines
14 KiB
YAML
417 lines
14 KiB
YAML
# ops/monitoring/alerts-unified.yml
|
||
# AWOOOI 統一 Prometheus 告警規則
|
||
# 2026-04-05 Claude Code: 整合所有規則,加入統一 layer 標籤
|
||
# 部署目標: 192.168.0.110:/home/wooo/monitoring/alerts.yml
|
||
# 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署)
|
||
#
|
||
# 標籤規範:
|
||
# layer: k8s | docker-110 | docker-188 | systemd-188
|
||
# component: 服務名稱
|
||
# team: ops | backend | ai | platform
|
||
# host: "110" | "188" | "120" | "121"
|
||
# auto_repair: "true" | "false"
|
||
|
||
groups:
|
||
|
||
# =========================================================================
|
||
# 主機層告警 (host_alerts)
|
||
# =========================================================================
|
||
- name: host_alerts
|
||
rules:
|
||
- alert: HostDown
|
||
expr: up{job=~"node-exporter.*"} == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} 不可達"
|
||
description: "Node Exporter 無回應超過 1 分鐘"
|
||
|
||
- alert: HostHighCpuLoad
|
||
expr: 100 - (avg by(host) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} CPU 高負載"
|
||
description: "CPU 使用率超過 80%"
|
||
|
||
- alert: HostOutOfMemory
|
||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} 記憶體不足"
|
||
description: "記憶體使用率超過 85%"
|
||
|
||
- alert: HostOutOfDiskSpace
|
||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 85
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "主機 {{ $labels.host }} 磁碟空間不足"
|
||
description: "磁碟使用率超過 85%"
|
||
|
||
# =========================================================================
|
||
# K8s 叢集告警 (kubernetes_alerts)
|
||
# =========================================================================
|
||
- name: kubernetes_alerts
|
||
rules:
|
||
- alert: K3sNodeNotReady
|
||
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "K3s 節點 {{ $labels.node }} 未就緒"
|
||
description: "節點超過 2 分鐘未達到 Ready 狀態"
|
||
|
||
- alert: KubePodCrashLooping
|
||
expr: rate(kube_pod_container_status_restarts_total{namespace="awoooi-prod"}[15m]) > 0
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 持續重啟"
|
||
description: "Pod 在過去 15 分鐘內重啟次數異常"
|
||
|
||
- alert: KubePodNotReady
|
||
expr: kube_pod_status_ready{condition="true",namespace="awoooi-prod"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 未就緒"
|
||
description: "Running 中的 Pod 超過 5 分鐘未達到 Ready 狀態"
|
||
|
||
- alert: KubeDeploymentReplicasMismatch
|
||
expr: kube_deployment_spec_replicas{namespace="awoooi-prod"} != kube_deployment_status_replicas_available{namespace="awoooi-prod"}
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} 副本數不匹配"
|
||
description: "期望副本數與可用副本數不一致超過 10 分鐘"
|
||
|
||
- alert: VeleroBackupFailed
|
||
expr: increase(velero_backup_failure_total[24h]) > 0
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: ops
|
||
component: velero
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Velero 備份失敗"
|
||
description: "過去 24 小時有備份失敗"
|
||
|
||
- alert: VeleroBackupNotRun
|
||
expr: time() - velero_backup_last_successful_timestamp > 86400
|
||
for: 10m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: ops
|
||
component: velero
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Velero 超過 24 小時未成功備份"
|
||
description: "最後一次成功備份超過 24 小時"
|
||
|
||
# =========================================================================
|
||
# 資料庫告警 (database_alerts)
|
||
# =========================================================================
|
||
- name: database_alerts
|
||
rules:
|
||
- alert: PostgreSQLDown
|
||
expr: up{job="postgres-exporter"} == 0 or pg_up == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
component: postgres
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "PostgreSQL 資料庫離線"
|
||
description: "PostgreSQL Exporter 無法連接資料庫超過 1 分鐘"
|
||
|
||
- alert: RedisDown
|
||
expr: up{job="redis-exporter"} == 0 or redis_up == 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: systemd-188
|
||
component: redis
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis 快取服務離線"
|
||
description: "Redis Exporter 無法連接 Redis 超過 1 分鐘"
|
||
|
||
- alert: PostgreSQLHighConnections
|
||
expr: pg_stat_activity_count > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: postgres
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "PostgreSQL 連接數過高"
|
||
description: "當前連接數 {{ $value }} 超過 80"
|
||
|
||
- alert: RedisMemoryHigh
|
||
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: systemd-188
|
||
component: redis
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Redis 記憶體使用過高"
|
||
description: "Redis 記憶體使用率超過 80%"
|
||
|
||
# =========================================================================
|
||
# 服務可用性告警 (service_alerts) — 含 layer 標籤決定修復路徑
|
||
# =========================================================================
|
||
- name: service_alerts
|
||
rules:
|
||
# ---- 188 Docker 層 ----
|
||
- alert: OpenClawDown
|
||
# 2026-04-05 Claude Code: 修正舊命名 ClawBotDown → OpenClawDown
|
||
expr: up{job="clawbot"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-188
|
||
component: openclaw
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "OpenClaw 服務離線"
|
||
description: "OpenClaw (192.168.0.188:8088) 已離線超過 2 分鐘"
|
||
|
||
- alert: SignOzDown
|
||
expr: probe_success{job="blackbox-http", instance=~".*3301.*"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-188
|
||
component: signoz
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "SignOz 服務離線"
|
||
description: "SignOz (192.168.0.188:3301) 已離線超過 2 分鐘"
|
||
|
||
# ---- 110 Docker 層 ----
|
||
- alert: SentryDown
|
||
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9000"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-110
|
||
component: sentry
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Sentry 服務離線"
|
||
description: "Sentry (192.168.0.110:9000) 已離線超過 2 分鐘"
|
||
|
||
- alert: HarborDown
|
||
expr: probe_success{job="blackbox-http", instance=~".*5000.*"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-110
|
||
component: harbor
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Harbor Registry 離線"
|
||
description: "Harbor (192.168.0.110:5000) 已離線超過 2 分鐘,CD pipeline 將無法拉取映像"
|
||
|
||
- alert: GiteaDown
|
||
expr: probe_success{job="blackbox-http", instance="http://192.168.0.110:3001"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-110
|
||
component: gitea
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Gitea Git 服務離線"
|
||
description: "Gitea (192.168.0.110:3001) 已離線超過 2 分鐘,CD pipeline 失效"
|
||
|
||
- alert: AlertmanagerDown
|
||
expr: probe_success{job="blackbox-tcp", instance="192.168.0.110:9093"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
layer: docker-110
|
||
component: alertmanager
|
||
host: "110"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "Alertmanager 離線"
|
||
description: "Alertmanager (192.168.0.110:9093) 已離線,所有告警將靜默"
|
||
|
||
# =========================================================================
|
||
# 告警鏈路監控 (alert_chain) — 防止 2026-03-26/04-05 事故重演
|
||
# =========================================================================
|
||
- name: alert_chain
|
||
rules:
|
||
- alert: AlertChainBroken_Alertmanager
|
||
expr: |
|
||
sum(rate(awoooi_webhook_requests_total{source="alertmanager",status!="success"}[5m]))
|
||
/ sum(rate(awoooi_webhook_requests_total{source="alertmanager"}[5m])) > 0.1
|
||
for: 10m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: platform
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Alertmanager Webhook 錯誤率 > 10%"
|
||
description: "告警鏈路可能斷裂,請執行 E2E 驗證"
|
||
|
||
- alert: AlertChainBroken_Sentry
|
||
expr: |
|
||
sum(rate(awoooi_webhook_requests_total{source="sentry",status!="success"}[5m]))
|
||
/ sum(rate(awoooi_webhook_requests_total{source="sentry"}[5m])) > 0.1
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: platform
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Sentry Webhook 錯誤率 > 10%"
|
||
description: "Sentry 錯誤可能無法正確處理"
|
||
|
||
- alert: NoAlertsReceived2Hours
|
||
expr: time() - max by (source)(awoooi_alert_chain_last_success_timestamp) > 7200
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: platform
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "2 小時內未收到任何告警 ({{ $labels.source }})"
|
||
description: "可能是告警鏈路問題,請執行 Smoke Test"
|
||
|
||
- alert: AlertChainUnhealthy
|
||
expr: awoooi_alert_chain_healthy == 0
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: platform
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "告警鏈路不健康 ({{ $labels.source }})"
|
||
description: "告警鏈路標記為不健康,最近處理失敗"
|
||
|
||
# =========================================================================
|
||
# 自動修復監控 (auto_repair)
|
||
# =========================================================================
|
||
- name: auto_repair
|
||
rules:
|
||
- alert: AutoRepairLowSuccessRate
|
||
expr: awoooi_auto_repair_success_rate < 0.3
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
layer: k8s
|
||
team: backend
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "自動修復成功率過低 ({{ $value | humanizePercentage }})"
|
||
description: "動作 {{ $labels.action }} 的成功率低於 30%,建議檢查 Playbook"
|
||
|
||
- alert: PermanentFixRequired
|
||
expr: sum(rate(awoooi_anomaly_escalation_total{level="PERMANENT_FIX"}[1h])) > 0
|
||
for: 1m
|
||
labels:
|
||
severity: critical
|
||
layer: k8s
|
||
team: backend
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "需要永久修復的異常升級"
|
||
description: "有異常升級到 PERMANENT_FIX 級別,需要根本修復"
|
||
|
||
# =========================================================================
|
||
# MinIO / Kali 告警
|
||
# =========================================================================
|
||
- name: minio_kali_alerts
|
||
rules:
|
||
- alert: MinIODown
|
||
expr: probe_success{job="blackbox-http", instance=~".*9001.*"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
layer: docker-188
|
||
component: minio
|
||
host: "188"
|
||
team: ops
|
||
auto_repair: "true"
|
||
annotations:
|
||
summary: "MinIO (Velero 備份) 離線"
|
||
description: "MinIO (192.168.0.188:9001) 已離線超過 2 分鐘,Velero 備份可能失敗"
|
||
|
||
- alert: KaliScannerDown
|
||
expr: probe_success{job="blackbox-http", instance=~".*192.168.0.112.*"} == 0
|
||
for: 5m
|
||
labels:
|
||
severity: info
|
||
layer: docker-188
|
||
component: kali
|
||
host: "112"
|
||
team: ops
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "Kali Scanner 離線"
|
||
description: "Kali (192.168.0.112:8080) 離線,安全掃描功能暫停"
|