fix(monitoring): 補回 4 個僅存於主機的 Prometheus 規則群組

deploy-alerts.sh 部署時覆寫了這 4 個從未進 repo 的群組： - awoooi_flywheel_health (5條：Playbook/Success/Vectorization/NullRate/Stuck) - awoooi_backup_restore (2條：RestoreTestFailed/TestStale) - awoooi_infrastructure_detailed (3條：Container/RedisStream/DiskGrowth) - awoooi_host_connectivity (1條：NetworkPartition) 從 /home/wooo/monitoring/alerts.yml.bak_20260412_183835 還原。 offset PromQL 已修正為各個 selector 上，而非整個表達式。 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 19:14:39 +08:00
parent 5fe049de55
commit edb97fd29b
1 changed files with 185 additions and 0 deletions
--- a/ops/monitoring/alerts-unified.yml
+++ b/ops/monitoring/alerts-unified.yml
@@ -2,6 +2,7 @@
 # AWOOOI 統一 Prometheus 告警規則
 # 2026-04-05 Claude Code: 整合所有規則，加入統一 layer 標籤
 # 2026-04-08 Claude Sonnet 4.6: 補 database_detail_alerts 群組 (6條詳細規則)
+# 2026-04-12 Claude Sonnet 4.6: 補回 4 個僅存在主機的群組 (backup/flywheel/connectivity/infra-detailed)
 # 部署目標: 192.168.0.110:/home/wooo/monitoring/alerts.yml
 # 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署)
 #
@@ -762,3 +763,187 @@ groups:
        annotations:
          summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24h"
          description: "飛輪推理匹配節點可能堵塞，需人工清理或重新觸發診斷"
+
+  # =========================================================================
+  # 飛輪健康詳細告警 (awoooi_flywheel_health) — 從主機補回 2026-04-12
+  # =========================================================================
+  - name: awoooi_flywheel_health
+    interval: 5m
+    rules:
+      - alert: FlywheelPlaybookZero
+        expr: awoooi_flywheel_playbook_count == 0
+        for: 1h
+        labels:
+          severity: critical
+          alert_category: flywheel_health
+          notification_type: TYPE-8M
+          auto_repair: "false"
+        annotations:
+          summary: "飛輪 Playbook 數量為 0"
+          description: "Playbook 數量持續 1 小時為 0，飛輪學習節點完全失效。"
+          runbook: "執行 scripts/cold_start_playbooks.py 冷啟動"
+
+      - alert: FlywheelExecutionSuccessLow
+        expr: awoooi_flywheel_execution_success_rate < 0.1
+        for: 2h
+        labels:
+          severity: warning
+          alert_category: flywheel_health
+          notification_type: TYPE-8M
+          auto_repair: "false"
+        annotations:
+          summary: "飛輪自動修復成功率低於 10%"
+          description: "執行成功率 {{ $value | humanizePercentage }}，低於健康基線 10%。"
+          runbook: "檢查 decision_manager 日誌，確認 target 解析和 SSH MCP 狀態"
+
+      - alert: FlywheelKMVectorizationLow
+        expr: awoooi_flywheel_km_unvectorized_count > 10
+        for: 30m
+        labels:
+          severity: warning
+          alert_category: flywheel_health
+          notification_type: TYPE-8M
+          auto_repair: "false"
+        annotations:
+          summary: "飛輪 KM 未向量化數量 > 10"
+          description: "{{ $value }} 筆 KM 條目尚未向量化，RAG 查詢品質下降。"
+          runbook: "執行 scripts/batch_vectorize_km.py 或檢查每日 CronJob 狀態"
+
+      - alert: FlywheelAlertnameNullHigh
+        expr: awoooi_flywheel_alertname_null_rate > 0.05
+        for: 30m
+        labels:
+          severity: warning
+          alert_category: flywheel_health
+          notification_type: TYPE-8M
+          auto_repair: "false"
+        annotations:
+          summary: "飛輪 alertname NULL 率超過 5%"
+          description: "alertname NULL 率 {{ $value | humanizePercentage }}，影響路由準確性。"
+          runbook: "執行 scripts/backfill_alertname.py 回填"
+
+      - alert: FlywheelIncidentsStuck
+        expr: awoooi_flywheel_incidents_stuck > 5
+        for: 10m
+        labels:
+          severity: warning
+          alert_category: flywheel_health
+          notification_type: TYPE-8M
+          auto_repair: "false"
+        annotations:
+          summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24 小時"
+          description: "大量 Incident 未推進，可能是決策引擎或 Telegram 通知阻塞。"
+
+  # =========================================================================
+  # 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12
+  # =========================================================================
+  - name: awoooi_backup_restore
+    interval: 1h
+    rules:
+      - alert: BackupRestoreTestFailed
+        expr: awoooi_backup_restore_test_success == 0
+        for: 5m
+        labels:
+          severity: critical
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "備份還原 dry-run 測試失敗"
+          description: "Velero restore dry-run 失敗，備份可能無法還原。立即人工驗證備份狀態。"
+          runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run"
+
+      - alert: BackupRestoreTestStale
+        expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200
+        for: 10m
+        labels:
+          severity: warning
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "備份還原測試超過 8 天未執行"
+          description: "上次備份測試距今 {{ $value | humanizeDuration }}，週排程 CronJob 可能失效。"
+          runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"
+
+  # =========================================================================
+  # 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12
+  # =========================================================================
+  - name: awoooi_infrastructure_detailed
+    interval: 60s
+    rules:
+      - alert: DockerContainerUnhealthyDetailed
+        expr: |
+          count by (name, instance) (
+            container_tasks_state{state="running", instance=~"192.168.0.188.*"}
+          ) == 0
+          or
+          container_last_seen{instance=~"192.168.0.188.*", name!=""} < (time() - 120)
+        for: 5m
+        labels:
+          severity: warning
+          layer: docker-188
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "188 主機容器 {{ $labels.name }} 異常"
+          description: "容器 {{ $labels.name }} 在 {{ $labels.instance }} 已超過 2 分鐘無活動或不在 running 狀態。"
+          runbook: "SSH 到 192.168.0.188：docker inspect {{ $labels.name }} 確認健康狀態"
+
+      - alert: RedisStreamBacklogHigh
+        expr: awoooi_redis_stream_len > 500
+        for: 10m
+        labels:
+          severity: warning
+          layer: docker-188
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "Redis Stream {{ $labels.stream }} 積壓 {{ $value }} 筆"
+          description: "Redis Stream 積壓超過 500 筆，飛輪消費者可能阻塞。"
+          runbook: "檢查 consumer group lag：XINFO GROUPS <stream-key>"
+
+      - alert: PostgreSQLDiskGrowthRate
+        expr: |
+          (
+            node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"}
+            - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"}
+          )
+          - (
+            node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"} offset 1h
+            - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"} offset 1h
+          )
+          > 524288000
+        for: 5m
+        labels:
+          severity: warning
+          layer: docker-188
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "188 主機磁碟 1 小時增長超過 500MB"
+          description: "磁碟在過去 1 小時增長 {{ $value | humanize1024 }}B，可能是 PostgreSQL WAL 或日誌暴增。"
+          runbook: "SSH 188：df -h / && du -sh /var/lib/postgresql/*/pg_wal"
+
+  # =========================================================================
+  # 主機連通性告警 (awoooi_host_connectivity) — 從主機補回 2026-04-12
+  # =========================================================================
+  - name: awoooi_host_connectivity
+    interval: 60s
+    rules:
+      - alert: HostNetworkPartition
+        expr: probe_success{job="host-connectivity"} == 0
+        for: 5m
+        labels:
+          severity: critical
+          layer: systemd-188
+          alert_category: infrastructure
+          notification_type: TYPE-3
+          auto_repair: "false"
+        annotations:
+          summary: "主機 {{ $labels.instance }} 無法連通"
+          description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘，可能發生網路分區。"
+          runbook: "SSH 檢查路由和防火牆規則"