From edb97fd29bc7ac020b48f04c9e279b13905f7c7f Mon Sep 17 00:00:00 2001 From: OG T Date: Sun, 12 Apr 2026 19:14:39 +0800 Subject: [PATCH] =?UTF-8?q?fix(monitoring):=20=E8=A3=9C=E5=9B=9E=204=20?= =?UTF-8?q?=E5=80=8B=E5=83=85=E5=AD=98=E6=96=BC=E4=B8=BB=E6=A9=9F=E7=9A=84?= =?UTF-8?q?=20Prometheus=20=E8=A6=8F=E5=89=87=E7=BE=A4=E7=B5=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit deploy-alerts.sh 部署時覆寫了這 4 個從未進 repo 的群組: - awoooi_flywheel_health (5條:Playbook/Success/Vectorization/NullRate/Stuck) - awoooi_backup_restore (2條:RestoreTestFailed/TestStale) - awoooi_infrastructure_detailed (3條:Container/RedisStream/DiskGrowth) - awoooi_host_connectivity (1條:NetworkPartition) 從 /home/wooo/monitoring/alerts.yml.bak_20260412_183835 還原。 offset PromQL 已修正為各個 selector 上,而非整個表達式。 Co-Authored-By: Claude Sonnet 4.6 --- ops/monitoring/alerts-unified.yml | 185 ++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 6551d941..9e5e7326 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -2,6 +2,7 @@ # AWOOOI 統一 Prometheus 告警規則 # 2026-04-05 Claude Code: 整合所有規則,加入統一 layer 標籤 # 2026-04-08 Claude Sonnet 4.6: 補 database_detail_alerts 群組 (6條詳細規則) +# 2026-04-12 Claude Sonnet 4.6: 補回 4 個僅存在主機的群組 (backup/flywheel/connectivity/infra-detailed) # 部署目標: 192.168.0.110:/home/wooo/monitoring/alerts.yml # 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署) # @@ -762,3 +763,187 @@ groups: annotations: summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24h" description: "飛輪推理匹配節點可能堵塞,需人工清理或重新觸發診斷" + + # ========================================================================= + # 飛輪健康詳細告警 (awoooi_flywheel_health) — 從主機補回 2026-04-12 + # ========================================================================= + - name: awoooi_flywheel_health + interval: 5m + rules: + - alert: FlywheelPlaybookZero + expr: awoooi_flywheel_playbook_count == 0 + for: 1h + labels: + severity: critical + alert_category: flywheel_health + notification_type: TYPE-8M + auto_repair: "false" + annotations: + summary: "飛輪 Playbook 數量為 0" + description: "Playbook 數量持續 1 小時為 0,飛輪學習節點完全失效。" + runbook: "執行 scripts/cold_start_playbooks.py 冷啟動" + + - alert: FlywheelExecutionSuccessLow + expr: awoooi_flywheel_execution_success_rate < 0.1 + for: 2h + labels: + severity: warning + alert_category: flywheel_health + notification_type: TYPE-8M + auto_repair: "false" + annotations: + summary: "飛輪自動修復成功率低於 10%" + description: "執行成功率 {{ $value | humanizePercentage }},低於健康基線 10%。" + runbook: "檢查 decision_manager 日誌,確認 target 解析和 SSH MCP 狀態" + + - alert: FlywheelKMVectorizationLow + expr: awoooi_flywheel_km_unvectorized_count > 10 + for: 30m + labels: + severity: warning + alert_category: flywheel_health + notification_type: TYPE-8M + auto_repair: "false" + annotations: + summary: "飛輪 KM 未向量化數量 > 10" + description: "{{ $value }} 筆 KM 條目尚未向量化,RAG 查詢品質下降。" + runbook: "執行 scripts/batch_vectorize_km.py 或檢查每日 CronJob 狀態" + + - alert: FlywheelAlertnameNullHigh + expr: awoooi_flywheel_alertname_null_rate > 0.05 + for: 30m + labels: + severity: warning + alert_category: flywheel_health + notification_type: TYPE-8M + auto_repair: "false" + annotations: + summary: "飛輪 alertname NULL 率超過 5%" + description: "alertname NULL 率 {{ $value | humanizePercentage }},影響路由準確性。" + runbook: "執行 scripts/backfill_alertname.py 回填" + + - alert: FlywheelIncidentsStuck + expr: awoooi_flywheel_incidents_stuck > 5 + for: 10m + labels: + severity: warning + alert_category: flywheel_health + notification_type: TYPE-8M + auto_repair: "false" + annotations: + summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24 小時" + description: "大量 Incident 未推進,可能是決策引擎或 Telegram 通知阻塞。" + + # ========================================================================= + # 備份還原告警 (awoooi_backup_restore) — 從主機補回 2026-04-12 + # ========================================================================= + - name: awoooi_backup_restore + interval: 1h + rules: + - alert: BackupRestoreTestFailed + expr: awoooi_backup_restore_test_success == 0 + for: 5m + labels: + severity: critical + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "備份還原 dry-run 測試失敗" + description: "Velero restore dry-run 失敗,備份可能無法還原。立即人工驗證備份狀態。" + runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run" + + - alert: BackupRestoreTestStale + expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200 + for: 10m + labels: + severity: warning + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "備份還原測試超過 8 天未執行" + description: "上次備份測試距今 {{ $value | humanizeDuration }},週排程 CronJob 可能失效。" + runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態" + + # ========================================================================= + # 基礎設施詳細告警 (awoooi_infrastructure_detailed) — 從主機補回 2026-04-12 + # ========================================================================= + - name: awoooi_infrastructure_detailed + interval: 60s + rules: + - alert: DockerContainerUnhealthyDetailed + expr: | + count by (name, instance) ( + container_tasks_state{state="running", instance=~"192.168.0.188.*"} + ) == 0 + or + container_last_seen{instance=~"192.168.0.188.*", name!=""} < (time() - 120) + for: 5m + labels: + severity: warning + layer: docker-188 + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "188 主機容器 {{ $labels.name }} 異常" + description: "容器 {{ $labels.name }} 在 {{ $labels.instance }} 已超過 2 分鐘無活動或不在 running 狀態。" + runbook: "SSH 到 192.168.0.188:docker inspect {{ $labels.name }} 確認健康狀態" + + - alert: RedisStreamBacklogHigh + expr: awoooi_redis_stream_len > 500 + for: 10m + labels: + severity: warning + layer: docker-188 + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "Redis Stream {{ $labels.stream }} 積壓 {{ $value }} 筆" + description: "Redis Stream 積壓超過 500 筆,飛輪消費者可能阻塞。" + runbook: "檢查 consumer group lag:XINFO GROUPS " + + - alert: PostgreSQLDiskGrowthRate + expr: | + ( + node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"} + - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"} + ) + - ( + node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"} offset 1h + - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"} offset 1h + ) + > 524288000 + for: 5m + labels: + severity: warning + layer: docker-188 + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "188 主機磁碟 1 小時增長超過 500MB" + description: "磁碟在過去 1 小時增長 {{ $value | humanize1024 }}B,可能是 PostgreSQL WAL 或日誌暴增。" + runbook: "SSH 188:df -h / && du -sh /var/lib/postgresql/*/pg_wal" + + # ========================================================================= + # 主機連通性告警 (awoooi_host_connectivity) — 從主機補回 2026-04-12 + # ========================================================================= + - name: awoooi_host_connectivity + interval: 60s + rules: + - alert: HostNetworkPartition + expr: probe_success{job="host-connectivity"} == 0 + for: 5m + labels: + severity: critical + layer: systemd-188 + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "主機 {{ $labels.instance }} 無法連通" + description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘,可能發生網路分區。" + runbook: "SSH 檢查路由和防火牆規則"