awoooi/k8s/monitoring/flywheel-alerts.yaml

# =============================================================================
# 🔴 [已封存 / DEPRECATED] 請勿使用此檔案
# =============================================================================
# 2026-04-14 Claude Sonnet 4.6（Backlog 清剿）:
# - 本檔為 PrometheusRule CRD 格式，需 Prometheus Operator 才能載入
# - 但 AWOOOI 的 Prometheus 是 Docker 部署（188），無 Operator
# - 11 條規則已全數遷入 ops/monitoring/alerts-unified.yml（group: awoooi_flywheel_meta_alerts）
# - 本檔保留僅作歷史參考，請勿 kubectl apply
#
# 權威來源：ops/monitoring/alerts-unified.yml
# =============================================================================
# 歷史資訊：飛輪健康度告警規則 — ADR-074 M1
# 數據來源：/api/v1/stats/flywheel/metrics（awoooi-flywheel scrape job）
# 建立：2026-04-12 ogt (ADR-074 M1)
# 封存：2026-04-14 Claude Sonnet 4.6（11/11 規則已遷入 alerts-unified.yml）
# =============================================================================

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: flywheel-alerts
  namespace: monitoring
  labels:
    release: prometheus
    app: prometheus
spec:
  groups:
    - name: awoooi_flywheel_health
      interval: 5m
      rules:

        # P0: Playbook 完全沒有 → 飛輪學習節點失效
        - alert: FlywheelPlaybookZero
          expr: awoooi_flywheel_playbook_count == 0
          for: 1h
          labels:
            severity: critical
            alert_category: infrastructure
            notification_type: TYPE-3
          annotations:
            summary: "飛輪 Playbook 數量為 0"
            description: "Playbook 數量持續 1 小時為 0，飛輪學習節點完全失效。"
            runbook: "執行 scripts/cold_start_playbooks.py 冷啟動"

        # P0: 執行成功率極低（有資料但低於門檻）
        # 2026-05-03 ogt + Claude Opus 4.7（亞太）— anti-silencing 補配對告警
        # 新版 flywheel_stats_service 樣本不足會 emit NaN（Prom 把 NaN 當 stale 不參與比較）
        # 故此規則只在「有資料、值低於 0.1」時觸發，不會被 NaN 誤觸；
        # 真正的「資料管線斷流」由下方 FlywheelExecutionRateMissing 補打。
        - alert: FlywheelExecutionSuccessLow
          expr: awoooi_flywheel_execution_success_rate < 0.1
          for: 2h
          labels:
            severity: warning
            alert_category: infrastructure
            notification_type: TYPE-3
          annotations:
            summary: "飛輪自動修復成功率低於 10%"
            description: "執行成功率 {{ $value | humanizePercentage }}，低於健康基線 10%。"
            runbook: "檢查 decision_manager 日誌，確認 target 解析和 SSH MCP 狀態"

        # P0: 飛輪執行率資料管線斷流（NaN sentinel + 30 分鐘無資料）
        # 2026-05-03 ogt + Claude Opus 4.7（亞太）— feedback_silencing_alerts_recurring_violation
        # 配對 FlywheelExecutionSuccessLow：當指標連續 30 分鐘為 NaN（樣本不足 sentinel）
        # 即代表「資料應該來但沒來」，watchdog W-3b 也會打同一情境，雙保險。
        - alert: FlywheelExecutionRateMissing
          expr: absent(awoooi_flywheel_execution_success_rate) or (awoooi_flywheel_execution_success_rate != awoooi_flywheel_execution_success_rate)
          for: 30m
          labels:
            severity: warning
            alert_category: infrastructure
            notification_type: TYPE-3
          annotations:
            summary: "飛輪執行率指標連續 30 分鐘無資料"
            description: "execution_success_rate 連續 30 分鐘為 NaN 或不存在，代表 Redis playbook 統計斷流（資料管線壞 / Redis flush / FlywheelStatsService 異常）。"
            runbook: "1) 檢查 Redis playbook:* keys 是否存在 2) 檢查 FlywheelStatsService 日誌 3) /metrics endpoint 直接拉看 NaN 來源"

        # P0: KM 大量未向量化 → RAG 無法使用歷史案例
        - alert: FlywheelKMVectorizationLow
          expr: awoooi_flywheel_km_unvectorized_count > 10
          for: 30m
          labels:
            severity: warning
            alert_category: infrastructure
            notification_type: TYPE-3
          annotations:
            summary: "飛輪 KM 未向量化數量 > 10"
            description: "{{ $value }} 筆 KM 條目尚未向量化，RAG 查詢品質下降。"
            runbook: "執行 scripts/batch_vectorize_km.py 或檢查每日 CronJob 狀態"

        # P1: alertname NULL 率異常
        - alert: FlywheelAlertnameNullHigh
          expr: awoooi_flywheel_alertname_null_rate > 0.05
          for: 30m
          labels:
            severity: warning
            alert_category: infrastructure
            notification_type: TYPE-3
          annotations:
            summary: "飛輪 alertname NULL 率超過 5%"
            description: "alertname NULL 率 {{ $value | humanizePercentage }}，影響路由準確性。"
            runbook: "執行 scripts/backfill_alertname.py 回填"

        # P1: Incident 卡住超過 24 小時
        - alert: FlywheelIncidentsStuck
          expr: awoooi_flywheel_incidents_stuck > 5
          for: 10m
          labels:
            severity: warning
            alert_category: infrastructure
            notification_type: TYPE-3
          annotations:
            summary: "{{ $value }} 筆 Incident 卡在 INVESTIGATING 超過 24 小時"
            description: "大量 Incident 未推進，可能是決策引擎或 Telegram 通知阻塞。"

    - name: awoooi_backup_restore
      interval: 1h
      rules:

        # P0: 備份還原 dry-run 失敗
        - alert: BackupRestoreTestFailed
          expr: awoooi_backup_restore_test_success == 0
          for: 5m
          labels:
            severity: critical
            alert_category: infrastructure
            notification_type: TYPE-3
          annotations:
            summary: "備份還原 dry-run 測試失敗"
            description: "Velero restore dry-run 失敗，備份可能無法還原。立即人工驗證備份狀態。"
            runbook: "執行 velero backup describe awoooi-daily 及 velero restore create --from-backup awoooi-daily --dry-run"

        # P1: 備份還原測試超過 8 天未執行（週排程失效）
        - alert: BackupRestoreTestStale
          expr: (time() - awoooi_backup_restore_test_timestamp_seconds) > 691200
          for: 10m
          labels:
            severity: warning
            alert_category: infrastructure
            notification_type: TYPE-3
          annotations:
            summary: "備份還原測試超過 8 天未執行"
            description: "上次備份測試距今 {{ $value | humanizeDuration }}，週排程 CronJob 可能失效。"
            runbook: "檢查 velero namespace 中的 backup-restore-test CronJob 狀態"

    - name: awoooi_infrastructure_detailed
      interval: 60s
      rules:

        # P1: Docker 188 容器不健康（docker inspect health=unhealthy）
        # node-exporter + cAdvisor 暴露 container_last_seen / container_tasks_state
        - alert: DockerContainerUnhealthyDetailed
          expr: |
            count by (name, instance) (
              container_tasks_state{state="running", instance=~"192.168.0.188.*"}
            ) == 0
            or
            container_last_seen{instance=~"192.168.0.188.*", name!=""} < (time() - 120)
          for: 5m
          labels:
            severity: warning
            alert_category: infrastructure
            notification_type: TYPE-3
          annotations:
            summary: "188 主機容器 {{ $labels.name }} 異常"
            description: "容器 {{ $labels.name }} 在 {{ $labels.instance }} 已超過 2 分鐘無活動或不在 running 狀態。"
            runbook: "SSH 到 192.168.0.188：docker inspect {{ $labels.name }} 確認健康狀態"

        # P1: Redis Streams 積壓過高（alert stream 或 incident stream）
        - alert: RedisStreamBacklogHigh
          expr: awoooi_redis_stream_len > 500
          for: 10m
          labels:
            severity: warning
            alert_category: infrastructure
            notification_type: TYPE-3
          annotations:
            summary: "Redis Stream {{ $labels.stream }} 積壓 {{ $value }} 筆"
            description: "Redis Stream 積壓超過 500 筆，飛輪消費者可能阻塞。"
            runbook: "檢查 consumer group lag：XINFO GROUPS <stream-key>"

        # P1: PostgreSQL 磁碟增長率過快（1小時增長超過 500MB）
        - alert: PostgreSQLDiskGrowthRate
          expr: |
            (
              node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"}
              - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"}
            )
            - (
              node_filesystem_size_bytes{instance="192.168.0.188:9100", mountpoint="/"}
              - node_filesystem_avail_bytes{instance="192.168.0.188:9100", mountpoint="/"}
            ) offset 1h
            > 524288000
          for: 5m
          labels:
            severity: warning
            alert_category: infrastructure
            notification_type: TYPE-3
          annotations:
            summary: "188 主機磁碟 1 小時增長超過 500MB"
            description: "磁碟在過去 1 小時增長 {{ $value | humanize1024 }}B，可能是 PostgreSQL WAL 或日誌暴增。"
            runbook: "SSH 188：df -h / && du -sh /var/lib/postgresql/*/pg_wal"

    - name: awoooi_host_connectivity
      interval: 60s
      rules:

        # P0: 主機間網路分區
        - alert: HostNetworkPartition
          expr: probe_success{job="host-connectivity"} == 0
          for: 5m
          labels:
            severity: critical
            alert_category: infrastructure
            notification_type: TYPE-3
          annotations:
            summary: "主機 {{ $labels.instance }} 無法連通"
            description: "TCP probe 到 {{ $labels.instance }} 失敗超過 5 分鐘，可能發生網路分區。"
            runbook: "SSH 檢查路由和防火牆規則"