awoooi/ops/monitoring/slo-rules.yml

# ops/monitoring/slo-rules.yml
# AI 自主化飛輪 SLO — Prometheus Recording Rules + Burn Rate Alerts
# 2026-04-27 P3.4 by Claude — AI SLO
# ADR-100: ai-autonomous-slo
#
# 部署目標: Prometheus rule_files 載入（與 alerts-unified.yml 同目錄）
# 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署)
#
# 4 個 SLO：
#   SLO 1 — 自主化率      ≥ 80%    sli:autonomy_rate:5m
#   SLO 2 — 決策準確率    ≥ 90%    sli:decision_accuracy:5m
#   SLO 3 — 信心校準      ≥ 80%    sli:confidence_calibration:1h
#   SLO 4 — KM 增長率     ≥ 20/day sli:km_growth_rate:24h
#
# Burn rate alerts: SLO 1+2+3 各 3 個視窗 = 9 alerts
# KM growth alerts: SLO 4 用 2 個閾值告警 = 2 alerts
# 合計: 11 alerts

groups:
  - name: ai_autonomous_slo
    interval: 30s
    rules:
      # -----------------------------------------------------------------------
      # Recording Rules — SLI 計算
      # -----------------------------------------------------------------------

      # SLO 1: 自主化率 = auto_executed / all_operations
      - record: sli:autonomy_rate:5m
        expr: |
          sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))
          /
          sum(rate(automation_operation_log_total{}[5m]))

      # SLO 2: 決策準確率 = verifier_success / auto_executed
      - record: sli:decision_accuracy:5m
        expr: |
          sum(rate(post_execution_verification_total{outcome="success"}[5m]))
          /
          sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))

      # SLO 3: 信心校準 = high_confidence_success / high_confidence_total (1h 滑動窗口)
      - record: sli:confidence_calibration:1h
        expr: |
          sum(rate(approval_records_high_confidence_success_total[1h]))
          /
          sum(rate(approval_records_high_confidence_total[1h]))

      # SLO 4: KM 增長率 = 24h increase (絕對值，不做 rate)
      - record: sli:km_growth_rate:24h
        expr: increase(knowledge_entries_total[24h])

      # -----------------------------------------------------------------------
      # Error Budget Recording Rules（輔助 Grafana 顯示）
      # SLO 1/2/3: error_budget_remaining = 1 - (1 - SLI) / (1 - SLO_target)
      # -----------------------------------------------------------------------
      - record: slo:autonomy_rate:error_budget_remaining
        expr: |
          1 - clamp_min(1 - sli:autonomy_rate:5m, 0) / 0.20

      - record: slo:decision_accuracy:error_budget_remaining
        expr: |
          1 - clamp_min(1 - sli:decision_accuracy:5m, 0) / 0.10

      - record: slo:confidence_calibration:error_budget_remaining
        expr: |
          1 - clamp_min(1 - sli:confidence_calibration:1h, 0) / 0.20

      # -----------------------------------------------------------------------
      # Alert Rules — SLO 1: 自主化率（error budget 20%，SLO = 0.80）
      # burn rate 公式: error_rate > budget_ratio × (budget_period / window)
      # 28d budget; fast=1h burn 2%: threshold = 0.20 × (28d×24h/1h) × (0.02) = 0.20 × 13.44
      # -----------------------------------------------------------------------

      - alert: SLO_AutonomyRate_FastBurn
        # 1h 視窗消耗 > 2% error budget（burn rate 14.4×）
        expr: |
          (1 - sli:autonomy_rate:5m) > (0.20 * 14.4)
        for: 2m
        labels:
          severity: critical
          slo_name: autonomy_rate
          burn_window: 1h
          team: ai
          auto_repair: "false"
        annotations:
          summary: "SLO 自主化率 fast burn（1h 消耗 >2% budget）"
          description: "當前自主化率 {{ $value | humanizePercentage }}，低於 80% 目標，1h burn rate 超標。"
          runbook: "查 automation_operation_log_total，確認 human_required 是否異常增加。"

      - alert: SLO_AutonomyRate_MediumBurn
        # 6h 視窗消耗 > 5% error budget（burn rate 6×）
        expr: |
          (1 - sli:autonomy_rate:5m) > (0.20 * 6)
        for: 15m
        labels:
          severity: warning
          slo_name: autonomy_rate
          burn_window: 6h
          team: ai
          auto_repair: "false"
        annotations:
          summary: "SLO 自主化率 medium burn（6h 消耗 >5% budget）"
          description: "當前自主化率 {{ $value | humanizePercentage }}，6h 趨勢持續偏低。"
          runbook: "檢查 fusion decision threshold 是否過嚴，或 proactive_inspector 是否正常。"

      - alert: SLO_AutonomyRate_SlowBurn
        # 3d 累積 > 10% error budget（burn rate 1.1×）
        expr: |
          (1 - sli:autonomy_rate:5m) > (0.20 * 1.1)
        for: 1h
        labels:
          severity: info
          slo_name: autonomy_rate
          burn_window: 3d
          team: ai
          auto_repair: "false"
        annotations:
          summary: "SLO 自主化率 slow burn（長期趨勢偏低）"
          description: "自主化率長期低於目標，累積 error budget 消耗率偏高，建議本週 review。"
          runbook: "分析近 7d 數據，是否需要重訓或調整 confidence threshold。"

      # -----------------------------------------------------------------------
      # Alert Rules — SLO 2: 決策準確率（error budget 10%，SLO = 0.90）
      # -----------------------------------------------------------------------

      - alert: SLO_DecisionAccuracy_FastBurn
        expr: |
          (1 - sli:decision_accuracy:5m) > (0.10 * 14.4)
        for: 2m
        labels:
          severity: critical
          slo_name: decision_accuracy
          burn_window: 1h
          team: ai
          auto_repair: "false"
        annotations:
          summary: "SLO 決策準確率 fast burn（1h 消耗 >2% budget）"
          description: "決策準確率 {{ $value | humanizePercentage }}，低於 90% 目標，需立即調查。"
          runbook: "查 post_execution_verification_total{outcome='failed'}，確認是否 LLM 幻覺或指令執行失敗。"

      - alert: SLO_DecisionAccuracy_MediumBurn
        expr: |
          (1 - sli:decision_accuracy:5m) > (0.10 * 6)
        for: 15m
        labels:
          severity: warning
          slo_name: decision_accuracy
          burn_window: 6h
          team: ai
          auto_repair: "false"
        annotations:
          summary: "SLO 決策準確率 medium burn（6h 消耗 >5% budget）"
          description: "決策準確率 6h 趨勢持續偏低，建議強化 verifier 邏輯。"
          runbook: "增加 verifier 二次驗證，或提高 auto_execute confidence 門檻。"

      - alert: SLO_DecisionAccuracy_SlowBurn
        expr: |
          (1 - sli:decision_accuracy:5m) > (0.10 * 1.1)
        for: 1h
        labels:
          severity: info
          slo_name: decision_accuracy
          burn_window: 3d
          team: ai
          auto_repair: "false"
        annotations:
          summary: "SLO 決策準確率 slow burn（長期趨勢偏低）"
          description: "決策準確率長期低於目標，累積 error budget 消耗偏高。"
          runbook: "近 7d verifier 失敗分析，考慮 playbook fine-tune。"

      # -----------------------------------------------------------------------
      # Alert Rules — SLO 3: 信心校準（error budget 20%，SLO = 0.80）
      # -----------------------------------------------------------------------

      - alert: SLO_ConfidenceCalibration_FastBurn
        expr: |
          (1 - sli:confidence_calibration:1h) > (0.20 * 14.4)
        for: 2m
        labels:
          severity: critical
          slo_name: confidence_calibration
          burn_window: 1h
          team: ai
          auto_repair: "false"
        annotations:
          summary: "SLO 信心校準 fast burn（高信心決策準確率驟降）"
          description: "confidence≥0.8 的決策中驗證通過率驟降，AI 信心值失準，需緊急介入。"
          runbook: "查 approval_records_high_confidence_success_total，確認是否新模型或新 playbook 引入偏差。"

      - alert: SLO_ConfidenceCalibration_MediumBurn
        expr: |
          (1 - sli:confidence_calibration:1h) > (0.20 * 6)
        for: 30m
        labels:
          severity: warning
          slo_name: confidence_calibration
          burn_window: 6h
          team: ai
          auto_repair: "false"
        annotations:
          summary: "SLO 信心校準 medium burn（信心校準持續偏差）"
          description: "高信心決策準確率持續偏低，建議提高 auto_execute 信心閾值至 0.85。"
          runbook: "調整 FUSION_CONFIDENCE_THRESHOLD 並觀察 24h 趨勢。"

      - alert: SLO_ConfidenceCalibration_SlowBurn
        expr: |
          (1 - sli:confidence_calibration:1h) > (0.20 * 1.1)
        for: 2h
        labels:
          severity: info
          slo_name: confidence_calibration
          burn_window: 3d
          team: ai
          auto_repair: "false"
        annotations:
          summary: "SLO 信心校準 slow burn（長期信心校準偏差）"
          description: "高信心決策準確率長期不達標，建議觸發 P3.3 fine-tune 重訓。"
          runbook: "安排 fine-tune pipeline，以最近 KM 知識更新訓練資料。"

      # -----------------------------------------------------------------------
      # Alert Rules — SLO 4: KM 增長率（絕對值告警）
      # -----------------------------------------------------------------------

      - alert: SLO_KMGrowthRate_Low
        expr: |
          sli:km_growth_rate:24h < 20
        for: 10m
        labels:
          severity: warning
          slo_name: km_growth_rate
          team: ai
          auto_repair: "false"
        annotations:
          summary: "SLO KM 增長率偏低（< 20 筆/day）"
          description: "過去 24h KM 新增 {{ $value }} 筆，低於目標 20 筆/day。"
          runbook: "查 KM 寫入路徑（auto_execute 後 _write_execution_result_to_km），確認飛輪 KM 閉環正常。"

      - alert: SLO_KMGrowthRate_Critical
        expr: |
          sli:km_growth_rate:24h < 5
        for: 10m
        labels:
          severity: critical
          slo_name: km_growth_rate
          team: ai
          auto_repair: "false"
        annotations:
          summary: "SLO KM 增長率嚴重不足（< 5 筆/day）— 疑似 KM 鏈斷裂"
          description: "過去 24h KM 新增 {{ $value }} 筆，遠低於目標 20 筆/day，飛輪學習迴圈疑似中斷。"
          runbook: |
            1. 確認 knowledge_entries_total counter 是否正常遞增
            2. 查 governance_agent 日誌中 governance_km_growth_slo_violation
            3. 確認 auto_execute 後 KM 寫入路徑（feedback_flywheel_km_write_gap.md）
            4. 手動執行 POST /api/v1/governance/check