# ops/monitoring/slo-rules.yml # AI 自主化飛輪 SLO — Prometheus Recording Rules + Burn Rate Alerts # 2026-04-27 P3.4 by Claude — AI SLO # ADR-100: ai-autonomous-slo # # 部署目標: Prometheus rule_files 載入(與 alerts-unified.yml 同目錄) # 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署) # # 4 個 SLO: # SLO 1 — 自主化率 ≥ 80% sli:autonomy_rate:5m # SLO 2 — 決策準確率 ≥ 90% sli:decision_accuracy:5m # SLO 3 — 信心校準 ≥ 80% sli:confidence_calibration:1h # SLO 4 — KM 增長率 ≥ 20/day sli:km_growth_rate:24h # # Burn rate alerts: SLO 1+2+3 各 3 個視窗 = 9 alerts # KM growth alerts: SLO 4 用 2 個閾值告警 = 2 alerts # 合計: 11 alerts groups: - name: ai_autonomous_slo interval: 30s rules: # ----------------------------------------------------------------------- # Recording Rules — SLI 計算 # ----------------------------------------------------------------------- # SLO 1: 自主化率 = auto_executed / all_operations - record: sli:autonomy_rate:5m expr: | sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m])) / sum(rate(automation_operation_log_total{}[5m])) # SLO 2: 決策準確率 = verifier_success / auto_executed - record: sli:decision_accuracy:5m expr: | sum(rate(post_execution_verification_total{outcome="success"}[5m])) / sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m])) # SLO 3: 信心校準 = high_confidence_success / high_confidence_total (1h 滑動窗口) - record: sli:confidence_calibration:1h expr: | sum(rate(approval_records_high_confidence_success_total[1h])) / sum(rate(approval_records_high_confidence_total[1h])) # SLO 4: KM 增長率 = DB-derived 24h gauge;fallback 給舊 counter history - record: sli:km_growth_rate:24h expr: | max(knowledge_entries_created_24h) or max(increase(knowledge_entries_total[24h])) # ----------------------------------------------------------------------- # Error Budget Recording Rules(輔助 Grafana 顯示) # SLO 1/2/3: error_budget_remaining = 1 - (1 - SLI) / (1 - SLO_target) # ----------------------------------------------------------------------- - record: slo:autonomy_rate:error_budget_remaining expr: | 1 - clamp_min(1 - sli:autonomy_rate:5m, 0) / 0.20 - record: slo:decision_accuracy:error_budget_remaining expr: | 1 - clamp_min(1 - sli:decision_accuracy:5m, 0) / 0.10 - record: slo:confidence_calibration:error_budget_remaining expr: | 1 - clamp_min(1 - sli:confidence_calibration:1h, 0) / 0.20 # ----------------------------------------------------------------------- # Alert Rules — SLO 1: 自主化率(error budget 20%,SLO = 0.80) # burn rate 公式: error_rate > budget_ratio × (budget_period / window) # 28d budget; fast=1h burn 2%: threshold = 0.20 × (28d×24h/1h) × (0.02) = 0.20 × 13.44 # ----------------------------------------------------------------------- - alert: SLO_AutonomyRate_FastBurn # 1h 視窗消耗 > 2% error budget(burn rate 14.4×) expr: | (1 - sli:autonomy_rate:5m) > (0.20 * 14.4) for: 2m labels: severity: critical slo_name: autonomy_rate burn_window: 1h team: ai auto_repair: "false" annotations: summary: "SLO 自主化率 fast burn(1h 消耗 >2% budget)" description: "當前自主化率 {{ $value | humanizePercentage }},低於 80% 目標,1h burn rate 超標。" runbook: "查 automation_operation_log_total,確認 human_required 是否異常增加。" - alert: SLO_AutonomyRate_MediumBurn # 6h 視窗消耗 > 5% error budget(burn rate 6×) expr: | (1 - sli:autonomy_rate:5m) > (0.20 * 6) for: 15m labels: severity: warning slo_name: autonomy_rate burn_window: 6h team: ai auto_repair: "false" annotations: summary: "SLO 自主化率 medium burn(6h 消耗 >5% budget)" description: "當前自主化率 {{ $value | humanizePercentage }},6h 趨勢持續偏低。" runbook: "檢查 fusion decision threshold 是否過嚴,或 proactive_inspector 是否正常。" - alert: SLO_AutonomyRate_SlowBurn # 3d 累積 > 10% error budget(burn rate 1.1×) expr: | (1 - sli:autonomy_rate:5m) > (0.20 * 1.1) for: 1h labels: severity: info slo_name: autonomy_rate burn_window: 3d team: ai auto_repair: "false" annotations: summary: "SLO 自主化率 slow burn(長期趨勢偏低)" description: "自主化率長期低於目標,累積 error budget 消耗率偏高,建議本週 review。" runbook: "分析近 7d 數據,是否需要重訓或調整 confidence threshold。" # ----------------------------------------------------------------------- # Alert Rules — SLO 2: 決策準確率(error budget 10%,SLO = 0.90) # ----------------------------------------------------------------------- - alert: SLO_DecisionAccuracy_FastBurn expr: | (1 - sli:decision_accuracy:5m) > (0.10 * 14.4) for: 2m labels: severity: critical slo_name: decision_accuracy burn_window: 1h team: ai auto_repair: "false" annotations: summary: "SLO 決策準確率 fast burn(1h 消耗 >2% budget)" description: "決策準確率 {{ $value | humanizePercentage }},低於 90% 目標,需立即調查。" runbook: "查 post_execution_verification_total{outcome='failed'},確認是否 LLM 幻覺或指令執行失敗。" - alert: SLO_DecisionAccuracy_MediumBurn expr: | (1 - sli:decision_accuracy:5m) > (0.10 * 6) for: 15m labels: severity: warning slo_name: decision_accuracy burn_window: 6h team: ai auto_repair: "false" annotations: summary: "SLO 決策準確率 medium burn(6h 消耗 >5% budget)" description: "決策準確率 6h 趨勢持續偏低,建議強化 verifier 邏輯。" runbook: "增加 verifier 二次驗證,或提高 auto_execute confidence 門檻。" - alert: SLO_DecisionAccuracy_SlowBurn expr: | (1 - sli:decision_accuracy:5m) > (0.10 * 1.1) for: 1h labels: severity: info slo_name: decision_accuracy burn_window: 3d team: ai auto_repair: "false" annotations: summary: "SLO 決策準確率 slow burn(長期趨勢偏低)" description: "決策準確率長期低於目標,累積 error budget 消耗偏高。" runbook: "近 7d verifier 失敗分析,考慮 playbook fine-tune。" # ----------------------------------------------------------------------- # Alert Rules — SLO 3: 信心校準(error budget 20%,SLO = 0.80) # ----------------------------------------------------------------------- - alert: SLO_ConfidenceCalibration_FastBurn expr: | (1 - sli:confidence_calibration:1h) > (0.20 * 14.4) for: 2m labels: severity: critical slo_name: confidence_calibration burn_window: 1h team: ai auto_repair: "false" annotations: summary: "SLO 信心校準 fast burn(高信心決策準確率驟降)" description: "confidence≥0.8 的決策中驗證通過率驟降,AI 信心值失準,需緊急介入。" runbook: "查 approval_records_high_confidence_success_total,確認是否新模型或新 playbook 引入偏差。" - alert: SLO_ConfidenceCalibration_MediumBurn expr: | (1 - sli:confidence_calibration:1h) > (0.20 * 6) for: 30m labels: severity: warning slo_name: confidence_calibration burn_window: 6h team: ai auto_repair: "false" annotations: summary: "SLO 信心校準 medium burn(信心校準持續偏差)" description: "高信心決策準確率持續偏低,建議提高 auto_execute 信心閾值至 0.85。" runbook: "調整 FUSION_CONFIDENCE_THRESHOLD 並觀察 24h 趨勢。" - alert: SLO_ConfidenceCalibration_SlowBurn expr: | (1 - sli:confidence_calibration:1h) > (0.20 * 1.1) for: 2h labels: severity: info slo_name: confidence_calibration burn_window: 3d team: ai auto_repair: "false" annotations: summary: "SLO 信心校準 slow burn(長期信心校準偏差)" description: "高信心決策準確率長期不達標,建議觸發 P3.3 fine-tune 重訓。" runbook: "安排 fine-tune pipeline,以最近 KM 知識更新訓練資料。" # ----------------------------------------------------------------------- # Alert Rules — SLO 4: KM 增長率(絕對值告警) # ----------------------------------------------------------------------- - alert: SLO_KMGrowthRate_Low expr: | sli:km_growth_rate:24h < 20 for: 10m labels: severity: warning slo_name: km_growth_rate team: ai auto_repair: "false" annotations: summary: "SLO KM 增長率偏低(< 20 筆/day)" description: "過去 24h KM 新增 {{ $value }} 筆,低於目標 20 筆/day。" runbook: "查 KM 寫入路徑(auto_execute 後 _write_execution_result_to_km),確認飛輪 KM 閉環正常。" - alert: SLO_KMGrowthRate_Critical expr: | sli:km_growth_rate:24h < 5 for: 10m labels: severity: critical slo_name: km_growth_rate team: ai auto_repair: "false" annotations: summary: "SLO KM 增長率嚴重不足(< 5 筆/day)— 疑似 KM 鏈斷裂" description: "過去 24h KM 新增 {{ $value }} 筆,遠低於目標 20 筆/day,飛輪學習迴圈疑似中斷。" runbook: | 1. 確認 knowledge_entries_created_24h gauge 與 knowledge_entries_total counter 是否正常遞增 2. 查 governance_agent 日誌中 governance_km_growth_slo_violation 3. 確認 auto_execute 後 KM 寫入路徑(feedback_flywheel_km_write_gap.md) 4. 手動執行 POST /api/v1/governance/check