Files
awoooi/ops/monitoring/slo-rules.yml
Your Name 21dcfbd991
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
Deploy Alert Rules / Deploy Prometheus Alert Rules (push) Successful in 22s
CD Pipeline / tests (push) Successful in 1m6s
CD Pipeline / build-and-deploy (push) Successful in 5m17s
CD Pipeline / post-deploy-checks (push) Successful in 1m38s
fix(governance): collapse km slo fallback series
2026-05-14 19:37:15 +08:00

258 lines
11 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# ops/monitoring/slo-rules.yml
# AI 自主化飛輪 SLO — Prometheus Recording Rules + Burn Rate Alerts
# 2026-04-27 P3.4 by Claude — AI SLO
# ADR-100: ai-autonomous-slo
#
# 部署目標: Prometheus rule_files 載入(與 alerts-unified.yml 同目錄)
# 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署)
#
# 4 個 SLO
# SLO 1 — 自主化率 ≥ 80% sli:autonomy_rate:5m
# SLO 2 — 決策準確率 ≥ 90% sli:decision_accuracy:5m
# SLO 3 — 信心校準 ≥ 80% sli:confidence_calibration:1h
# SLO 4 — KM 增長率 ≥ 20/day sli:km_growth_rate:24h
#
# Burn rate alerts: SLO 1+2+3 各 3 個視窗 = 9 alerts
# KM growth alerts: SLO 4 用 2 個閾值告警 = 2 alerts
# 合計: 11 alerts
groups:
- name: ai_autonomous_slo
interval: 30s
rules:
# -----------------------------------------------------------------------
# Recording Rules — SLI 計算
# -----------------------------------------------------------------------
# SLO 1: 自主化率 = auto_executed / all_operations
- record: sli:autonomy_rate:5m
expr: |
sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))
/
sum(rate(automation_operation_log_total{}[5m]))
# SLO 2: 決策準確率 = verifier_success / auto_executed
- record: sli:decision_accuracy:5m
expr: |
sum(rate(post_execution_verification_total{outcome="success"}[5m]))
/
sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))
# SLO 3: 信心校準 = high_confidence_success / high_confidence_total (1h 滑動窗口)
- record: sli:confidence_calibration:1h
expr: |
sum(rate(approval_records_high_confidence_success_total[1h]))
/
sum(rate(approval_records_high_confidence_total[1h]))
# SLO 4: KM 增長率 = DB-derived 24h gaugefallback 給舊 counter history
- record: sli:km_growth_rate:24h
expr: |
max(knowledge_entries_created_24h)
or
max(increase(knowledge_entries_total[24h]))
# -----------------------------------------------------------------------
# Error Budget Recording Rules輔助 Grafana 顯示)
# SLO 1/2/3: error_budget_remaining = 1 - (1 - SLI) / (1 - SLO_target)
# -----------------------------------------------------------------------
- record: slo:autonomy_rate:error_budget_remaining
expr: |
1 - clamp_min(1 - sli:autonomy_rate:5m, 0) / 0.20
- record: slo:decision_accuracy:error_budget_remaining
expr: |
1 - clamp_min(1 - sli:decision_accuracy:5m, 0) / 0.10
- record: slo:confidence_calibration:error_budget_remaining
expr: |
1 - clamp_min(1 - sli:confidence_calibration:1h, 0) / 0.20
# -----------------------------------------------------------------------
# Alert Rules — SLO 1: 自主化率error budget 20%SLO = 0.80
# burn rate 公式: error_rate > budget_ratio × (budget_period / window)
# 28d budget; fast=1h burn 2%: threshold = 0.20 × (28d×24h/1h) × (0.02) = 0.20 × 13.44
# -----------------------------------------------------------------------
- alert: SLO_AutonomyRate_FastBurn
# 1h 視窗消耗 > 2% error budgetburn rate 14.4×
expr: |
(1 - sli:autonomy_rate:5m) > (0.20 * 14.4)
for: 2m
labels:
severity: critical
slo_name: autonomy_rate
burn_window: 1h
team: ai
auto_repair: "false"
annotations:
summary: "SLO 自主化率 fast burn1h 消耗 >2% budget"
description: "當前自主化率 {{ $value | humanizePercentage }},低於 80% 目標1h burn rate 超標。"
runbook: "查 automation_operation_log_total確認 human_required 是否異常增加。"
- alert: SLO_AutonomyRate_MediumBurn
# 6h 視窗消耗 > 5% error budgetburn rate 6×
expr: |
(1 - sli:autonomy_rate:5m) > (0.20 * 6)
for: 15m
labels:
severity: warning
slo_name: autonomy_rate
burn_window: 6h
team: ai
auto_repair: "false"
annotations:
summary: "SLO 自主化率 medium burn6h 消耗 >5% budget"
description: "當前自主化率 {{ $value | humanizePercentage }}6h 趨勢持續偏低。"
runbook: "檢查 fusion decision threshold 是否過嚴,或 proactive_inspector 是否正常。"
- alert: SLO_AutonomyRate_SlowBurn
# 3d 累積 > 10% error budgetburn rate 1.1×
expr: |
(1 - sli:autonomy_rate:5m) > (0.20 * 1.1)
for: 1h
labels:
severity: info
slo_name: autonomy_rate
burn_window: 3d
team: ai
auto_repair: "false"
annotations:
summary: "SLO 自主化率 slow burn長期趨勢偏低"
description: "自主化率長期低於目標,累積 error budget 消耗率偏高,建議本週 review。"
runbook: "分析近 7d 數據,是否需要重訓或調整 confidence threshold。"
# -----------------------------------------------------------------------
# Alert Rules — SLO 2: 決策準確率error budget 10%SLO = 0.90
# -----------------------------------------------------------------------
- alert: SLO_DecisionAccuracy_FastBurn
expr: |
(1 - sli:decision_accuracy:5m) > (0.10 * 14.4)
for: 2m
labels:
severity: critical
slo_name: decision_accuracy
burn_window: 1h
team: ai
auto_repair: "false"
annotations:
summary: "SLO 決策準確率 fast burn1h 消耗 >2% budget"
description: "決策準確率 {{ $value | humanizePercentage }},低於 90% 目標,需立即調查。"
runbook: "查 post_execution_verification_total{outcome='failed'},確認是否 LLM 幻覺或指令執行失敗。"
- alert: SLO_DecisionAccuracy_MediumBurn
expr: |
(1 - sli:decision_accuracy:5m) > (0.10 * 6)
for: 15m
labels:
severity: warning
slo_name: decision_accuracy
burn_window: 6h
team: ai
auto_repair: "false"
annotations:
summary: "SLO 決策準確率 medium burn6h 消耗 >5% budget"
description: "決策準確率 6h 趨勢持續偏低,建議強化 verifier 邏輯。"
runbook: "增加 verifier 二次驗證,或提高 auto_execute confidence 門檻。"
- alert: SLO_DecisionAccuracy_SlowBurn
expr: |
(1 - sli:decision_accuracy:5m) > (0.10 * 1.1)
for: 1h
labels:
severity: info
slo_name: decision_accuracy
burn_window: 3d
team: ai
auto_repair: "false"
annotations:
summary: "SLO 決策準確率 slow burn長期趨勢偏低"
description: "決策準確率長期低於目標,累積 error budget 消耗偏高。"
runbook: "近 7d verifier 失敗分析,考慮 playbook fine-tune。"
# -----------------------------------------------------------------------
# Alert Rules — SLO 3: 信心校準error budget 20%SLO = 0.80
# -----------------------------------------------------------------------
- alert: SLO_ConfidenceCalibration_FastBurn
expr: |
(1 - sli:confidence_calibration:1h) > (0.20 * 14.4)
for: 2m
labels:
severity: critical
slo_name: confidence_calibration
burn_window: 1h
team: ai
auto_repair: "false"
annotations:
summary: "SLO 信心校準 fast burn高信心決策準確率驟降"
description: "confidence≥0.8 的決策中驗證通過率驟降AI 信心值失準,需緊急介入。"
runbook: "查 approval_records_high_confidence_success_total確認是否新模型或新 playbook 引入偏差。"
- alert: SLO_ConfidenceCalibration_MediumBurn
expr: |
(1 - sli:confidence_calibration:1h) > (0.20 * 6)
for: 30m
labels:
severity: warning
slo_name: confidence_calibration
burn_window: 6h
team: ai
auto_repair: "false"
annotations:
summary: "SLO 信心校準 medium burn信心校準持續偏差"
description: "高信心決策準確率持續偏低,建議提高 auto_execute 信心閾值至 0.85。"
runbook: "調整 FUSION_CONFIDENCE_THRESHOLD 並觀察 24h 趨勢。"
- alert: SLO_ConfidenceCalibration_SlowBurn
expr: |
(1 - sli:confidence_calibration:1h) > (0.20 * 1.1)
for: 2h
labels:
severity: info
slo_name: confidence_calibration
burn_window: 3d
team: ai
auto_repair: "false"
annotations:
summary: "SLO 信心校準 slow burn長期信心校準偏差"
description: "高信心決策準確率長期不達標,建議觸發 P3.3 fine-tune 重訓。"
runbook: "安排 fine-tune pipeline以最近 KM 知識更新訓練資料。"
# -----------------------------------------------------------------------
# Alert Rules — SLO 4: KM 增長率(絕對值告警)
# -----------------------------------------------------------------------
- alert: SLO_KMGrowthRate_Low
expr: |
sli:km_growth_rate:24h < 20
for: 10m
labels:
severity: warning
slo_name: km_growth_rate
team: ai
auto_repair: "false"
annotations:
summary: "SLO KM 增長率偏低(< 20 筆/day"
description: "過去 24h KM 新增 {{ $value }} 筆,低於目標 20 筆/day。"
runbook: "查 KM 寫入路徑auto_execute 後 _write_execution_result_to_km確認飛輪 KM 閉環正常。"
- alert: SLO_KMGrowthRate_Critical
expr: |
sli:km_growth_rate:24h < 5
for: 10m
labels:
severity: critical
slo_name: km_growth_rate
team: ai
auto_repair: "false"
annotations:
summary: "SLO KM 增長率嚴重不足(< 5 筆/day— 疑似 KM 鏈斷裂"
description: "過去 24h KM 新增 {{ $value }} 筆,遠低於目標 20 筆/day飛輪學習迴圈疑似中斷。"
runbook: |
1. 確認 knowledge_entries_created_24h gauge 與 knowledge_entries_total counter 是否正常遞增
2. 查 governance_agent 日誌中 governance_km_growth_slo_violation
3. 確認 auto_execute 後 KM 寫入路徑feedback_flywheel_km_write_gap.md
4. 手動執行 POST /api/v1/governance/check