Wave 8 P3.2 模型版本追蹤 + ADR-100 SLO 自我治理 + 配套: P3.2 — Model Version Tracking: - model_version_probe.py (268 行) — 探測 Ollama / OpenRouter 等 provider 的 model version - model_version_tracker.py (101 行) — 對齊 PG provider_version_history 表 - migrations/p3_2_provider_version_history.sql + rollback — 25 行 schema - db/models.py +32 行 — ProviderVersionHistory ORM ADR-100 — AI 自主化 SLO: - docs/adr/ADR-100-ai-autonomous-slo.md (167 行) — 飛輪 SLO 設計與閾值 - ops/monitoring/slo-rules.yml (254 行) — Prometheus SLO recording rules + alerts - ops/monitoring/tests/test_slo_rules.yaml (242 行) — promtool unit tests 整合修改: - main.py +72 行 — Lifespan 啟動 model_version_probe + KB rot cleaner schedule - gitea_webhook.py +45 行 — webhook 接收 model 版本變化通知 - ci_auto_repair.py / evidence_snapshot.py / pre_decision_investigator.py — 配合接線 新測試: - test_kb_rot_cleaner_schedule.py (120 行) — 9 tests pass - test_slo_rules.yaml — promtool 驗收 Tests: 9 passed (test_kb_rot_cleaner_schedule) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-Authored-By: Multiple Engineers (P3.2 + ADR-100) <noreply@anthropic.com>
255 lines
11 KiB
YAML
255 lines
11 KiB
YAML
# ops/monitoring/slo-rules.yml
|
||
# AI 自主化飛輪 SLO — Prometheus Recording Rules + Burn Rate Alerts
|
||
# 2026-04-27 P3.4 by Claude — AI SLO
|
||
# ADR-100: ai-autonomous-slo
|
||
#
|
||
# 部署目標: Prometheus rule_files 載入(與 alerts-unified.yml 同目錄)
|
||
# 部署方式: scripts/ops/deploy-alerts.sh (CD 自動部署)
|
||
#
|
||
# 4 個 SLO:
|
||
# SLO 1 — 自主化率 ≥ 80% sli:autonomy_rate:5m
|
||
# SLO 2 — 決策準確率 ≥ 90% sli:decision_accuracy:5m
|
||
# SLO 3 — 信心校準 ≥ 80% sli:confidence_calibration:1h
|
||
# SLO 4 — KM 增長率 ≥ 20/day sli:km_growth_rate:24h
|
||
#
|
||
# Burn rate alerts: SLO 1+2+3 各 3 個視窗 = 9 alerts
|
||
# KM growth alerts: SLO 4 用 2 個閾值告警 = 2 alerts
|
||
# 合計: 11 alerts
|
||
|
||
groups:
|
||
- name: ai_autonomous_slo
|
||
interval: 30s
|
||
rules:
|
||
# -----------------------------------------------------------------------
|
||
# Recording Rules — SLI 計算
|
||
# -----------------------------------------------------------------------
|
||
|
||
# SLO 1: 自主化率 = auto_executed / all_operations
|
||
- record: sli:autonomy_rate:5m
|
||
expr: |
|
||
sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))
|
||
/
|
||
sum(rate(automation_operation_log_total{}[5m]))
|
||
|
||
# SLO 2: 決策準確率 = verifier_success / auto_executed
|
||
- record: sli:decision_accuracy:5m
|
||
expr: |
|
||
sum(rate(post_execution_verification_total{outcome="success"}[5m]))
|
||
/
|
||
sum(rate(automation_operation_log_total{outcome="auto_executed"}[5m]))
|
||
|
||
# SLO 3: 信心校準 = high_confidence_success / high_confidence_total (1h 滑動窗口)
|
||
- record: sli:confidence_calibration:1h
|
||
expr: |
|
||
sum(rate(approval_records_high_confidence_success_total[1h]))
|
||
/
|
||
sum(rate(approval_records_high_confidence_total[1h]))
|
||
|
||
# SLO 4: KM 增長率 = 24h increase (絕對值,不做 rate)
|
||
- record: sli:km_growth_rate:24h
|
||
expr: increase(knowledge_entries_total[24h])
|
||
|
||
# -----------------------------------------------------------------------
|
||
# Error Budget Recording Rules(輔助 Grafana 顯示)
|
||
# SLO 1/2/3: error_budget_remaining = 1 - (1 - SLI) / (1 - SLO_target)
|
||
# -----------------------------------------------------------------------
|
||
- record: slo:autonomy_rate:error_budget_remaining
|
||
expr: |
|
||
1 - clamp_min(1 - sli:autonomy_rate:5m, 0) / 0.20
|
||
|
||
- record: slo:decision_accuracy:error_budget_remaining
|
||
expr: |
|
||
1 - clamp_min(1 - sli:decision_accuracy:5m, 0) / 0.10
|
||
|
||
- record: slo:confidence_calibration:error_budget_remaining
|
||
expr: |
|
||
1 - clamp_min(1 - sli:confidence_calibration:1h, 0) / 0.20
|
||
|
||
# -----------------------------------------------------------------------
|
||
# Alert Rules — SLO 1: 自主化率(error budget 20%,SLO = 0.80)
|
||
# burn rate 公式: error_rate > budget_ratio × (budget_period / window)
|
||
# 28d budget; fast=1h burn 2%: threshold = 0.20 × (28d×24h/1h) × (0.02) = 0.20 × 13.44
|
||
# -----------------------------------------------------------------------
|
||
|
||
- alert: SLO_AutonomyRate_FastBurn
|
||
# 1h 視窗消耗 > 2% error budget(burn rate 14.4×)
|
||
expr: |
|
||
(1 - sli:autonomy_rate:5m) > (0.20 * 14.4)
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
slo_name: autonomy_rate
|
||
burn_window: 1h
|
||
team: ai
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SLO 自主化率 fast burn(1h 消耗 >2% budget)"
|
||
description: "當前自主化率 {{ $value | humanizePercentage }},低於 80% 目標,1h burn rate 超標。"
|
||
runbook: "查 automation_operation_log_total,確認 human_required 是否異常增加。"
|
||
|
||
- alert: SLO_AutonomyRate_MediumBurn
|
||
# 6h 視窗消耗 > 5% error budget(burn rate 6×)
|
||
expr: |
|
||
(1 - sli:autonomy_rate:5m) > (0.20 * 6)
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
slo_name: autonomy_rate
|
||
burn_window: 6h
|
||
team: ai
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SLO 自主化率 medium burn(6h 消耗 >5% budget)"
|
||
description: "當前自主化率 {{ $value | humanizePercentage }},6h 趨勢持續偏低。"
|
||
runbook: "檢查 fusion decision threshold 是否過嚴,或 proactive_inspector 是否正常。"
|
||
|
||
- alert: SLO_AutonomyRate_SlowBurn
|
||
# 3d 累積 > 10% error budget(burn rate 1.1×)
|
||
expr: |
|
||
(1 - sli:autonomy_rate:5m) > (0.20 * 1.1)
|
||
for: 1h
|
||
labels:
|
||
severity: info
|
||
slo_name: autonomy_rate
|
||
burn_window: 3d
|
||
team: ai
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SLO 自主化率 slow burn(長期趨勢偏低)"
|
||
description: "自主化率長期低於目標,累積 error budget 消耗率偏高,建議本週 review。"
|
||
runbook: "分析近 7d 數據,是否需要重訓或調整 confidence threshold。"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# Alert Rules — SLO 2: 決策準確率(error budget 10%,SLO = 0.90)
|
||
# -----------------------------------------------------------------------
|
||
|
||
- alert: SLO_DecisionAccuracy_FastBurn
|
||
expr: |
|
||
(1 - sli:decision_accuracy:5m) > (0.10 * 14.4)
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
slo_name: decision_accuracy
|
||
burn_window: 1h
|
||
team: ai
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SLO 決策準確率 fast burn(1h 消耗 >2% budget)"
|
||
description: "決策準確率 {{ $value | humanizePercentage }},低於 90% 目標,需立即調查。"
|
||
runbook: "查 post_execution_verification_total{outcome='failed'},確認是否 LLM 幻覺或指令執行失敗。"
|
||
|
||
- alert: SLO_DecisionAccuracy_MediumBurn
|
||
expr: |
|
||
(1 - sli:decision_accuracy:5m) > (0.10 * 6)
|
||
for: 15m
|
||
labels:
|
||
severity: warning
|
||
slo_name: decision_accuracy
|
||
burn_window: 6h
|
||
team: ai
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SLO 決策準確率 medium burn(6h 消耗 >5% budget)"
|
||
description: "決策準確率 6h 趨勢持續偏低,建議強化 verifier 邏輯。"
|
||
runbook: "增加 verifier 二次驗證,或提高 auto_execute confidence 門檻。"
|
||
|
||
- alert: SLO_DecisionAccuracy_SlowBurn
|
||
expr: |
|
||
(1 - sli:decision_accuracy:5m) > (0.10 * 1.1)
|
||
for: 1h
|
||
labels:
|
||
severity: info
|
||
slo_name: decision_accuracy
|
||
burn_window: 3d
|
||
team: ai
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SLO 決策準確率 slow burn(長期趨勢偏低)"
|
||
description: "決策準確率長期低於目標,累積 error budget 消耗偏高。"
|
||
runbook: "近 7d verifier 失敗分析,考慮 playbook fine-tune。"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# Alert Rules — SLO 3: 信心校準(error budget 20%,SLO = 0.80)
|
||
# -----------------------------------------------------------------------
|
||
|
||
- alert: SLO_ConfidenceCalibration_FastBurn
|
||
expr: |
|
||
(1 - sli:confidence_calibration:1h) > (0.20 * 14.4)
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
slo_name: confidence_calibration
|
||
burn_window: 1h
|
||
team: ai
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SLO 信心校準 fast burn(高信心決策準確率驟降)"
|
||
description: "confidence≥0.8 的決策中驗證通過率驟降,AI 信心值失準,需緊急介入。"
|
||
runbook: "查 approval_records_high_confidence_success_total,確認是否新模型或新 playbook 引入偏差。"
|
||
|
||
- alert: SLO_ConfidenceCalibration_MediumBurn
|
||
expr: |
|
||
(1 - sli:confidence_calibration:1h) > (0.20 * 6)
|
||
for: 30m
|
||
labels:
|
||
severity: warning
|
||
slo_name: confidence_calibration
|
||
burn_window: 6h
|
||
team: ai
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SLO 信心校準 medium burn(信心校準持續偏差)"
|
||
description: "高信心決策準確率持續偏低,建議提高 auto_execute 信心閾值至 0.85。"
|
||
runbook: "調整 FUSION_CONFIDENCE_THRESHOLD 並觀察 24h 趨勢。"
|
||
|
||
- alert: SLO_ConfidenceCalibration_SlowBurn
|
||
expr: |
|
||
(1 - sli:confidence_calibration:1h) > (0.20 * 1.1)
|
||
for: 2h
|
||
labels:
|
||
severity: info
|
||
slo_name: confidence_calibration
|
||
burn_window: 3d
|
||
team: ai
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SLO 信心校準 slow burn(長期信心校準偏差)"
|
||
description: "高信心決策準確率長期不達標,建議觸發 P3.3 fine-tune 重訓。"
|
||
runbook: "安排 fine-tune pipeline,以最近 KM 知識更新訓練資料。"
|
||
|
||
# -----------------------------------------------------------------------
|
||
# Alert Rules — SLO 4: KM 增長率(絕對值告警)
|
||
# -----------------------------------------------------------------------
|
||
|
||
- alert: SLO_KMGrowthRate_Low
|
||
expr: |
|
||
sli:km_growth_rate:24h < 20
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
slo_name: km_growth_rate
|
||
team: ai
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SLO KM 增長率偏低(< 20 筆/day)"
|
||
description: "過去 24h KM 新增 {{ $value }} 筆,低於目標 20 筆/day。"
|
||
runbook: "查 KM 寫入路徑(auto_execute 後 _write_execution_result_to_km),確認飛輪 KM 閉環正常。"
|
||
|
||
- alert: SLO_KMGrowthRate_Critical
|
||
expr: |
|
||
sli:km_growth_rate:24h < 5
|
||
for: 10m
|
||
labels:
|
||
severity: critical
|
||
slo_name: km_growth_rate
|
||
team: ai
|
||
auto_repair: "false"
|
||
annotations:
|
||
summary: "SLO KM 增長率嚴重不足(< 5 筆/day)— 疑似 KM 鏈斷裂"
|
||
description: "過去 24h KM 新增 {{ $value }} 筆,遠低於目標 20 筆/day,飛輪學習迴圈疑似中斷。"
|
||
runbook: |
|
||
1. 確認 knowledge_entries_total counter 是否正常遞增
|
||
2. 查 governance_agent 日誌中 governance_km_growth_slo_violation
|
||
3. 確認 auto_execute 後 KM 寫入路徑(feedback_flywheel_km_write_gap.md)
|
||
4. 手動執行 POST /api/v1/governance/check
|