Files
awoooi/apps/api/src/services/ai_slo_calculator.py
Your Name d610c7386e
All checks were successful
CD Pipeline / tests (push) Successful in 1m20s
Code Review / ai-code-review (push) Successful in 12s
CD Pipeline / build-and-deploy (push) Successful in 7m32s
CD Pipeline / post-deploy-checks (push) Successful in 1m46s
fix(api): explain auto execute slo degradation
2026-06-01 17:45:13 +08:00

688 lines
27 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI AIOps Phase 6 — AI SLO 計算器(決策品質自我監控)
=========================================================
職責:滾動計算三大 AI 決策品質 SLO違反閾值時寫入 ai_governance_events
供 decision_manager 自我降級邏輯讀取。
三大 SLOMASTER §3.6 ADR-087
SLO-1 auto_execute_success_rate > 85% 7d 滾動)
SLO-2 human_override_rate < 20% 7d 滾動)
SLO-3 verifier_false_neg_rate < 5% 7d 滾動proxy: 2h 內重複告警)
設計原則:
1. 純讀 + 純寫分離 — calculate() 只讀 DBsave_event() 只寫 DB
2. 計算失敗 → 保守:假設 SLO 違反,寫 violation 事件
3. 所有結果快取 Rediskey: ai:slo:latest, TTL 5min避免高頻查 DB
4. 不自動解決舊 violation — resolved 只能人工或下次「全部通過」時補填
ADR-087: AI 自我治理閉環
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 初始建立
"""
from __future__ import annotations
import json
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from math import ceil
from typing import Any
import structlog
from sqlalchemy import func, select, text
from src.db.base import get_db_context
from src.db.models import AiGovernanceEvent, AutoRepairExecution, ApprovalRecord
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# ─────────────────────────────────────────────────────────────────────────────
# SLO 閾值MASTER §3.6 鐵律,修改前需 ADR-087 更新)
# ─────────────────────────────────────────────────────────────────────────────
SLO_AUTO_SUCCESS_MIN: float = 0.85 # auto_execute 成功率下限
SLO_OVERRIDE_RATE_MAX: float = 0.20 # 人工推翻率上限
SLO_FALSE_NEG_MAX: float = 0.05 # verifier false negative 上限
SLO_WINDOW_DAYS: int = 7 # 滾動視窗(天)
SLO_MIN_SAMPLES: int = 5 # 最少樣本數,低於此不計算(資料不足)
REDIS_KEY = "ai:slo:latest"
REDIS_TTL_SEC = 300 # 5 分鐘快取
# ─────────────────────────────────────────────────────────────────────────────
# Data Types
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class SloMetric:
"""單一 SLO 指標"""
name: str
value: float | None # None = 樣本不足,跳過
threshold: float
direction: str # "above" = 需高於閾值 / "below" = 需低於閾值
sample_count: int
violated: bool # 是否違反None → False不觸發降級
@property
def label(self) -> str:
if self.value is None:
return f"{self.name}: N/A樣本 {self.sample_count} < {SLO_MIN_SAMPLES}"
pct = f"{self.value:.1%}"
thr = f"{self.threshold:.0%}"
op = ">" if self.direction == "above" else "<"
status = "❌ 違反" if self.violated else "✅ 合規"
return f"{self.name}: {pct} (需 {op}{thr} {status}"
@dataclass
class SloReport:
"""完整 SLO 計算報告"""
metrics: list[SloMetric] = field(default_factory=list)
any_violated: bool = False
calculated_at: str = field(default_factory=lambda: now_taipei().isoformat())
window_days: int = SLO_WINDOW_DAYS
diagnostics: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict:
return {
"calculated_at": self.calculated_at,
"window_days": self.window_days,
"any_violated": self.any_violated,
"metrics": [
{
"name": m.name,
"value": m.value,
"threshold": m.threshold,
"direction": m.direction,
"sample_count": m.sample_count,
"violated": m.violated,
"label": m.label,
}
for m in self.metrics
],
"diagnostics": self.diagnostics,
}
# ─────────────────────────────────────────────────────────────────────────────
# Main Service
# ─────────────────────────────────────────────────────────────────────────────
class AiSloCalculator:
"""
AI 決策品質 SLO 計算器
Usage:
calc = AiSloCalculator()
report = await calc.calculate()
if report.any_violated:
await calc.save_violation_event(report)
"""
async def calculate(self) -> SloReport:
"""
計算三大 SLO 指標7d 滾動視窗)。
Returns:
SloReport計算失敗時保守回傳 any_violated=True
"""
try:
since = now_taipei() - timedelta(days=SLO_WINDOW_DAYS)
async with get_db_context() as session:
slo1 = await self._calc_auto_success_rate(session, since)
slo2 = await self._calc_human_override_rate(session, since)
slo3 = await self._calc_false_neg_rate(session, since)
diagnostics = {}
if slo1.violated:
diagnostics["auto_execute_success_rate"] = (
await self._build_auto_success_diagnostics(session, since)
)
metrics = [slo1, slo2, slo3]
any_violated = any(m.violated for m in metrics)
report = SloReport(
metrics=metrics,
any_violated=any_violated,
diagnostics=diagnostics,
)
logger.info(
"slo_calculated",
any_violated=any_violated,
slo1=slo1.value,
slo2=slo2.value,
slo3=slo3.value,
)
return report
except Exception as e:
logger.error("slo_calculation_error", error=str(e))
# 保守:計算失敗 → 假設違反
violated_metric = SloMetric(
name="calculation_error",
value=None,
threshold=0.0,
direction="above",
sample_count=0,
violated=True,
)
return SloReport(
metrics=[violated_metric],
any_violated=True,
)
async def get_cached_report(self) -> SloReport | None:
"""從 Redis 讀取最近一次 SLO 報告5min 快取)。"""
try:
from src.core.redis_client import get_redis
redis = get_redis()
raw = await redis.get(REDIS_KEY)
if raw:
data = json.loads(raw)
metrics = [
SloMetric(
name=m["name"],
value=m["value"],
threshold=m["threshold"],
direction=m["direction"],
sample_count=m["sample_count"],
violated=m["violated"],
)
for m in data.get("metrics", [])
]
return SloReport(
metrics=metrics,
any_violated=data.get("any_violated", False),
calculated_at=data.get("calculated_at", ""),
window_days=data.get("window_days", SLO_WINDOW_DAYS),
diagnostics=data.get("diagnostics", {}),
)
except Exception as e:
logger.warning("slo_cache_read_error", error=str(e))
return None
async def cache_report(self, report: SloReport) -> None:
"""將 SLO 報告存入 Redis 快取TTL 5min"""
try:
from src.core.redis_client import get_redis
redis = get_redis()
await redis.set(REDIS_KEY, json.dumps(report.to_dict()), ex=REDIS_TTL_SEC)
except Exception as e:
logger.warning("slo_cache_write_error", error=str(e))
async def save_violation_event(self, report: SloReport) -> None:
"""
將 SLO 違反寫入 ai_governance_events。
只在 any_violated=True 時呼叫。不管舊違反是否解決。
"""
try:
async with get_db_context() as session:
event = AiGovernanceEvent(
event_type="slo_violation",
details=report.to_dict(),
resolved=False,
)
session.add(event)
await session.commit()
logger.warning(
"slo_violation_recorded",
violated_metrics=[m.name for m in report.metrics if m.violated],
)
except Exception as e:
logger.error("slo_violation_save_error", error=str(e))
async def run(self) -> SloReport:
"""
完整執行:計算 → 快取 → 如違反則寫事件。
Returns:
SloReport
"""
report = await self.calculate()
await self.cache_report(report)
if report.any_violated:
await self.save_violation_event(report)
return report
# ──────────────────────────────────────────────────────────────────────────
# Private: SLO 計算方法
# ──────────────────────────────────────────────────────────────────────────
async def _calc_auto_success_rate(self, session, since) -> SloMetric:
"""SLO-1: auto_repair_executions 7d 成功率。"""
try:
total_q = await session.execute(
select(func.count()).where(
AutoRepairExecution.created_at >= since
)
)
total: int = total_q.scalar() or 0
if total < SLO_MIN_SAMPLES:
return SloMetric(
name="auto_execute_success_rate",
value=None,
threshold=SLO_AUTO_SUCCESS_MIN,
direction="above",
sample_count=total,
violated=False,
)
success_q = await session.execute(
select(func.count()).where(
AutoRepairExecution.created_at >= since,
AutoRepairExecution.success.is_(True),
)
)
success: int = success_q.scalar() or 0
rate = success / total
return SloMetric(
name="auto_execute_success_rate",
value=rate,
threshold=SLO_AUTO_SUCCESS_MIN,
direction="above",
sample_count=total,
violated=rate < SLO_AUTO_SUCCESS_MIN,
)
except Exception as e:
logger.warning("slo1_calc_error", error=str(e))
return SloMetric(
name="auto_execute_success_rate",
value=None, threshold=SLO_AUTO_SUCCESS_MIN,
direction="above", sample_count=0, violated=False,
)
async def _calc_human_override_rate(self, session, since) -> SloMetric:
"""
SLO-2: 人工推翻率 = AI 提案被 rejected / 總 AI 提案。
rejected = approval_records.status = 'rejected'
AI 提案 = requested_by LIKE 'ai_%' or 'system'
"""
try:
ai_q = await session.execute(
select(func.count()).where(
ApprovalRecord.created_at >= since,
)
)
total: int = ai_q.scalar() or 0
if total < SLO_MIN_SAMPLES:
return SloMetric(
name="human_override_rate",
value=None,
threshold=SLO_OVERRIDE_RATE_MAX,
direction="below",
sample_count=total,
violated=False,
)
rejected_q = await session.execute(
select(func.count()).where(
ApprovalRecord.created_at >= since,
ApprovalRecord.status == "rejected",
)
)
rejected: int = rejected_q.scalar() or 0
rate = rejected / total
return SloMetric(
name="human_override_rate",
value=rate,
threshold=SLO_OVERRIDE_RATE_MAX,
direction="below",
sample_count=total,
violated=rate > SLO_OVERRIDE_RATE_MAX,
)
except Exception as e:
logger.warning("slo2_calc_error", error=str(e))
return SloMetric(
name="human_override_rate",
value=None, threshold=SLO_OVERRIDE_RATE_MAX,
direction="below", sample_count=0, violated=False,
)
async def _calc_false_neg_rate(self, session, since) -> SloMetric:
"""
SLO-3: Verifier false negative代理指標
計算方式auto_repair 執行後 2 小時內同 incident_id 再次出現
在 auto_repair_executions 中(= 修好了又壞 = verifier 誤判為成功)。
使用 SQL window function
- 找出 success=True 的執行
- 計算同 incident_id 下是否有後續 failed 執行在 2h 內
"""
try:
result = await session.execute(
text("""
WITH success_runs AS (
SELECT incident_id, created_at
FROM auto_repair_executions
WHERE success = TRUE
AND created_at >= :since
),
false_negs AS (
SELECT DISTINCT s.incident_id
FROM success_runs s
JOIN auto_repair_executions f
ON f.incident_id = s.incident_id
AND f.success = FALSE
AND f.created_at > s.created_at
AND f.created_at <= s.created_at + INTERVAL '2 hours'
)
SELECT
(SELECT COUNT(*) FROM success_runs) AS total_success,
(SELECT COUNT(*) FROM false_negs) AS false_neg_count
"""),
{"since": since},
)
row = result.fetchone()
total_success: int = row[0] if row else 0
false_neg: int = row[1] if row else 0
if total_success < SLO_MIN_SAMPLES:
return SloMetric(
name="verifier_false_neg_rate",
value=None,
threshold=SLO_FALSE_NEG_MAX,
direction="below",
sample_count=total_success,
violated=False,
)
rate = false_neg / total_success
return SloMetric(
name="verifier_false_neg_rate",
value=rate,
threshold=SLO_FALSE_NEG_MAX,
direction="below",
sample_count=total_success,
violated=rate > SLO_FALSE_NEG_MAX,
)
except Exception as e:
logger.warning("slo3_calc_error", error=str(e))
return SloMetric(
name="verifier_false_neg_rate",
value=None, threshold=SLO_FALSE_NEG_MAX,
direction="below", sample_count=0, violated=False,
)
async def _build_auto_success_diagnostics(self, session, since) -> dict[str, Any]:
"""建立 W-1 auto_execute_success_rate 的可解釋診斷資料。"""
try:
result = await session.execute(
text("""
SELECT
are.incident_id,
are.playbook_id,
are.playbook_name,
are.success,
are.error_message,
are.created_at,
COALESCE(
inc.signals->0->>'alertname',
inc.signals->0->'labels'->>'alertname',
inc.signals->0->>'alert_name',
inc.affected_services->>0,
'unknown'
) AS alertname
FROM auto_repair_executions are
LEFT JOIN incidents inc ON inc.incident_id = are.incident_id
WHERE are.created_at >= :since
ORDER BY are.created_at ASC
"""),
{"since": since},
)
rows = [dict(row._mapping) for row in result]
return build_auto_execute_success_diagnostics(
rows=rows,
now=now_taipei(),
threshold=SLO_AUTO_SUCCESS_MIN,
window_days=SLO_WINDOW_DAYS,
min_samples=SLO_MIN_SAMPLES,
)
except Exception as e:
logger.warning("slo1_diagnostics_error", error=str(e))
return {
"schema_version": "ai_slo_auto_execute_diagnostics_v1",
"status": "diagnostics_unavailable",
"error": str(e)[:200],
}
def build_auto_execute_success_diagnostics(
rows: list[dict[str, Any]],
now: datetime,
threshold: float = SLO_AUTO_SUCCESS_MIN,
window_days: int = SLO_WINDOW_DAYS,
min_samples: int = SLO_MIN_SAMPLES,
) -> dict[str, Any]:
"""
從 auto_repair_executions rows 建立前端/Telegram 可讀的 W-1 診斷。
此函式保持純邏輯,讓 watchdog 與 API 可以共用同一份語義,也方便
單元測試鎖住 rolling-window 回綠推估。
"""
sorted_rows = sorted(rows, key=lambda r: r.get("created_at") or now)
total = len(sorted_rows)
success = sum(1 for row in sorted_rows if bool(row.get("success")))
failed = total - success
rate = (success / total) if total else None
failures = [row for row in sorted_rows if not bool(row.get("success"))]
failure_groups = _build_failure_groups(failures)
sealed_groups = [
group for group in failure_groups
if str(group.get("closure_status", "")).startswith("sealed_")
]
open_groups = [
group for group in failure_groups
if not str(group.get("closure_status", "")).startswith("sealed_")
]
projected_green_at, projection_reason = _project_auto_success_green_at(
rows=sorted_rows,
now=now,
threshold=threshold,
window_days=window_days,
min_samples=min_samples,
)
if failed == 0:
status = "green"
elif open_groups:
status = "needs_investigation"
elif sealed_groups:
status = "sealed_waiting_window"
else:
status = "insufficient_diagnostics"
return {
"schema_version": "ai_slo_auto_execute_diagnostics_v1",
"status": status,
"summary": {
"total": total,
"success": success,
"failed": failed,
"rate": rate,
"threshold": threshold,
"window_days": window_days,
"min_samples": min_samples,
},
"top_failure_groups": failure_groups[:5],
"sealed_failure_group_count": len(sealed_groups),
"open_failure_group_count": len(open_groups),
"immediate_successes_needed": _successes_needed_now(success, total, threshold),
"projected_green_at": projected_green_at.isoformat() if projected_green_at else None,
"projection_reason": projection_reason,
"next_action": _auto_execute_diagnostics_next_action(status),
}
def _build_failure_groups(failures: list[dict[str, Any]]) -> list[dict[str, Any]]:
groups: dict[tuple[str, str, str, str], dict[str, Any]] = {}
for row in failures:
alertname = str(row.get("alertname") or "unknown")
playbook_id = str(row.get("playbook_id") or "unknown")
playbook_name = str(row.get("playbook_name") or "unknown")
error_signature = _auto_repair_error_signature(row.get("error_message"))
key = (alertname, playbook_id, playbook_name, error_signature)
group = groups.setdefault(
key,
{
"alertname": alertname,
"playbook_id": playbook_id,
"playbook_name": playbook_name,
"error_signature": error_signature,
"count": 0,
"first_seen": None,
"last_seen": None,
"example_incident_id": row.get("incident_id"),
},
)
group["count"] += 1
created_at = row.get("created_at")
if isinstance(created_at, datetime):
if group["first_seen"] is None or created_at < group["first_seen"]:
group["first_seen"] = created_at
if group["last_seen"] is None or created_at > group["last_seen"]:
group["last_seen"] = created_at
enriched = []
for group in groups.values():
closure = _classify_auto_repair_failure_closure(group)
enriched.append({
**group,
"first_seen": group["first_seen"].isoformat() if group["first_seen"] else None,
"last_seen": group["last_seen"].isoformat() if group["last_seen"] else None,
**closure,
})
return sorted(enriched, key=lambda item: item["count"], reverse=True)
def _auto_repair_error_signature(error_message: Any) -> str:
error = str(error_message or "").strip().lower()
if not error:
return "missing_error_message"
if "unsupported scheme" in error and "docker restart" in error:
return "legacy_ssh_docker_restart"
if "nodes" in error and "not found" in error:
return "k3s_node_target_not_found"
if "http error" in error:
return "http_error"
if "timeout" in error:
return "timeout"
compact = " ".join(error.split())
return compact[:120] or "unknown_error"
def _classify_auto_repair_failure_closure(group: dict[str, Any]) -> dict[str, str]:
signature = str(group.get("error_signature") or "")
alertname = str(group.get("alertname") or "")
playbook_name = str(group.get("playbook_name") or "")
text = f"{alertname} {playbook_name}".lower()
if signature == "legacy_ssh_docker_restart":
return {
"closure_status": "sealed_by_mcp_grant",
"closure_label": "已封口Docker restart 已改走 ssh_docker_restart/write MCP grant",
"recommended_action": "觀察後續 DockerContainerUnhealthy 執行,不回填舊歷史",
}
if signature == "k3s_node_target_not_found" and (
"stock" in text or "wooo.work" in text or "external" in text
):
return {
"closure_status": "sealed_by_external_site_guard",
"closure_label": "已封口:外部站台告警已阻擋 K3s node PlayBook 誤配",
"recommended_action": "觀察 StockWoooWorkDown 是否改走 external_site_down / NO_ACTION",
}
return {
"closure_status": "open_failure_source",
"closure_label": "待調查:尚未匹配到已封口修復來源",
"recommended_action": "反查 incident truth-chain、PlayBook、MCP 執行紀錄",
}
def _successes_needed_now(success: int, total: int, threshold: float) -> int:
if total <= 0 or threshold >= 1:
return 0
gap = (threshold * total) - success
if gap <= 0:
return 0
return max(0, ceil(gap / (1 - threshold)))
def _project_auto_success_green_at(
rows: list[dict[str, Any]],
now: datetime,
threshold: float,
window_days: int,
min_samples: int,
) -> tuple[datetime | None, str | None]:
window = timedelta(days=window_days)
current_rows = [
row for row in rows
if isinstance(row.get("created_at"), datetime)
and row["created_at"] >= now - window
]
current_total = len(current_rows)
current_success = sum(1 for row in current_rows if bool(row.get("success")))
if current_total < min_samples:
return now, "sample_window_below_min"
if current_success / current_total >= threshold:
return now, "already_green"
candidates = sorted({
row["created_at"] + window + timedelta(seconds=1)
for row in current_rows
if row["created_at"] + window > now
})
for checkpoint in candidates:
active_rows = [
row for row in rows
if isinstance(row.get("created_at"), datetime)
and row["created_at"] >= checkpoint - window
and row["created_at"] <= checkpoint
]
active_total = len(active_rows)
active_success = sum(1 for row in active_rows if bool(row.get("success")))
if active_total < min_samples:
return checkpoint, "sample_window_below_min"
if active_success / active_total >= threshold:
return checkpoint, "rolling_window_if_no_new_failures"
return None, "no_projection_available"
def _auto_execute_diagnostics_next_action(status: str) -> str:
if status == "green":
return "keep_monitoring"
if status == "sealed_waiting_window":
return "observe_rolling_window_no_manual_restart"
if status == "needs_investigation":
return "investigate_open_failure_groups"
return "refresh_truth_chain_and_execution_logs"
# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────
_calculator: AiSloCalculator | None = None
def get_ai_slo_calculator() -> AiSloCalculator:
global _calculator
if _calculator is None:
_calculator = AiSloCalculator()
return _calculator