688 lines
27 KiB
Python
688 lines
27 KiB
Python
"""
|
||
AWOOOI AIOps Phase 6 — AI SLO 計算器(決策品質自我監控)
|
||
=========================================================
|
||
職責:滾動計算三大 AI 決策品質 SLO;違反閾值時寫入 ai_governance_events,
|
||
供 decision_manager 自我降級邏輯讀取。
|
||
|
||
三大 SLO(MASTER §3.6 ADR-087):
|
||
SLO-1 auto_execute_success_rate > 85% (7d 滾動)
|
||
SLO-2 human_override_rate < 20% (7d 滾動)
|
||
SLO-3 verifier_false_neg_rate < 5% (7d 滾動,proxy: 2h 內重複告警)
|
||
|
||
設計原則:
|
||
1. 純讀 + 純寫分離 — calculate() 只讀 DB,save_event() 只寫 DB
|
||
2. 計算失敗 → 保守:假設 SLO 違反,寫 violation 事件
|
||
3. 所有結果快取 Redis(key: ai:slo:latest, TTL 5min),避免高頻查 DB
|
||
4. 不自動解決舊 violation — resolved 只能人工或下次「全部通過」時補填
|
||
|
||
ADR-087: AI 自我治理閉環
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
from dataclasses import dataclass, field
|
||
from datetime import datetime, timedelta
|
||
from math import ceil
|
||
from typing import Any
|
||
|
||
import structlog
|
||
from sqlalchemy import func, select, text
|
||
|
||
from src.db.base import get_db_context
|
||
from src.db.models import AiGovernanceEvent, AutoRepairExecution, ApprovalRecord
|
||
from src.utils.timezone import now_taipei
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# SLO 閾值(MASTER §3.6 鐵律,修改前需 ADR-087 更新)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
SLO_AUTO_SUCCESS_MIN: float = 0.85 # auto_execute 成功率下限
|
||
SLO_OVERRIDE_RATE_MAX: float = 0.20 # 人工推翻率上限
|
||
SLO_FALSE_NEG_MAX: float = 0.05 # verifier false negative 上限
|
||
|
||
SLO_WINDOW_DAYS: int = 7 # 滾動視窗(天)
|
||
SLO_MIN_SAMPLES: int = 5 # 最少樣本數,低於此不計算(資料不足)
|
||
|
||
REDIS_KEY = "ai:slo:latest"
|
||
REDIS_TTL_SEC = 300 # 5 分鐘快取
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Data Types
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class SloMetric:
|
||
"""單一 SLO 指標"""
|
||
name: str
|
||
value: float | None # None = 樣本不足,跳過
|
||
threshold: float
|
||
direction: str # "above" = 需高於閾值 / "below" = 需低於閾值
|
||
sample_count: int
|
||
violated: bool # 是否違反(None → False,不觸發降級)
|
||
|
||
@property
|
||
def label(self) -> str:
|
||
if self.value is None:
|
||
return f"{self.name}: N/A(樣本 {self.sample_count} < {SLO_MIN_SAMPLES})"
|
||
pct = f"{self.value:.1%}"
|
||
thr = f"{self.threshold:.0%}"
|
||
op = ">" if self.direction == "above" else "<"
|
||
status = "❌ 違反" if self.violated else "✅ 合規"
|
||
return f"{self.name}: {pct} (需 {op}{thr}) {status}"
|
||
|
||
|
||
@dataclass
|
||
class SloReport:
|
||
"""完整 SLO 計算報告"""
|
||
metrics: list[SloMetric] = field(default_factory=list)
|
||
any_violated: bool = False
|
||
calculated_at: str = field(default_factory=lambda: now_taipei().isoformat())
|
||
window_days: int = SLO_WINDOW_DAYS
|
||
diagnostics: dict[str, Any] = field(default_factory=dict)
|
||
|
||
def to_dict(self) -> dict:
|
||
return {
|
||
"calculated_at": self.calculated_at,
|
||
"window_days": self.window_days,
|
||
"any_violated": self.any_violated,
|
||
"metrics": [
|
||
{
|
||
"name": m.name,
|
||
"value": m.value,
|
||
"threshold": m.threshold,
|
||
"direction": m.direction,
|
||
"sample_count": m.sample_count,
|
||
"violated": m.violated,
|
||
"label": m.label,
|
||
}
|
||
for m in self.metrics
|
||
],
|
||
"diagnostics": self.diagnostics,
|
||
}
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Main Service
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class AiSloCalculator:
|
||
"""
|
||
AI 決策品質 SLO 計算器
|
||
|
||
Usage:
|
||
calc = AiSloCalculator()
|
||
report = await calc.calculate()
|
||
if report.any_violated:
|
||
await calc.save_violation_event(report)
|
||
"""
|
||
|
||
async def calculate(self) -> SloReport:
|
||
"""
|
||
計算三大 SLO 指標(7d 滾動視窗)。
|
||
|
||
Returns:
|
||
SloReport(計算失敗時保守回傳 any_violated=True)
|
||
"""
|
||
try:
|
||
since = now_taipei() - timedelta(days=SLO_WINDOW_DAYS)
|
||
|
||
async with get_db_context() as session:
|
||
slo1 = await self._calc_auto_success_rate(session, since)
|
||
slo2 = await self._calc_human_override_rate(session, since)
|
||
slo3 = await self._calc_false_neg_rate(session, since)
|
||
diagnostics = {}
|
||
if slo1.violated:
|
||
diagnostics["auto_execute_success_rate"] = (
|
||
await self._build_auto_success_diagnostics(session, since)
|
||
)
|
||
|
||
metrics = [slo1, slo2, slo3]
|
||
any_violated = any(m.violated for m in metrics)
|
||
|
||
report = SloReport(
|
||
metrics=metrics,
|
||
any_violated=any_violated,
|
||
diagnostics=diagnostics,
|
||
)
|
||
|
||
logger.info(
|
||
"slo_calculated",
|
||
any_violated=any_violated,
|
||
slo1=slo1.value,
|
||
slo2=slo2.value,
|
||
slo3=slo3.value,
|
||
)
|
||
return report
|
||
|
||
except Exception as e:
|
||
logger.error("slo_calculation_error", error=str(e))
|
||
# 保守:計算失敗 → 假設違反
|
||
violated_metric = SloMetric(
|
||
name="calculation_error",
|
||
value=None,
|
||
threshold=0.0,
|
||
direction="above",
|
||
sample_count=0,
|
||
violated=True,
|
||
)
|
||
return SloReport(
|
||
metrics=[violated_metric],
|
||
any_violated=True,
|
||
)
|
||
|
||
async def get_cached_report(self) -> SloReport | None:
|
||
"""從 Redis 讀取最近一次 SLO 報告(5min 快取)。"""
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
redis = get_redis()
|
||
raw = await redis.get(REDIS_KEY)
|
||
if raw:
|
||
data = json.loads(raw)
|
||
metrics = [
|
||
SloMetric(
|
||
name=m["name"],
|
||
value=m["value"],
|
||
threshold=m["threshold"],
|
||
direction=m["direction"],
|
||
sample_count=m["sample_count"],
|
||
violated=m["violated"],
|
||
)
|
||
for m in data.get("metrics", [])
|
||
]
|
||
return SloReport(
|
||
metrics=metrics,
|
||
any_violated=data.get("any_violated", False),
|
||
calculated_at=data.get("calculated_at", ""),
|
||
window_days=data.get("window_days", SLO_WINDOW_DAYS),
|
||
diagnostics=data.get("diagnostics", {}),
|
||
)
|
||
except Exception as e:
|
||
logger.warning("slo_cache_read_error", error=str(e))
|
||
return None
|
||
|
||
async def cache_report(self, report: SloReport) -> None:
|
||
"""將 SLO 報告存入 Redis 快取(TTL 5min)。"""
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
redis = get_redis()
|
||
await redis.set(REDIS_KEY, json.dumps(report.to_dict()), ex=REDIS_TTL_SEC)
|
||
except Exception as e:
|
||
logger.warning("slo_cache_write_error", error=str(e))
|
||
|
||
async def save_violation_event(self, report: SloReport) -> None:
|
||
"""
|
||
將 SLO 違反寫入 ai_governance_events。
|
||
|
||
只在 any_violated=True 時呼叫。不管舊違反是否解決。
|
||
"""
|
||
try:
|
||
async with get_db_context() as session:
|
||
event = AiGovernanceEvent(
|
||
event_type="slo_violation",
|
||
details=report.to_dict(),
|
||
resolved=False,
|
||
)
|
||
session.add(event)
|
||
await session.commit()
|
||
logger.warning(
|
||
"slo_violation_recorded",
|
||
violated_metrics=[m.name for m in report.metrics if m.violated],
|
||
)
|
||
except Exception as e:
|
||
logger.error("slo_violation_save_error", error=str(e))
|
||
|
||
async def run(self) -> SloReport:
|
||
"""
|
||
完整執行:計算 → 快取 → 如違反則寫事件。
|
||
|
||
Returns:
|
||
SloReport
|
||
"""
|
||
report = await self.calculate()
|
||
await self.cache_report(report)
|
||
if report.any_violated:
|
||
await self.save_violation_event(report)
|
||
return report
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────
|
||
# Private: SLO 計算方法
|
||
# ──────────────────────────────────────────────────────────────────────────
|
||
|
||
async def _calc_auto_success_rate(self, session, since) -> SloMetric:
|
||
"""SLO-1: auto_repair_executions 7d 成功率。"""
|
||
try:
|
||
total_q = await session.execute(
|
||
select(func.count()).where(
|
||
AutoRepairExecution.created_at >= since
|
||
)
|
||
)
|
||
total: int = total_q.scalar() or 0
|
||
|
||
if total < SLO_MIN_SAMPLES:
|
||
return SloMetric(
|
||
name="auto_execute_success_rate",
|
||
value=None,
|
||
threshold=SLO_AUTO_SUCCESS_MIN,
|
||
direction="above",
|
||
sample_count=total,
|
||
violated=False,
|
||
)
|
||
|
||
success_q = await session.execute(
|
||
select(func.count()).where(
|
||
AutoRepairExecution.created_at >= since,
|
||
AutoRepairExecution.success.is_(True),
|
||
)
|
||
)
|
||
success: int = success_q.scalar() or 0
|
||
rate = success / total
|
||
|
||
return SloMetric(
|
||
name="auto_execute_success_rate",
|
||
value=rate,
|
||
threshold=SLO_AUTO_SUCCESS_MIN,
|
||
direction="above",
|
||
sample_count=total,
|
||
violated=rate < SLO_AUTO_SUCCESS_MIN,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("slo1_calc_error", error=str(e))
|
||
return SloMetric(
|
||
name="auto_execute_success_rate",
|
||
value=None, threshold=SLO_AUTO_SUCCESS_MIN,
|
||
direction="above", sample_count=0, violated=False,
|
||
)
|
||
|
||
async def _calc_human_override_rate(self, session, since) -> SloMetric:
|
||
"""
|
||
SLO-2: 人工推翻率 = AI 提案被 rejected / 總 AI 提案。
|
||
|
||
rejected = approval_records.status = 'rejected'
|
||
AI 提案 = requested_by LIKE 'ai_%' or 'system'
|
||
"""
|
||
try:
|
||
ai_q = await session.execute(
|
||
select(func.count()).where(
|
||
ApprovalRecord.created_at >= since,
|
||
)
|
||
)
|
||
total: int = ai_q.scalar() or 0
|
||
|
||
if total < SLO_MIN_SAMPLES:
|
||
return SloMetric(
|
||
name="human_override_rate",
|
||
value=None,
|
||
threshold=SLO_OVERRIDE_RATE_MAX,
|
||
direction="below",
|
||
sample_count=total,
|
||
violated=False,
|
||
)
|
||
|
||
rejected_q = await session.execute(
|
||
select(func.count()).where(
|
||
ApprovalRecord.created_at >= since,
|
||
ApprovalRecord.status == "rejected",
|
||
)
|
||
)
|
||
rejected: int = rejected_q.scalar() or 0
|
||
rate = rejected / total
|
||
|
||
return SloMetric(
|
||
name="human_override_rate",
|
||
value=rate,
|
||
threshold=SLO_OVERRIDE_RATE_MAX,
|
||
direction="below",
|
||
sample_count=total,
|
||
violated=rate > SLO_OVERRIDE_RATE_MAX,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("slo2_calc_error", error=str(e))
|
||
return SloMetric(
|
||
name="human_override_rate",
|
||
value=None, threshold=SLO_OVERRIDE_RATE_MAX,
|
||
direction="below", sample_count=0, violated=False,
|
||
)
|
||
|
||
async def _calc_false_neg_rate(self, session, since) -> SloMetric:
|
||
"""
|
||
SLO-3: Verifier false negative(代理指標)。
|
||
|
||
計算方式:auto_repair 執行後 2 小時內同 incident_id 再次出現
|
||
在 auto_repair_executions 中(= 修好了又壞 = verifier 誤判為成功)。
|
||
|
||
使用 SQL window function:
|
||
- 找出 success=True 的執行
|
||
- 計算同 incident_id 下是否有後續 failed 執行在 2h 內
|
||
"""
|
||
try:
|
||
result = await session.execute(
|
||
text("""
|
||
WITH success_runs AS (
|
||
SELECT incident_id, created_at
|
||
FROM auto_repair_executions
|
||
WHERE success = TRUE
|
||
AND created_at >= :since
|
||
),
|
||
false_negs AS (
|
||
SELECT DISTINCT s.incident_id
|
||
FROM success_runs s
|
||
JOIN auto_repair_executions f
|
||
ON f.incident_id = s.incident_id
|
||
AND f.success = FALSE
|
||
AND f.created_at > s.created_at
|
||
AND f.created_at <= s.created_at + INTERVAL '2 hours'
|
||
)
|
||
SELECT
|
||
(SELECT COUNT(*) FROM success_runs) AS total_success,
|
||
(SELECT COUNT(*) FROM false_negs) AS false_neg_count
|
||
"""),
|
||
{"since": since},
|
||
)
|
||
row = result.fetchone()
|
||
total_success: int = row[0] if row else 0
|
||
false_neg: int = row[1] if row else 0
|
||
|
||
if total_success < SLO_MIN_SAMPLES:
|
||
return SloMetric(
|
||
name="verifier_false_neg_rate",
|
||
value=None,
|
||
threshold=SLO_FALSE_NEG_MAX,
|
||
direction="below",
|
||
sample_count=total_success,
|
||
violated=False,
|
||
)
|
||
|
||
rate = false_neg / total_success
|
||
return SloMetric(
|
||
name="verifier_false_neg_rate",
|
||
value=rate,
|
||
threshold=SLO_FALSE_NEG_MAX,
|
||
direction="below",
|
||
sample_count=total_success,
|
||
violated=rate > SLO_FALSE_NEG_MAX,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("slo3_calc_error", error=str(e))
|
||
return SloMetric(
|
||
name="verifier_false_neg_rate",
|
||
value=None, threshold=SLO_FALSE_NEG_MAX,
|
||
direction="below", sample_count=0, violated=False,
|
||
)
|
||
|
||
async def _build_auto_success_diagnostics(self, session, since) -> dict[str, Any]:
|
||
"""建立 W-1 auto_execute_success_rate 的可解釋診斷資料。"""
|
||
try:
|
||
result = await session.execute(
|
||
text("""
|
||
SELECT
|
||
are.incident_id,
|
||
are.playbook_id,
|
||
are.playbook_name,
|
||
are.success,
|
||
are.error_message,
|
||
are.created_at,
|
||
COALESCE(
|
||
inc.signals->0->>'alertname',
|
||
inc.signals->0->'labels'->>'alertname',
|
||
inc.signals->0->>'alert_name',
|
||
inc.affected_services->>0,
|
||
'unknown'
|
||
) AS alertname
|
||
FROM auto_repair_executions are
|
||
LEFT JOIN incidents inc ON inc.incident_id = are.incident_id
|
||
WHERE are.created_at >= :since
|
||
ORDER BY are.created_at ASC
|
||
"""),
|
||
{"since": since},
|
||
)
|
||
rows = [dict(row._mapping) for row in result]
|
||
return build_auto_execute_success_diagnostics(
|
||
rows=rows,
|
||
now=now_taipei(),
|
||
threshold=SLO_AUTO_SUCCESS_MIN,
|
||
window_days=SLO_WINDOW_DAYS,
|
||
min_samples=SLO_MIN_SAMPLES,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("slo1_diagnostics_error", error=str(e))
|
||
return {
|
||
"schema_version": "ai_slo_auto_execute_diagnostics_v1",
|
||
"status": "diagnostics_unavailable",
|
||
"error": str(e)[:200],
|
||
}
|
||
|
||
|
||
def build_auto_execute_success_diagnostics(
|
||
rows: list[dict[str, Any]],
|
||
now: datetime,
|
||
threshold: float = SLO_AUTO_SUCCESS_MIN,
|
||
window_days: int = SLO_WINDOW_DAYS,
|
||
min_samples: int = SLO_MIN_SAMPLES,
|
||
) -> dict[str, Any]:
|
||
"""
|
||
從 auto_repair_executions rows 建立前端/Telegram 可讀的 W-1 診斷。
|
||
|
||
此函式保持純邏輯,讓 watchdog 與 API 可以共用同一份語義,也方便
|
||
單元測試鎖住 rolling-window 回綠推估。
|
||
"""
|
||
sorted_rows = sorted(rows, key=lambda r: r.get("created_at") or now)
|
||
total = len(sorted_rows)
|
||
success = sum(1 for row in sorted_rows if bool(row.get("success")))
|
||
failed = total - success
|
||
rate = (success / total) if total else None
|
||
failures = [row for row in sorted_rows if not bool(row.get("success"))]
|
||
failure_groups = _build_failure_groups(failures)
|
||
sealed_groups = [
|
||
group for group in failure_groups
|
||
if str(group.get("closure_status", "")).startswith("sealed_")
|
||
]
|
||
open_groups = [
|
||
group for group in failure_groups
|
||
if not str(group.get("closure_status", "")).startswith("sealed_")
|
||
]
|
||
projected_green_at, projection_reason = _project_auto_success_green_at(
|
||
rows=sorted_rows,
|
||
now=now,
|
||
threshold=threshold,
|
||
window_days=window_days,
|
||
min_samples=min_samples,
|
||
)
|
||
|
||
if failed == 0:
|
||
status = "green"
|
||
elif open_groups:
|
||
status = "needs_investigation"
|
||
elif sealed_groups:
|
||
status = "sealed_waiting_window"
|
||
else:
|
||
status = "insufficient_diagnostics"
|
||
|
||
return {
|
||
"schema_version": "ai_slo_auto_execute_diagnostics_v1",
|
||
"status": status,
|
||
"summary": {
|
||
"total": total,
|
||
"success": success,
|
||
"failed": failed,
|
||
"rate": rate,
|
||
"threshold": threshold,
|
||
"window_days": window_days,
|
||
"min_samples": min_samples,
|
||
},
|
||
"top_failure_groups": failure_groups[:5],
|
||
"sealed_failure_group_count": len(sealed_groups),
|
||
"open_failure_group_count": len(open_groups),
|
||
"immediate_successes_needed": _successes_needed_now(success, total, threshold),
|
||
"projected_green_at": projected_green_at.isoformat() if projected_green_at else None,
|
||
"projection_reason": projection_reason,
|
||
"next_action": _auto_execute_diagnostics_next_action(status),
|
||
}
|
||
|
||
|
||
def _build_failure_groups(failures: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||
groups: dict[tuple[str, str, str, str], dict[str, Any]] = {}
|
||
for row in failures:
|
||
alertname = str(row.get("alertname") or "unknown")
|
||
playbook_id = str(row.get("playbook_id") or "unknown")
|
||
playbook_name = str(row.get("playbook_name") or "unknown")
|
||
error_signature = _auto_repair_error_signature(row.get("error_message"))
|
||
key = (alertname, playbook_id, playbook_name, error_signature)
|
||
group = groups.setdefault(
|
||
key,
|
||
{
|
||
"alertname": alertname,
|
||
"playbook_id": playbook_id,
|
||
"playbook_name": playbook_name,
|
||
"error_signature": error_signature,
|
||
"count": 0,
|
||
"first_seen": None,
|
||
"last_seen": None,
|
||
"example_incident_id": row.get("incident_id"),
|
||
},
|
||
)
|
||
group["count"] += 1
|
||
created_at = row.get("created_at")
|
||
if isinstance(created_at, datetime):
|
||
if group["first_seen"] is None or created_at < group["first_seen"]:
|
||
group["first_seen"] = created_at
|
||
if group["last_seen"] is None or created_at > group["last_seen"]:
|
||
group["last_seen"] = created_at
|
||
|
||
enriched = []
|
||
for group in groups.values():
|
||
closure = _classify_auto_repair_failure_closure(group)
|
||
enriched.append({
|
||
**group,
|
||
"first_seen": group["first_seen"].isoformat() if group["first_seen"] else None,
|
||
"last_seen": group["last_seen"].isoformat() if group["last_seen"] else None,
|
||
**closure,
|
||
})
|
||
|
||
return sorted(enriched, key=lambda item: item["count"], reverse=True)
|
||
|
||
|
||
def _auto_repair_error_signature(error_message: Any) -> str:
|
||
error = str(error_message or "").strip().lower()
|
||
if not error:
|
||
return "missing_error_message"
|
||
if "unsupported scheme" in error and "docker restart" in error:
|
||
return "legacy_ssh_docker_restart"
|
||
if "nodes" in error and "not found" in error:
|
||
return "k3s_node_target_not_found"
|
||
if "http error" in error:
|
||
return "http_error"
|
||
if "timeout" in error:
|
||
return "timeout"
|
||
compact = " ".join(error.split())
|
||
return compact[:120] or "unknown_error"
|
||
|
||
|
||
def _classify_auto_repair_failure_closure(group: dict[str, Any]) -> dict[str, str]:
|
||
signature = str(group.get("error_signature") or "")
|
||
alertname = str(group.get("alertname") or "")
|
||
playbook_name = str(group.get("playbook_name") or "")
|
||
text = f"{alertname} {playbook_name}".lower()
|
||
|
||
if signature == "legacy_ssh_docker_restart":
|
||
return {
|
||
"closure_status": "sealed_by_mcp_grant",
|
||
"closure_label": "已封口:Docker restart 已改走 ssh_docker_restart/write MCP grant",
|
||
"recommended_action": "觀察後續 DockerContainerUnhealthy 執行,不回填舊歷史",
|
||
}
|
||
|
||
if signature == "k3s_node_target_not_found" and (
|
||
"stock" in text or "wooo.work" in text or "external" in text
|
||
):
|
||
return {
|
||
"closure_status": "sealed_by_external_site_guard",
|
||
"closure_label": "已封口:外部站台告警已阻擋 K3s node PlayBook 誤配",
|
||
"recommended_action": "觀察 StockWoooWorkDown 是否改走 external_site_down / NO_ACTION",
|
||
}
|
||
|
||
return {
|
||
"closure_status": "open_failure_source",
|
||
"closure_label": "待調查:尚未匹配到已封口修復來源",
|
||
"recommended_action": "反查 incident truth-chain、PlayBook、MCP 執行紀錄",
|
||
}
|
||
|
||
|
||
def _successes_needed_now(success: int, total: int, threshold: float) -> int:
|
||
if total <= 0 or threshold >= 1:
|
||
return 0
|
||
gap = (threshold * total) - success
|
||
if gap <= 0:
|
||
return 0
|
||
return max(0, ceil(gap / (1 - threshold)))
|
||
|
||
|
||
def _project_auto_success_green_at(
|
||
rows: list[dict[str, Any]],
|
||
now: datetime,
|
||
threshold: float,
|
||
window_days: int,
|
||
min_samples: int,
|
||
) -> tuple[datetime | None, str | None]:
|
||
window = timedelta(days=window_days)
|
||
current_rows = [
|
||
row for row in rows
|
||
if isinstance(row.get("created_at"), datetime)
|
||
and row["created_at"] >= now - window
|
||
]
|
||
current_total = len(current_rows)
|
||
current_success = sum(1 for row in current_rows if bool(row.get("success")))
|
||
|
||
if current_total < min_samples:
|
||
return now, "sample_window_below_min"
|
||
if current_success / current_total >= threshold:
|
||
return now, "already_green"
|
||
|
||
candidates = sorted({
|
||
row["created_at"] + window + timedelta(seconds=1)
|
||
for row in current_rows
|
||
if row["created_at"] + window > now
|
||
})
|
||
for checkpoint in candidates:
|
||
active_rows = [
|
||
row for row in rows
|
||
if isinstance(row.get("created_at"), datetime)
|
||
and row["created_at"] >= checkpoint - window
|
||
and row["created_at"] <= checkpoint
|
||
]
|
||
active_total = len(active_rows)
|
||
active_success = sum(1 for row in active_rows if bool(row.get("success")))
|
||
if active_total < min_samples:
|
||
return checkpoint, "sample_window_below_min"
|
||
if active_success / active_total >= threshold:
|
||
return checkpoint, "rolling_window_if_no_new_failures"
|
||
|
||
return None, "no_projection_available"
|
||
|
||
|
||
def _auto_execute_diagnostics_next_action(status: str) -> str:
|
||
if status == "green":
|
||
return "keep_monitoring"
|
||
if status == "sealed_waiting_window":
|
||
return "observe_rolling_window_no_manual_restart"
|
||
if status == "needs_investigation":
|
||
return "investigate_open_failure_groups"
|
||
return "refresh_truth_chain_and_execution_logs"
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Singleton
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_calculator: AiSloCalculator | None = None
|
||
|
||
|
||
def get_ai_slo_calculator() -> AiSloCalculator:
|
||
global _calculator
|
||
if _calculator is None:
|
||
_calculator = AiSloCalculator()
|
||
return _calculator
|