Files
awoooi/apps/api/src/services/model_rollback_service.py
OG T 77a92eb469
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 14m59s
feat(P6): 提交 offline_replay_service + model_rollback_service (漏提)
Phase 6 ADR-087 治理閉環兩個核心服務,
之前建立後沒有 git add,一直是 untracked 狀態。

2026-04-15 Claude Sonnet 4.6 Asia/Taipei
2026-04-15 22:29:09 +08:00

242 lines
9.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI AIOps Phase 6 — Model Rollback 服務(決策衰退偵測)
==========================================================
職責:讀取最近 N 週的離線回放一致率,偵測連續衰退趨勢,
在確認衰退後寫入 ai_governance_eventsconservative_mode
並觸發 Telegram 告警通知 SRE 評估 retrain。
衰退定義:
- 連續 4 週一致率下降week[i] < week[i-1]
- 最新一週一致率 < 55%(絕對閾值)
設計原則:
1. 不直接執行 retrain / model rollback — 只發通知 + 寫 governance event
2. 自我降級連動:偵測到衰退 → conservative_mode event → decision_manager 讀取
3. best-effort — 任何錯誤靜默記錄,不影響主流程
4. feature flag: AIOPS_P6_GOVERNANCE_ENABLED
ADR-087 Phase 6: 自我治理閉環
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 初始建立
"""
from __future__ import annotations
import uuid
from dataclasses import dataclass
import structlog
from sqlalchemy import and_, desc, select
from src.db.base import get_db_context
from src.db.models import AiGovernanceEvent
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# ─────────────────────────────────────────────────────────────────────────────
# 常數
# ─────────────────────────────────────────────────────────────────────────────
CONSECUTIVE_DECLINE_WEEKS = 4 # 連續衰退 N 週才觸發
ABSOLUTE_FLOOR_RATE = 0.55 # 一致率低於此值視為嚴重
RETRAIN_COOLDOWN_DAYS = 14 # 兩次 conservative_mode 事件間最短間隔
# ─────────────────────────────────────────────────────────────────────────────
# Result Types
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class RollbackCheckResult:
"""衰退偵測結果"""
checked_weeks: int = 0
consistency_rates: list[float] = None # type: ignore[assignment]
consecutive_declines: int = 0
absolute_floor_breached: bool = False
retrain_recommended: bool = False
conservative_mode_triggered: bool = False
cooldown_active: bool = False
def __post_init__(self) -> None:
if self.consistency_rates is None:
self.consistency_rates = []
# ─────────────────────────────────────────────────────────────────────────────
# Service
# ─────────────────────────────────────────────────────────────────────────────
class ModelRollbackService:
"""
Model Rollback 衰退偵測服務
Usage:
svc = ModelRollbackService()
result = await svc.check()
"""
async def check(self, force: bool = False) -> RollbackCheckResult:
"""
檢查是否有決策衰退趨勢。
Args:
force: True 時繞過 feature flag
"""
from src.core.feature_flags import aiops_flags
if not force and not aiops_flags.AIOPS_P6_GOVERNANCE_ENABLED:
logger.debug("model_rollback_check_skipped")
return RollbackCheckResult()
try:
return await self._check()
except Exception as e:
logger.error("model_rollback_check_error", error=str(e))
return RollbackCheckResult()
async def _check(self) -> RollbackCheckResult:
"""讀取最近 N 次回放記錄,偵測衰退。"""
async with get_db_context() as db:
# 取最近 CONSECUTIVE_DECLINE_WEEKS + 1 筆回放記錄(包含 ok 和 degraded
stmt = (
select(AiGovernanceEvent)
.where(
AiGovernanceEvent.event_type.in_(["replay_ok", "replay_degraded"])
)
.order_by(desc(AiGovernanceEvent.triggered_at))
.limit(CONSECUTIVE_DECLINE_WEEKS + 1)
)
result = await db.execute(stmt)
events = list(result.scalars().all())
if len(events) < 2:
logger.info("model_rollback_insufficient_history", count=len(events))
return RollbackCheckResult(checked_weeks=len(events))
# 按時間由舊到新排列
events.reverse()
rates = []
for ev in events:
details = ev.details or {}
rate = details.get("consistency_rate", 1.0)
rates.append(float(rate))
# 計算連續衰退次數(由新到舊)
recent = rates[::-1] # 最新在前
consecutive = 0
for i in range(len(recent) - 1):
if recent[i] < recent[i + 1]:
consecutive += 1
else:
break
absolute_floor = rates[-1] < ABSOLUTE_FLOOR_RATE if rates else False
retrain_recommended = (
consecutive >= CONSECUTIVE_DECLINE_WEEKS or absolute_floor
)
result_obj = RollbackCheckResult(
checked_weeks=len(rates),
consistency_rates=rates,
consecutive_declines=consecutive,
absolute_floor_breached=absolute_floor,
retrain_recommended=retrain_recommended,
)
if retrain_recommended:
await self._maybe_trigger_conservative_mode(result_obj)
logger.info(
"model_rollback_check_done",
consecutive_declines=consecutive,
latest_rate=rates[-1] if rates else None,
retrain_recommended=retrain_recommended,
)
return result_obj
async def _maybe_trigger_conservative_mode(
self, result: RollbackCheckResult
) -> None:
"""
若距上次 conservative_mode 事件超過 RETRAIN_COOLDOWN_DAYS
寫入新的 conservative_mode 事件並發送告警。
"""
from datetime import timedelta
cutoff = now_taipei() - timedelta(days=RETRAIN_COOLDOWN_DAYS)
async with get_db_context() as db:
# 查是否在冷卻期內
stmt = (
select(AiGovernanceEvent)
.where(
and_(
AiGovernanceEvent.event_type == "conservative_mode",
AiGovernanceEvent.triggered_at >= cutoff,
)
)
.limit(1)
)
recent_evt = (await db.execute(stmt)).scalar_one_or_none()
if recent_evt:
result.cooldown_active = True
logger.info(
"model_rollback_conservative_mode_cooldown",
cooldown_days=RETRAIN_COOLDOWN_DAYS,
)
return
# 寫入 conservative_mode 事件
from sqlalchemy import insert as sa_insert
await db.execute(
sa_insert(AiGovernanceEvent).values(
id=str(uuid.uuid4()),
event_type="conservative_mode",
details={
"consecutive_declines": result.consecutive_declines,
"consistency_rates": result.consistency_rates,
"absolute_floor_breached": result.absolute_floor_breached,
"triggered_by": "model_rollback_service",
},
resolved=False,
)
)
result.conservative_mode_triggered = True
logger.warning(
"model_rollback_conservative_mode_triggered",
consecutive_declines=result.consecutive_declines,
rates=result.consistency_rates,
)
# 發送 Telegram 告警best-effort
await self._notify_retrain_needed(result)
async def _notify_retrain_needed(self, result: RollbackCheckResult) -> None:
"""發送 Telegram Tier-2 通知,提醒 SRE 評估 retrain。"""
try:
from src.services.notification_service import get_notification_service
svc = get_notification_service()
msg = (
"⚠️ AI 決策品質衰退偵測\n\n"
f"連續衰退次數: {result.consecutive_declines}\n"
f"最近一致率: {result.consistency_rates[-1]:.1%}\n"
f"絕對閾值突破: {'' if result.absolute_floor_breached else ''}\n\n"
"建議動作: 評估 fine-tune retrain 或回滾至上一版本"
)
await svc.send_system_alert(message=msg, level="warning")
except Exception as e:
logger.warning("model_rollback_notify_failed", error=str(e))
# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────
_instance: ModelRollbackService | None = None
def get_model_rollback_service() -> ModelRollbackService:
global _instance
if _instance is None:
_instance = ModelRollbackService()
return _instance