All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 14m59s
Phase 6 ADR-087 治理閉環兩個核心服務, 之前建立後沒有 git add,一直是 untracked 狀態。 2026-04-15 Claude Sonnet 4.6 Asia/Taipei
242 lines
9.9 KiB
Python
242 lines
9.9 KiB
Python
"""
|
||
AWOOOI AIOps Phase 6 — Model Rollback 服務(決策衰退偵測)
|
||
==========================================================
|
||
職責:讀取最近 N 週的離線回放一致率,偵測連續衰退趨勢,
|
||
在確認衰退後寫入 ai_governance_events(conservative_mode)
|
||
並觸發 Telegram 告警通知 SRE 評估 retrain。
|
||
|
||
衰退定義:
|
||
- 連續 4 週一致率下降(week[i] < week[i-1])
|
||
- 最新一週一致率 < 55%(絕對閾值)
|
||
|
||
設計原則:
|
||
1. 不直接執行 retrain / model rollback — 只發通知 + 寫 governance event
|
||
2. 自我降級連動:偵測到衰退 → conservative_mode event → decision_manager 讀取
|
||
3. best-effort — 任何錯誤靜默記錄,不影響主流程
|
||
4. feature flag: AIOPS_P6_GOVERNANCE_ENABLED
|
||
|
||
ADR-087 Phase 6: 自我治理閉環
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import uuid
|
||
from dataclasses import dataclass
|
||
|
||
import structlog
|
||
from sqlalchemy import and_, desc, select
|
||
|
||
from src.db.base import get_db_context
|
||
from src.db.models import AiGovernanceEvent
|
||
from src.utils.timezone import now_taipei
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 常數
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
CONSECUTIVE_DECLINE_WEEKS = 4 # 連續衰退 N 週才觸發
|
||
ABSOLUTE_FLOOR_RATE = 0.55 # 一致率低於此值視為嚴重
|
||
RETRAIN_COOLDOWN_DAYS = 14 # 兩次 conservative_mode 事件間最短間隔
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Result Types
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class RollbackCheckResult:
|
||
"""衰退偵測結果"""
|
||
checked_weeks: int = 0
|
||
consistency_rates: list[float] = None # type: ignore[assignment]
|
||
consecutive_declines: int = 0
|
||
absolute_floor_breached: bool = False
|
||
retrain_recommended: bool = False
|
||
conservative_mode_triggered: bool = False
|
||
cooldown_active: bool = False
|
||
|
||
def __post_init__(self) -> None:
|
||
if self.consistency_rates is None:
|
||
self.consistency_rates = []
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Service
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class ModelRollbackService:
|
||
"""
|
||
Model Rollback 衰退偵測服務
|
||
|
||
Usage:
|
||
svc = ModelRollbackService()
|
||
result = await svc.check()
|
||
"""
|
||
|
||
async def check(self, force: bool = False) -> RollbackCheckResult:
|
||
"""
|
||
檢查是否有決策衰退趨勢。
|
||
|
||
Args:
|
||
force: True 時繞過 feature flag
|
||
"""
|
||
from src.core.feature_flags import aiops_flags
|
||
if not force and not aiops_flags.AIOPS_P6_GOVERNANCE_ENABLED:
|
||
logger.debug("model_rollback_check_skipped")
|
||
return RollbackCheckResult()
|
||
|
||
try:
|
||
return await self._check()
|
||
except Exception as e:
|
||
logger.error("model_rollback_check_error", error=str(e))
|
||
return RollbackCheckResult()
|
||
|
||
async def _check(self) -> RollbackCheckResult:
|
||
"""讀取最近 N 次回放記錄,偵測衰退。"""
|
||
async with get_db_context() as db:
|
||
# 取最近 CONSECUTIVE_DECLINE_WEEKS + 1 筆回放記錄(包含 ok 和 degraded)
|
||
stmt = (
|
||
select(AiGovernanceEvent)
|
||
.where(
|
||
AiGovernanceEvent.event_type.in_(["replay_ok", "replay_degraded"])
|
||
)
|
||
.order_by(desc(AiGovernanceEvent.triggered_at))
|
||
.limit(CONSECUTIVE_DECLINE_WEEKS + 1)
|
||
)
|
||
result = await db.execute(stmt)
|
||
events = list(result.scalars().all())
|
||
|
||
if len(events) < 2:
|
||
logger.info("model_rollback_insufficient_history", count=len(events))
|
||
return RollbackCheckResult(checked_weeks=len(events))
|
||
|
||
# 按時間由舊到新排列
|
||
events.reverse()
|
||
rates = []
|
||
for ev in events:
|
||
details = ev.details or {}
|
||
rate = details.get("consistency_rate", 1.0)
|
||
rates.append(float(rate))
|
||
|
||
# 計算連續衰退次數(由新到舊)
|
||
recent = rates[::-1] # 最新在前
|
||
consecutive = 0
|
||
for i in range(len(recent) - 1):
|
||
if recent[i] < recent[i + 1]:
|
||
consecutive += 1
|
||
else:
|
||
break
|
||
|
||
absolute_floor = rates[-1] < ABSOLUTE_FLOOR_RATE if rates else False
|
||
retrain_recommended = (
|
||
consecutive >= CONSECUTIVE_DECLINE_WEEKS or absolute_floor
|
||
)
|
||
|
||
result_obj = RollbackCheckResult(
|
||
checked_weeks=len(rates),
|
||
consistency_rates=rates,
|
||
consecutive_declines=consecutive,
|
||
absolute_floor_breached=absolute_floor,
|
||
retrain_recommended=retrain_recommended,
|
||
)
|
||
|
||
if retrain_recommended:
|
||
await self._maybe_trigger_conservative_mode(result_obj)
|
||
|
||
logger.info(
|
||
"model_rollback_check_done",
|
||
consecutive_declines=consecutive,
|
||
latest_rate=rates[-1] if rates else None,
|
||
retrain_recommended=retrain_recommended,
|
||
)
|
||
return result_obj
|
||
|
||
async def _maybe_trigger_conservative_mode(
|
||
self, result: RollbackCheckResult
|
||
) -> None:
|
||
"""
|
||
若距上次 conservative_mode 事件超過 RETRAIN_COOLDOWN_DAYS,
|
||
寫入新的 conservative_mode 事件並發送告警。
|
||
"""
|
||
from datetime import timedelta
|
||
cutoff = now_taipei() - timedelta(days=RETRAIN_COOLDOWN_DAYS)
|
||
|
||
async with get_db_context() as db:
|
||
# 查是否在冷卻期內
|
||
stmt = (
|
||
select(AiGovernanceEvent)
|
||
.where(
|
||
and_(
|
||
AiGovernanceEvent.event_type == "conservative_mode",
|
||
AiGovernanceEvent.triggered_at >= cutoff,
|
||
)
|
||
)
|
||
.limit(1)
|
||
)
|
||
recent_evt = (await db.execute(stmt)).scalar_one_or_none()
|
||
|
||
if recent_evt:
|
||
result.cooldown_active = True
|
||
logger.info(
|
||
"model_rollback_conservative_mode_cooldown",
|
||
cooldown_days=RETRAIN_COOLDOWN_DAYS,
|
||
)
|
||
return
|
||
|
||
# 寫入 conservative_mode 事件
|
||
from sqlalchemy import insert as sa_insert
|
||
await db.execute(
|
||
sa_insert(AiGovernanceEvent).values(
|
||
id=str(uuid.uuid4()),
|
||
event_type="conservative_mode",
|
||
details={
|
||
"consecutive_declines": result.consecutive_declines,
|
||
"consistency_rates": result.consistency_rates,
|
||
"absolute_floor_breached": result.absolute_floor_breached,
|
||
"triggered_by": "model_rollback_service",
|
||
},
|
||
resolved=False,
|
||
)
|
||
)
|
||
result.conservative_mode_triggered = True
|
||
logger.warning(
|
||
"model_rollback_conservative_mode_triggered",
|
||
consecutive_declines=result.consecutive_declines,
|
||
rates=result.consistency_rates,
|
||
)
|
||
|
||
# 發送 Telegram 告警(best-effort)
|
||
await self._notify_retrain_needed(result)
|
||
|
||
async def _notify_retrain_needed(self, result: RollbackCheckResult) -> None:
|
||
"""發送 Telegram Tier-2 通知,提醒 SRE 評估 retrain。"""
|
||
try:
|
||
from src.services.notification_service import get_notification_service
|
||
svc = get_notification_service()
|
||
msg = (
|
||
"⚠️ AI 決策品質衰退偵測\n\n"
|
||
f"連續衰退次數: {result.consecutive_declines} 週\n"
|
||
f"最近一致率: {result.consistency_rates[-1]:.1%}\n"
|
||
f"絕對閾值突破: {'是' if result.absolute_floor_breached else '否'}\n\n"
|
||
"建議動作: 評估 fine-tune retrain 或回滾至上一版本"
|
||
)
|
||
await svc.send_system_alert(message=msg, level="warning")
|
||
except Exception as e:
|
||
logger.warning("model_rollback_notify_failed", error=str(e))
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Singleton
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_instance: ModelRollbackService | None = None
|
||
|
||
|
||
def get_model_rollback_service() -> ModelRollbackService:
|
||
global _instance
|
||
if _instance is None:
|
||
_instance = ModelRollbackService()
|
||
return _instance
|