""" AWOOOI AIOps Phase 4 — Dynamic Baseline Service(動態基線服務) ============================================================= 職責:Holt-Winters 指數平滑,偵測 Prometheus metric 異常偏離 核心 API: is_anomaly(metric_name, current_value) -> AnomalyResult update_baseline(metric_name, datapoints) 設計原則: - Shadow Mode(AIOPS_P4_SHADOW_MODE=True):只記錄,不觸發 Alert - 熔斷保護:statsmodels 失敗 → fallback 到滑動平均 - 7 天歷史資料最少訓練量(低於此閾值 → skip,不誤判) - 基線持久化到 Redis(key: baseline:{metric_name},TTL 24h) - 訓練在 background worker 執行,not in webhook handler ADR-084: Phase 4 動態異常偵測源頭升級 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 初始建立 """ from __future__ import annotations import json import math from dataclasses import dataclass, field from typing import Any import structlog from src.utils.timezone import now_taipei logger = structlog.get_logger(__name__) # ── 常數 ──────────────────────────────────────────────────────────────────── MIN_DATAPOINTS = 168 # 7 天 × 24h 最少樣本數(才能訓練季節性模型) SIGMA_THRESHOLD = 3.0 # 偏差 ≥ 3σ → 異常 REDIS_TTL_SEC = 86400 # 基線 Redis TTL = 24h REDIS_KEY_PREFIX = "baseline:" HISTORY_WINDOW_HOURS = 336 # 保留 14 天歷史 # ───────────────────────────────────────────────────────────────────────────── # Data Types # ───────────────────────────────────────────────────────────────────────────── @dataclass class MetricDatapoint: """單一 metric 時序資料點""" timestamp: float # Unix epoch value: float @dataclass class BaselineState: """Holt-Winters 訓練後的基線狀態(Redis 持久化)""" metric_name: str mean: float std: float seasonal_factors: list[float] = field(default_factory=list) # 24h 週期 last_trained_at: str = "" datapoint_count: int = 0 def to_dict(self) -> dict[str, Any]: return { "metric_name": self.metric_name, "mean": self.mean, "std": self.std, "seasonal_factors": self.seasonal_factors, "last_trained_at": self.last_trained_at, "datapoint_count": self.datapoint_count, } @classmethod def from_dict(cls, d: dict[str, Any]) -> BaselineState: return cls( metric_name=d["metric_name"], mean=d["mean"], std=d["std"], seasonal_factors=d.get("seasonal_factors", []), last_trained_at=d.get("last_trained_at", ""), datapoint_count=d.get("datapoint_count", 0), ) @dataclass class AnomalyResult: """is_anomaly() 回傳結果""" metric_name: str current_value: float is_anomaly: bool deviation_sigma: float # 偏差 σ 數(>3 = 異常) expected_mean: float expected_std: float direction: str = "none" # "up" / "down" / "none" shadow_mode: bool = True # True = 只記錄,不觸發 reason: str = "" # ───────────────────────────────────────────────────────────────────────────── # Main Service # ───────────────────────────────────────────────────────────────────────────── class DynamicBaselineService: """ 動態基線服務 兩大功能: 1. train_baseline() — 從 Prometheus 抓歷史資料,用 Holt-Winters 訓練 2. is_anomaly() — 即時判斷當前值是否偏離基線 ≥ 3σ """ async def train_baseline( self, metric_name: str, promql: str, lookback_hours: int = HISTORY_WINDOW_HOURS, ) -> BaselineState | None: """ 從 Prometheus 抓取歷史資料並訓練基線。 Args: metric_name: 基線識別名(e.g. "cpu_usage_node_mon") promql: Prometheus query(e.g. "avg(rate(node_cpu_seconds_total[5m]))") lookback_hours: 歷史視窗(預設 14 天) Returns: BaselineState(已存 Redis);資料不足 → None """ try: datapoints = await self._fetch_prometheus_history(promql, lookback_hours) if len(datapoints) < MIN_DATAPOINTS: logger.info( "baseline_insufficient_data", metric=metric_name, count=len(datapoints), required=MIN_DATAPOINTS, ) return None state = self._fit_holt_winters(metric_name, datapoints) await self._save_baseline(state, promql=promql, lookback_hours=lookback_hours) logger.info( "baseline_trained", metric=metric_name, mean=f"{state.mean:.4f}", std=f"{state.std:.4f}", datapoints=len(datapoints), ) return state except Exception: logger.exception("baseline_train_failed", metric=metric_name) return None async def is_anomaly( self, metric_name: str, current_value: float, hour_of_day: int | None = None, ) -> AnomalyResult: """ 即時異常判斷。 Args: metric_name: 基線識別名 current_value: 當前觀測值 hour_of_day: 當前小時(0-23),用於套用 seasonal factor;None = 不套用 Returns: AnomalyResult """ from src.core.feature_flags import aiops_flags shadow_mode = aiops_flags.AIOPS_P4_SHADOW_MODE try: state = await self._load_baseline(metric_name) if state is None: return AnomalyResult( metric_name=metric_name, current_value=current_value, is_anomaly=False, deviation_sigma=0.0, expected_mean=current_value, expected_std=0.0, reason="no_baseline_available", shadow_mode=shadow_mode, ) # 套用 seasonal factor(如果有 24h 週期資料) expected_mean = state.mean if hour_of_day is not None and len(state.seasonal_factors) == 24: expected_mean *= state.seasonal_factors[hour_of_day] expected_std = state.std if state.std > 0 else 1e-9 deviation = abs(current_value - expected_mean) sigma = deviation / expected_std anomaly = sigma >= SIGMA_THRESHOLD direction = "none" if anomaly: direction = "up" if current_value > expected_mean else "down" result = AnomalyResult( metric_name=metric_name, current_value=current_value, is_anomaly=anomaly, deviation_sigma=round(sigma, 2), expected_mean=round(expected_mean, 4), expected_std=round(expected_std, 4), direction=direction, shadow_mode=shadow_mode, reason=f"deviation {sigma:.1f}σ from baseline" if anomaly else "within_normal_range", ) if anomaly: logger.info( "dynamic_anomaly_detected", metric=metric_name, value=current_value, expected=expected_mean, sigma=sigma, direction=direction, shadow_mode=shadow_mode, ) return result except Exception as e: logger.warning("baseline_anomaly_check_failed", metric=metric_name, error=str(e)) return AnomalyResult( metric_name=metric_name, current_value=current_value, is_anomaly=False, deviation_sigma=0.0, expected_mean=0.0, expected_std=0.0, reason=f"check_error:{e}", shadow_mode=shadow_mode, ) # ────────────────────────────────────────────────────────────────────────── # Private Helpers # ────────────────────────────────────────────────────────────────────────── async def _fetch_prometheus_history( self, promql: str, lookback_hours: int, ) -> list[MetricDatapoint]: """從 Prometheus query_range API 抓取歷史資料(1h 步進)。""" import httpx from src.core.config import settings end_ts = now_taipei().timestamp() start_ts = end_ts - lookback_hours * 3600 try: async with httpx.AsyncClient(timeout=30.0) as client: resp = await client.get( f"{settings.PROMETHEUS_URL}/api/v1/query_range", params={ "query": promql, "start": start_ts, "end": end_ts, "step": "3600", # 1h 步進 }, ) resp.raise_for_status() data = resp.json() results = data.get("data", {}).get("result", []) if not results: return [] # 取第一個 time series values = results[0].get("values", []) return [ MetricDatapoint(timestamp=float(ts), value=float(v)) for ts, v in values if v != "NaN" ] except Exception as e: logger.warning("prometheus_history_fetch_failed", error=str(e)) return [] def _fit_holt_winters( self, metric_name: str, datapoints: list[MetricDatapoint], ) -> BaselineState: """ 用 statsmodels Holt-Winters 訓練基線。 Fallback:若 statsmodels 不可用 → 滑動統計。 """ values = [dp.value for dp in datapoints] try: import numpy as np from statsmodels.tsa.holtwinters import ExponentialSmoothing arr = np.array(values, dtype=float) # 確保無 NaN / Inf arr = arr[np.isfinite(arr)] if len(arr) < MIN_DATAPOINTS: return self._fit_simple_stats(metric_name, values) # Holt-Winters:加法趨勢 + 加法季節性(24h 週期) seasonal_periods = min(24, len(arr) // 2) model = ExponentialSmoothing( arr, trend="add", seasonal="add" if len(arr) >= seasonal_periods * 2 else None, seasonal_periods=seasonal_periods, initialization_method="estimated", ).fit(optimized=True) fitted = model.fittedvalues residuals = arr - fitted mean_val = float(np.mean(fitted)) std_val = float(np.std(residuals)) # 24h seasonal factors(正規化為相對倍數) seasonal_factors = [1.0] * 24 if hasattr(model, "season") and model.season is not None: s = model.season if len(s) >= 24: s_arr = np.array(s[-24:]) # 轉為乘法因子(mean-centered) s_mean = abs(np.mean(s_arr)) or 1.0 sf = (s_arr / s_mean).tolist() seasonal_factors = [max(0.1, min(10.0, f)) for f in sf] return BaselineState( metric_name=metric_name, mean=mean_val, std=max(std_val, mean_val * 0.01), # 最小 std = 1% mean seasonal_factors=seasonal_factors, last_trained_at=now_taipei().isoformat(), datapoint_count=len(arr), ) except Exception as e: logger.warning("holt_winters_failed_fallback_to_stats", error=str(e)) return self._fit_simple_stats(metric_name, values) def _fit_simple_stats( self, metric_name: str, values: list[float], ) -> BaselineState: """Fallback:純滑動平均 + 標準差基線。""" if not values: return BaselineState(metric_name=metric_name, mean=0.0, std=1.0) n = len(values) mean_val = sum(values) / n variance = sum((v - mean_val) ** 2 for v in values) / n std_val = math.sqrt(variance) return BaselineState( metric_name=metric_name, mean=mean_val, std=max(std_val, mean_val * 0.01), seasonal_factors=[1.0] * 24, last_trained_at=now_taipei().isoformat(), datapoint_count=n, ) async def _save_baseline(self, state: BaselineState, promql: str = "", lookback_hours: int = HISTORY_WINDOW_HOURS) -> None: """ 儲存基線狀態: 1. 先寫 PostgreSQL(永久保存,source of truth) 2. 再寫 Redis(24h warm cache,加速讀取) Phase 4 ADR-084 架構鐵律:訓練好的 Holt-Winters 模型不能只存 Redis。 Redis 24h TTL 到期 = AI 每天重新學習「正常」的定義 = 不是在學習。 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 改為 PG source of truth """ # 1. 寫入 PostgreSQL(主要持久化) await self._pg_upsert_baseline(state, promql, lookback_hours) # 2. 寫入 Redis warm cache(加速讀取,到期後從 PG 復原) try: from src.core.redis_client import get_redis r = get_redis() key = f"{REDIS_KEY_PREFIX}{state.metric_name}" await r.set(key, json.dumps(state.to_dict()), ex=REDIS_TTL_SEC) except Exception as e: logger.warning("baseline_redis_cache_failed", metric=state.metric_name, error=str(e)) async def _load_baseline(self, metric_name: str) -> BaselineState | None: """ 載入基線:Redis-first → miss 時從 PG 載入並回填 Redis。 Phase 4 ADR-084: Redis 只是 warm cache,PG 才是 source of truth。 """ # 1. Redis warm cache hit try: from src.core.redis_client import get_redis r = get_redis() key = f"{REDIS_KEY_PREFIX}{metric_name}" data = await r.get(key) if data is not None: return BaselineState.from_dict(json.loads(data)) except Exception as e: logger.warning("baseline_redis_load_failed", metric=metric_name, error=str(e)) # 2. PG fallback(cache miss) state = await self._pg_load_latest_baseline(metric_name) if state is not None: # 回填 Redis cache try: from src.core.redis_client import get_redis r = get_redis() key = f"{REDIS_KEY_PREFIX}{metric_name}" await r.set(key, json.dumps(state.to_dict()), ex=REDIS_TTL_SEC) except Exception: pass # cache 回填失敗不影響讀取 return state async def _pg_upsert_baseline(self, state: BaselineState, promql: str, lookback_hours: int) -> None: """寫入 DynamicBaselineRecord 到 PostgreSQL(INSERT,不更新舊記錄)""" try: from src.db.base import get_db_context from src.db.models import DynamicBaselineRecord async with get_db_context() as session: record = DynamicBaselineRecord( metric_name=state.metric_name, mean=state.mean, std=state.std, seasonal_factors=state.seasonal_factors, datapoint_count=state.datapoint_count, promql=promql, lookback_hours=lookback_hours, ) session.add(record) await session.commit() logger.info("baseline_pg_saved", metric=state.metric_name, datapoints=state.datapoint_count) except Exception as e: logger.warning("baseline_pg_save_failed", metric=state.metric_name, error=str(e)) async def _pg_load_latest_baseline(self, metric_name: str) -> BaselineState | None: """從 PostgreSQL 載入最新一筆基線記錄""" try: from sqlalchemy import select from src.db.base import get_db_context from src.db.models import DynamicBaselineRecord async with get_db_context() as session: stmt = ( select(DynamicBaselineRecord) .where(DynamicBaselineRecord.metric_name == metric_name) .order_by(DynamicBaselineRecord.trained_at.desc()) .limit(1) ) result = await session.execute(stmt) record = result.scalar_one_or_none() if record is None: return None return BaselineState( metric_name=record.metric_name, mean=record.mean, std=record.std, seasonal_factors=record.seasonal_factors, last_trained_at=record.trained_at.isoformat(), datapoint_count=record.datapoint_count, ) except Exception as e: logger.warning("baseline_pg_load_failed", metric=metric_name, error=str(e)) return None # ───────────────────────────────────────────────────────────────────────────── # Singleton # ───────────────────────────────────────────────────────────────────────────── _baseline_service: DynamicBaselineService | None = None def get_dynamic_baseline_service() -> DynamicBaselineService: global _baseline_service if _baseline_service is None: _baseline_service = DynamicBaselineService() return _baseline_service