490 lines
19 KiB
Python
490 lines
19 KiB
Python
"""
|
||
AWOOOI AIOps Phase 4 — Dynamic Baseline Service(動態基線服務)
|
||
=============================================================
|
||
職責:Holt-Winters 指數平滑,偵測 Prometheus metric 異常偏離
|
||
|
||
核心 API:
|
||
is_anomaly(metric_name, current_value) -> AnomalyResult
|
||
update_baseline(metric_name, datapoints)
|
||
|
||
設計原則:
|
||
- Shadow Mode(AIOPS_P4_SHADOW_MODE=True):只記錄,不觸發 Alert
|
||
- 熔斷保護:statsmodels 失敗 → fallback 到滑動平均
|
||
- 7 天歷史資料最少訓練量(低於此閾值 → skip,不誤判)
|
||
- 基線持久化到 Redis(key: baseline:{metric_name},TTL 24h)
|
||
- 訓練在 background worker 執行,not in webhook handler
|
||
|
||
ADR-084: Phase 4 動態異常偵測源頭升級
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import math
|
||
from dataclasses import dataclass, field
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
from src.utils.timezone import now_taipei
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# ── 常數 ────────────────────────────────────────────────────────────────────
|
||
MIN_DATAPOINTS = 168 # 7 天 × 24h 最少樣本數(才能訓練季節性模型)
|
||
SIGMA_THRESHOLD = 3.0 # 偏差 ≥ 3σ → 異常
|
||
REDIS_TTL_SEC = 86400 # 基線 Redis TTL = 24h
|
||
REDIS_KEY_PREFIX = "baseline:"
|
||
HISTORY_WINDOW_HOURS = 336 # 保留 14 天歷史
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Data Types
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class MetricDatapoint:
|
||
"""單一 metric 時序資料點"""
|
||
timestamp: float # Unix epoch
|
||
value: float
|
||
|
||
|
||
@dataclass
|
||
class BaselineState:
|
||
"""Holt-Winters 訓練後的基線狀態(Redis 持久化)"""
|
||
metric_name: str
|
||
mean: float
|
||
std: float
|
||
seasonal_factors: list[float] = field(default_factory=list) # 24h 週期
|
||
last_trained_at: str = ""
|
||
datapoint_count: int = 0
|
||
|
||
def to_dict(self) -> dict[str, Any]:
|
||
return {
|
||
"metric_name": self.metric_name,
|
||
"mean": self.mean,
|
||
"std": self.std,
|
||
"seasonal_factors": self.seasonal_factors,
|
||
"last_trained_at": self.last_trained_at,
|
||
"datapoint_count": self.datapoint_count,
|
||
}
|
||
|
||
@classmethod
|
||
def from_dict(cls, d: dict[str, Any]) -> BaselineState:
|
||
return cls(
|
||
metric_name=d["metric_name"],
|
||
mean=d["mean"],
|
||
std=d["std"],
|
||
seasonal_factors=d.get("seasonal_factors", []),
|
||
last_trained_at=d.get("last_trained_at", ""),
|
||
datapoint_count=d.get("datapoint_count", 0),
|
||
)
|
||
|
||
|
||
@dataclass
|
||
class AnomalyResult:
|
||
"""is_anomaly() 回傳結果"""
|
||
metric_name: str
|
||
current_value: float
|
||
is_anomaly: bool
|
||
deviation_sigma: float # 偏差 σ 數(>3 = 異常)
|
||
expected_mean: float
|
||
expected_std: float
|
||
direction: str = "none" # "up" / "down" / "none"
|
||
shadow_mode: bool = True # True = 只記錄,不觸發
|
||
reason: str = ""
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Main Service
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class DynamicBaselineService:
|
||
"""
|
||
動態基線服務
|
||
|
||
兩大功能:
|
||
1. train_baseline() — 從 Prometheus 抓歷史資料,用 Holt-Winters 訓練
|
||
2. is_anomaly() — 即時判斷當前值是否偏離基線 ≥ 3σ
|
||
"""
|
||
|
||
async def train_baseline(
|
||
self,
|
||
metric_name: str,
|
||
promql: str,
|
||
lookback_hours: int = HISTORY_WINDOW_HOURS,
|
||
) -> BaselineState | None:
|
||
"""
|
||
從 Prometheus 抓取歷史資料並訓練基線。
|
||
|
||
Args:
|
||
metric_name: 基線識別名(e.g. "cpu_usage_node_mon")
|
||
promql: Prometheus query(e.g. "avg(rate(node_cpu_seconds_total[5m]))")
|
||
lookback_hours: 歷史視窗(預設 14 天)
|
||
|
||
Returns:
|
||
BaselineState(已存 Redis);資料不足 → None
|
||
"""
|
||
try:
|
||
datapoints = await self._fetch_prometheus_history(promql, lookback_hours)
|
||
if len(datapoints) < MIN_DATAPOINTS:
|
||
logger.info(
|
||
"baseline_insufficient_data",
|
||
metric=metric_name,
|
||
count=len(datapoints),
|
||
required=MIN_DATAPOINTS,
|
||
)
|
||
return None
|
||
|
||
state = self._fit_holt_winters(metric_name, datapoints)
|
||
await self._save_baseline(state, promql=promql, lookback_hours=lookback_hours)
|
||
logger.info(
|
||
"baseline_trained",
|
||
metric=metric_name,
|
||
mean=f"{state.mean:.4f}",
|
||
std=f"{state.std:.4f}",
|
||
datapoints=len(datapoints),
|
||
)
|
||
return state
|
||
|
||
except Exception:
|
||
logger.exception("baseline_train_failed", metric=metric_name)
|
||
return None
|
||
|
||
async def is_anomaly(
|
||
self,
|
||
metric_name: str,
|
||
current_value: float,
|
||
hour_of_day: int | None = None,
|
||
) -> AnomalyResult:
|
||
"""
|
||
即時異常判斷。
|
||
|
||
Args:
|
||
metric_name: 基線識別名
|
||
current_value: 當前觀測值
|
||
hour_of_day: 當前小時(0-23),用於套用 seasonal factor;None = 不套用
|
||
|
||
Returns:
|
||
AnomalyResult
|
||
"""
|
||
from src.core.feature_flags import aiops_flags
|
||
|
||
shadow_mode = aiops_flags.AIOPS_P4_SHADOW_MODE
|
||
|
||
try:
|
||
state = await self._load_baseline(metric_name)
|
||
if state is None:
|
||
return AnomalyResult(
|
||
metric_name=metric_name,
|
||
current_value=current_value,
|
||
is_anomaly=False,
|
||
deviation_sigma=0.0,
|
||
expected_mean=current_value,
|
||
expected_std=0.0,
|
||
reason="no_baseline_available",
|
||
shadow_mode=shadow_mode,
|
||
)
|
||
|
||
# 套用 seasonal factor(如果有 24h 週期資料)
|
||
expected_mean = state.mean
|
||
if hour_of_day is not None and len(state.seasonal_factors) == 24:
|
||
expected_mean *= state.seasonal_factors[hour_of_day]
|
||
|
||
expected_std = state.std if state.std > 0 else 1e-9
|
||
deviation = abs(current_value - expected_mean)
|
||
sigma = deviation / expected_std
|
||
|
||
anomaly = sigma >= SIGMA_THRESHOLD
|
||
direction = "none"
|
||
if anomaly:
|
||
direction = "up" if current_value > expected_mean else "down"
|
||
|
||
result = AnomalyResult(
|
||
metric_name=metric_name,
|
||
current_value=current_value,
|
||
is_anomaly=anomaly,
|
||
deviation_sigma=round(sigma, 2),
|
||
expected_mean=round(expected_mean, 4),
|
||
expected_std=round(expected_std, 4),
|
||
direction=direction,
|
||
shadow_mode=shadow_mode,
|
||
reason=f"deviation {sigma:.1f}σ from baseline" if anomaly else "within_normal_range",
|
||
)
|
||
|
||
if anomaly:
|
||
logger.info(
|
||
"dynamic_anomaly_detected",
|
||
metric=metric_name,
|
||
value=current_value,
|
||
expected=expected_mean,
|
||
sigma=sigma,
|
||
direction=direction,
|
||
shadow_mode=shadow_mode,
|
||
)
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
logger.warning("baseline_anomaly_check_failed", metric=metric_name, error=str(e))
|
||
return AnomalyResult(
|
||
metric_name=metric_name,
|
||
current_value=current_value,
|
||
is_anomaly=False,
|
||
deviation_sigma=0.0,
|
||
expected_mean=0.0,
|
||
expected_std=0.0,
|
||
reason=f"check_error:{e}",
|
||
shadow_mode=shadow_mode,
|
||
)
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────
|
||
# Private Helpers
|
||
# ──────────────────────────────────────────────────────────────────────────
|
||
|
||
async def _fetch_prometheus_history(
|
||
self,
|
||
promql: str,
|
||
lookback_hours: int,
|
||
) -> list[MetricDatapoint]:
|
||
"""從 Prometheus query_range API 抓取歷史資料(1h 步進)。"""
|
||
import httpx
|
||
|
||
from src.core.config import settings
|
||
|
||
end_ts = now_taipei().timestamp()
|
||
start_ts = end_ts - lookback_hours * 3600
|
||
|
||
try:
|
||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||
resp = await client.get(
|
||
f"{settings.PROMETHEUS_URL}/api/v1/query_range",
|
||
params={
|
||
"query": promql,
|
||
"start": start_ts,
|
||
"end": end_ts,
|
||
"step": "3600", # 1h 步進
|
||
},
|
||
)
|
||
resp.raise_for_status()
|
||
data = resp.json()
|
||
|
||
results = data.get("data", {}).get("result", [])
|
||
if not results:
|
||
return []
|
||
|
||
# 取第一個 time series
|
||
values = results[0].get("values", [])
|
||
return [
|
||
MetricDatapoint(timestamp=float(ts), value=float(v))
|
||
for ts, v in values
|
||
if v != "NaN"
|
||
]
|
||
except Exception as e:
|
||
logger.warning("prometheus_history_fetch_failed", error=str(e))
|
||
return []
|
||
|
||
def _fit_holt_winters(
|
||
self,
|
||
metric_name: str,
|
||
datapoints: list[MetricDatapoint],
|
||
) -> BaselineState:
|
||
"""
|
||
用 statsmodels Holt-Winters 訓練基線。
|
||
Fallback:若 statsmodels 不可用 → 滑動統計。
|
||
"""
|
||
values = [dp.value for dp in datapoints]
|
||
|
||
try:
|
||
import numpy as np
|
||
from statsmodels.tsa.holtwinters import ExponentialSmoothing
|
||
|
||
arr = np.array(values, dtype=float)
|
||
# 確保無 NaN / Inf
|
||
arr = arr[np.isfinite(arr)]
|
||
|
||
if len(arr) < MIN_DATAPOINTS:
|
||
return self._fit_simple_stats(metric_name, values)
|
||
|
||
# Holt-Winters:加法趨勢 + 加法季節性(24h 週期)
|
||
seasonal_periods = min(24, len(arr) // 2)
|
||
model = ExponentialSmoothing(
|
||
arr,
|
||
trend="add",
|
||
seasonal="add" if len(arr) >= seasonal_periods * 2 else None,
|
||
seasonal_periods=seasonal_periods,
|
||
initialization_method="estimated",
|
||
).fit(optimized=True)
|
||
|
||
fitted = model.fittedvalues
|
||
residuals = arr - fitted
|
||
mean_val = float(np.mean(fitted))
|
||
std_val = float(np.std(residuals))
|
||
|
||
# 24h seasonal factors(正規化為相對倍數)
|
||
seasonal_factors = [1.0] * 24
|
||
if hasattr(model, "season") and model.season is not None:
|
||
s = model.season
|
||
if len(s) >= 24:
|
||
s_arr = np.array(s[-24:])
|
||
# 轉為乘法因子(mean-centered)
|
||
s_mean = abs(np.mean(s_arr)) or 1.0
|
||
sf = (s_arr / s_mean).tolist()
|
||
seasonal_factors = [max(0.1, min(10.0, f)) for f in sf]
|
||
|
||
return BaselineState(
|
||
metric_name=metric_name,
|
||
mean=mean_val,
|
||
std=max(std_val, mean_val * 0.01), # 最小 std = 1% mean
|
||
seasonal_factors=seasonal_factors,
|
||
last_trained_at=now_taipei().isoformat(),
|
||
datapoint_count=len(arr),
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning("holt_winters_failed_fallback_to_stats", error=str(e))
|
||
return self._fit_simple_stats(metric_name, values)
|
||
|
||
def _fit_simple_stats(
|
||
self,
|
||
metric_name: str,
|
||
values: list[float],
|
||
) -> BaselineState:
|
||
"""Fallback:純滑動平均 + 標準差基線。"""
|
||
if not values:
|
||
return BaselineState(metric_name=metric_name, mean=0.0, std=1.0)
|
||
|
||
n = len(values)
|
||
mean_val = sum(values) / n
|
||
variance = sum((v - mean_val) ** 2 for v in values) / n
|
||
std_val = math.sqrt(variance)
|
||
|
||
return BaselineState(
|
||
metric_name=metric_name,
|
||
mean=mean_val,
|
||
std=max(std_val, mean_val * 0.01),
|
||
seasonal_factors=[1.0] * 24,
|
||
last_trained_at=now_taipei().isoformat(),
|
||
datapoint_count=n,
|
||
)
|
||
|
||
async def _save_baseline(self, state: BaselineState, promql: str = "", lookback_hours: int = HISTORY_WINDOW_HOURS) -> None:
|
||
"""
|
||
儲存基線狀態:
|
||
1. 先寫 PostgreSQL(永久保存,source of truth)
|
||
2. 再寫 Redis(24h warm cache,加速讀取)
|
||
|
||
Phase 4 ADR-084 架構鐵律:訓練好的 Holt-Winters 模型不能只存 Redis。
|
||
Redis 24h TTL 到期 = AI 每天重新學習「正常」的定義 = 不是在學習。
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 改為 PG source of truth
|
||
"""
|
||
# 1. 寫入 PostgreSQL(主要持久化)
|
||
await self._pg_upsert_baseline(state, promql, lookback_hours)
|
||
|
||
# 2. 寫入 Redis warm cache(加速讀取,到期後從 PG 復原)
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
r = get_redis()
|
||
key = f"{REDIS_KEY_PREFIX}{state.metric_name}"
|
||
await r.set(key, json.dumps(state.to_dict()), ex=REDIS_TTL_SEC)
|
||
except Exception as e:
|
||
logger.warning("baseline_redis_cache_failed", metric=state.metric_name, error=str(e))
|
||
|
||
async def _load_baseline(self, metric_name: str) -> BaselineState | None:
|
||
"""
|
||
載入基線:Redis-first → miss 時從 PG 載入並回填 Redis。
|
||
|
||
Phase 4 ADR-084: Redis 只是 warm cache,PG 才是 source of truth。
|
||
"""
|
||
# 1. Redis warm cache hit
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
r = get_redis()
|
||
key = f"{REDIS_KEY_PREFIX}{metric_name}"
|
||
data = await r.get(key)
|
||
if data is not None:
|
||
return BaselineState.from_dict(json.loads(data))
|
||
except Exception as e:
|
||
logger.warning("baseline_redis_load_failed", metric=metric_name, error=str(e))
|
||
|
||
# 2. PG fallback(cache miss)
|
||
state = await self._pg_load_latest_baseline(metric_name)
|
||
if state is not None:
|
||
# 回填 Redis cache
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
r = get_redis()
|
||
key = f"{REDIS_KEY_PREFIX}{metric_name}"
|
||
await r.set(key, json.dumps(state.to_dict()), ex=REDIS_TTL_SEC)
|
||
except Exception:
|
||
pass # cache 回填失敗不影響讀取
|
||
return state
|
||
|
||
async def _pg_upsert_baseline(self, state: BaselineState, promql: str, lookback_hours: int) -> None:
|
||
"""寫入 DynamicBaselineRecord 到 PostgreSQL(INSERT,不更新舊記錄)"""
|
||
try:
|
||
from src.db.base import get_db_context
|
||
from src.db.models import DynamicBaselineRecord
|
||
|
||
async with get_db_context() as session:
|
||
record = DynamicBaselineRecord(
|
||
metric_name=state.metric_name,
|
||
mean=state.mean,
|
||
std=state.std,
|
||
seasonal_factors=state.seasonal_factors,
|
||
datapoint_count=state.datapoint_count,
|
||
promql=promql,
|
||
lookback_hours=lookback_hours,
|
||
)
|
||
session.add(record)
|
||
await session.commit()
|
||
logger.info("baseline_pg_saved", metric=state.metric_name, datapoints=state.datapoint_count)
|
||
except Exception as e:
|
||
logger.warning("baseline_pg_save_failed", metric=state.metric_name, error=str(e))
|
||
|
||
async def _pg_load_latest_baseline(self, metric_name: str) -> BaselineState | None:
|
||
"""從 PostgreSQL 載入最新一筆基線記錄"""
|
||
try:
|
||
from sqlalchemy import select
|
||
|
||
from src.db.base import get_db_context
|
||
from src.db.models import DynamicBaselineRecord
|
||
|
||
async with get_db_context() as session:
|
||
stmt = (
|
||
select(DynamicBaselineRecord)
|
||
.where(DynamicBaselineRecord.metric_name == metric_name)
|
||
.order_by(DynamicBaselineRecord.trained_at.desc())
|
||
.limit(1)
|
||
)
|
||
result = await session.execute(stmt)
|
||
record = result.scalar_one_or_none()
|
||
if record is None:
|
||
return None
|
||
return BaselineState(
|
||
metric_name=record.metric_name,
|
||
mean=record.mean,
|
||
std=record.std,
|
||
seasonal_factors=record.seasonal_factors,
|
||
last_trained_at=record.trained_at.isoformat(),
|
||
datapoint_count=record.datapoint_count,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("baseline_pg_load_failed", metric=metric_name, error=str(e))
|
||
return None
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Singleton
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_baseline_service: DynamicBaselineService | None = None
|
||
|
||
|
||
def get_dynamic_baseline_service() -> DynamicBaselineService:
|
||
global _baseline_service
|
||
if _baseline_service is None:
|
||
_baseline_service = DynamicBaselineService()
|
||
return _baseline_service
|