Files
awoooi/apps/api/src/services/dynamic_baseline_service.py
Your Name ff30c61c4c
All checks were successful
Code Review / ai-code-review (push) Successful in 21s
CD Pipeline / tests (push) Successful in 1m20s
CD Pipeline / build-and-deploy (push) Successful in 4m15s
CD Pipeline / post-deploy-checks (push) Successful in 1m58s
fix(rls): 收斂 API DB access context
2026-05-12 19:55:13 +08:00

490 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI AIOps Phase 4 — Dynamic Baseline Service動態基線服務
=============================================================
職責Holt-Winters 指數平滑,偵測 Prometheus metric 異常偏離
核心 API
is_anomaly(metric_name, current_value) -> AnomalyResult
update_baseline(metric_name, datapoints)
設計原則:
- Shadow ModeAIOPS_P4_SHADOW_MODE=True只記錄不觸發 Alert
- 熔斷保護statsmodels 失敗 → fallback 到滑動平均
- 7 天歷史資料最少訓練量(低於此閾值 → skip不誤判
- 基線持久化到 Rediskey: baseline:{metric_name}TTL 24h
- 訓練在 background worker 執行not in webhook handler
ADR-084: Phase 4 動態異常偵測源頭升級
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 初始建立
"""
from __future__ import annotations
import json
import math
from dataclasses import dataclass, field
from typing import Any
import structlog
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# ── 常數 ────────────────────────────────────────────────────────────────────
MIN_DATAPOINTS = 168 # 7 天 × 24h 最少樣本數(才能訓練季節性模型)
SIGMA_THRESHOLD = 3.0 # 偏差 ≥ 3σ → 異常
REDIS_TTL_SEC = 86400 # 基線 Redis TTL = 24h
REDIS_KEY_PREFIX = "baseline:"
HISTORY_WINDOW_HOURS = 336 # 保留 14 天歷史
# ─────────────────────────────────────────────────────────────────────────────
# Data Types
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class MetricDatapoint:
"""單一 metric 時序資料點"""
timestamp: float # Unix epoch
value: float
@dataclass
class BaselineState:
"""Holt-Winters 訓練後的基線狀態Redis 持久化)"""
metric_name: str
mean: float
std: float
seasonal_factors: list[float] = field(default_factory=list) # 24h 週期
last_trained_at: str = ""
datapoint_count: int = 0
def to_dict(self) -> dict[str, Any]:
return {
"metric_name": self.metric_name,
"mean": self.mean,
"std": self.std,
"seasonal_factors": self.seasonal_factors,
"last_trained_at": self.last_trained_at,
"datapoint_count": self.datapoint_count,
}
@classmethod
def from_dict(cls, d: dict[str, Any]) -> BaselineState:
return cls(
metric_name=d["metric_name"],
mean=d["mean"],
std=d["std"],
seasonal_factors=d.get("seasonal_factors", []),
last_trained_at=d.get("last_trained_at", ""),
datapoint_count=d.get("datapoint_count", 0),
)
@dataclass
class AnomalyResult:
"""is_anomaly() 回傳結果"""
metric_name: str
current_value: float
is_anomaly: bool
deviation_sigma: float # 偏差 σ 數(>3 = 異常)
expected_mean: float
expected_std: float
direction: str = "none" # "up" / "down" / "none"
shadow_mode: bool = True # True = 只記錄,不觸發
reason: str = ""
# ─────────────────────────────────────────────────────────────────────────────
# Main Service
# ─────────────────────────────────────────────────────────────────────────────
class DynamicBaselineService:
"""
動態基線服務
兩大功能:
1. train_baseline() — 從 Prometheus 抓歷史資料,用 Holt-Winters 訓練
2. is_anomaly() — 即時判斷當前值是否偏離基線 ≥ 3σ
"""
async def train_baseline(
self,
metric_name: str,
promql: str,
lookback_hours: int = HISTORY_WINDOW_HOURS,
) -> BaselineState | None:
"""
從 Prometheus 抓取歷史資料並訓練基線。
Args:
metric_name: 基線識別名e.g. "cpu_usage_node_mon"
promql: Prometheus querye.g. "avg(rate(node_cpu_seconds_total[5m]))"
lookback_hours: 歷史視窗(預設 14 天)
Returns:
BaselineState已存 Redis資料不足 → None
"""
try:
datapoints = await self._fetch_prometheus_history(promql, lookback_hours)
if len(datapoints) < MIN_DATAPOINTS:
logger.info(
"baseline_insufficient_data",
metric=metric_name,
count=len(datapoints),
required=MIN_DATAPOINTS,
)
return None
state = self._fit_holt_winters(metric_name, datapoints)
await self._save_baseline(state, promql=promql, lookback_hours=lookback_hours)
logger.info(
"baseline_trained",
metric=metric_name,
mean=f"{state.mean:.4f}",
std=f"{state.std:.4f}",
datapoints=len(datapoints),
)
return state
except Exception:
logger.exception("baseline_train_failed", metric=metric_name)
return None
async def is_anomaly(
self,
metric_name: str,
current_value: float,
hour_of_day: int | None = None,
) -> AnomalyResult:
"""
即時異常判斷。
Args:
metric_name: 基線識別名
current_value: 當前觀測值
hour_of_day: 當前小時0-23用於套用 seasonal factorNone = 不套用
Returns:
AnomalyResult
"""
from src.core.feature_flags import aiops_flags
shadow_mode = aiops_flags.AIOPS_P4_SHADOW_MODE
try:
state = await self._load_baseline(metric_name)
if state is None:
return AnomalyResult(
metric_name=metric_name,
current_value=current_value,
is_anomaly=False,
deviation_sigma=0.0,
expected_mean=current_value,
expected_std=0.0,
reason="no_baseline_available",
shadow_mode=shadow_mode,
)
# 套用 seasonal factor如果有 24h 週期資料)
expected_mean = state.mean
if hour_of_day is not None and len(state.seasonal_factors) == 24:
expected_mean *= state.seasonal_factors[hour_of_day]
expected_std = state.std if state.std > 0 else 1e-9
deviation = abs(current_value - expected_mean)
sigma = deviation / expected_std
anomaly = sigma >= SIGMA_THRESHOLD
direction = "none"
if anomaly:
direction = "up" if current_value > expected_mean else "down"
result = AnomalyResult(
metric_name=metric_name,
current_value=current_value,
is_anomaly=anomaly,
deviation_sigma=round(sigma, 2),
expected_mean=round(expected_mean, 4),
expected_std=round(expected_std, 4),
direction=direction,
shadow_mode=shadow_mode,
reason=f"deviation {sigma:.1f}σ from baseline" if anomaly else "within_normal_range",
)
if anomaly:
logger.info(
"dynamic_anomaly_detected",
metric=metric_name,
value=current_value,
expected=expected_mean,
sigma=sigma,
direction=direction,
shadow_mode=shadow_mode,
)
return result
except Exception as e:
logger.warning("baseline_anomaly_check_failed", metric=metric_name, error=str(e))
return AnomalyResult(
metric_name=metric_name,
current_value=current_value,
is_anomaly=False,
deviation_sigma=0.0,
expected_mean=0.0,
expected_std=0.0,
reason=f"check_error:{e}",
shadow_mode=shadow_mode,
)
# ──────────────────────────────────────────────────────────────────────────
# Private Helpers
# ──────────────────────────────────────────────────────────────────────────
async def _fetch_prometheus_history(
self,
promql: str,
lookback_hours: int,
) -> list[MetricDatapoint]:
"""從 Prometheus query_range API 抓取歷史資料1h 步進)。"""
import httpx
from src.core.config import settings
end_ts = now_taipei().timestamp()
start_ts = end_ts - lookback_hours * 3600
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.get(
f"{settings.PROMETHEUS_URL}/api/v1/query_range",
params={
"query": promql,
"start": start_ts,
"end": end_ts,
"step": "3600", # 1h 步進
},
)
resp.raise_for_status()
data = resp.json()
results = data.get("data", {}).get("result", [])
if not results:
return []
# 取第一個 time series
values = results[0].get("values", [])
return [
MetricDatapoint(timestamp=float(ts), value=float(v))
for ts, v in values
if v != "NaN"
]
except Exception as e:
logger.warning("prometheus_history_fetch_failed", error=str(e))
return []
def _fit_holt_winters(
self,
metric_name: str,
datapoints: list[MetricDatapoint],
) -> BaselineState:
"""
用 statsmodels Holt-Winters 訓練基線。
Fallback若 statsmodels 不可用 → 滑動統計。
"""
values = [dp.value for dp in datapoints]
try:
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
arr = np.array(values, dtype=float)
# 確保無 NaN / Inf
arr = arr[np.isfinite(arr)]
if len(arr) < MIN_DATAPOINTS:
return self._fit_simple_stats(metric_name, values)
# Holt-Winters加法趨勢 + 加法季節性24h 週期)
seasonal_periods = min(24, len(arr) // 2)
model = ExponentialSmoothing(
arr,
trend="add",
seasonal="add" if len(arr) >= seasonal_periods * 2 else None,
seasonal_periods=seasonal_periods,
initialization_method="estimated",
).fit(optimized=True)
fitted = model.fittedvalues
residuals = arr - fitted
mean_val = float(np.mean(fitted))
std_val = float(np.std(residuals))
# 24h seasonal factors正規化為相對倍數
seasonal_factors = [1.0] * 24
if hasattr(model, "season") and model.season is not None:
s = model.season
if len(s) >= 24:
s_arr = np.array(s[-24:])
# 轉為乘法因子mean-centered
s_mean = abs(np.mean(s_arr)) or 1.0
sf = (s_arr / s_mean).tolist()
seasonal_factors = [max(0.1, min(10.0, f)) for f in sf]
return BaselineState(
metric_name=metric_name,
mean=mean_val,
std=max(std_val, mean_val * 0.01), # 最小 std = 1% mean
seasonal_factors=seasonal_factors,
last_trained_at=now_taipei().isoformat(),
datapoint_count=len(arr),
)
except Exception as e:
logger.warning("holt_winters_failed_fallback_to_stats", error=str(e))
return self._fit_simple_stats(metric_name, values)
def _fit_simple_stats(
self,
metric_name: str,
values: list[float],
) -> BaselineState:
"""Fallback純滑動平均 + 標準差基線。"""
if not values:
return BaselineState(metric_name=metric_name, mean=0.0, std=1.0)
n = len(values)
mean_val = sum(values) / n
variance = sum((v - mean_val) ** 2 for v in values) / n
std_val = math.sqrt(variance)
return BaselineState(
metric_name=metric_name,
mean=mean_val,
std=max(std_val, mean_val * 0.01),
seasonal_factors=[1.0] * 24,
last_trained_at=now_taipei().isoformat(),
datapoint_count=n,
)
async def _save_baseline(self, state: BaselineState, promql: str = "", lookback_hours: int = HISTORY_WINDOW_HOURS) -> None:
"""
儲存基線狀態:
1. 先寫 PostgreSQL永久保存source of truth
2. 再寫 Redis24h warm cache加速讀取
Phase 4 ADR-084 架構鐵律:訓練好的 Holt-Winters 模型不能只存 Redis。
Redis 24h TTL 到期 = AI 每天重新學習「正常」的定義 = 不是在學習。
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 改為 PG source of truth
"""
# 1. 寫入 PostgreSQL主要持久化
await self._pg_upsert_baseline(state, promql, lookback_hours)
# 2. 寫入 Redis warm cache加速讀取到期後從 PG 復原)
try:
from src.core.redis_client import get_redis
r = get_redis()
key = f"{REDIS_KEY_PREFIX}{state.metric_name}"
await r.set(key, json.dumps(state.to_dict()), ex=REDIS_TTL_SEC)
except Exception as e:
logger.warning("baseline_redis_cache_failed", metric=state.metric_name, error=str(e))
async def _load_baseline(self, metric_name: str) -> BaselineState | None:
"""
載入基線Redis-first → miss 時從 PG 載入並回填 Redis。
Phase 4 ADR-084: Redis 只是 warm cachePG 才是 source of truth。
"""
# 1. Redis warm cache hit
try:
from src.core.redis_client import get_redis
r = get_redis()
key = f"{REDIS_KEY_PREFIX}{metric_name}"
data = await r.get(key)
if data is not None:
return BaselineState.from_dict(json.loads(data))
except Exception as e:
logger.warning("baseline_redis_load_failed", metric=metric_name, error=str(e))
# 2. PG fallbackcache miss
state = await self._pg_load_latest_baseline(metric_name)
if state is not None:
# 回填 Redis cache
try:
from src.core.redis_client import get_redis
r = get_redis()
key = f"{REDIS_KEY_PREFIX}{metric_name}"
await r.set(key, json.dumps(state.to_dict()), ex=REDIS_TTL_SEC)
except Exception:
pass # cache 回填失敗不影響讀取
return state
async def _pg_upsert_baseline(self, state: BaselineState, promql: str, lookback_hours: int) -> None:
"""寫入 DynamicBaselineRecord 到 PostgreSQLINSERT不更新舊記錄"""
try:
from src.db.base import get_db_context
from src.db.models import DynamicBaselineRecord
async with get_db_context() as session:
record = DynamicBaselineRecord(
metric_name=state.metric_name,
mean=state.mean,
std=state.std,
seasonal_factors=state.seasonal_factors,
datapoint_count=state.datapoint_count,
promql=promql,
lookback_hours=lookback_hours,
)
session.add(record)
await session.commit()
logger.info("baseline_pg_saved", metric=state.metric_name, datapoints=state.datapoint_count)
except Exception as e:
logger.warning("baseline_pg_save_failed", metric=state.metric_name, error=str(e))
async def _pg_load_latest_baseline(self, metric_name: str) -> BaselineState | None:
"""從 PostgreSQL 載入最新一筆基線記錄"""
try:
from sqlalchemy import select
from src.db.base import get_db_context
from src.db.models import DynamicBaselineRecord
async with get_db_context() as session:
stmt = (
select(DynamicBaselineRecord)
.where(DynamicBaselineRecord.metric_name == metric_name)
.order_by(DynamicBaselineRecord.trained_at.desc())
.limit(1)
)
result = await session.execute(stmt)
record = result.scalar_one_or_none()
if record is None:
return None
return BaselineState(
metric_name=record.metric_name,
mean=record.mean,
std=record.std,
seasonal_factors=record.seasonal_factors,
last_trained_at=record.trained_at.isoformat(),
datapoint_count=record.datapoint_count,
)
except Exception as e:
logger.warning("baseline_pg_load_failed", metric=metric_name, error=str(e))
return None
# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────
_baseline_service: DynamicBaselineService | None = None
def get_dynamic_baseline_service() -> DynamicBaselineService:
global _baseline_service
if _baseline_service is None:
_baseline_service = DynamicBaselineService()
return _baseline_service