Files
awoooi/apps/api/src/services/trend_predictor.py
OG T 14a02263ae
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 12m32s
feat(Phase 4): 主動巡檢 + 趨勢預測 + 8D 感官升級 全部完成
## Phase 4 完整交付(ADR-084)

### 新增服務
- trend_predictor.py: numpy 線性回歸,4h 閾值突破預警,R² 信心評分
- proactive_inspector.py: 每 5 分鐘主動巡檢協調器
  - DynamicBaselineService(3σ 偏離)
  - LogAnomalyDetector(新 Drain3 pattern)
  - TrendPredictor(斜率外推 4h 預測)
  - Shadow Mode + 30 分鐘去重 + Holt-Winters 背景重訓

### 8D 感官升級(EvidenceSnapshot Phase 4 增強)
- PreDecisionInvestigator._collect_phase4_anomalies(): 決策前讀取
  ProactiveInspector 最近巡檢快取 + LogAnomalyDetector 新 pattern
- EvidenceSnapshot.anomaly_context: 新欄位,Phase 4 動態異常上下文
- DiagnosticianAgent._build_prompt(): prompt 包含 anomaly_context,
  LLM RCA 可參考動態基線偏差與趨勢預警

### 資料庫遷移
- incident_evidence: ADD COLUMN anomaly_context JSONB(冪等)

### main.py
- 啟動 run_proactive_inspector_loop() asyncio task

2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 全部完成

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 15:47:05 +08:00

307 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI AIOps Phase 4 — Trend Predictor趨勢預測
===================================================
職責numpy 線性回歸,預測 metric 在未來 N 小時是否超越警戒閾值
核心 API
predict_breach(metric_name, current_value, threshold) -> TrendPrediction
設計原則:
- 不使用 Prophet500MB+ Stan 依賴),改用 numpy 線性回歸
- 從 DynamicBaselineRecord 取歷史窗口資料,計算趨勢斜率
- Shadow Mode預測只記錄 logger.info不觸發 Alert
- 熔斷numpy 失敗 → fallback 到最近值外推
ADR-084: Phase 4 動態異常偵測源頭升級
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 初始建立
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
import structlog
from src.utils.timezone import now_taipei
logger = structlog.get_logger(__name__)
# ── 常數 ────────────────────────────────────────────────────────────────────
FORECAST_HOURS = 4 # 預測未來 4 小時
MIN_DATAPOINTS_FOR_TREND = 12 # 至少 12 個資料點才能做回歸12h
TREND_CONFIDENCE_THRESHOLD = 0.7 # R² > 0.7 → 趨勢可信
REDIS_KEY_HISTORY = "trend:history:" # hash: metric_name → JSON list of (ts, value)
REDIS_TTL_HISTORY = 86400 * 2 # 保留 2 天歷史
MAX_HISTORY_POINTS = 336 # 最多 336 個點14天 × 24h
# ─────────────────────────────────────────────────────────────────────────────
# Data Types
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class TrendPrediction:
"""趨勢預測結果"""
metric_name: str
current_value: float
threshold: float
forecast_hours: int # 預測時間窗口(小時)
predicted_value: float # 預測在 forecast_hours 後的值
slope_per_hour: float # 每小時變化量(線性斜率)
r_squared: float # 線性回歸 R²0-1越高趨勢越清晰
will_breach: bool # 是否預測超越 threshold
breach_in_hours: float | None # 預計幾小時後超越None = 不會超越)
confidence: str # high / medium / low / insufficient_data
shadow_mode: bool = True
detected_at: str = ""
# ─────────────────────────────────────────────────────────────────────────────
# Main Service
# ─────────────────────────────────────────────────────────────────────────────
class TrendPredictor:
"""
趨勢預測服務
工作流程:
1. 從 Redis 取近期歷史資料點sliding window
2. numpy 線性回歸計算趨勢斜率
3. 外推預測 4h 後的值
4. 判斷是否在 4h 內超越警戒閾值
"""
async def predict_breach(
self,
metric_name: str,
current_value: float,
threshold: float,
forecast_hours: int = FORECAST_HOURS,
) -> TrendPrediction:
"""
預測 metric 是否在 forecast_hours 內超越 threshold。
Args:
metric_name: 基線識別名(需與 DynamicBaselineService 一致)
current_value: 當前觀測值
threshold: 警戒閾值(超越 = 預警)
forecast_hours: 預測窗口(預設 4h
Returns:
TrendPredictionShadow Mode 時只記錄不觸發)
"""
from src.core.feature_flags import aiops_flags
if not aiops_flags.AIOPS_P4_TREND_PREDICTOR:
return self._no_data_result(metric_name, current_value, threshold, forecast_hours)
shadow_mode = aiops_flags.AIOPS_P4_SHADOW_MODE
detected_at = now_taipei().isoformat()
# 推送當前值到歷史
await self._append_history(metric_name, current_value)
# 取歷史資料
history = await self._get_history(metric_name)
if len(history) < MIN_DATAPOINTS_FOR_TREND:
return TrendPrediction(
metric_name=metric_name,
current_value=current_value,
threshold=threshold,
forecast_hours=forecast_hours,
predicted_value=current_value,
slope_per_hour=0.0,
r_squared=0.0,
will_breach=current_value >= threshold,
breach_in_hours=0.0 if current_value >= threshold else None,
confidence="insufficient_data",
shadow_mode=shadow_mode,
detected_at=detected_at,
)
prediction = self._linear_regression_predict(
history=history,
current_value=current_value,
threshold=threshold,
forecast_hours=forecast_hours,
)
prediction.shadow_mode = shadow_mode
prediction.detected_at = detected_at
if prediction.will_breach:
logger.info(
"trend_breach_predicted",
metric=metric_name,
current=current_value,
predicted=prediction.predicted_value,
threshold=threshold,
breach_in_hours=prediction.breach_in_hours,
slope=prediction.slope_per_hour,
r2=prediction.r_squared,
shadow_mode=shadow_mode,
)
return prediction
async def push_datapoint(
self,
metric_name: str,
value: float,
) -> None:
"""主動推送資料點(供 ProactiveInspector 呼叫)。"""
await self._append_history(metric_name, value)
# ──────────────────────────────────────────────────────────────────────────
# Private Helpers
# ──────────────────────────────────────────────────────────────────────────
def _linear_regression_predict(
self,
history: list[tuple[float, float]], # (timestamp, value)
current_value: float,
threshold: float,
forecast_hours: int,
) -> TrendPrediction:
"""
numpy 線性回歸y = slope * x + intercept
x = 相對小時數0 到 Ny = metric 值
"""
metric_name = "" # 呼叫方 fillback
try:
import numpy as np
times = np.array([h[0] for h in history], dtype=float)
values = np.array([h[1] for h in history], dtype=float)
# 相對化時間(小時為單位)
times_rel = (times - times[0]) / 3600.0
# 線性回歸polyfit degree=1
coeffs = np.polyfit(times_rel, values, 1)
slope = float(coeffs[0]) # 每小時斜率
intercept = float(coeffs[1])
# R² 計算
fitted = np.polyval(coeffs, times_rel)
ss_res = float(np.sum((values - fitted) ** 2))
ss_tot = float(np.sum((values - np.mean(values)) ** 2))
r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0
# 預測 forecast_hours 後的值
current_time_rel = (now_taipei().timestamp() - times[0]) / 3600.0
predicted_value = slope * (current_time_rel + forecast_hours) + intercept
# 信心度
if r2 >= TREND_CONFIDENCE_THRESHOLD:
confidence = "high"
elif r2 >= 0.4:
confidence = "medium"
else:
confidence = "low"
# 是否超越閾值
will_breach = predicted_value >= threshold
breach_in_hours: float | None = None
if will_breach and slope > 0 and current_value < threshold:
# 計算幾小時後超越current_value + slope * h = threshold
breach_in_hours = round((threshold - current_value) / slope, 2)
breach_in_hours = min(breach_in_hours, float(forecast_hours))
elif current_value >= threshold:
breach_in_hours = 0.0
return TrendPrediction(
metric_name=metric_name,
current_value=current_value,
threshold=threshold,
forecast_hours=forecast_hours,
predicted_value=round(predicted_value, 4),
slope_per_hour=round(slope, 6),
r_squared=round(r2, 4),
will_breach=will_breach,
breach_in_hours=breach_in_hours,
confidence=confidence,
)
except Exception as e:
logger.warning("trend_regression_failed", error=str(e))
# Fallback最近值外推
return TrendPrediction(
metric_name=metric_name,
current_value=current_value,
threshold=threshold,
forecast_hours=forecast_hours,
predicted_value=current_value,
slope_per_hour=0.0,
r_squared=0.0,
will_breach=current_value >= threshold,
breach_in_hours=0.0 if current_value >= threshold else None,
confidence="low",
)
async def _append_history(self, metric_name: str, value: float) -> None:
"""推送資料點到 Redis 滑動視窗(最舊的被 trim 掉)。"""
try:
import json
from src.core.redis_client import get_redis
r = get_redis()
key = f"{REDIS_KEY_HISTORY}{metric_name}"
point = json.dumps([now_taipei().timestamp(), value])
await r.rpush(key, point)
await r.ltrim(key, -MAX_HISTORY_POINTS, -1) # 保留最新 N 個點
await r.expire(key, REDIS_TTL_HISTORY)
except Exception as e:
logger.warning("trend_history_append_failed", metric=metric_name, error=str(e))
async def _get_history(self, metric_name: str) -> list[tuple[float, float]]:
"""從 Redis 取歷史資料點。"""
try:
import json
from src.core.redis_client import get_redis
r = get_redis()
key = f"{REDIS_KEY_HISTORY}{metric_name}"
raw = await r.lrange(key, 0, -1)
return [tuple(json.loads(item)) for item in raw]
except Exception as e:
logger.warning("trend_history_get_failed", metric=metric_name, error=str(e))
return []
def _no_data_result(
self,
metric_name: str,
current_value: float,
threshold: float,
forecast_hours: int,
) -> TrendPrediction:
"""Feature flag 關閉時的空結果。"""
return TrendPrediction(
metric_name=metric_name,
current_value=current_value,
threshold=threshold,
forecast_hours=forecast_hours,
predicted_value=current_value,
slope_per_hour=0.0,
r_squared=0.0,
will_breach=False,
breach_in_hours=None,
confidence="insufficient_data",
shadow_mode=True,
detected_at=now_taipei().isoformat(),
)
# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────
_predictor: TrendPredictor | None = None
def get_trend_predictor() -> TrendPredictor:
global _predictor
if _predictor is None:
_predictor = TrendPredictor()
return _predictor