Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 12m32s
## Phase 4 完整交付(ADR-084) ### 新增服務 - trend_predictor.py: numpy 線性回歸,4h 閾值突破預警,R² 信心評分 - proactive_inspector.py: 每 5 分鐘主動巡檢協調器 - DynamicBaselineService(3σ 偏離) - LogAnomalyDetector(新 Drain3 pattern) - TrendPredictor(斜率外推 4h 預測) - Shadow Mode + 30 分鐘去重 + Holt-Winters 背景重訓 ### 8D 感官升級(EvidenceSnapshot Phase 4 增強) - PreDecisionInvestigator._collect_phase4_anomalies(): 決策前讀取 ProactiveInspector 最近巡檢快取 + LogAnomalyDetector 新 pattern - EvidenceSnapshot.anomaly_context: 新欄位,Phase 4 動態異常上下文 - DiagnosticianAgent._build_prompt(): prompt 包含 anomaly_context, LLM RCA 可參考動態基線偏差與趨勢預警 ### 資料庫遷移 - incident_evidence: ADD COLUMN anomaly_context JSONB(冪等) ### main.py - 啟動 run_proactive_inspector_loop() asyncio task 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 全部完成 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
307 lines
12 KiB
Python
307 lines
12 KiB
Python
"""
|
||
AWOOOI AIOps Phase 4 — Trend Predictor(趨勢預測)
|
||
===================================================
|
||
職責:numpy 線性回歸,預測 metric 在未來 N 小時是否超越警戒閾值
|
||
|
||
核心 API:
|
||
predict_breach(metric_name, current_value, threshold) -> TrendPrediction
|
||
|
||
設計原則:
|
||
- 不使用 Prophet(500MB+ Stan 依賴),改用 numpy 線性回歸
|
||
- 從 DynamicBaselineRecord 取歷史窗口資料,計算趨勢斜率
|
||
- Shadow Mode:預測只記錄 logger.info,不觸發 Alert
|
||
- 熔斷:numpy 失敗 → fallback 到最近值外推
|
||
|
||
ADR-084: Phase 4 動態異常偵測源頭升級
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from dataclasses import dataclass
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
from src.utils.timezone import now_taipei
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# ── 常數 ────────────────────────────────────────────────────────────────────
|
||
FORECAST_HOURS = 4 # 預測未來 4 小時
|
||
MIN_DATAPOINTS_FOR_TREND = 12 # 至少 12 個資料點才能做回歸(12h)
|
||
TREND_CONFIDENCE_THRESHOLD = 0.7 # R² > 0.7 → 趨勢可信
|
||
REDIS_KEY_HISTORY = "trend:history:" # hash: metric_name → JSON list of (ts, value)
|
||
REDIS_TTL_HISTORY = 86400 * 2 # 保留 2 天歷史
|
||
MAX_HISTORY_POINTS = 336 # 最多 336 個點(14天 × 24h)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Data Types
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class TrendPrediction:
|
||
"""趨勢預測結果"""
|
||
metric_name: str
|
||
current_value: float
|
||
threshold: float
|
||
forecast_hours: int # 預測時間窗口(小時)
|
||
predicted_value: float # 預測在 forecast_hours 後的值
|
||
slope_per_hour: float # 每小時變化量(線性斜率)
|
||
r_squared: float # 線性回歸 R²(0-1,越高趨勢越清晰)
|
||
will_breach: bool # 是否預測超越 threshold
|
||
breach_in_hours: float | None # 預計幾小時後超越(None = 不會超越)
|
||
confidence: str # high / medium / low / insufficient_data
|
||
shadow_mode: bool = True
|
||
detected_at: str = ""
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Main Service
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class TrendPredictor:
|
||
"""
|
||
趨勢預測服務
|
||
|
||
工作流程:
|
||
1. 從 Redis 取近期歷史資料點(sliding window)
|
||
2. numpy 線性回歸計算趨勢斜率
|
||
3. 外推預測 4h 後的值
|
||
4. 判斷是否在 4h 內超越警戒閾值
|
||
"""
|
||
|
||
async def predict_breach(
|
||
self,
|
||
metric_name: str,
|
||
current_value: float,
|
||
threshold: float,
|
||
forecast_hours: int = FORECAST_HOURS,
|
||
) -> TrendPrediction:
|
||
"""
|
||
預測 metric 是否在 forecast_hours 內超越 threshold。
|
||
|
||
Args:
|
||
metric_name: 基線識別名(需與 DynamicBaselineService 一致)
|
||
current_value: 當前觀測值
|
||
threshold: 警戒閾值(超越 = 預警)
|
||
forecast_hours: 預測窗口(預設 4h)
|
||
|
||
Returns:
|
||
TrendPrediction(Shadow Mode 時只記錄不觸發)
|
||
"""
|
||
from src.core.feature_flags import aiops_flags
|
||
|
||
if not aiops_flags.AIOPS_P4_TREND_PREDICTOR:
|
||
return self._no_data_result(metric_name, current_value, threshold, forecast_hours)
|
||
|
||
shadow_mode = aiops_flags.AIOPS_P4_SHADOW_MODE
|
||
detected_at = now_taipei().isoformat()
|
||
|
||
# 推送當前值到歷史
|
||
await self._append_history(metric_name, current_value)
|
||
|
||
# 取歷史資料
|
||
history = await self._get_history(metric_name)
|
||
if len(history) < MIN_DATAPOINTS_FOR_TREND:
|
||
return TrendPrediction(
|
||
metric_name=metric_name,
|
||
current_value=current_value,
|
||
threshold=threshold,
|
||
forecast_hours=forecast_hours,
|
||
predicted_value=current_value,
|
||
slope_per_hour=0.0,
|
||
r_squared=0.0,
|
||
will_breach=current_value >= threshold,
|
||
breach_in_hours=0.0 if current_value >= threshold else None,
|
||
confidence="insufficient_data",
|
||
shadow_mode=shadow_mode,
|
||
detected_at=detected_at,
|
||
)
|
||
|
||
prediction = self._linear_regression_predict(
|
||
history=history,
|
||
current_value=current_value,
|
||
threshold=threshold,
|
||
forecast_hours=forecast_hours,
|
||
)
|
||
prediction.shadow_mode = shadow_mode
|
||
prediction.detected_at = detected_at
|
||
|
||
if prediction.will_breach:
|
||
logger.info(
|
||
"trend_breach_predicted",
|
||
metric=metric_name,
|
||
current=current_value,
|
||
predicted=prediction.predicted_value,
|
||
threshold=threshold,
|
||
breach_in_hours=prediction.breach_in_hours,
|
||
slope=prediction.slope_per_hour,
|
||
r2=prediction.r_squared,
|
||
shadow_mode=shadow_mode,
|
||
)
|
||
|
||
return prediction
|
||
|
||
async def push_datapoint(
|
||
self,
|
||
metric_name: str,
|
||
value: float,
|
||
) -> None:
|
||
"""主動推送資料點(供 ProactiveInspector 呼叫)。"""
|
||
await self._append_history(metric_name, value)
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────
|
||
# Private Helpers
|
||
# ──────────────────────────────────────────────────────────────────────────
|
||
|
||
def _linear_regression_predict(
|
||
self,
|
||
history: list[tuple[float, float]], # (timestamp, value)
|
||
current_value: float,
|
||
threshold: float,
|
||
forecast_hours: int,
|
||
) -> TrendPrediction:
|
||
"""
|
||
numpy 線性回歸:y = slope * x + intercept
|
||
x = 相對小時數(0 到 N),y = metric 值
|
||
"""
|
||
metric_name = "" # 呼叫方 fillback
|
||
|
||
try:
|
||
import numpy as np
|
||
|
||
times = np.array([h[0] for h in history], dtype=float)
|
||
values = np.array([h[1] for h in history], dtype=float)
|
||
|
||
# 相對化時間(小時為單位)
|
||
times_rel = (times - times[0]) / 3600.0
|
||
|
||
# 線性回歸:polyfit degree=1
|
||
coeffs = np.polyfit(times_rel, values, 1)
|
||
slope = float(coeffs[0]) # 每小時斜率
|
||
intercept = float(coeffs[1])
|
||
|
||
# R² 計算
|
||
fitted = np.polyval(coeffs, times_rel)
|
||
ss_res = float(np.sum((values - fitted) ** 2))
|
||
ss_tot = float(np.sum((values - np.mean(values)) ** 2))
|
||
r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0
|
||
|
||
# 預測 forecast_hours 後的值
|
||
current_time_rel = (now_taipei().timestamp() - times[0]) / 3600.0
|
||
predicted_value = slope * (current_time_rel + forecast_hours) + intercept
|
||
|
||
# 信心度
|
||
if r2 >= TREND_CONFIDENCE_THRESHOLD:
|
||
confidence = "high"
|
||
elif r2 >= 0.4:
|
||
confidence = "medium"
|
||
else:
|
||
confidence = "low"
|
||
|
||
# 是否超越閾值
|
||
will_breach = predicted_value >= threshold
|
||
breach_in_hours: float | None = None
|
||
|
||
if will_breach and slope > 0 and current_value < threshold:
|
||
# 計算幾小時後超越:current_value + slope * h = threshold
|
||
breach_in_hours = round((threshold - current_value) / slope, 2)
|
||
breach_in_hours = min(breach_in_hours, float(forecast_hours))
|
||
elif current_value >= threshold:
|
||
breach_in_hours = 0.0
|
||
|
||
return TrendPrediction(
|
||
metric_name=metric_name,
|
||
current_value=current_value,
|
||
threshold=threshold,
|
||
forecast_hours=forecast_hours,
|
||
predicted_value=round(predicted_value, 4),
|
||
slope_per_hour=round(slope, 6),
|
||
r_squared=round(r2, 4),
|
||
will_breach=will_breach,
|
||
breach_in_hours=breach_in_hours,
|
||
confidence=confidence,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning("trend_regression_failed", error=str(e))
|
||
# Fallback:最近值外推
|
||
return TrendPrediction(
|
||
metric_name=metric_name,
|
||
current_value=current_value,
|
||
threshold=threshold,
|
||
forecast_hours=forecast_hours,
|
||
predicted_value=current_value,
|
||
slope_per_hour=0.0,
|
||
r_squared=0.0,
|
||
will_breach=current_value >= threshold,
|
||
breach_in_hours=0.0 if current_value >= threshold else None,
|
||
confidence="low",
|
||
)
|
||
|
||
async def _append_history(self, metric_name: str, value: float) -> None:
|
||
"""推送資料點到 Redis 滑動視窗(最舊的被 trim 掉)。"""
|
||
try:
|
||
import json
|
||
from src.core.redis_client import get_redis
|
||
r = get_redis()
|
||
key = f"{REDIS_KEY_HISTORY}{metric_name}"
|
||
point = json.dumps([now_taipei().timestamp(), value])
|
||
await r.rpush(key, point)
|
||
await r.ltrim(key, -MAX_HISTORY_POINTS, -1) # 保留最新 N 個點
|
||
await r.expire(key, REDIS_TTL_HISTORY)
|
||
except Exception as e:
|
||
logger.warning("trend_history_append_failed", metric=metric_name, error=str(e))
|
||
|
||
async def _get_history(self, metric_name: str) -> list[tuple[float, float]]:
|
||
"""從 Redis 取歷史資料點。"""
|
||
try:
|
||
import json
|
||
from src.core.redis_client import get_redis
|
||
r = get_redis()
|
||
key = f"{REDIS_KEY_HISTORY}{metric_name}"
|
||
raw = await r.lrange(key, 0, -1)
|
||
return [tuple(json.loads(item)) for item in raw]
|
||
except Exception as e:
|
||
logger.warning("trend_history_get_failed", metric=metric_name, error=str(e))
|
||
return []
|
||
|
||
def _no_data_result(
|
||
self,
|
||
metric_name: str,
|
||
current_value: float,
|
||
threshold: float,
|
||
forecast_hours: int,
|
||
) -> TrendPrediction:
|
||
"""Feature flag 關閉時的空結果。"""
|
||
return TrendPrediction(
|
||
metric_name=metric_name,
|
||
current_value=current_value,
|
||
threshold=threshold,
|
||
forecast_hours=forecast_hours,
|
||
predicted_value=current_value,
|
||
slope_per_hour=0.0,
|
||
r_squared=0.0,
|
||
will_breach=False,
|
||
breach_in_hours=None,
|
||
confidence="insufficient_data",
|
||
shadow_mode=True,
|
||
detected_at=now_taipei().isoformat(),
|
||
)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Singleton
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_predictor: TrendPredictor | None = None
|
||
|
||
|
||
def get_trend_predictor() -> TrendPredictor:
|
||
global _predictor
|
||
if _predictor is None:
|
||
_predictor = TrendPredictor()
|
||
return _predictor
|