Files
2026FIFAWorldCup/platform/backend/app/analytics/ml_ensemble.py

436 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""機器學習賽果預測引擎Ensemble"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Mapping
from uuid import uuid4
import numpy as np
import pandas as pd
try:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
except Exception: # pragma: no cover - 缺少 scikit-learn 時的 fallback
GradientBoostingClassifier = None
train_test_split = None
FEATURE_COLUMNS = ('rest_days_advantage', 'travel_distance_km', 'recent_5_xg_diff')
OUTCOMES = ('home', 'draw', 'away')
def _sigmoid(value: float) -> float:
return 1.0 / (1.0 + np.exp(-value))
def _softmax(values: np.ndarray) -> np.ndarray:
shifted = values - np.max(values)
exp_values = np.exp(shifted)
return exp_values / exp_values.sum()
@dataclass(frozen=True)
class EnsembleModelArtifact:
"""已訓練的 ML 模組與中繼資料。"""
model: Any
feature_columns: tuple[str, ...]
model_id: str
training_size: int
is_fallback: bool
training_accuracy: float | None = None
class _FallbackMatchModel:
"""缺少 ML 套件時的保底模型(規則式)。"""
feature_columns = FEATURE_COLUMNS
def predict_proba(self, row_df: pd.DataFrame) -> np.ndarray:
if row_df.empty:
return np.zeros((0, 3))
x = row_df[self.feature_columns].to_numpy(float)
raw_scores = []
for rest_days_advantage, travel_distance_km, recent_5_xg_diff in x:
home_score = 0.6 + rest_days_advantage * 0.022 + recent_5_xg_diff * 0.34 - travel_distance_km * 0.0012
draw_score = 0.30 - abs(rest_days_advantage) * 0.015 - abs(recent_5_xg_diff) * 0.22
away_score = 0.1 - rest_days_advantage * 0.022 - recent_5_xg_diff * 0.34 + travel_distance_km * 0.0012
scores = np.array(
[
_sigmoid(home_score),
_sigmoid(draw_score) * 0.9,
_sigmoid(away_score),
],
dtype=float,
)
raw_scores.append(_softmax(scores))
return np.vstack(raw_scores)
def _as_float(value: Any, default: float = 0.0) -> float:
try:
return float(value)
except (TypeError, ValueError):
return default
def normalize_feature_payload(payload: Mapping[str, Any]) -> dict[str, float]:
"""從前端或資料庫欄位,萃取核心三大特徵。"""
home_rest = _as_float(payload.get('home_rest_days'))
away_rest = _as_float(payload.get('away_rest_days'))
home_travel = _as_float(payload.get('home_travel_distance_km'))
away_travel = _as_float(payload.get('away_travel_distance_km'))
recent_home = _as_float(payload.get('recent_5_xg_home'))
recent_away = _as_float(payload.get('recent_5_xg_away'))
return {
'home_rest_days': home_rest,
'away_rest_days': away_rest,
'home_travel_distance_km': home_travel,
'away_travel_distance_km': away_travel,
'recent_5_xg_home': recent_home,
'recent_5_xg_away': recent_away,
'rest_days_advantage': home_rest - away_rest,
'travel_distance_km': home_travel - away_travel,
'recent_5_xg_diff': recent_home - recent_away,
}
def _validation_frame(rows: list[Mapping[str, Any]]) -> pd.DataFrame:
if len(rows) < 5:
raise ValueError('訓練樣本少於 5 筆,無法完成穩定訓練')
frame = pd.DataFrame(rows)
required_fields = set(FEATURE_COLUMNS) | {'match_result'}
missing = required_fields - set(frame.columns)
if missing:
raise ValueError(f'訓練資料缺欄位:{sorted(missing)}')
frame = frame.copy()
frame[list(FEATURE_COLUMNS)] = frame[list(FEATURE_COLUMNS)].astype(float).fillna(0.0)
frame['match_result'] = frame['match_result'].str.lower().str.strip()
unknown = set(frame['match_result']) - set(OUTCOMES)
if unknown:
raise ValueError(f'未知賽果標籤:{sorted(unknown)},僅支援 {OUTCOMES}')
return frame
def build_default_ml_training_rows() -> list[dict[str, float | str]]:
"""建立保底訓練樣本(當環境無法即時取得外部訓練資料時)。"""
return [
{
'home_rest_days': 4,
'away_rest_days': 3,
'home_travel_distance_km': 520,
'away_travel_distance_km': 1100,
'recent_5_xg_home': 1.8,
'recent_5_xg_away': 1.0,
'rest_days_advantage': 1,
'travel_distance_km': -580,
'recent_5_xg_diff': 0.8,
'match_result': 'home',
},
{
'home_rest_days': 2,
'away_rest_days': 5,
'home_travel_distance_km': 220,
'away_travel_distance_km': 780,
'recent_5_xg_home': 1.1,
'recent_5_xg_away': 1.7,
'rest_days_advantage': -3,
'travel_distance_km': -560,
'recent_5_xg_diff': -0.6,
'match_result': 'away',
},
{
'home_rest_days': 6,
'away_rest_days': 4,
'home_travel_distance_km': 120,
'away_travel_distance_km': 960,
'recent_5_xg_home': 2.3,
'recent_5_xg_away': 1.8,
'rest_days_advantage': 2,
'travel_distance_km': -840,
'recent_5_xg_diff': 0.5,
'match_result': 'home',
},
{
'home_rest_days': 3,
'away_rest_days': 3,
'home_travel_distance_km': 900,
'away_travel_distance_km': 900,
'recent_5_xg_home': 1.2,
'recent_5_xg_away': 1.3,
'rest_days_advantage': 0,
'travel_distance_km': 0,
'recent_5_xg_diff': -0.1,
'match_result': 'draw',
},
{
'home_rest_days': 8,
'away_rest_days': 2,
'home_travel_distance_km': 350,
'away_travel_distance_km': 700,
'recent_5_xg_home': 2.0,
'recent_5_xg_away': 1.2,
'rest_days_advantage': 6,
'travel_distance_km': -350,
'recent_5_xg_diff': 0.8,
'match_result': 'home',
},
{
'home_rest_days': 1,
'away_rest_days': 2,
'home_travel_distance_km': 1600,
'away_travel_distance_km': 2500,
'recent_5_xg_home': 1.4,
'recent_5_xg_away': 2.1,
'rest_days_advantage': -1,
'travel_distance_km': -900,
'recent_5_xg_diff': -0.7,
'match_result': 'away',
},
{
'home_rest_days': 5,
'away_rest_days': 5,
'home_travel_distance_km': 700,
'away_travel_distance_km': 700,
'recent_5_xg_home': 1.9,
'recent_5_xg_away': 1.9,
'rest_days_advantage': 0,
'travel_distance_km': 0,
'recent_5_xg_diff': 0.0,
'match_result': 'draw',
},
{
'home_rest_days': 9,
'away_rest_days': 3,
'home_travel_distance_km': 400,
'away_travel_distance_km': 300,
'recent_5_xg_home': 2.4,
'recent_5_xg_away': 1.1,
'rest_days_advantage': 6,
'travel_distance_km': 100,
'recent_5_xg_diff': 1.3,
'match_result': 'home',
},
{
'home_rest_days': 2,
'away_rest_days': 7,
'home_travel_distance_km': 1800,
'away_travel_distance_km': 250,
'recent_5_xg_home': 1.0,
'recent_5_xg_away': 1.5,
'rest_days_advantage': -5,
'travel_distance_km': 1550,
'recent_5_xg_diff': -0.5,
'match_result': 'away',
},
{
'home_rest_days': 4,
'away_rest_days': 4,
'home_travel_distance_km': 500,
'away_travel_distance_km': 500,
'recent_5_xg_home': 1.6,
'recent_5_xg_away': 1.4,
'rest_days_advantage': 0,
'travel_distance_km': 0,
'recent_5_xg_diff': 0.2,
'match_result': 'home',
},
{
'home_rest_days': 6,
'away_rest_days': 1,
'home_travel_distance_km': 300,
'away_travel_distance_km': 1200,
'recent_5_xg_home': 2.8,
'recent_5_xg_away': 0.8,
'rest_days_advantage': 5,
'travel_distance_km': -900,
'recent_5_xg_diff': 2.0,
'match_result': 'home',
},
{
'home_rest_days': 2,
'away_rest_days': 6,
'home_travel_distance_km': 1000,
'away_travel_distance_km': 200,
'recent_5_xg_home': 1.0,
'recent_5_xg_away': 2.6,
'rest_days_advantage': -4,
'travel_distance_km': 800,
'recent_5_xg_diff': -1.6,
'match_result': 'away',
},
{
'home_rest_days': 7,
'away_rest_days': 7,
'home_travel_distance_km': 650,
'away_travel_distance_km': 650,
'recent_5_xg_home': 1.8,
'recent_5_xg_away': 1.8,
'rest_days_advantage': 0,
'travel_distance_km': 0,
'recent_5_xg_diff': 0.0,
'match_result': 'draw',
},
{
'home_rest_days': 3,
'away_rest_days': 1,
'home_travel_distance_km': 260,
'away_travel_distance_km': 900,
'recent_5_xg_home': 2.1,
'recent_5_xg_away': 1.6,
'rest_days_advantage': 2,
'travel_distance_km': -640,
'recent_5_xg_diff': 0.5,
'match_result': 'home',
},
{
'home_rest_days': 0,
'away_rest_days': 5,
'home_travel_distance_km': 1500,
'away_travel_distance_km': 150,
'recent_5_xg_home': 1.2,
'recent_5_xg_away': 2.0,
'rest_days_advantage': -5,
'travel_distance_km': 1350,
'recent_5_xg_diff': -0.8,
'match_result': 'away',
},
{
'home_rest_days': 5,
'away_rest_days': 2,
'home_travel_distance_km': 300,
'away_travel_distance_km': 300,
'recent_5_xg_home': 2.2,
'recent_5_xg_away': 1.1,
'rest_days_advantage': 3,
'travel_distance_km': 0,
'recent_5_xg_diff': 1.1,
'match_result': 'home',
},
{
'home_rest_days': 4,
'away_rest_days': 8,
'home_travel_distance_km': 450,
'away_travel_distance_km': 980,
'recent_5_xg_home': 1.5,
'recent_5_xg_away': 2.4,
'rest_days_advantage': -4,
'travel_distance_km': -530,
'recent_5_xg_diff': -0.9,
'match_result': 'away',
},
]
def train_match_outcome_ensemble(
training_rows: list[Mapping[str, Any]],
*,
model_id: str | None = None,
) -> EnsembleModelArtifact:
"""訓練 1X2 賽果 Ensemble無法使用 sklearn 時自動回退規則模型)。"""
normalized = [_normalize_training_row(row) for row in training_rows]
frame = _validation_frame(normalized)
x = frame[list(FEATURE_COLUMNS)]
y = frame['match_result'].map({'home': 0, 'draw': 1, 'away': 2})
if len(frame) < 24 or GradientBoostingClassifier is None or train_test_split is None:
return EnsembleModelArtifact(
model=_FallbackMatchModel(),
feature_columns=FEATURE_COLUMNS,
model_id=model_id or uuid4().hex,
training_size=len(frame),
is_fallback=True,
training_accuracy=None,
)
x_train, x_val, y_train, y_val = train_test_split(
x,
y,
test_size=min(0.3, max(0.15, 1 - (30 / len(frame)))),
random_state=17,
stratify=y,
)
model = GradientBoostingClassifier(
random_state=17,
n_estimators=220,
max_depth=3,
learning_rate=0.06,
)
model.fit(x_train, y_train)
accuracy = float(model.score(x_val, y_val)) if len(set(y_val)) > 1 else None
return EnsembleModelArtifact(
model=model,
feature_columns=FEATURE_COLUMNS,
model_id=model_id or uuid4().hex,
training_size=len(frame),
is_fallback=False,
training_accuracy=accuracy,
)
def _normalize_training_row(row: Mapping[str, Any]) -> dict[str, float | str]:
normalized = normalize_feature_payload(row)
if 'match_result' not in row:
raise ValueError('訓練資料缺少 match_result')
normalized['match_result'] = str(row['match_result']).strip().lower()
return normalized
def build_default_ensemble_artifact() -> EnsembleModelArtifact:
"""建立系統預設模型(含 fallback"""
return train_match_outcome_ensemble(build_default_ml_training_rows(), model_id='default')
def model_predict_probabilities(
artifact: EnsembleModelArtifact,
features: Mapping[str, Any],
) -> dict[str, float]:
"""回傳 home/draw/away 的機率。"""
normalized = normalize_feature_payload(features)
feature_frame = pd.DataFrame([normalized], columns=artifact.feature_columns)
probs = artifact.model.predict_proba(feature_frame)[0]
return {
'home': float(probs[0]),
'draw': float(probs[1]),
'away': float(probs[2]),
}
def calculate_model_edges(
predicted: dict[str, float],
implied: dict[str, float],
) -> dict[str, dict[str, float | bool]]:
"""比較模型機率與莊家隱含機率,標示 Strong Buy。"""
edges: dict[str, dict[str, float | bool]] = {}
for key in OUTCOMES:
p = float(predicted.get(key, 0))
i = float(implied.get(key, 0))
edge = p - i
edges[key] = {
'model_prob': round(p, 6),
'implied_prob': round(i, 6),
'edge': round(edge, 6),
'strong_buy': edge >= 0.04,
}
return edges