436 lines
12 KiB
Python
436 lines
12 KiB
Python
"""機器學習賽果預測引擎(Ensemble)。"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from dataclasses import dataclass
|
||
from typing import Any, Mapping
|
||
from uuid import uuid4
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
|
||
try:
|
||
from sklearn.ensemble import GradientBoostingClassifier
|
||
from sklearn.model_selection import train_test_split
|
||
except Exception: # pragma: no cover - 缺少 scikit-learn 時的 fallback
|
||
GradientBoostingClassifier = None
|
||
train_test_split = None
|
||
|
||
|
||
FEATURE_COLUMNS = ('rest_days_advantage', 'travel_distance_km', 'recent_5_xg_diff')
|
||
OUTCOMES = ('home', 'draw', 'away')
|
||
|
||
|
||
def _sigmoid(value: float) -> float:
|
||
return 1.0 / (1.0 + np.exp(-value))
|
||
|
||
|
||
def _softmax(values: np.ndarray) -> np.ndarray:
|
||
shifted = values - np.max(values)
|
||
exp_values = np.exp(shifted)
|
||
return exp_values / exp_values.sum()
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class EnsembleModelArtifact:
|
||
"""已訓練的 ML 模組與中繼資料。"""
|
||
|
||
model: Any
|
||
feature_columns: tuple[str, ...]
|
||
model_id: str
|
||
training_size: int
|
||
is_fallback: bool
|
||
training_accuracy: float | None = None
|
||
|
||
|
||
class _FallbackMatchModel:
|
||
"""缺少 ML 套件時的保底模型(規則式)。"""
|
||
|
||
feature_columns = FEATURE_COLUMNS
|
||
|
||
def predict_proba(self, row_df: pd.DataFrame) -> np.ndarray:
|
||
if row_df.empty:
|
||
return np.zeros((0, 3))
|
||
|
||
x = row_df[self.feature_columns].to_numpy(float)
|
||
raw_scores = []
|
||
|
||
for rest_days_advantage, travel_distance_km, recent_5_xg_diff in x:
|
||
home_score = 0.6 + rest_days_advantage * 0.022 + recent_5_xg_diff * 0.34 - travel_distance_km * 0.0012
|
||
draw_score = 0.30 - abs(rest_days_advantage) * 0.015 - abs(recent_5_xg_diff) * 0.22
|
||
away_score = 0.1 - rest_days_advantage * 0.022 - recent_5_xg_diff * 0.34 + travel_distance_km * 0.0012
|
||
|
||
scores = np.array(
|
||
[
|
||
_sigmoid(home_score),
|
||
_sigmoid(draw_score) * 0.9,
|
||
_sigmoid(away_score),
|
||
],
|
||
dtype=float,
|
||
)
|
||
raw_scores.append(_softmax(scores))
|
||
|
||
return np.vstack(raw_scores)
|
||
|
||
|
||
def _as_float(value: Any, default: float = 0.0) -> float:
|
||
try:
|
||
return float(value)
|
||
except (TypeError, ValueError):
|
||
return default
|
||
|
||
|
||
def normalize_feature_payload(payload: Mapping[str, Any]) -> dict[str, float]:
|
||
"""從前端或資料庫欄位,萃取核心三大特徵。"""
|
||
|
||
home_rest = _as_float(payload.get('home_rest_days'))
|
||
away_rest = _as_float(payload.get('away_rest_days'))
|
||
home_travel = _as_float(payload.get('home_travel_distance_km'))
|
||
away_travel = _as_float(payload.get('away_travel_distance_km'))
|
||
recent_home = _as_float(payload.get('recent_5_xg_home'))
|
||
recent_away = _as_float(payload.get('recent_5_xg_away'))
|
||
|
||
return {
|
||
'home_rest_days': home_rest,
|
||
'away_rest_days': away_rest,
|
||
'home_travel_distance_km': home_travel,
|
||
'away_travel_distance_km': away_travel,
|
||
'recent_5_xg_home': recent_home,
|
||
'recent_5_xg_away': recent_away,
|
||
'rest_days_advantage': home_rest - away_rest,
|
||
'travel_distance_km': home_travel - away_travel,
|
||
'recent_5_xg_diff': recent_home - recent_away,
|
||
}
|
||
|
||
|
||
def _validation_frame(rows: list[Mapping[str, Any]]) -> pd.DataFrame:
|
||
if len(rows) < 5:
|
||
raise ValueError('訓練樣本少於 5 筆,無法完成穩定訓練')
|
||
|
||
frame = pd.DataFrame(rows)
|
||
required_fields = set(FEATURE_COLUMNS) | {'match_result'}
|
||
missing = required_fields - set(frame.columns)
|
||
if missing:
|
||
raise ValueError(f'訓練資料缺欄位:{sorted(missing)}')
|
||
|
||
frame = frame.copy()
|
||
frame[list(FEATURE_COLUMNS)] = frame[list(FEATURE_COLUMNS)].astype(float).fillna(0.0)
|
||
frame['match_result'] = frame['match_result'].str.lower().str.strip()
|
||
|
||
unknown = set(frame['match_result']) - set(OUTCOMES)
|
||
if unknown:
|
||
raise ValueError(f'未知賽果標籤:{sorted(unknown)},僅支援 {OUTCOMES}')
|
||
return frame
|
||
|
||
|
||
def build_default_ml_training_rows() -> list[dict[str, float | str]]:
|
||
"""建立保底訓練樣本(當環境無法即時取得外部訓練資料時)。"""
|
||
|
||
return [
|
||
{
|
||
'home_rest_days': 4,
|
||
'away_rest_days': 3,
|
||
'home_travel_distance_km': 520,
|
||
'away_travel_distance_km': 1100,
|
||
'recent_5_xg_home': 1.8,
|
||
'recent_5_xg_away': 1.0,
|
||
'rest_days_advantage': 1,
|
||
'travel_distance_km': -580,
|
||
'recent_5_xg_diff': 0.8,
|
||
'match_result': 'home',
|
||
},
|
||
{
|
||
'home_rest_days': 2,
|
||
'away_rest_days': 5,
|
||
'home_travel_distance_km': 220,
|
||
'away_travel_distance_km': 780,
|
||
'recent_5_xg_home': 1.1,
|
||
'recent_5_xg_away': 1.7,
|
||
'rest_days_advantage': -3,
|
||
'travel_distance_km': -560,
|
||
'recent_5_xg_diff': -0.6,
|
||
'match_result': 'away',
|
||
},
|
||
{
|
||
'home_rest_days': 6,
|
||
'away_rest_days': 4,
|
||
'home_travel_distance_km': 120,
|
||
'away_travel_distance_km': 960,
|
||
'recent_5_xg_home': 2.3,
|
||
'recent_5_xg_away': 1.8,
|
||
'rest_days_advantage': 2,
|
||
'travel_distance_km': -840,
|
||
'recent_5_xg_diff': 0.5,
|
||
'match_result': 'home',
|
||
},
|
||
{
|
||
'home_rest_days': 3,
|
||
'away_rest_days': 3,
|
||
'home_travel_distance_km': 900,
|
||
'away_travel_distance_km': 900,
|
||
'recent_5_xg_home': 1.2,
|
||
'recent_5_xg_away': 1.3,
|
||
'rest_days_advantage': 0,
|
||
'travel_distance_km': 0,
|
||
'recent_5_xg_diff': -0.1,
|
||
'match_result': 'draw',
|
||
},
|
||
{
|
||
'home_rest_days': 8,
|
||
'away_rest_days': 2,
|
||
'home_travel_distance_km': 350,
|
||
'away_travel_distance_km': 700,
|
||
'recent_5_xg_home': 2.0,
|
||
'recent_5_xg_away': 1.2,
|
||
'rest_days_advantage': 6,
|
||
'travel_distance_km': -350,
|
||
'recent_5_xg_diff': 0.8,
|
||
'match_result': 'home',
|
||
},
|
||
{
|
||
'home_rest_days': 1,
|
||
'away_rest_days': 2,
|
||
'home_travel_distance_km': 1600,
|
||
'away_travel_distance_km': 2500,
|
||
'recent_5_xg_home': 1.4,
|
||
'recent_5_xg_away': 2.1,
|
||
'rest_days_advantage': -1,
|
||
'travel_distance_km': -900,
|
||
'recent_5_xg_diff': -0.7,
|
||
'match_result': 'away',
|
||
},
|
||
{
|
||
'home_rest_days': 5,
|
||
'away_rest_days': 5,
|
||
'home_travel_distance_km': 700,
|
||
'away_travel_distance_km': 700,
|
||
'recent_5_xg_home': 1.9,
|
||
'recent_5_xg_away': 1.9,
|
||
'rest_days_advantage': 0,
|
||
'travel_distance_km': 0,
|
||
'recent_5_xg_diff': 0.0,
|
||
'match_result': 'draw',
|
||
},
|
||
{
|
||
'home_rest_days': 9,
|
||
'away_rest_days': 3,
|
||
'home_travel_distance_km': 400,
|
||
'away_travel_distance_km': 300,
|
||
'recent_5_xg_home': 2.4,
|
||
'recent_5_xg_away': 1.1,
|
||
'rest_days_advantage': 6,
|
||
'travel_distance_km': 100,
|
||
'recent_5_xg_diff': 1.3,
|
||
'match_result': 'home',
|
||
},
|
||
{
|
||
'home_rest_days': 2,
|
||
'away_rest_days': 7,
|
||
'home_travel_distance_km': 1800,
|
||
'away_travel_distance_km': 250,
|
||
'recent_5_xg_home': 1.0,
|
||
'recent_5_xg_away': 1.5,
|
||
'rest_days_advantage': -5,
|
||
'travel_distance_km': 1550,
|
||
'recent_5_xg_diff': -0.5,
|
||
'match_result': 'away',
|
||
},
|
||
{
|
||
'home_rest_days': 4,
|
||
'away_rest_days': 4,
|
||
'home_travel_distance_km': 500,
|
||
'away_travel_distance_km': 500,
|
||
'recent_5_xg_home': 1.6,
|
||
'recent_5_xg_away': 1.4,
|
||
'rest_days_advantage': 0,
|
||
'travel_distance_km': 0,
|
||
'recent_5_xg_diff': 0.2,
|
||
'match_result': 'home',
|
||
},
|
||
{
|
||
'home_rest_days': 6,
|
||
'away_rest_days': 1,
|
||
'home_travel_distance_km': 300,
|
||
'away_travel_distance_km': 1200,
|
||
'recent_5_xg_home': 2.8,
|
||
'recent_5_xg_away': 0.8,
|
||
'rest_days_advantage': 5,
|
||
'travel_distance_km': -900,
|
||
'recent_5_xg_diff': 2.0,
|
||
'match_result': 'home',
|
||
},
|
||
{
|
||
'home_rest_days': 2,
|
||
'away_rest_days': 6,
|
||
'home_travel_distance_km': 1000,
|
||
'away_travel_distance_km': 200,
|
||
'recent_5_xg_home': 1.0,
|
||
'recent_5_xg_away': 2.6,
|
||
'rest_days_advantage': -4,
|
||
'travel_distance_km': 800,
|
||
'recent_5_xg_diff': -1.6,
|
||
'match_result': 'away',
|
||
},
|
||
{
|
||
'home_rest_days': 7,
|
||
'away_rest_days': 7,
|
||
'home_travel_distance_km': 650,
|
||
'away_travel_distance_km': 650,
|
||
'recent_5_xg_home': 1.8,
|
||
'recent_5_xg_away': 1.8,
|
||
'rest_days_advantage': 0,
|
||
'travel_distance_km': 0,
|
||
'recent_5_xg_diff': 0.0,
|
||
'match_result': 'draw',
|
||
},
|
||
{
|
||
'home_rest_days': 3,
|
||
'away_rest_days': 1,
|
||
'home_travel_distance_km': 260,
|
||
'away_travel_distance_km': 900,
|
||
'recent_5_xg_home': 2.1,
|
||
'recent_5_xg_away': 1.6,
|
||
'rest_days_advantage': 2,
|
||
'travel_distance_km': -640,
|
||
'recent_5_xg_diff': 0.5,
|
||
'match_result': 'home',
|
||
},
|
||
{
|
||
'home_rest_days': 0,
|
||
'away_rest_days': 5,
|
||
'home_travel_distance_km': 1500,
|
||
'away_travel_distance_km': 150,
|
||
'recent_5_xg_home': 1.2,
|
||
'recent_5_xg_away': 2.0,
|
||
'rest_days_advantage': -5,
|
||
'travel_distance_km': 1350,
|
||
'recent_5_xg_diff': -0.8,
|
||
'match_result': 'away',
|
||
},
|
||
{
|
||
'home_rest_days': 5,
|
||
'away_rest_days': 2,
|
||
'home_travel_distance_km': 300,
|
||
'away_travel_distance_km': 300,
|
||
'recent_5_xg_home': 2.2,
|
||
'recent_5_xg_away': 1.1,
|
||
'rest_days_advantage': 3,
|
||
'travel_distance_km': 0,
|
||
'recent_5_xg_diff': 1.1,
|
||
'match_result': 'home',
|
||
},
|
||
{
|
||
'home_rest_days': 4,
|
||
'away_rest_days': 8,
|
||
'home_travel_distance_km': 450,
|
||
'away_travel_distance_km': 980,
|
||
'recent_5_xg_home': 1.5,
|
||
'recent_5_xg_away': 2.4,
|
||
'rest_days_advantage': -4,
|
||
'travel_distance_km': -530,
|
||
'recent_5_xg_diff': -0.9,
|
||
'match_result': 'away',
|
||
},
|
||
]
|
||
|
||
|
||
def train_match_outcome_ensemble(
|
||
training_rows: list[Mapping[str, Any]],
|
||
*,
|
||
model_id: str | None = None,
|
||
) -> EnsembleModelArtifact:
|
||
"""訓練 1X2 賽果 Ensemble(無法使用 sklearn 時自動回退規則模型)。"""
|
||
|
||
normalized = [_normalize_training_row(row) for row in training_rows]
|
||
frame = _validation_frame(normalized)
|
||
|
||
x = frame[list(FEATURE_COLUMNS)]
|
||
y = frame['match_result'].map({'home': 0, 'draw': 1, 'away': 2})
|
||
|
||
if len(frame) < 24 or GradientBoostingClassifier is None or train_test_split is None:
|
||
return EnsembleModelArtifact(
|
||
model=_FallbackMatchModel(),
|
||
feature_columns=FEATURE_COLUMNS,
|
||
model_id=model_id or uuid4().hex,
|
||
training_size=len(frame),
|
||
is_fallback=True,
|
||
training_accuracy=None,
|
||
)
|
||
|
||
x_train, x_val, y_train, y_val = train_test_split(
|
||
x,
|
||
y,
|
||
test_size=min(0.3, max(0.15, 1 - (30 / len(frame)))),
|
||
random_state=17,
|
||
stratify=y,
|
||
)
|
||
|
||
model = GradientBoostingClassifier(
|
||
random_state=17,
|
||
n_estimators=220,
|
||
max_depth=3,
|
||
learning_rate=0.06,
|
||
)
|
||
model.fit(x_train, y_train)
|
||
accuracy = float(model.score(x_val, y_val)) if len(set(y_val)) > 1 else None
|
||
|
||
return EnsembleModelArtifact(
|
||
model=model,
|
||
feature_columns=FEATURE_COLUMNS,
|
||
model_id=model_id or uuid4().hex,
|
||
training_size=len(frame),
|
||
is_fallback=False,
|
||
training_accuracy=accuracy,
|
||
)
|
||
|
||
|
||
def _normalize_training_row(row: Mapping[str, Any]) -> dict[str, float | str]:
|
||
normalized = normalize_feature_payload(row)
|
||
if 'match_result' not in row:
|
||
raise ValueError('訓練資料缺少 match_result')
|
||
normalized['match_result'] = str(row['match_result']).strip().lower()
|
||
return normalized
|
||
|
||
|
||
def build_default_ensemble_artifact() -> EnsembleModelArtifact:
|
||
"""建立系統預設模型(含 fallback)。"""
|
||
|
||
return train_match_outcome_ensemble(build_default_ml_training_rows(), model_id='default')
|
||
|
||
|
||
def model_predict_probabilities(
|
||
artifact: EnsembleModelArtifact,
|
||
features: Mapping[str, Any],
|
||
) -> dict[str, float]:
|
||
"""回傳 home/draw/away 的機率。"""
|
||
|
||
normalized = normalize_feature_payload(features)
|
||
feature_frame = pd.DataFrame([normalized], columns=artifact.feature_columns)
|
||
probs = artifact.model.predict_proba(feature_frame)[0]
|
||
return {
|
||
'home': float(probs[0]),
|
||
'draw': float(probs[1]),
|
||
'away': float(probs[2]),
|
||
}
|
||
|
||
|
||
def calculate_model_edges(
|
||
predicted: dict[str, float],
|
||
implied: dict[str, float],
|
||
) -> dict[str, dict[str, float | bool]]:
|
||
"""比較模型機率與莊家隱含機率,標示 Strong Buy。"""
|
||
|
||
edges: dict[str, dict[str, float | bool]] = {}
|
||
for key in OUTCOMES:
|
||
p = float(predicted.get(key, 0))
|
||
i = float(implied.get(key, 0))
|
||
edge = p - i
|
||
edges[key] = {
|
||
'model_prob': round(p, 6),
|
||
'implied_prob': round(i, 6),
|
||
'edge': round(edge, 6),
|
||
'strong_buy': edge >= 0.04,
|
||
}
|
||
return edges
|
||
|