"""機器學習賽果預測引擎(Ensemble)。""" from __future__ import annotations from dataclasses import dataclass from typing import Any, Mapping from uuid import uuid4 import numpy as np import pandas as pd try: from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split except Exception: # pragma: no cover - 缺少 scikit-learn 時的 fallback GradientBoostingClassifier = None train_test_split = None FEATURE_COLUMNS = ('rest_days_advantage', 'travel_distance_km', 'recent_5_xg_diff') OUTCOMES = ('home', 'draw', 'away') def _sigmoid(value: float) -> float: return 1.0 / (1.0 + np.exp(-value)) def _softmax(values: np.ndarray) -> np.ndarray: shifted = values - np.max(values) exp_values = np.exp(shifted) return exp_values / exp_values.sum() @dataclass(frozen=True) class EnsembleModelArtifact: """已訓練的 ML 模組與中繼資料。""" model: Any feature_columns: tuple[str, ...] model_id: str training_size: int is_fallback: bool training_accuracy: float | None = None class _FallbackMatchModel: """缺少 ML 套件時的保底模型(規則式)。""" feature_columns = FEATURE_COLUMNS def predict_proba(self, row_df: pd.DataFrame) -> np.ndarray: if row_df.empty: return np.zeros((0, 3)) x = row_df[self.feature_columns].to_numpy(float) raw_scores = [] for rest_days_advantage, travel_distance_km, recent_5_xg_diff in x: home_score = 0.6 + rest_days_advantage * 0.022 + recent_5_xg_diff * 0.34 - travel_distance_km * 0.0012 draw_score = 0.30 - abs(rest_days_advantage) * 0.015 - abs(recent_5_xg_diff) * 0.22 away_score = 0.1 - rest_days_advantage * 0.022 - recent_5_xg_diff * 0.34 + travel_distance_km * 0.0012 scores = np.array( [ _sigmoid(home_score), _sigmoid(draw_score) * 0.9, _sigmoid(away_score), ], dtype=float, ) raw_scores.append(_softmax(scores)) return np.vstack(raw_scores) def _as_float(value: Any, default: float = 0.0) -> float: try: return float(value) except (TypeError, ValueError): return default def normalize_feature_payload(payload: Mapping[str, Any]) -> dict[str, float]: """從前端或資料庫欄位,萃取核心三大特徵。""" home_rest = _as_float(payload.get('home_rest_days')) away_rest = _as_float(payload.get('away_rest_days')) home_travel = _as_float(payload.get('home_travel_distance_km')) away_travel = _as_float(payload.get('away_travel_distance_km')) recent_home = _as_float(payload.get('recent_5_xg_home')) recent_away = _as_float(payload.get('recent_5_xg_away')) return { 'home_rest_days': home_rest, 'away_rest_days': away_rest, 'home_travel_distance_km': home_travel, 'away_travel_distance_km': away_travel, 'recent_5_xg_home': recent_home, 'recent_5_xg_away': recent_away, 'rest_days_advantage': home_rest - away_rest, 'travel_distance_km': home_travel - away_travel, 'recent_5_xg_diff': recent_home - recent_away, } def _validation_frame(rows: list[Mapping[str, Any]]) -> pd.DataFrame: if len(rows) < 5: raise ValueError('訓練樣本少於 5 筆,無法完成穩定訓練') frame = pd.DataFrame(rows) required_fields = set(FEATURE_COLUMNS) | {'match_result'} missing = required_fields - set(frame.columns) if missing: raise ValueError(f'訓練資料缺欄位:{sorted(missing)}') frame = frame.copy() frame[list(FEATURE_COLUMNS)] = frame[list(FEATURE_COLUMNS)].astype(float).fillna(0.0) frame['match_result'] = frame['match_result'].str.lower().str.strip() unknown = set(frame['match_result']) - set(OUTCOMES) if unknown: raise ValueError(f'未知賽果標籤:{sorted(unknown)},僅支援 {OUTCOMES}') return frame def build_default_ml_training_rows() -> list[dict[str, float | str]]: """建立保底訓練樣本(當環境無法即時取得外部訓練資料時)。""" return [ { 'home_rest_days': 4, 'away_rest_days': 3, 'home_travel_distance_km': 520, 'away_travel_distance_km': 1100, 'recent_5_xg_home': 1.8, 'recent_5_xg_away': 1.0, 'rest_days_advantage': 1, 'travel_distance_km': -580, 'recent_5_xg_diff': 0.8, 'match_result': 'home', }, { 'home_rest_days': 2, 'away_rest_days': 5, 'home_travel_distance_km': 220, 'away_travel_distance_km': 780, 'recent_5_xg_home': 1.1, 'recent_5_xg_away': 1.7, 'rest_days_advantage': -3, 'travel_distance_km': -560, 'recent_5_xg_diff': -0.6, 'match_result': 'away', }, { 'home_rest_days': 6, 'away_rest_days': 4, 'home_travel_distance_km': 120, 'away_travel_distance_km': 960, 'recent_5_xg_home': 2.3, 'recent_5_xg_away': 1.8, 'rest_days_advantage': 2, 'travel_distance_km': -840, 'recent_5_xg_diff': 0.5, 'match_result': 'home', }, { 'home_rest_days': 3, 'away_rest_days': 3, 'home_travel_distance_km': 900, 'away_travel_distance_km': 900, 'recent_5_xg_home': 1.2, 'recent_5_xg_away': 1.3, 'rest_days_advantage': 0, 'travel_distance_km': 0, 'recent_5_xg_diff': -0.1, 'match_result': 'draw', }, { 'home_rest_days': 8, 'away_rest_days': 2, 'home_travel_distance_km': 350, 'away_travel_distance_km': 700, 'recent_5_xg_home': 2.0, 'recent_5_xg_away': 1.2, 'rest_days_advantage': 6, 'travel_distance_km': -350, 'recent_5_xg_diff': 0.8, 'match_result': 'home', }, { 'home_rest_days': 1, 'away_rest_days': 2, 'home_travel_distance_km': 1600, 'away_travel_distance_km': 2500, 'recent_5_xg_home': 1.4, 'recent_5_xg_away': 2.1, 'rest_days_advantage': -1, 'travel_distance_km': -900, 'recent_5_xg_diff': -0.7, 'match_result': 'away', }, { 'home_rest_days': 5, 'away_rest_days': 5, 'home_travel_distance_km': 700, 'away_travel_distance_km': 700, 'recent_5_xg_home': 1.9, 'recent_5_xg_away': 1.9, 'rest_days_advantage': 0, 'travel_distance_km': 0, 'recent_5_xg_diff': 0.0, 'match_result': 'draw', }, { 'home_rest_days': 9, 'away_rest_days': 3, 'home_travel_distance_km': 400, 'away_travel_distance_km': 300, 'recent_5_xg_home': 2.4, 'recent_5_xg_away': 1.1, 'rest_days_advantage': 6, 'travel_distance_km': 100, 'recent_5_xg_diff': 1.3, 'match_result': 'home', }, { 'home_rest_days': 2, 'away_rest_days': 7, 'home_travel_distance_km': 1800, 'away_travel_distance_km': 250, 'recent_5_xg_home': 1.0, 'recent_5_xg_away': 1.5, 'rest_days_advantage': -5, 'travel_distance_km': 1550, 'recent_5_xg_diff': -0.5, 'match_result': 'away', }, { 'home_rest_days': 4, 'away_rest_days': 4, 'home_travel_distance_km': 500, 'away_travel_distance_km': 500, 'recent_5_xg_home': 1.6, 'recent_5_xg_away': 1.4, 'rest_days_advantage': 0, 'travel_distance_km': 0, 'recent_5_xg_diff': 0.2, 'match_result': 'home', }, { 'home_rest_days': 6, 'away_rest_days': 1, 'home_travel_distance_km': 300, 'away_travel_distance_km': 1200, 'recent_5_xg_home': 2.8, 'recent_5_xg_away': 0.8, 'rest_days_advantage': 5, 'travel_distance_km': -900, 'recent_5_xg_diff': 2.0, 'match_result': 'home', }, { 'home_rest_days': 2, 'away_rest_days': 6, 'home_travel_distance_km': 1000, 'away_travel_distance_km': 200, 'recent_5_xg_home': 1.0, 'recent_5_xg_away': 2.6, 'rest_days_advantage': -4, 'travel_distance_km': 800, 'recent_5_xg_diff': -1.6, 'match_result': 'away', }, { 'home_rest_days': 7, 'away_rest_days': 7, 'home_travel_distance_km': 650, 'away_travel_distance_km': 650, 'recent_5_xg_home': 1.8, 'recent_5_xg_away': 1.8, 'rest_days_advantage': 0, 'travel_distance_km': 0, 'recent_5_xg_diff': 0.0, 'match_result': 'draw', }, { 'home_rest_days': 3, 'away_rest_days': 1, 'home_travel_distance_km': 260, 'away_travel_distance_km': 900, 'recent_5_xg_home': 2.1, 'recent_5_xg_away': 1.6, 'rest_days_advantage': 2, 'travel_distance_km': -640, 'recent_5_xg_diff': 0.5, 'match_result': 'home', }, { 'home_rest_days': 0, 'away_rest_days': 5, 'home_travel_distance_km': 1500, 'away_travel_distance_km': 150, 'recent_5_xg_home': 1.2, 'recent_5_xg_away': 2.0, 'rest_days_advantage': -5, 'travel_distance_km': 1350, 'recent_5_xg_diff': -0.8, 'match_result': 'away', }, { 'home_rest_days': 5, 'away_rest_days': 2, 'home_travel_distance_km': 300, 'away_travel_distance_km': 300, 'recent_5_xg_home': 2.2, 'recent_5_xg_away': 1.1, 'rest_days_advantage': 3, 'travel_distance_km': 0, 'recent_5_xg_diff': 1.1, 'match_result': 'home', }, { 'home_rest_days': 4, 'away_rest_days': 8, 'home_travel_distance_km': 450, 'away_travel_distance_km': 980, 'recent_5_xg_home': 1.5, 'recent_5_xg_away': 2.4, 'rest_days_advantage': -4, 'travel_distance_km': -530, 'recent_5_xg_diff': -0.9, 'match_result': 'away', }, ] def train_match_outcome_ensemble( training_rows: list[Mapping[str, Any]], *, model_id: str | None = None, ) -> EnsembleModelArtifact: """訓練 1X2 賽果 Ensemble(無法使用 sklearn 時自動回退規則模型)。""" normalized = [_normalize_training_row(row) for row in training_rows] frame = _validation_frame(normalized) x = frame[list(FEATURE_COLUMNS)] y = frame['match_result'].map({'home': 0, 'draw': 1, 'away': 2}) if len(frame) < 24 or GradientBoostingClassifier is None or train_test_split is None: return EnsembleModelArtifact( model=_FallbackMatchModel(), feature_columns=FEATURE_COLUMNS, model_id=model_id or uuid4().hex, training_size=len(frame), is_fallback=True, training_accuracy=None, ) x_train, x_val, y_train, y_val = train_test_split( x, y, test_size=min(0.3, max(0.15, 1 - (30 / len(frame)))), random_state=17, stratify=y, ) model = GradientBoostingClassifier( random_state=17, n_estimators=220, max_depth=3, learning_rate=0.06, ) model.fit(x_train, y_train) accuracy = float(model.score(x_val, y_val)) if len(set(y_val)) > 1 else None return EnsembleModelArtifact( model=model, feature_columns=FEATURE_COLUMNS, model_id=model_id or uuid4().hex, training_size=len(frame), is_fallback=False, training_accuracy=accuracy, ) def _normalize_training_row(row: Mapping[str, Any]) -> dict[str, float | str]: normalized = normalize_feature_payload(row) if 'match_result' not in row: raise ValueError('訓練資料缺少 match_result') normalized['match_result'] = str(row['match_result']).strip().lower() return normalized def build_default_ensemble_artifact() -> EnsembleModelArtifact: """建立系統預設模型(含 fallback)。""" return train_match_outcome_ensemble(build_default_ml_training_rows(), model_id='default') def model_predict_probabilities( artifact: EnsembleModelArtifact, features: Mapping[str, Any], ) -> dict[str, float]: """回傳 home/draw/away 的機率。""" normalized = normalize_feature_payload(features) feature_frame = pd.DataFrame([normalized], columns=artifact.feature_columns) probs = artifact.model.predict_proba(feature_frame)[0] return { 'home': float(probs[0]), 'draw': float(probs[1]), 'away': float(probs[2]), } def calculate_model_edges( predicted: dict[str, float], implied: dict[str, float], ) -> dict[str, dict[str, float | bool]]: """比較模型機率與莊家隱含機率,標示 Strong Buy。""" edges: dict[str, dict[str, float | bool]] = {} for key in OUTCOMES: p = float(predicted.get(key, 0)) i = float(implied.get(key, 0)) edge = p - i edges[key] = { 'model_prob': round(p, 6), 'implied_prob': round(i, 6), 'edge': round(edge, 6), 'strong_buy': edge >= 0.04, } return edges