diff --git a/platform/backend/app/analytics/localization.py b/platform/backend/app/analytics/localization.py index 3ab5fb2..df00bff 100644 --- a/platform/backend/app/analytics/localization.py +++ b/platform/backend/app/analytics/localization.py @@ -2,6 +2,8 @@ from __future__ import annotations +import re +import unicodedata from typing import Any TEAM_NAMES = { @@ -138,11 +140,128 @@ def _clean(value: Any) -> str: return str(value or '').strip() +# External feeds and bookmakers use several spellings for the same national team. +# Keep this list intentionally broad so the public API never leaks mixed English names. +TEAM_NAMES.update( + { + "South Korea": "南韓", + "Korea Republic": "南韓", + "Republic of Korea": "南韓", + "ROK": "南韓", + "南韓": "南韓", + "韓國": "南韓", + "Czechia": "捷克", + "Czech Republic": "捷克", + "捷克": "捷克", + "Bosnia and Herzegovina": "波士尼亞與赫塞哥維納", + "Bosnia & Herzegovina": "波士尼亞與赫塞哥維納", + "Bosnia-Herzegovina": "波士尼亞與赫塞哥維納", + "Bosnia": "波士尼亞與赫塞哥維納", + "Paraguay": "巴拉圭", + "Algeria": "阿爾及利亞", + "Morocco": "摩洛哥", + "Switzerland": "瑞士", + "Scotland": "蘇格蘭", + "Wales": "威爾斯", + "Poland": "波蘭", + "Denmark": "丹麥", + "Serbia": "塞爾維亞", + "Chile": "智利", + "Peru": "秘魯", + "Venezuela": "委內瑞拉", + "Bolivia": "玻利維亞", + "Costa Rica": "哥斯大黎加", + "Honduras": "宏都拉斯", + "Jamaica": "牙買加", + "Panama": "巴拿馬", + "El Salvador": "薩爾瓦多", + "Guatemala": "瓜地馬拉", + "Haiti": "海地", + "Trinidad and Tobago": "千里達及托巴哥", + "Trinidad & Tobago": "千里達及托巴哥", + "Curacao": "庫拉索", + "Curaçao": "庫拉索", + "Dominican Republic": "多明尼加共和國", + "New Zealand": "紐西蘭", + "Australia": "澳洲", + "Saudi Arabia": "沙烏地阿拉伯", + "Qatar": "卡達", + "United Arab Emirates": "阿拉伯聯合大公國", + "UAE": "阿拉伯聯合大公國", + "Iraq": "伊拉克", + "Jordan": "約旦", + "Oman": "阿曼", + "Uzbekistan": "烏茲別克", + "Iran": "伊朗", + "Japan": "日本", + "China PR": "中國", + "China": "中國", + "North Korea": "北韓", + "Cameroon": "喀麥隆", + "Ghana": "迦納", + "Ivory Coast": "象牙海岸", + "Cote d'Ivoire": "象牙海岸", + "Côte d’Ivoire": "象牙海岸", + "Senegal": "塞內加爾", + "Nigeria": "奈及利亞", + "Tunisia": "突尼西亞", + "Egypt": "埃及", + "Mali": "馬利", + "DR Congo": "剛果民主共和國", + "Congo DR": "剛果民主共和國", + "South Africa": "南非", + } +) + +TEAM_NAME_ALIASES = { + "usmnt": "United States", + "usa": "United States", + "u s a": "United States", + "united states of america": "United States", + "mex": "Mexico", + "can": "Canada", + "kor": "South Korea", + "korea republic": "South Korea", + "republic of korea": "South Korea", + "czechia": "Czechia", + "czech republic": "Czechia", + "bosnia and herzegovina": "Bosnia and Herzegovina", + "bosnia herzegovina": "Bosnia and Herzegovina", + "bosnia": "Bosnia and Herzegovina", + "paraguay": "Paraguay", +} + + +def _normalize_lookup_key(value: Any) -> str: + text = unicodedata.normalize("NFKD", _clean(value)) + text = "".join(ch for ch in text if not unicodedata.combining(ch)) + text = text.replace("&", " and ") + text = re.sub(r"[^0-9A-Za-z\u4e00-\u9fff]+", " ", text) + return re.sub(r"\s+", " ", text).strip().lower() + + def _lookup(value: Any, mapping: dict[str, str]) -> str: - text = _clean(value) - if not text: - return '待確認' - return mapping.get(text, mapping.get(text.lower(), text)) + text = _clean(value) + if not text: + return '待確認' + + direct = mapping.get(text) or mapping.get(text.lower()) + if direct: + return direct + + normalized = _normalize_lookup_key(text) + if mapping is TEAM_NAMES: + alias = TEAM_NAME_ALIASES.get(normalized) + if alias: + alias_value = mapping.get(alias) or mapping.get(alias.lower()) + if alias_value: + return alias_value + + for key, translated in mapping.items(): + if _normalize_lookup_key(key) == normalized: + return translated + + return text def localize_team_name(value: Any) -> str: diff --git a/platform/backend/app/main.py b/platform/backend/app/main.py index e2045f7..a42f688 100644 --- a/platform/backend/app/main.py +++ b/platform/backend/app/main.py @@ -1283,11 +1283,20 @@ async def _query_match_list(limit: int = 200) -> list[dict[str, Any]]: '_raw_status': str(status.value if hasattr(status, 'value') else status), '_result_synced_at': result_synced_at, } - key = ( - home_label.strip().lower(), - away_label.strip().lower(), - kickoff_utc.isoformat() if hasattr(kickoff_utc, 'isoformat') else str(kickoff_utc), + team_pair_key = tuple( + sorted( + ( + " ".join(home_label.strip().lower().split()), + " ".join(away_label.strip().lower().split()), + ) + ) ) + kickoff_key = ( + kickoff_utc.replace(minute=0, second=0, microsecond=0).isoformat() + if hasattr(kickoff_utc, 'isoformat') and hasattr(kickoff_utc, 'minute') + else str(kickoff_utc)[:13] + ) + key = (*team_pair_key, kickoff_key) current = deduped.get(key) if current is None or quality_rank(payload) > quality_rank(current): deduped[key] = payload