fix: normalize world cup match feed quality
All checks were successful
2026 World Cup Quant Platform - Production Deployment / Code Quality, Security Gate & Testing (push) Successful in 4m32s
2026 World Cup Quant Platform - Production Deployment / Deploy to Production VM via Gitea CD (push) Successful in 1m15s

This commit is contained in:
wooo
2026-06-18 13:49:33 +08:00
parent c94abe616d
commit 3994996ea0
2 changed files with 136 additions and 8 deletions

View File

@@ -2,6 +2,8 @@
from __future__ import annotations
import re
import unicodedata
from typing import Any
TEAM_NAMES = {
@@ -138,11 +140,128 @@ def _clean(value: Any) -> str:
return str(value or '').strip()
# External feeds and bookmakers use several spellings for the same national team.
# Keep this list intentionally broad so the public API never leaks mixed English names.
TEAM_NAMES.update(
{
"South Korea": "南韓",
"Korea Republic": "南韓",
"Republic of Korea": "南韓",
"ROK": "南韓",
"南韓": "南韓",
"韓國": "南韓",
"Czechia": "捷克",
"Czech Republic": "捷克",
"捷克": "捷克",
"Bosnia and Herzegovina": "波士尼亞與赫塞哥維納",
"Bosnia & Herzegovina": "波士尼亞與赫塞哥維納",
"Bosnia-Herzegovina": "波士尼亞與赫塞哥維納",
"Bosnia": "波士尼亞與赫塞哥維納",
"Paraguay": "巴拉圭",
"Algeria": "阿爾及利亞",
"Morocco": "摩洛哥",
"Switzerland": "瑞士",
"Scotland": "蘇格蘭",
"Wales": "威爾斯",
"Poland": "波蘭",
"Denmark": "丹麥",
"Serbia": "塞爾維亞",
"Chile": "智利",
"Peru": "秘魯",
"Venezuela": "委內瑞拉",
"Bolivia": "玻利維亞",
"Costa Rica": "哥斯大黎加",
"Honduras": "宏都拉斯",
"Jamaica": "牙買加",
"Panama": "巴拿馬",
"El Salvador": "薩爾瓦多",
"Guatemala": "瓜地馬拉",
"Haiti": "海地",
"Trinidad and Tobago": "千里達及托巴哥",
"Trinidad & Tobago": "千里達及托巴哥",
"Curacao": "庫拉索",
"Curaçao": "庫拉索",
"Dominican Republic": "多明尼加共和國",
"New Zealand": "紐西蘭",
"Australia": "澳洲",
"Saudi Arabia": "沙烏地阿拉伯",
"Qatar": "卡達",
"United Arab Emirates": "阿拉伯聯合大公國",
"UAE": "阿拉伯聯合大公國",
"Iraq": "伊拉克",
"Jordan": "約旦",
"Oman": "阿曼",
"Uzbekistan": "烏茲別克",
"Iran": "伊朗",
"Japan": "日本",
"China PR": "中國",
"China": "中國",
"North Korea": "北韓",
"Cameroon": "喀麥隆",
"Ghana": "迦納",
"Ivory Coast": "象牙海岸",
"Cote d'Ivoire": "象牙海岸",
"Côte dIvoire": "象牙海岸",
"Senegal": "塞內加爾",
"Nigeria": "奈及利亞",
"Tunisia": "突尼西亞",
"Egypt": "埃及",
"Mali": "馬利",
"DR Congo": "剛果民主共和國",
"Congo DR": "剛果民主共和國",
"South Africa": "南非",
}
)
TEAM_NAME_ALIASES = {
"usmnt": "United States",
"usa": "United States",
"u s a": "United States",
"united states of america": "United States",
"mex": "Mexico",
"can": "Canada",
"kor": "South Korea",
"korea republic": "South Korea",
"republic of korea": "South Korea",
"czechia": "Czechia",
"czech republic": "Czechia",
"bosnia and herzegovina": "Bosnia and Herzegovina",
"bosnia herzegovina": "Bosnia and Herzegovina",
"bosnia": "Bosnia and Herzegovina",
"paraguay": "Paraguay",
}
def _normalize_lookup_key(value: Any) -> str:
text = unicodedata.normalize("NFKD", _clean(value))
text = "".join(ch for ch in text if not unicodedata.combining(ch))
text = text.replace("&", " and ")
text = re.sub(r"[^0-9A-Za-z\u4e00-\u9fff]+", " ", text)
return re.sub(r"\s+", " ", text).strip().lower()
def _lookup(value: Any, mapping: dict[str, str]) -> str:
text = _clean(value)
if not text:
return '待確認'
return mapping.get(text, mapping.get(text.lower(), text))
text = _clean(value)
if not text:
return '待確認'
direct = mapping.get(text) or mapping.get(text.lower())
if direct:
return direct
normalized = _normalize_lookup_key(text)
if mapping is TEAM_NAMES:
alias = TEAM_NAME_ALIASES.get(normalized)
if alias:
alias_value = mapping.get(alias) or mapping.get(alias.lower())
if alias_value:
return alias_value
for key, translated in mapping.items():
if _normalize_lookup_key(key) == normalized:
return translated
return text
def localize_team_name(value: Any) -> str:

View File

@@ -1283,11 +1283,20 @@ async def _query_match_list(limit: int = 200) -> list[dict[str, Any]]:
'_raw_status': str(status.value if hasattr(status, 'value') else status),
'_result_synced_at': result_synced_at,
}
key = (
home_label.strip().lower(),
away_label.strip().lower(),
kickoff_utc.isoformat() if hasattr(kickoff_utc, 'isoformat') else str(kickoff_utc),
team_pair_key = tuple(
sorted(
(
" ".join(home_label.strip().lower().split()),
" ".join(away_label.strip().lower().split()),
)
)
)
kickoff_key = (
kickoff_utc.replace(minute=0, second=0, microsecond=0).isoformat()
if hasattr(kickoff_utc, 'isoformat') and hasattr(kickoff_utc, 'minute')
else str(kickoff_utc)[:13]
)
key = (*team_pair_key, kickoff_key)
current = deduped.get(key)
if current is None or quality_rank(payload) > quality_rank(current):
deduped[key] = payload