fix: normalize world cup match feed quality
This commit is contained in:
@@ -2,6 +2,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from typing import Any
|
||||
|
||||
TEAM_NAMES = {
|
||||
@@ -138,11 +140,128 @@ def _clean(value: Any) -> str:
|
||||
return str(value or '').strip()
|
||||
|
||||
|
||||
# External feeds and bookmakers use several spellings for the same national team.
|
||||
# Keep this list intentionally broad so the public API never leaks mixed English names.
|
||||
TEAM_NAMES.update(
|
||||
{
|
||||
"South Korea": "南韓",
|
||||
"Korea Republic": "南韓",
|
||||
"Republic of Korea": "南韓",
|
||||
"ROK": "南韓",
|
||||
"南韓": "南韓",
|
||||
"韓國": "南韓",
|
||||
"Czechia": "捷克",
|
||||
"Czech Republic": "捷克",
|
||||
"捷克": "捷克",
|
||||
"Bosnia and Herzegovina": "波士尼亞與赫塞哥維納",
|
||||
"Bosnia & Herzegovina": "波士尼亞與赫塞哥維納",
|
||||
"Bosnia-Herzegovina": "波士尼亞與赫塞哥維納",
|
||||
"Bosnia": "波士尼亞與赫塞哥維納",
|
||||
"Paraguay": "巴拉圭",
|
||||
"Algeria": "阿爾及利亞",
|
||||
"Morocco": "摩洛哥",
|
||||
"Switzerland": "瑞士",
|
||||
"Scotland": "蘇格蘭",
|
||||
"Wales": "威爾斯",
|
||||
"Poland": "波蘭",
|
||||
"Denmark": "丹麥",
|
||||
"Serbia": "塞爾維亞",
|
||||
"Chile": "智利",
|
||||
"Peru": "秘魯",
|
||||
"Venezuela": "委內瑞拉",
|
||||
"Bolivia": "玻利維亞",
|
||||
"Costa Rica": "哥斯大黎加",
|
||||
"Honduras": "宏都拉斯",
|
||||
"Jamaica": "牙買加",
|
||||
"Panama": "巴拿馬",
|
||||
"El Salvador": "薩爾瓦多",
|
||||
"Guatemala": "瓜地馬拉",
|
||||
"Haiti": "海地",
|
||||
"Trinidad and Tobago": "千里達及托巴哥",
|
||||
"Trinidad & Tobago": "千里達及托巴哥",
|
||||
"Curacao": "庫拉索",
|
||||
"Curaçao": "庫拉索",
|
||||
"Dominican Republic": "多明尼加共和國",
|
||||
"New Zealand": "紐西蘭",
|
||||
"Australia": "澳洲",
|
||||
"Saudi Arabia": "沙烏地阿拉伯",
|
||||
"Qatar": "卡達",
|
||||
"United Arab Emirates": "阿拉伯聯合大公國",
|
||||
"UAE": "阿拉伯聯合大公國",
|
||||
"Iraq": "伊拉克",
|
||||
"Jordan": "約旦",
|
||||
"Oman": "阿曼",
|
||||
"Uzbekistan": "烏茲別克",
|
||||
"Iran": "伊朗",
|
||||
"Japan": "日本",
|
||||
"China PR": "中國",
|
||||
"China": "中國",
|
||||
"North Korea": "北韓",
|
||||
"Cameroon": "喀麥隆",
|
||||
"Ghana": "迦納",
|
||||
"Ivory Coast": "象牙海岸",
|
||||
"Cote d'Ivoire": "象牙海岸",
|
||||
"Côte d’Ivoire": "象牙海岸",
|
||||
"Senegal": "塞內加爾",
|
||||
"Nigeria": "奈及利亞",
|
||||
"Tunisia": "突尼西亞",
|
||||
"Egypt": "埃及",
|
||||
"Mali": "馬利",
|
||||
"DR Congo": "剛果民主共和國",
|
||||
"Congo DR": "剛果民主共和國",
|
||||
"South Africa": "南非",
|
||||
}
|
||||
)
|
||||
|
||||
TEAM_NAME_ALIASES = {
|
||||
"usmnt": "United States",
|
||||
"usa": "United States",
|
||||
"u s a": "United States",
|
||||
"united states of america": "United States",
|
||||
"mex": "Mexico",
|
||||
"can": "Canada",
|
||||
"kor": "South Korea",
|
||||
"korea republic": "South Korea",
|
||||
"republic of korea": "South Korea",
|
||||
"czechia": "Czechia",
|
||||
"czech republic": "Czechia",
|
||||
"bosnia and herzegovina": "Bosnia and Herzegovina",
|
||||
"bosnia herzegovina": "Bosnia and Herzegovina",
|
||||
"bosnia": "Bosnia and Herzegovina",
|
||||
"paraguay": "Paraguay",
|
||||
}
|
||||
|
||||
|
||||
def _normalize_lookup_key(value: Any) -> str:
|
||||
text = unicodedata.normalize("NFKD", _clean(value))
|
||||
text = "".join(ch for ch in text if not unicodedata.combining(ch))
|
||||
text = text.replace("&", " and ")
|
||||
text = re.sub(r"[^0-9A-Za-z\u4e00-\u9fff]+", " ", text)
|
||||
return re.sub(r"\s+", " ", text).strip().lower()
|
||||
|
||||
|
||||
def _lookup(value: Any, mapping: dict[str, str]) -> str:
|
||||
text = _clean(value)
|
||||
if not text:
|
||||
return '待確認'
|
||||
return mapping.get(text, mapping.get(text.lower(), text))
|
||||
text = _clean(value)
|
||||
if not text:
|
||||
return '待確認'
|
||||
|
||||
direct = mapping.get(text) or mapping.get(text.lower())
|
||||
if direct:
|
||||
return direct
|
||||
|
||||
normalized = _normalize_lookup_key(text)
|
||||
if mapping is TEAM_NAMES:
|
||||
alias = TEAM_NAME_ALIASES.get(normalized)
|
||||
if alias:
|
||||
alias_value = mapping.get(alias) or mapping.get(alias.lower())
|
||||
if alias_value:
|
||||
return alias_value
|
||||
|
||||
for key, translated in mapping.items():
|
||||
if _normalize_lookup_key(key) == normalized:
|
||||
return translated
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def localize_team_name(value: Any) -> str:
|
||||
|
||||
@@ -1283,11 +1283,20 @@ async def _query_match_list(limit: int = 200) -> list[dict[str, Any]]:
|
||||
'_raw_status': str(status.value if hasattr(status, 'value') else status),
|
||||
'_result_synced_at': result_synced_at,
|
||||
}
|
||||
key = (
|
||||
home_label.strip().lower(),
|
||||
away_label.strip().lower(),
|
||||
kickoff_utc.isoformat() if hasattr(kickoff_utc, 'isoformat') else str(kickoff_utc),
|
||||
team_pair_key = tuple(
|
||||
sorted(
|
||||
(
|
||||
" ".join(home_label.strip().lower().split()),
|
||||
" ".join(away_label.strip().lower().split()),
|
||||
)
|
||||
)
|
||||
)
|
||||
kickoff_key = (
|
||||
kickoff_utc.replace(minute=0, second=0, microsecond=0).isoformat()
|
||||
if hasattr(kickoff_utc, 'isoformat') and hasattr(kickoff_utc, 'minute')
|
||||
else str(kickoff_utc)[:13]
|
||||
)
|
||||
key = (*team_pair_key, kickoff_key)
|
||||
current = deduped.get(key)
|
||||
if current is None or quality_rank(payload) > quality_rank(current):
|
||||
deduped[key] = payload
|
||||
|
||||
Reference in New Issue
Block a user