"""Utilities for MOMO product URL normalization and fallback.""" import re from typing import Optional from urllib.parse import parse_qs, urlparse, urlunparse MOMO_BASE_DOMAINS = { 'www.momoshop.com.tw', 'm.momoshop.com.tw', } ERR404_PATH = '/ecm/js/err404/ec404.html' MOMO_ICODE_FALLBACK_MIN_LEN = 4 MOMO_ICODE_RE = re.compile(r'^[A-Za-z0-9_-]+$') def is_probable_momo_icode(i_code: Optional[object]) -> bool: """判斷值是否像是合理的 MOMO 商品代碼。""" cleaned = str(i_code or '').strip() if not cleaned: return False lowered = cleaned.lower() if lowered in {'nan', 'none', 'null', 'undefined'}: return False if lowered.startswith(('momo_', 'manual_', 'pchome_')): return False if len(cleaned) < MOMO_ICODE_FALLBACK_MIN_LEN: return False return bool(MOMO_ICODE_RE.fullmatch(cleaned)) def build_momo_product_url(i_code: Optional[object]) -> Optional[str]: """Build fallback MOMO product detail URL from i_code.""" if not is_probable_momo_icode(i_code): return None return f"https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code={str(i_code).strip()}" def extract_momo_i_code(url: Optional[object]) -> Optional[str]: """從 URL 萃取 i_code。""" if not url: return None raw = str(url).strip() if not raw: return None # URL 格式:直接解析 try: normalized = raw if raw.startswith(('http://', 'https://')) else ( f'https:{raw}' if raw.startswith('//') else raw ) parsed = urlparse(normalized) if parsed.scheme in ('http', 'https'): query = parse_qs(parsed.query or '') i_code = (query.get('i_code') or [''])[0] if i_code: return i_code.strip() match = re.search(r'/goodsdetail/([^/?#]+)', parsed.path or '', re.I) if match: return match.group(1).strip() except Exception: pass # 備援匹配 match = re.search(r'[?&]i_code=([^&#]+)', raw, re.I) if match: return match.group(1).strip() return None def _normalize_quoted_url(url: str) -> str: """Normalize scheme-relative and path-relative URLs.""" cleaned = (url or '').strip() if cleaned.startswith('//'): return f'https:{cleaned}' if cleaned.startswith('/'): return f'https://www.momoshop.com.tw{cleaned}' return cleaned def is_valid_momo_product_url(url: str) -> bool: """Return whether URL looks like a valid MOMO product page.""" if not url: return False parsed = urlparse(url) if parsed.scheme not in ('http', 'https'): return False if (parsed.hostname or '').lower() not in MOMO_BASE_DOMAINS: return False path = (parsed.path or '').lower() if ERR404_PATH in path: return False # 商品頁通常會有 GoodsDetail.jsp 或 goodsDetail/xxx if 'goodsdetail' in path: if 'i_code' not in parse_qs(parsed.query or '') and not re.search(r'/goodsdetail/[^/]+', path): return False query = parse_qs(parsed.query or '') if 'i_code' in query: return True # /goodsDetail/ 不一定有 query return bool(re.search(r'/goodsdetail/[^/]+', path)) return False def normalize_momo_product_url(url: Optional[object], i_code: Optional[object]) -> Optional[str]: """ Normalize a MOMO URL and fall back to i_code product detail URL when invalid. Args: url: Original link. i_code: Product code for fallback URL. """ fallback_code = extract_momo_i_code(url) or (str(i_code).strip() if is_probable_momo_icode(i_code) else None) fallback = build_momo_product_url(fallback_code) if not url: return fallback normalized = _normalize_quoted_url(str(url).strip()) if not normalized: return fallback lower = normalized.lower() if lower.startswith('javascript:') or lower.startswith('void('): return fallback if is_valid_momo_product_url(normalized): return normalized # 兜底:若網址可解析且 host 仍是 MOMO,但不是預期路徑,仍可視為損壞資料 parsed = urlparse(normalized) if parsed.scheme in ('http', 'https'): return fallback return fallback