Files
ewoooc/utils/momo_url_utils.py
OoO 75de76ac12
Some checks failed
CD Pipeline / deploy (push) Has been cancelled
fix(momo): block EC404 auto-open with end-to-end URL guard
- normalize URLs at write time (scheduler crawlers, routes) to drop
  javascript:/EC404/placeholder i_code (momo_/manual_/pchome_)
- add global click+auxclick guard in base.html and ewoooc_base.html
  that intercepts blocked MOMO URLs and redirects to safe i_code URL
- per-page dashboards reuse the same isLikelyMomoIcode validation
- /api/track_momo_link records blocked events for diagnosis
- ship sanitize_momo_urls.py to clean existing polluted DB rows

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 12:00:34 +08:00

147 lines
4.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Utilities for MOMO product URL normalization and fallback."""
import re
from typing import Optional
from urllib.parse import parse_qs, urlparse, urlunparse
MOMO_BASE_DOMAINS = {
'www.momoshop.com.tw',
'm.momoshop.com.tw',
}
ERR404_PATH = '/ecm/js/err404/ec404.html'
MOMO_ICODE_FALLBACK_MIN_LEN = 4
MOMO_ICODE_RE = re.compile(r'^[A-Za-z0-9_-]+$')
def is_probable_momo_icode(i_code: Optional[object]) -> bool:
"""判斷值是否像是合理的 MOMO 商品代碼。"""
cleaned = str(i_code or '').strip()
if not cleaned:
return False
lowered = cleaned.lower()
if lowered in {'nan', 'none', 'null', 'undefined'}:
return False
if lowered.startswith(('momo_', 'manual_', 'pchome_')):
return False
if len(cleaned) < MOMO_ICODE_FALLBACK_MIN_LEN:
return False
return bool(MOMO_ICODE_RE.fullmatch(cleaned))
def build_momo_product_url(i_code: Optional[object]) -> Optional[str]:
"""Build fallback MOMO product detail URL from i_code."""
if not is_probable_momo_icode(i_code):
return None
return f"https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code={str(i_code).strip()}"
def extract_momo_i_code(url: Optional[object]) -> Optional[str]:
"""從 URL 萃取 i_code。"""
if not url:
return None
raw = str(url).strip()
if not raw:
return None
# URL 格式:直接解析
try:
normalized = raw if raw.startswith(('http://', 'https://')) else (
f'https:{raw}' if raw.startswith('//') else raw
)
parsed = urlparse(normalized)
if parsed.scheme in ('http', 'https'):
query = parse_qs(parsed.query or '')
i_code = (query.get('i_code') or [''])[0]
if i_code:
return i_code.strip()
match = re.search(r'/goodsdetail/([^/?#]+)', parsed.path or '', re.I)
if match:
return match.group(1).strip()
except Exception:
pass
# 備援匹配
match = re.search(r'[?&]i_code=([^&#]+)', raw, re.I)
if match:
return match.group(1).strip()
return None
def _normalize_quoted_url(url: str) -> str:
"""Normalize scheme-relative and path-relative URLs."""
cleaned = (url or '').strip()
if cleaned.startswith('//'):
return f'https:{cleaned}'
if cleaned.startswith('/'):
return f'https://www.momoshop.com.tw{cleaned}'
return cleaned
def is_valid_momo_product_url(url: str) -> bool:
"""Return whether URL looks like a valid MOMO product page."""
if not url:
return False
parsed = urlparse(url)
if parsed.scheme not in ('http', 'https'):
return False
if (parsed.hostname or '').lower() not in MOMO_BASE_DOMAINS:
return False
path = (parsed.path or '').lower()
if ERR404_PATH in path:
return False
# 商品頁通常會有 GoodsDetail.jsp 或 goodsDetail/xxx
if 'goodsdetail' in path:
if 'i_code' not in parse_qs(parsed.query or '') and not re.search(r'/goodsdetail/[^/]+', path):
return False
query = parse_qs(parsed.query or '')
if 'i_code' in query:
return True
# /goodsDetail/<i_code> 不一定有 query
return bool(re.search(r'/goodsdetail/[^/]+', path))
return False
def normalize_momo_product_url(url: Optional[object], i_code: Optional[object]) -> Optional[str]:
"""
Normalize a MOMO URL and fall back to i_code product detail URL when invalid.
Args:
url: Original link.
i_code: Product code for fallback URL.
"""
fallback_code = extract_momo_i_code(url) or (str(i_code).strip() if is_probable_momo_icode(i_code) else None)
fallback = build_momo_product_url(fallback_code)
if not url:
return fallback
normalized = _normalize_quoted_url(str(url).strip())
if not normalized:
return fallback
lower = normalized.lower()
if lower.startswith('javascript:') or lower.startswith('void('):
return fallback
if is_valid_momo_product_url(normalized):
return normalized
# 兜底:若網址可解析且 host 仍是 MOMO但不是預期路徑仍可視為損壞資料
parsed = urlparse(normalized)
if parsed.scheme in ('http', 'https'):
return fallback
return fallback