"""市場情報 HTML 診斷解析工具。 只萃取頁面標題、連結候選與內容指紋,不建立正式 campaign/product。 """ import hashlib import re from dataclasses import asdict, dataclass from html.parser import HTMLParser from urllib.parse import urljoin, urlparse @dataclass(frozen=True) class LinkCandidate: """活動頁中可疑連結候選。""" href: str text: str is_same_host: bool score: int generic_score: int platform_score: int confidence_band: str confidence_reason: str def to_dict(self): return asdict(self) @dataclass(frozen=True) class HtmlDiagnostics: """HTML 診斷摘要。""" content_length: int page_hash: str title: str link_count: int same_host_link_count: int campaign_link_candidates: list def to_dict(self): return asdict(self) class _DiagnosticHtmlParser(HTMLParser): def __init__(self): super().__init__(convert_charrefs=True) self.title_parts = [] self.links = [] self._in_title = False self._active_href = None self._active_text = [] def handle_starttag(self, tag, attrs): attrs_map = dict(attrs or []) if tag.lower() == "title": self._in_title = True if tag.lower() == "a": self._active_href = attrs_map.get("href") self._active_text = [] def handle_data(self, data): if self._in_title: self.title_parts.append(data) if self._active_href is not None: self._active_text.append(data) def handle_endtag(self, tag): tag = tag.lower() if tag == "title": self._in_title = False if tag == "a" and self._active_href is not None: self.links.append((self._active_href, " ".join(self._active_text))) self._active_href = None self._active_text = [] def _clean_text(value, limit=160): text = re.sub(r"\s+", " ", value or "").strip() return text[:limit] def _score_link(href, text): haystack = f"{href} {text}".lower() score = 0 for keyword in ("edm", "event", "promo", "campaign", "sale", "activity"): if keyword in haystack: score += 2 for keyword in ("活動", "優惠", "折扣", "檔期", "品牌日", "限時", "促銷"): if keyword in haystack: score += 3 return score def _confidence_band(score, *, is_same_host, platform_score, generic_score): """把診斷分數轉成人工審核用信心帶。""" reasons = [] reasons.append("same_host" if is_same_host else "external_host") if platform_score > 0: reasons.append(f"platform_score={platform_score}") if generic_score > 0: reasons.append(f"generic_score={generic_score}") if score >= 12 and is_same_host: return "high", ", ".join(reasons) if score >= 6: return "medium", ", ".join(reasons) return "low", ", ".join(reasons) def parse_html_diagnostics(html, base_url="", candidate_limit=12, score_link=None): """解析 HTML 診斷資訊,不做商業資料入庫。""" html = html or "" parser = _DiagnosticHtmlParser() parser.feed(html) base_host = urlparse(base_url or "").netloc candidates = [] same_host_count = 0 for raw_href, raw_text in parser.links: if not raw_href: continue href = urljoin(base_url, raw_href) parsed = urlparse(href) if parsed.scheme not in {"http", "https"}: continue is_same_host = bool(base_host and parsed.netloc == base_host) if is_same_host: same_host_count += 1 text = _clean_text(raw_text) generic_score = _score_link(href, text) platform_score = int(score_link(href, text)) if score_link else 0 score = generic_score + platform_score if score <= 0: continue confidence_band, confidence_reason = _confidence_band( score, is_same_host=is_same_host, platform_score=platform_score, generic_score=generic_score, ) candidates.append( LinkCandidate( href=href, text=text, is_same_host=is_same_host, score=score, generic_score=generic_score, platform_score=platform_score, confidence_band=confidence_band, confidence_reason=confidence_reason, ) ) band_rank = {"high": 3, "medium": 2, "low": 1} candidates = sorted( candidates, key=lambda item: (band_rank.get(item.confidence_band, 0), item.score), reverse=True, )[:candidate_limit] page_hash = hashlib.sha256(html.encode("utf-8", errors="ignore")).hexdigest() if html else "" title = _clean_text(" ".join(parser.title_parts), limit=200) return HtmlDiagnostics( content_length=len(html), page_hash=page_hash, title=title, link_count=len(parser.links), same_host_link_count=same_host_count, campaign_link_candidates=[candidate.to_dict() for candidate in candidates], )