Files
ewoooc/services/market_intel/html_diagnostics.py

173 lines
5.1 KiB
Python

"""市場情報 HTML 診斷解析工具。
只萃取頁面標題、連結候選與內容指紋,不建立正式 campaign/product。
"""
import hashlib
import re
from dataclasses import asdict, dataclass
from html.parser import HTMLParser
from urllib.parse import urljoin, urlparse
@dataclass(frozen=True)
class LinkCandidate:
"""活動頁中可疑連結候選。"""
href: str
text: str
is_same_host: bool
score: int
generic_score: int
platform_score: int
confidence_band: str
confidence_reason: str
def to_dict(self):
return asdict(self)
@dataclass(frozen=True)
class HtmlDiagnostics:
"""HTML 診斷摘要。"""
content_length: int
page_hash: str
title: str
link_count: int
same_host_link_count: int
campaign_link_candidates: list
def to_dict(self):
return asdict(self)
class _DiagnosticHtmlParser(HTMLParser):
def __init__(self):
super().__init__(convert_charrefs=True)
self.title_parts = []
self.links = []
self._in_title = False
self._active_href = None
self._active_text = []
def handle_starttag(self, tag, attrs):
attrs_map = dict(attrs or [])
if tag.lower() == "title":
self._in_title = True
if tag.lower() == "a":
self._active_href = attrs_map.get("href")
self._active_text = []
def handle_data(self, data):
if self._in_title:
self.title_parts.append(data)
if self._active_href is not None:
self._active_text.append(data)
def handle_endtag(self, tag):
tag = tag.lower()
if tag == "title":
self._in_title = False
if tag == "a" and self._active_href is not None:
self.links.append((self._active_href, " ".join(self._active_text)))
self._active_href = None
self._active_text = []
def _clean_text(value, limit=160):
text = re.sub(r"\s+", " ", value or "").strip()
return text[:limit]
def _score_link(href, text):
haystack = f"{href} {text}".lower()
score = 0
for keyword in ("edm", "event", "promo", "campaign", "sale", "activity"):
if keyword in haystack:
score += 2
for keyword in ("活動", "優惠", "折扣", "檔期", "品牌日", "限時", "促銷"):
if keyword in haystack:
score += 3
return score
def _confidence_band(score, *, is_same_host, platform_score, generic_score):
"""把診斷分數轉成人工審核用信心帶。"""
reasons = []
reasons.append("same_host" if is_same_host else "external_host")
if platform_score > 0:
reasons.append(f"platform_score={platform_score}")
if generic_score > 0:
reasons.append(f"generic_score={generic_score}")
if score >= 12 and is_same_host:
return "high", ", ".join(reasons)
if score >= 6:
return "medium", ", ".join(reasons)
return "low", ", ".join(reasons)
def parse_html_diagnostics(html, base_url="", candidate_limit=12, score_link=None):
"""解析 HTML 診斷資訊,不做商業資料入庫。"""
html = html or ""
parser = _DiagnosticHtmlParser()
parser.feed(html)
base_host = urlparse(base_url or "").netloc
candidates = []
same_host_count = 0
for raw_href, raw_text in parser.links:
if not raw_href:
continue
href = urljoin(base_url, raw_href)
parsed = urlparse(href)
if parsed.scheme not in {"http", "https"}:
continue
is_same_host = bool(base_host and parsed.netloc == base_host)
if is_same_host:
same_host_count += 1
text = _clean_text(raw_text)
generic_score = _score_link(href, text)
platform_score = int(score_link(href, text)) if score_link else 0
score = generic_score + platform_score
if score <= 0:
continue
confidence_band, confidence_reason = _confidence_band(
score,
is_same_host=is_same_host,
platform_score=platform_score,
generic_score=generic_score,
)
candidates.append(
LinkCandidate(
href=href,
text=text,
is_same_host=is_same_host,
score=score,
generic_score=generic_score,
platform_score=platform_score,
confidence_band=confidence_band,
confidence_reason=confidence_reason,
)
)
band_rank = {"high": 3, "medium": 2, "low": 1}
candidates = sorted(
candidates,
key=lambda item: (band_rank.get(item.confidence_band, 0), item.score),
reverse=True,
)[:candidate_limit]
page_hash = hashlib.sha256(html.encode("utf-8", errors="ignore")).hexdigest() if html else ""
title = _clean_text(" ".join(parser.title_parts), limit=200)
return HtmlDiagnostics(
content_length=len(html),
page_hash=page_hash,
title=title,
link_count=len(parser.links),
same_host_link_count=same_host_count,
campaign_link_candidates=[candidate.to_dict() for candidate in candidates],
)