173 lines
5.1 KiB
Python
173 lines
5.1 KiB
Python
"""市場情報 HTML 診斷解析工具。
|
|
|
|
只萃取頁面標題、連結候選與內容指紋,不建立正式 campaign/product。
|
|
"""
|
|
|
|
import hashlib
|
|
import re
|
|
from dataclasses import asdict, dataclass
|
|
from html.parser import HTMLParser
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class LinkCandidate:
|
|
"""活動頁中可疑連結候選。"""
|
|
|
|
href: str
|
|
text: str
|
|
is_same_host: bool
|
|
score: int
|
|
generic_score: int
|
|
platform_score: int
|
|
confidence_band: str
|
|
confidence_reason: str
|
|
|
|
def to_dict(self):
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class HtmlDiagnostics:
|
|
"""HTML 診斷摘要。"""
|
|
|
|
content_length: int
|
|
page_hash: str
|
|
title: str
|
|
link_count: int
|
|
same_host_link_count: int
|
|
campaign_link_candidates: list
|
|
|
|
def to_dict(self):
|
|
return asdict(self)
|
|
|
|
|
|
class _DiagnosticHtmlParser(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__(convert_charrefs=True)
|
|
self.title_parts = []
|
|
self.links = []
|
|
self._in_title = False
|
|
self._active_href = None
|
|
self._active_text = []
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
attrs_map = dict(attrs or [])
|
|
if tag.lower() == "title":
|
|
self._in_title = True
|
|
if tag.lower() == "a":
|
|
self._active_href = attrs_map.get("href")
|
|
self._active_text = []
|
|
|
|
def handle_data(self, data):
|
|
if self._in_title:
|
|
self.title_parts.append(data)
|
|
if self._active_href is not None:
|
|
self._active_text.append(data)
|
|
|
|
def handle_endtag(self, tag):
|
|
tag = tag.lower()
|
|
if tag == "title":
|
|
self._in_title = False
|
|
if tag == "a" and self._active_href is not None:
|
|
self.links.append((self._active_href, " ".join(self._active_text)))
|
|
self._active_href = None
|
|
self._active_text = []
|
|
|
|
|
|
def _clean_text(value, limit=160):
|
|
text = re.sub(r"\s+", " ", value or "").strip()
|
|
return text[:limit]
|
|
|
|
|
|
def _score_link(href, text):
|
|
haystack = f"{href} {text}".lower()
|
|
score = 0
|
|
for keyword in ("edm", "event", "promo", "campaign", "sale", "activity"):
|
|
if keyword in haystack:
|
|
score += 2
|
|
for keyword in ("活動", "優惠", "折扣", "檔期", "品牌日", "限時", "促銷"):
|
|
if keyword in haystack:
|
|
score += 3
|
|
return score
|
|
|
|
|
|
def _confidence_band(score, *, is_same_host, platform_score, generic_score):
|
|
"""把診斷分數轉成人工審核用信心帶。"""
|
|
reasons = []
|
|
reasons.append("same_host" if is_same_host else "external_host")
|
|
if platform_score > 0:
|
|
reasons.append(f"platform_score={platform_score}")
|
|
if generic_score > 0:
|
|
reasons.append(f"generic_score={generic_score}")
|
|
|
|
if score >= 12 and is_same_host:
|
|
return "high", ", ".join(reasons)
|
|
if score >= 6:
|
|
return "medium", ", ".join(reasons)
|
|
return "low", ", ".join(reasons)
|
|
|
|
|
|
def parse_html_diagnostics(html, base_url="", candidate_limit=12, score_link=None):
|
|
"""解析 HTML 診斷資訊,不做商業資料入庫。"""
|
|
html = html or ""
|
|
parser = _DiagnosticHtmlParser()
|
|
parser.feed(html)
|
|
|
|
base_host = urlparse(base_url or "").netloc
|
|
candidates = []
|
|
same_host_count = 0
|
|
|
|
for raw_href, raw_text in parser.links:
|
|
if not raw_href:
|
|
continue
|
|
href = urljoin(base_url, raw_href)
|
|
parsed = urlparse(href)
|
|
if parsed.scheme not in {"http", "https"}:
|
|
continue
|
|
is_same_host = bool(base_host and parsed.netloc == base_host)
|
|
if is_same_host:
|
|
same_host_count += 1
|
|
text = _clean_text(raw_text)
|
|
generic_score = _score_link(href, text)
|
|
platform_score = int(score_link(href, text)) if score_link else 0
|
|
score = generic_score + platform_score
|
|
if score <= 0:
|
|
continue
|
|
confidence_band, confidence_reason = _confidence_band(
|
|
score,
|
|
is_same_host=is_same_host,
|
|
platform_score=platform_score,
|
|
generic_score=generic_score,
|
|
)
|
|
candidates.append(
|
|
LinkCandidate(
|
|
href=href,
|
|
text=text,
|
|
is_same_host=is_same_host,
|
|
score=score,
|
|
generic_score=generic_score,
|
|
platform_score=platform_score,
|
|
confidence_band=confidence_band,
|
|
confidence_reason=confidence_reason,
|
|
)
|
|
)
|
|
|
|
band_rank = {"high": 3, "medium": 2, "low": 1}
|
|
candidates = sorted(
|
|
candidates,
|
|
key=lambda item: (band_rank.get(item.confidence_band, 0), item.score),
|
|
reverse=True,
|
|
)[:candidate_limit]
|
|
page_hash = hashlib.sha256(html.encode("utf-8", errors="ignore")).hexdigest() if html else ""
|
|
title = _clean_text(" ".join(parser.title_parts), limit=200)
|
|
|
|
return HtmlDiagnostics(
|
|
content_length=len(html),
|
|
page_hash=page_hash,
|
|
title=title,
|
|
link_count=len(parser.links),
|
|
same_host_link_count=same_host_count,
|
|
campaign_link_candidates=[candidate.to_dict() for candidate in candidates],
|
|
)
|