"""市場情報 HTML 診斷解析工具。

只萃取頁面標題、連結候選與內容指紋，不建立正式 campaign/product。
"""

import hashlib
import re
from dataclasses import asdict, dataclass
from html.parser import HTMLParser
from urllib.parse import urljoin, urlparse


@dataclass(frozen=True)
class LinkCandidate:
    """活動頁中可疑連結候選。"""

    href: str
    text: str
    is_same_host: bool
    score: int
    generic_score: int
    platform_score: int
    confidence_band: str
    confidence_reason: str

    def to_dict(self):
        return asdict(self)


@dataclass(frozen=True)
class HtmlDiagnostics:
    """HTML 診斷摘要。"""

    content_length: int
    page_hash: str
    title: str
    link_count: int
    same_host_link_count: int
    campaign_link_candidates: list

    def to_dict(self):
        return asdict(self)


class _DiagnosticHtmlParser(HTMLParser):
    def __init__(self):
        super().__init__(convert_charrefs=True)
        self.title_parts = []
        self.links = []
        self._in_title = False
        self._active_href = None
        self._active_text = []

    def handle_starttag(self, tag, attrs):
        attrs_map = dict(attrs or [])
        if tag.lower() == "title":
            self._in_title = True
        if tag.lower() == "a":
            self._active_href = attrs_map.get("href")
            self._active_text = []

    def handle_data(self, data):
        if self._in_title:
            self.title_parts.append(data)
        if self._active_href is not None:
            self._active_text.append(data)

    def handle_endtag(self, tag):
        tag = tag.lower()
        if tag == "title":
            self._in_title = False
        if tag == "a" and self._active_href is not None:
            self.links.append((self._active_href, " ".join(self._active_text)))
            self._active_href = None
            self._active_text = []


def _clean_text(value, limit=160):
    text = re.sub(r"\s+", " ", value or "").strip()
    return text[:limit]


def _score_link(href, text):
    haystack = f"{href} {text}".lower()
    score = 0
    for keyword in ("edm", "event", "promo", "campaign", "sale", "activity"):
        if keyword in haystack:
            score += 2
    for keyword in ("活動", "優惠", "折扣", "檔期", "品牌日", "限時", "促銷"):
        if keyword in haystack:
            score += 3
    return score


def _confidence_band(score, *, is_same_host, platform_score, generic_score):
    """把診斷分數轉成人工審核用信心帶。"""
    reasons = []
    reasons.append("same_host" if is_same_host else "external_host")
    if platform_score > 0:
        reasons.append(f"platform_score={platform_score}")
    if generic_score > 0:
        reasons.append(f"generic_score={generic_score}")

    if score >= 12 and is_same_host:
        return "high", ", ".join(reasons)
    if score >= 6:
        return "medium", ", ".join(reasons)
    return "low", ", ".join(reasons)


def parse_html_diagnostics(html, base_url="", candidate_limit=12, score_link=None):
    """解析 HTML 診斷資訊，不做商業資料入庫。"""
    html = html or ""
    parser = _DiagnosticHtmlParser()
    parser.feed(html)

    base_host = urlparse(base_url or "").netloc
    candidates = []
    same_host_count = 0

    for raw_href, raw_text in parser.links:
        if not raw_href:
            continue
        href = urljoin(base_url, raw_href)
        parsed = urlparse(href)
        if parsed.scheme not in {"http", "https"}:
            continue
        is_same_host = bool(base_host and parsed.netloc == base_host)
        if is_same_host:
            same_host_count += 1
        text = _clean_text(raw_text)
        generic_score = _score_link(href, text)
        platform_score = int(score_link(href, text)) if score_link else 0
        score = generic_score + platform_score
        if score <= 0:
            continue
        confidence_band, confidence_reason = _confidence_band(
            score,
            is_same_host=is_same_host,
            platform_score=platform_score,
            generic_score=generic_score,
        )
        candidates.append(
            LinkCandidate(
                href=href,
                text=text,
                is_same_host=is_same_host,
                score=score,
                generic_score=generic_score,
                platform_score=platform_score,
                confidence_band=confidence_band,
                confidence_reason=confidence_reason,
            )
        )

    band_rank = {"high": 3, "medium": 2, "low": 1}
    candidates = sorted(
        candidates,
        key=lambda item: (band_rank.get(item.confidence_band, 0), item.score),
        reverse=True,
    )[:candidate_limit]
    page_hash = hashlib.sha256(html.encode("utf-8", errors="ignore")).hexdigest() if html else ""
    title = _clean_text(" ".join(parser.title_parts), limit=200)

    return HtmlDiagnostics(
        content_length=len(html),
        page_hash=page_hash,
        title=title,
        link_count=len(parser.links),
        same_host_link_count=same_host_count,
        campaign_link_candidates=[candidate.to_dict() for candidate in candidates],
    )