強化 PChome 商品身份比對防錯配

2026-05-20 11:07:05 +08:00
parent f7772cc465
commit 93f1a558a5
6 changed files with 233 additions and 28 deletions
--- a/TODO_NEXT_STEPS.txt
+++ b/TODO_NEXT_STEPS.txt
@@ -4,6 +4,7 @@
 ================================================================================

 【已完成】
+   - V10.312 強化 PChome 商品身份比對防錯配：matcher 開始解析 mg/mcg 劑量、件組套組與多規格集合，60ml+150ml vs 60ml+20ml、10mg vs 20mg、10片 vs 10盒、精華 vs 化妝水都會進硬否決或單位價覆核，不再靠單一規格重疊放行；覆核診斷同步新增「劑量差異」標籤，降低核心比價錯配污染 daily/growth/PPT/AI 分析。
   - V10.311 統一競品價差語意：`fetch_competitor_comparison_results()`、competitor PPT 與 OpenClaw competitor prompt 全部改用 `MOMO - PChome`，正值代表 MOMO 較貴 / PChome 低價壓力，負值代表 MOMO 價格優勢；避免 daily/growth 顯示價格壓力但 PPT/AI 反向解讀。
   - V10.310 強化 MOMO/PChome 核心比價閉環：PChome feeder 搜尋候選只有強同款 `0.90` 才提前停止，避免第一個 0.76 次佳候選卡掉後續精準搜尋詞；人工否決的候選會被跳過並改挑下一個候選，不再讓已否決商品長期阻塞同 SKU。人工 `reject_identity`、`unit_price_required`、`needs_research` 會立即讓同候選正式 `competitor_prices` 過期，Dashboard 即使尚有舊價也不再顯示正式總價差；手機版比價覆核欄位標籤、覆核按鈕冒泡與候選證據顯示同步修正。
   - V10.308 修正商品列表 PChome 比價閉環狀態：`manual_rejected`、`manual_unit_price_required`、`manual_needs_research` 不再掉回籠統「待比對」，改顯示「人工已否決 / 人工標記單位價 / 人工要求補搜尋」與後續 feeder 行為說明，避免人工覆核後 UI 看起來像沒有處理。
--- a/config.py
+++ b/config.py
@@ -320,7 +320,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
 # ==========================================
 # 系統版本與路徑
 # ==========================================
-SYSTEM_VERSION = "V10.311"
+SYSTEM_VERSION = "V10.312"
 LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
 public_url = PUBLIC_URL  # 用於模板顯示

--- a/docs/AI_INTELLIGENCE_MODULE_SOT.md
+++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md
@@ -605,3 +605,4 @@ POSTGRES_HOST=momo-db
 | 2026-04-17 | 188 容器無 volume mount，`docker cp` 臨時解 | 重建 image（`COPY . .` bake 進新代碼）；port 5001 衝突記錄為技術債 |
 | 2026-04-17 | 188 .env Telegram token 不正確（split-brain）| 修正為 `8610496165`，188→Telegram message_id=282 確認 |
 | 2026-04-17 | NIM Tool Calling E2E | 真實 NVIDIA_API_KEY 驗證：dispatched=3, errors=[] |
+| 2026-05-20 | PChome 商品身份比對仍可能因單一規格重疊誤放行 | V10.312 起 matcher 解析 mg/mcg 劑量、件組套組、多規格集合與同數字不同單位；劑量/容量/重量/件數/品類衝突會硬否決或導向單位價覆核，避免錯配污染 Dashboard、daily/growth、PPT 與 AI 競價分析 |
--- a/services/competitor_intel_repository.py
+++ b/services/competitor_intel_repository.py
@@ -79,6 +79,7 @@ MATCH_DIAGNOSTIC_REASON_LABELS = {
    "type_conflict": "品類不符",
    "volume_conflict": "容量差異",
    "weight_conflict": "重量差異",
+    "dosage_conflict": "劑量差異",
    "count_conflict": "件數差異",
    "component_count_conflict": "入數差異",
    "multi_component_conflict": "組合差異",
--- a/services/marketplace_product_matcher.py
+++ b/services/marketplace_product_matcher.py
@@ -85,8 +85,10 @@ GENERIC_TOKENS = {
    "片",
    "支",
    "條",
+    "件",
    "ml",
    "g",
+    "mg",
    "la",
    "paris",
 }
@@ -107,9 +109,50 @@ PRODUCT_TYPES = {
    "保健": ("錠", "膠囊", "粉", "飲", "包", "健康食品"),
 }

-COUNT_UNITS = {"入", "組", "瓶", "支", "條", "盒", "包", "片", "顆", "錠", "枚"}
+COUNT_UNITS = {"入", "組", "瓶", "支", "條", "盒", "包", "片", "顆", "錠", "枚", "件"}
 PIECE_UNITS = {"包", "片", "顆", "錠", "枚"}
 CONTAINER_UNITS = {"入", "組", "盒"}
+BUNDLE_OFFER_PHRASES = (
+    "囤貨組",
+    "超值組",
+    "特惠組",
+    "優惠組",
+    "優惠套組",
+    "禮盒組",
+    "加大組",
+    "加量組",
+    "分享組",
+    "明星組",
+    "套組",
+    "組合",
+    "組合包",
+    "雙件組",
+    "二件組",
+    "2件組",
+    "家庭組",
+    "多入組",
+)
+NON_BRAND_BRACKET_PHRASES = (
+    "保濕組",
+    "熱銷款",
+    "限定",
+    "特惠",
+    "優惠",
+    "超值",
+    "囤貨",
+    "組合",
+    "套組",
+    "禮盒",
+    "分享",
+    "雙件",
+    "二件",
+    "2件",
+    "家庭",
+    "多入",
+    "任選",
+    "買",
+    "母親節",
+)
 CHINESE_COUNT = {
    "一": 1,
    "二": 2,
@@ -137,6 +180,7 @@ class ProductIdentity:
    core_tokens: frozenset[str]
    volumes_ml: tuple[float, ...]
    weights_g: tuple[float, ...]
+    dosages_mg: tuple[float, ...]
    counts: tuple[tuple[int, str], ...]
    total_piece_count: Optional[int]

@@ -261,9 +305,10 @@ def _leading_brand_tokens(original: str, normalized: str) -> set[str]:
    bracket_match = re.match(r"\s*[【\[]([^】\]]{2,40})[】\]]", original or "")
    if bracket_match:
        content = normalize_product_text(bracket_match.group(1))
-        for token in _tokenize(_strip_noise(content)):
-            if token not in GENERIC_TOKENS:
-                tokens.add(token)
+        if not any(phrase in content for phrase in NON_BRAND_BRACKET_PHRASES):
+            for token in _tokenize(_strip_noise(content)):
+                if token not in GENERIC_TOKENS:
+                    tokens.add(token)

    leading = normalized[:48]
    for token in _tokenize(leading):
@@ -293,28 +338,37 @@ def _convert_volume(value: str, unit: str) -> Optional[tuple[str, float]]:
        return ("g", number)
    if unit == "kg":
        return ("g", number * 1000)
+    if unit in {"mg", "毫克"}:
+        return ("mg", number)
+    if unit in {"mcg", "μg", "ug", "微克"}:
+        return ("mg", number / 1000)
    return None


-def _extract_specs(text: str) -> tuple[tuple[float, ...], tuple[float, ...], tuple[tuple[int, str], ...], Optional[int]]:
+def _extract_specs(
+    text: str,
+) -> tuple[tuple[float, ...], tuple[float, ...], tuple[float, ...], tuple[tuple[int, str], ...], Optional[int]]:
    volumes_ml: list[float] = []
    weights_g: list[float] = []
-    for match in re.finditer(r"(\d+(?:\.\d+)?)\s*(ml|毫升|l|g|公克|kg)", text, re.I):
+    dosages_mg: list[float] = []
+    for match in re.finditer(r"(\d+(?:\.\d+)?)\s*(ml|毫升|l|g|公克|kg|mg|毫克|mcg|μg|ug|微克)", text, re.I):
        converted = _convert_volume(match.group(1), match.group(2))
        if not converted:
            continue
        unit, number = converted
        if unit == "ml":
            volumes_ml.append(number)
-        else:
+        elif unit == "g":
            weights_g.append(number)
+        else:
+            dosages_mg.append(number)

    counts: list[tuple[int, str]] = []
-    for match in re.finditer(r"(\d+)\s*([入組瓶支條盒包片顆錠枚])", text):
+    for match in re.finditer(r"(\d+)\s*([入組瓶支條盒包片顆錠枚件])", text):
        counts.append((int(match.group(1)), match.group(2)))
-    for match in re.finditer(r"([一二兩雙三四五六七八九十])\s*([入組瓶支條盒包片顆錠枚])", text):
+    for match in re.finditer(r"([一二兩雙三四五六七八九十])\s*([入組瓶支條盒包片顆錠枚件])", text):
        counts.append((CHINESE_COUNT[match.group(1)], match.group(2)))
-    for match in re.finditer(r"(?:x|乘)\s*(\d+)\s*([入組瓶支條盒包片顆錠枚])?", text, re.I):
+    for match in re.finditer(r"(?:x|乘)\s*(\d+)\s*([入組瓶支條盒包片顆錠枚件])?", text, re.I):
        unit = match.group(2) or "入"
        counts.append((int(match.group(1)), unit))

@@ -334,6 +388,7 @@ def _extract_specs(text: str) -> tuple[tuple[float, ...], tuple[float, ...], tup
    return (
        tuple(sorted(set(volumes_ml))),
        tuple(sorted(set(weights_g))),
+        tuple(sorted(set(dosages_mg))),
        unique_counts,
        total_piece_count,
    )
@@ -351,11 +406,11 @@ def parse_product_identity(name: str) -> ProductIdentity:
        for token in tokens
        if token not in GENERIC_TOKENS
        and not token.isdigit()
-        and not re.fullmatch(r"\d+(ml|g|kg|l)?", token)
+        and not re.fullmatch(r"\d+(ml|g|kg|l|mg|mcg|ug)?", token)
    }
    core_tokens -= brand_tokens

-    volumes_ml, weights_g, counts, total_piece_count = _extract_specs(searchable)
+    volumes_ml, weights_g, dosages_mg, counts, total_piece_count = _extract_specs(searchable)
    return ProductIdentity(
        original_name=name or "",
        normalized_name=normalized,
@@ -366,6 +421,7 @@ def parse_product_identity(name: str) -> ProductIdentity:
        core_tokens=frozenset(core_tokens),
        volumes_ml=volumes_ml,
        weights_g=weights_g,
+        dosages_mg=dosages_mg,
        counts=counts,
        total_piece_count=total_piece_count,
    )
@@ -408,18 +464,58 @@ def _close_number(left: float, right: float, tolerance: float = 0.04) -> bool:


 def _spec_component(left_values: Iterable[float], right_values: Iterable[float]) -> tuple[float, bool]:
-    left_tuple = tuple(left_values)
-    right_tuple = tuple(right_values)
+    left_tuple = tuple(sorted(set(left_values)))
+    right_tuple = tuple(sorted(set(right_values)))
    if not left_tuple and not right_tuple:
        return 0.55, False
    if not left_tuple or not right_tuple:
        return 0.45, False
+    if len(left_tuple) > 1 or len(right_tuple) > 1:
+        if len(left_tuple) != len(right_tuple):
+            return 0.0, True
+        unmatched = list(right_tuple)
+        for left_value in left_tuple:
+            match_index = next(
+                (
+                    index
+                    for index, right_value in enumerate(unmatched)
+                    if _close_number(left_value, right_value)
+                ),
+                None,
+            )
+            if match_index is None:
+                return 0.0, True
+            unmatched.pop(match_index)
+        return 1.0, False
    for left_value in left_tuple:
        if any(_close_number(left_value, right_value) for right_value in right_tuple):
            return 1.0, False
    return 0.0, True


+def _has_hard_count_unit_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
+    if not left.counts or not right.counts:
+        return False
+    left_by_count: dict[int, set[str]] = {}
+    right_by_count: dict[int, set[str]] = {}
+    for count, unit in left.counts:
+        left_by_count.setdefault(count, set()).add(unit)
+    for count, unit in right.counts:
+        right_by_count.setdefault(count, set()).add(unit)
+
+    for count in set(left_by_count) & set(right_by_count):
+        left_units = left_by_count[count]
+        right_units = right_by_count[count]
+        if left_units & right_units:
+            continue
+        if (
+            (left_units & PIECE_UNITS and right_units & CONTAINER_UNITS)
+            or (right_units & PIECE_UNITS and left_units & CONTAINER_UNITS)
+        ):
+            return True
+    return False
+
+
 def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool]:
    left_counts = [count for count, _unit in left.counts]
    right_counts = [count for count, _unit in right.counts]
@@ -431,6 +527,8 @@ def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float,
    if left.counts and right.counts:
        if set(left.counts) & set(right.counts):
            return 0.85, False
+        if _has_hard_count_unit_conflict(left, right):
+            return 0.0, True
        if left_counts and right_counts:
            ratio = max(max(left_counts), max(right_counts)) / max(min(max(left_counts), max(right_counts)), 1)
            if ratio >= 1.5:
@@ -444,6 +542,7 @@ def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float,
 def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool, tuple[str, ...]]:
    volume_score, volume_conflict = _spec_component(left.volumes_ml, right.volumes_ml)
    weight_score, weight_conflict = _spec_component(left.weights_g, right.weights_g)
+    dosage_score, dosage_conflict = _spec_component(left.dosages_mg, right.dosages_mg)
    count_score, count_conflict = _count_score(left, right)

    available = []
@@ -451,6 +550,8 @@ def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, b
        available.append(volume_score)
    if left.weights_g or right.weights_g:
        available.append(weight_score)
+    if left.dosages_mg or right.dosages_mg:
+        available.append(dosage_score)
    if left.counts or right.counts:
        available.append(count_score)
    if not available:
@@ -462,6 +563,8 @@ def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, b
        conflicts.append("volume_conflict")
    if weight_conflict:
        conflicts.append("weight_conflict")
+    if dosage_conflict:
+        conflicts.append("dosage_conflict")
    if count_conflict:
        conflicts.append("count_conflict")
    return score, bool(conflicts), tuple(conflicts)
@@ -473,16 +576,7 @@ def _has_bundle_offer(identity: ProductIdentity) -> bool:
        re.search(r"買\s*\d+\s*送\s*\d+", text)
        or re.search(r"買\s*[一二兩雙三四五六七八九十]\s*送\s*[一二兩雙三四五六七八九十]", text)
        or "買一送一" in text
-        or "囤貨組" in text
-        or "超值組" in text
-        or "特惠組" in text
-        or "優惠套組" in text
-        or "禮盒組" in text
-        or "加大組" in text
-        or "加量組" in text
-        or "分享組" in text
-        or "明星組" in text
-        or "套組" in text
+        or any(phrase in text for phrase in BUNDLE_OFFER_PHRASES)
    )


@@ -491,7 +585,7 @@ def _has_multi_component(identity: ProductIdentity) -> bool:
    return bool(
        "+" in text
        or "＋" in text
-        or re.search(r"\d+\s*(?:ml|g|毫升|公克)\s*x\s*\d+", text, re.I)
+        or re.search(r"\d+\s*(?:ml|g|mg|毫升|公克|毫克)\s*x\s*\d+", text, re.I)
    )


@@ -507,7 +601,13 @@ def _has_refill_pack(identity: ProductIdentity) -> bool:


 def _spec_mention_count(identity: ProductIdentity) -> int:
-    return len(re.findall(r"\d+(?:\.\d+)?\s*(?:ml|毫升|l|g|公克|kg)", identity.normalized_name, re.I))
+    return len(
+        re.findall(
+            r"\d+(?:\.\d+)?\s*(?:ml|毫升|l|g|公克|kg|mg|毫克|mcg|μg|ug|微克)",
+            identity.normalized_name,
+            re.I,
+        )
+    )


 def _count_text_value(value: str) -> Optional[int]:
@@ -526,6 +626,11 @@ def _pack_multiplier(identity: ProductIdentity) -> int:
            return left + right
    if "買一送一" in text or "買1送1" in text:
        return 2
+    piece_pack = re.search(r"(\d+|[一二兩雙三四五六七八九十])\s*件\s*組", text)
+    if piece_pack:
+        count = _count_text_value(piece_pack.group(1)) or 0
+        if count > 1:
+            return count

    multipliers = [count for count, unit in identity.counts if unit in COUNT_UNITS and count > 1]
    if multipliers:
@@ -764,7 +869,7 @@ def score_marketplace_match(
        hard_veto = True
    if chinese_name_score < 0.16 and token_score < 0.72:
        hard_veto = True
-    if left.product_type and right.product_type and left.product_type != right.product_type and token_score < 0.55:
+    if left.product_type and right.product_type and left.product_type != right.product_type:
        hard_veto = True

    comparison_mode = "exact_identity"
@@ -845,6 +950,10 @@ def build_search_terms(name: str, max_terms: int = 3) -> list[str]:
        specs.append(f"{int(identity.volumes_ml[0])}ml")
    if identity.weights_g:
        specs.append(f"{int(identity.weights_g[0])}g")
+    if identity.dosages_mg:
+        dosage = identity.dosages_mg[0]
+        dosage_label = f"{int(dosage)}mg" if dosage.is_integer() else f"{dosage:g}mg"
+        specs.append(dosage_label)
    if identity.total_piece_count:
        specs.append(f"{identity.total_piece_count}包")

--- a/tests/test_marketplace_product_matcher.py
+++ b/tests/test_marketplace_product_matcher.py
@@ -167,6 +167,99 @@ def test_marketplace_matcher_does_not_promote_wide_price_refill_candidate():
    assert "strong_product_line_match" not in diagnostics.reasons


+def test_marketplace_matcher_rejects_partial_overlap_in_multi_spec_set():
+    from services.marketplace_product_matcher import score_marketplace_match
+
+    diagnostics = score_marketplace_match(
+        "【蘭蔻】玫瑰霜60ml+玫瑰精露150ml",
+        "【蘭蔻】玫瑰霜60ml+玫瑰精露20ml",
+        momo_price=18765,
+        competitor_price=7999,
+    )
+
+    assert diagnostics.score < 0.76
+    assert diagnostics.hard_veto is True
+    assert "volume_conflict" in diagnostics.reasons
+    assert diagnostics.comparison_mode == "not_comparable"
+
+
+def test_marketplace_matcher_rejects_dosage_conflict():
+    from services.marketplace_product_matcher import score_marketplace_match
+
+    diagnostics = score_marketplace_match(
+        "品牌 葉黃素 10mg 60錠",
+        "品牌 葉黃素 20mg 60錠",
+        momo_price=990,
+        competitor_price=890,
+    )
+
+    assert diagnostics.score < 0.76
+    assert diagnostics.hard_veto is True
+    assert "dosage_conflict" in diagnostics.reasons
+
+
+def test_marketplace_matcher_rejects_product_type_conflict_even_when_line_matches():
+    from services.marketplace_product_matcher import score_marketplace_match
+
+    diagnostics = score_marketplace_match(
+        "理膚寶水 MelaB3 淡斑精華 30ml",
+        "理膚寶水 MelaB3 淡斑化妝水 30ml",
+        momo_price=1280,
+        competitor_price=1180,
+    )
+
+    assert diagnostics.score < 0.76
+    assert diagnostics.hard_veto is True
+    assert "type_conflict" in diagnostics.reasons
+    assert diagnostics.comparison_mode == "not_comparable"
+
+
+def test_marketplace_matcher_rejects_same_count_different_unit_family():
+    from services.marketplace_product_matcher import score_marketplace_match
+
+    diagnostics = score_marketplace_match(
+        "品牌 保濕面膜10片",
+        "品牌 保濕面膜10盒",
+        momo_price=399,
+        competitor_price=1990,
+    )
+
+    assert diagnostics.score < 0.76
+    assert diagnostics.hard_veto is True
+    assert "count_conflict" in diagnostics.reasons
+
+
+def test_marketplace_matcher_marks_generic_bundle_words_as_unit_comparable():
+    from services.marketplace_product_matcher import score_marketplace_match
+
+    diagnostics = score_marketplace_match(
+        "【NARS】裸光蜜粉餅 雙件組 10g",
+        "【NARS】裸光蜜粉餅 10g",
+        momo_price=1999,
+        competitor_price=1099,
+    )
+
+    assert diagnostics.score < 0.76
+    assert diagnostics.hard_veto is True
+    assert "bundle_offer_conflict" in diagnostics.reasons
+    assert diagnostics.comparison_mode == "unit_comparable"
+
+
+def test_marketplace_matcher_ignores_non_brand_bracket_copy():
+    from services.marketplace_product_matcher import score_marketplace_match
+
+    diagnostics = score_marketplace_match(
+        "【保濕組】理膚寶水 B5 修復霜 40ml",
+        "理膚寶水 B5 修復霜 40ml",
+        momo_price=699,
+        competitor_price=679,
+    )
+
+    assert diagnostics.score >= 0.76
+    assert diagnostics.hard_veto is False
+    assert "brand_conflict" not in diagnostics.reasons
+
+
 def test_batch_compare_top_uses_latest_momo_price_not_revenue(monkeypatch):
    from services import pchome_crawler