[V10.343] 強化 PChome 商品搜尋召回

2026-05-20 16:21:19 +08:00
parent 48e46e35c0
commit 193b6e53c5
8 changed files with 356 additions and 29 deletions
--- a/config.py
+++ b/config.py
@@ -320,7 +320,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
 # ==========================================
 # 系統版本與路徑
 # ==========================================
-SYSTEM_VERSION = "V10.342"
+SYSTEM_VERSION = "V10.343"
 LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
 public_url = PUBLIC_URL  # 用於模板顯示

--- a/docs/memory/code_modularization_inventory_20260430.md
+++ b/docs/memory/code_modularization_inventory_20260430.md
@@ -26,6 +26,7 @@
 - 2026-05-20 追記：同步背景 PChome identity / price direction 更新後的 `services/marketplace_product_matcher.py` 行數；此處只更新 inventory，不變更商品比對行為。
 - 2026-05-20 追記：同步背景 PChome crawler 搜尋韌性擴充後的 `services/pchome_crawler.py` 行數；此處只更新 inventory，不變更 PChome crawler 行為。
 - 2026-05-20 追記：同步 PChome 近門檻候選重評與 matcher 系列/刀片數防錯配更新後的 `services/marketplace_product_matcher.py`、`services/competitor_price_feeder.py` 行數；此處只更新 inventory，不變更比價行為。
+- 2026-05-20 追記：同步 PChome 搜尋詞品質層、候選召回與 hard-veto 狀態分流更新後的 `services/marketplace_product_matcher.py`、`services/competitor_price_feeder.py` 行數；並補列背景市場情報 deployment readiness 大檔，僅更新 inventory。

 ## 達到或超過 800 行檔案清單

@@ -52,16 +53,17 @@
 | 940 | `services/import_service.py` | P2 import service | validators / import writers / report builders |
 | 933 | `services/telegram_templates.py` | P2 Telegram templates | alert template groups / channel-specific formatting / reusable render helpers |
 | 867 | `services/token_report_service.py` | P2 token report service | query / aggregation / chart payload / notification formatting |
-| 1128 | `services/marketplace_product_matcher.py` | P2 marketplace matcher | identity parsing / unit-comparable scoring / persistence normalization |
+| 1356 | `services/marketplace_product_matcher.py` | P2 marketplace matcher | identity parsing / unit-comparable scoring / search term quality / persistence normalization |
 | 865 | `routes/daily_sales_routes.py` | P2 Daily Sales Blueprint | route glue / export helpers / daily query and formatting service |
 | 844 | `services/ollama_service.py` | P2 Ollama client | host health / request client / fallback policy / response parsing |
 | 849 | `services/pchome_crawler.py` | P2 PChome crawler | search fetch / parsing / fallback source handling / rate limit policy |
 | 1042 | `services/code_review_pipeline_service.py` | P2 Code review pipeline service | scan orchestration / finding normalization / persistence adapter |
 | 953 | `routes/export_routes.py` | P2 Export flow | export command/router glue / file path / download orchestration |
 | 816 | `services/ppt_vision_service.py` | P2 PPT vision QA service | runtime state / queue status / model probe / audit execution 分離 |
-| 1592 | `services/competitor_price_feeder.py` | P2 competitor price feeder | crawler scheduling / price normalization / cache strategy |
+| 1602 | `services/competitor_price_feeder.py` | P2 competitor price feeder | crawler scheduling / price normalization / cache strategy |
 | 1120 | `services/competitor_intel_repository.py` | P2 competitor intel repository | review queue query / cache shaping / formatting helpers |
 | 805 | `routes/bot_api_routes.py` | P2 Bot API Blueprint | route glue / bot action service |
+| 811 | `services/market_intel/deployment_readiness.py` | P2 market intel deployment readiness | preflight gates / readiness payload / route contract helpers |

 ## 市場情報開發前置禁區

--- a/services/competitor_price_feeder.py
+++ b/services/competitor_price_feeder.py
@@ -144,7 +144,13 @@ def _build_search_keywords(momo_name: str) -> list:
        cleaned = _clean_search_text(momo_name)
        terms = [cleaned[:36], cleaned[:24]]

-    return _dedupe_terms(terms)
+    primary_terms = _dedupe_terms(terms[: max(1, MAX_SEARCH_TERMS - 1)])
+    original_terms = _dedupe_terms([momo_name])
+    for term in original_terms:
+        if term.lower() not in {existing.lower() for existing in primary_terms}:
+            primary_terms.append(term)
+            break
+    return _dedupe_terms(primary_terms)


 def _format_match_diagnostics(diagnostics) -> str:
@@ -252,8 +258,9 @@ def _search_pchome_candidates(crawler, momo_name: str, keywords: list = None, mo
    """以多組搜尋詞擴大 PChome 候選池，只在強同款時提前停止。"""
    candidates = []
    seen_ids = set()
+    search_limit = SEARCH_LIMIT * max(1, SEARCH_MAX_PAGES)
    for keyword in keywords or _build_search_keywords(momo_name):
-        ok, _, products = crawler.search_products(keyword, limit=SEARCH_LIMIT, max_pages=SEARCH_MAX_PAGES)
+        ok, _, products = crawler.search_products(keyword, limit=search_limit, max_pages=SEARCH_MAX_PAGES)
        if not ok or not products:
            continue
        for product in products:
@@ -1154,6 +1161,7 @@ class CompetitorPriceFeeder:
                    continue

                if score < MIN_MATCH_SCORE and not manual_accept_override:
+                    attempt_status = "identity_veto" if getattr(diagnostics, "hard_veto", False) else "low_score"
                    logger.debug(
                        f"[Feeder] {sku} 比對分數過低 ({score:.3f} < {MIN_MATCH_SCORE})，"
                        f"{_format_match_diagnostics(diagnostics)}"
@@ -1165,7 +1173,7 @@ class CompetitorPriceFeeder:
                        momo_price=momo_price,
                        search_terms=search_terms,
                        candidate_count=len(products),
-                        attempt_status="low_score",
+                        attempt_status=attempt_status,
                        best_product=best_product,
                        best_score=score,
                        diagnostics=diagnostics,
@@ -1382,6 +1390,7 @@ class CompetitorPriceFeeder:
                    continue

                if score < MIN_MATCH_SCORE:
+                    attempt_status = "identity_veto" if getattr(diagnostics, "hard_veto", False) else "refresh_low_score"
                    self._record_match_attempt(
                        sku,
                        momo_name,
@@ -1389,7 +1398,7 @@ class CompetitorPriceFeeder:
                        momo_price=momo_price,
                        search_terms=search_terms,
                        candidate_count=1,
-                        attempt_status="refresh_low_score",
+                        attempt_status=attempt_status,
                        best_product=best_product,
                        best_score=score,
                        diagnostics=diagnostics,
--- a/services/marketplace_product_matcher.py
+++ b/services/marketplace_product_matcher.py
@@ -111,6 +111,96 @@ GENERIC_TOKENS = {
    "美國",
 }

+SEARCH_NOISE_PHRASES = (
+    "新品上市",
+    "全新上市",
+    "任選一款",
+    "任選1款",
+    "任選一色",
+    "任選1色",
+    "多款任選",
+    "多款可選",
+    "色號可選",
+    "香味可選",
+    "口味可選",
+    "送精美紙袋",
+    "精美紙袋",
+    "交換禮物",
+    "聖誕禮物",
+    "母親節",
+    "父親節",
+    "情人節",
+    "外出清潔",
+    "卸除髒汙",
+    "卸除防曬",
+    "卸防曬",
+    "韓國彩妝",
+    "水光感",
+    "官方直營",
+    "官方",
+)
+
+SEARCH_NOISE_TOKENS = {
+    "一款",
+    "1款",
+    "一色",
+    "1色",
+    "上市",
+    "全新",
+    "新品",
+    "香味",
+    "口味",
+    "味道",
+    "顏色",
+    "色號",
+    "紙袋",
+    "禮物",
+    "清潔",
+    "髒汙",
+    "防曬",
+    "彩妝",
+    "水光感",
+}
+
+SEARCH_IDENTITY_ANCHORS = (
+    "免用水潔淨液",
+    "身體按摩精油",
+    "按摩精油",
+    "擴香補充瓶",
+    "擴香瓶",
+    "全面修復霜",
+    "修復霜",
+    "護膚膏",
+    "屁屁噴",
+    "身體乳",
+    "緊實乳",
+    "潔膚露",
+    "潔淨液",
+    "護甲油",
+    "指甲油",
+    "美甲片",
+    "唇凍",
+    "唇釉",
+    "唇膏",
+    "粉底棒",
+    "遮瑕棒",
+    "化妝水",
+    "精華液",
+    "精華",
+    "面膜",
+    "乳液",
+    "乳霜",
+    "面霜",
+    "精油",
+    "水氧機",
+    "香氛機",
+)
+
+SEARCH_AMBIGUOUS_PRODUCT_TERMS = {
+    "保護膜",
+    "保護貼",
+}
+
 BRAND_ALIAS_OVERRIDES = {
    "clarins": ("克蘭詩", "clarins"),
    "nars": ("nars",),
@@ -1099,6 +1189,123 @@ def score_marketplace_match(
    )


+def _clean_search_phrase(value: str) -> str:
+    text = normalize_product_text(value)
+    for phrase in sorted(SEARCH_NOISE_PHRASES, key=len, reverse=True):
+        text = text.replace(phrase.lower(), " ")
+    text = re.sub(r"[^\w\u4e00-\u9fff]+", " ", text)
+    text = " ".join(
+        token for token in text.split()
+        if token not in SEARCH_NOISE_TOKENS and token not in GENERIC_TOKENS
+    )
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+def _search_spec_terms(identity: ProductIdentity) -> list[str]:
+    specs: list[str] = []
+    if identity.volumes_ml:
+        volume = identity.volumes_ml[0]
+        specs.append(f"{volume:g}ml")
+    if identity.weights_g:
+        weight = identity.weights_g[0]
+        specs.append(f"{weight:g}g")
+    if identity.dosages_mg:
+        dosage = identity.dosages_mg[0]
+        specs.append(f"{dosage:g}mg")
+    if identity.total_piece_count:
+        specs.append(f"{identity.total_piece_count}包")
+    return specs
+
+
+def _extract_anchor_phrases(token: str) -> list[str]:
+    cleaned = _clean_search_phrase(token)
+    if not cleaned:
+        return []
+
+    phrases: list[str] = []
+    for anchor in SEARCH_IDENTITY_ANCHORS:
+        if anchor not in cleaned:
+            continue
+        if re.search(r"[\u4e00-\u9fff]", anchor):
+            prefix_width = 0 if len(anchor) >= 5 else (4 if len(anchor) >= 3 else 6)
+            match = re.search(rf"([\u4e00-\u9fff]{{0,{prefix_width}}}{re.escape(anchor)})", cleaned)
+            phrase = match.group(1) if match else anchor
+        else:
+            phrase = anchor
+        phrase = _clean_search_phrase(phrase)
+        if any(existing in phrase and existing != phrase for existing in phrases):
+            continue
+        if len(phrase) >= 2 and phrase not in phrases:
+            phrases.append(phrase)
+    return phrases
+
+
+def _search_core_score(token: str, all_tokens: set[str]) -> tuple[int, int, str]:
+    cleaned = _clean_search_phrase(token)
+    if not cleaned:
+        return (-999, 0, cleaned)
+    compact = cleaned.replace(" ", "")
+    if compact in SEARCH_NOISE_TOKENS or compact in GENERIC_TOKENS:
+        return (-900, 0, cleaned)
+
+    score = 0
+    if re.search(r"[a-z][a-z0-9-]{2,}", cleaned):
+        score += 30
+    if re.search(r"\d", cleaned):
+        score += 12
+
+    anchors = _extract_anchor_phrases(cleaned)
+    if anchors:
+        score += 90
+        if anchors[0] == compact:
+            score += 8
+    else:
+        score += max(0, 24 - len(compact))
+
+    if len(compact) <= 8:
+        score += 14
+    elif len(compact) >= 12:
+        score -= 12
+
+    has_better_anchor = any(
+        other != token and _extract_anchor_phrases(other)
+        for other in all_tokens
+    )
+    if has_better_anchor and any(term in compact for term in SEARCH_AMBIGUOUS_PRODUCT_TERMS):
+        score -= 80
+    if any(noise in compact for noise in SEARCH_NOISE_TOKENS):
+        score -= 18
+
+    return (score, -len(compact), cleaned)
+
+
+def _ranked_search_core_phrases(identity: ProductIdentity, limit: int = 4) -> list[str]:
+    tokens = {token for token in identity.core_tokens if token not in GENERIC_TOKENS}
+    ranked_tokens = sorted(
+        tokens,
+        key=lambda token: _search_core_score(token, tokens),
+        reverse=True,
+    )
+
+    phrases: list[str] = []
+    for token in ranked_tokens:
+        if _search_core_score(token, tokens)[0] < -100:
+            continue
+        candidates = _extract_anchor_phrases(token) or [_clean_search_phrase(token)]
+        for phrase in candidates:
+            compact = phrase.replace(" ", "")
+            if len(compact) < 2 or compact in SEARCH_NOISE_TOKENS:
+                continue
+            if any(term in compact for term in SEARCH_AMBIGUOUS_PRODUCT_TERMS) and len(phrases) > 0:
+                continue
+            if phrase not in phrases:
+                phrases.append(phrase)
+            if len(phrases) >= limit:
+                return phrases
+    return phrases
+
+
 def build_search_terms(name: str, max_terms: int = 3) -> list[str]:
    identity = parse_product_identity(name)
    terms: list[str] = []
@@ -1120,30 +1327,27 @@ def build_search_terms(name: str, max_terms: int = 3) -> list[str]:
        return latin[0] if latin else ""

    brand_part = primary_brand_phrase()
-    core = " ".join(sorted(identity.core_tokens, key=lambda token: (-len(token), token))[:4])
-    specs = []
-    if identity.volumes_ml:
-        specs.append(f"{int(identity.volumes_ml[0])}ml")
-    if identity.weights_g:
-        specs.append(f"{int(identity.weights_g[0])}g")
-    if identity.dosages_mg:
-        dosage = identity.dosages_mg[0]
-        dosage_label = f"{int(dosage)}mg" if dosage.is_integer() else f"{dosage:g}mg"
-        specs.append(dosage_label)
-    if identity.total_piece_count:
-        specs.append(f"{identity.total_piece_count}包")
-
-    spec_part = " ".join(specs)
-    core_tokens = sorted(identity.core_tokens, key=lambda token: (-len(token), token))
-    core_short = " ".join(core_tokens[:2])
+    spec_part = " ".join(_search_spec_terms(identity))
+    core_phrases = _ranked_search_core_phrases(identity, limit=4)
+    core_short = " ".join(core_phrases[:2])
+    core_primary = core_phrases[0] if core_phrases else ""
+    model_phrases = [
+        phrase
+        for phrase in core_phrases[1:]
+        if re.fullmatch(r"[a-z]*\d+[a-z0-9-]*", phrase)
+        or re.fullmatch(r"[a-z][a-z0-9-]{2,}", phrase)
+    ]
+    primary_with_model = " ".join(
+        part for part in (core_primary, model_phrases[0] if model_phrases else "") if part
+    )
    for value in (
+        " ".join(part for part in (brand_part, primary_with_model, spec_part) if part),
        " ".join(part for part in (brand_part, core_short, spec_part) if part),
        " ".join(part for part in (brand_part, core_short) if part),
-        " ".join(part for part in (core_short, spec_part) if part),
+        " ".join(part for part in (core_primary, spec_part) if part),
        identity.searchable_name,
    ):
-        cleaned = re.sub(r"[^\w\u4e00-\u9fff]+", " ", value)
-        cleaned = re.sub(r"\s+", " ", cleaned).strip()
+        cleaned = _clean_search_phrase(value)
        if cleaned and cleaned not in terms:
            terms.append(cleaned[:42])
        if len(terms) >= max_terms:
--- a/tests/test_competitor_match_attempts_persistence.py
+++ b/tests/test_competitor_match_attempts_persistence.py
@@ -18,7 +18,8 @@ def test_competitor_feeder_persists_all_match_attempt_outcomes():
    assert "INSERT INTO competitor_match_attempts" in source
    assert "CAST(:search_terms AS jsonb)" in source
    assert 'attempt_status="matched"' in source
-    assert 'attempt_status="low_score"' in source
+    assert '"low_score"' in source
+    assert '"identity_veto"' in source
    assert 'attempt_status="no_result"' in source
    assert 'attempt_status="no_match"' in source
    assert 'attempt_status="error"' in source
@@ -342,6 +343,71 @@ def test_competitor_feeder_skips_rejected_candidate_and_uses_next_best(monkeypat
    assert attempts[0]["best_product"].product_id == "DDAB01-ACCEPTABLE"


+def test_competitor_feeder_splits_hard_veto_from_low_score(monkeypatch):
+    from services.competitor_price_feeder import CompetitorPriceFeeder
+    from services.pchome_crawler import PChomeProduct
+
+    product = PChomeProduct(
+        product_id="DDAB01-WRONG",
+        name="iPhone 16 Pro 保護膜",
+        price=399,
+        original_price=499,
+        discount=20,
+        image_url="",
+        product_url="https://24h.pchome.com.tw/prod/DDAB01-WRONG",
+        stock=20,
+        store="24h",
+        rating=4.7,
+        review_count=8,
+        is_on_sale=True,
+        crawled_at=datetime.now(),
+    )
+
+    class FakeCrawler:
+        def __init__(self, *_args, **_kwargs):
+            pass
+
+        def search_products(self, *_args, **_kwargs):
+            return True, "ok", [product]
+
+    def fake_score(*_args, **_kwargs):
+        return SimpleNamespace(
+            score=0.31,
+            brand_score=0.0,
+            token_score=0.1,
+            spec_score=0.55,
+            sequence_score=0.1,
+            type_score=0.55,
+            price_penalty=0.0,
+            hard_veto=True,
+            reasons=("brand_conflict", "product_line_conflict"),
+            comparison_mode="not_comparable",
+            tags=["identity_v2", "identity_veto"],
+        )
+
+    monkeypatch.setattr("services.pchome_crawler.PChomeCrawler", FakeCrawler)
+    monkeypatch.setattr("services.marketplace_product_matcher.score_marketplace_match", fake_score)
+    feeder = CompetitorPriceFeeder(engine=object())
+    attempts = []
+    monkeypatch.setattr(
+        feeder,
+        "_record_match_attempt",
+        lambda *args, **kwargs: attempts.append(kwargs),
+    )
+
+    result = feeder._run_sku_items([{
+        "sku": "A006",
+        "name": "【TAICEND 泰陞】寶貝液體保護膜 屁屁噴 100ml",
+        "product_id": 6,
+        "momo_price": 399,
+    }])
+
+    assert result.matched == 0
+    assert result.skipped_low_score == 1
+    assert attempts[0]["attempt_status"] == "identity_veto"
+    assert attempts[0]["diagnostics"].hard_veto is True
+
+
 def test_search_candidates_does_not_stop_on_merely_acceptable_match(monkeypatch):
    from services.competitor_price_feeder import _search_pchome_candidates
    from services.pchome_crawler import PChomeProduct
@@ -421,6 +487,18 @@ def test_competitor_feeder_logs_keyword_parser_fallback(monkeypatch, caplog):
    assert "fallback to cleaned product name" in caplog.text


+def test_competitor_feeder_keeps_original_name_as_search_fallback():
+    from services import competitor_price_feeder
+
+    terms = competitor_price_feeder._build_search_keywords(
+        "【Mustela 慕之恬廊】慕之幼 免用水潔淨液 300ml(外出清潔 卸除髒汙 卸除防曬 卸防曬)"
+    )
+
+    assert len(terms) == competitor_price_feeder.MAX_SEARCH_TERMS
+    assert terms[-1].startswith("Mustela 慕之恬廊 慕之幼 免用水潔淨液")
+    assert any("免用水潔淨液 300ml" in term for term in terms[:4])
+
+
 def test_competitor_feeder_refreshes_expired_identity_by_known_product_id(monkeypatch):
    from services.competitor_price_feeder import CompetitorPriceFeeder
    from services.pchome_crawler import PChomeProduct
--- a/tests/test_frontend_v2_assets.py
+++ b/tests/test_frontend_v2_assets.py
@@ -419,7 +419,8 @@ def test_ai_product_pick_agent_uses_real_competitor_data_and_dashboard_action():
    assert "MAX_SEARCH_TERMS" in feeder_source
    assert "_build_search_keywords" in feeder_source
    assert "_search_pchome_candidates" in feeder_source
-    assert "crawler.search_products(keyword, limit=SEARCH_LIMIT, max_pages=SEARCH_MAX_PAGES)" in feeder_source
+    assert "search_limit = SEARCH_LIMIT * max(1, SEARCH_MAX_PAGES)" in feeder_source
+    assert "crawler.search_products(keyword, limit=search_limit, max_pages=SEARCH_MAX_PAGES)" in feeder_source
    assert "_fetch_unmatched_priority_skus" in feeder_source
    assert "_fetch_expired_identity_skus" in feeder_source
    assert "run_expired_identity_refresh" in feeder_source
--- a/tests/test_marketplace_product_matcher.py
+++ b/tests/test_marketplace_product_matcher.py
@@ -435,6 +435,39 @@ def test_marketplace_search_terms_prefer_readable_brand_core_spec():
    assert not any(term.endswith(" l") for term in terms)


+def test_marketplace_search_terms_prioritize_identity_phrase_over_ambiguous_copy():
+    from services.marketplace_product_matcher import build_search_terms
+
+    terms = build_search_terms("【TAICEND 泰陞】寶貝液體保護膜 屁屁噴 100ml", max_terms=5)
+
+    assert terms[0] == "泰陞 屁屁噴 100ml"
+    assert "保護膜" not in terms[0]
+    assert "屁屁噴" in " ".join(terms[:3])
+
+
+def test_marketplace_search_terms_drop_option_and_marketing_noise():
+    from services.marketplace_product_matcher import build_search_terms
+
+    terms = build_search_terms("【YSL】情挑誘光嫩唇凍6ml(任選一款/新品上市)", max_terms=5)
+
+    assert terms[0] == "ysl 情挑誘光嫩唇凍 6ml"
+    assert not any("一款" in term or "上市" in term for term in terms)
+
+
+def test_marketplace_search_terms_keep_professional_product_phrase():
+    from services.marketplace_product_matcher import build_search_terms
+
+    abysse_terms = build_search_terms("【Abysse】天然植萃身體按摩精油550ml", max_terms=5)
+    mustela_terms = build_search_terms(
+        "【Mustela 慕之恬廊】慕之幼 免用水潔淨液 300ml(外出清潔 卸除髒汙 卸除防曬 卸防曬)",
+        max_terms=5,
+    )
+
+    assert abysse_terms[0] == "abysse 身體按摩精油 550ml"
+    assert mustela_terms[0] == "慕之恬廊 免用水潔淨液 300ml"
+    assert not any("卸除防曬" in term or "外出清潔" in term for term in mustela_terms)
+
+
 def test_batch_compare_top_uses_latest_momo_price_not_revenue(monkeypatch):
    from services import pchome_crawler

--- a/tests/test_pchome_crawler_search.py
+++ b/tests/test_pchome_crawler_search.py
@@ -179,5 +179,5 @@ def test_feeder_search_candidate_passes_page_cap(monkeypatch):
    )

    assert candidates == [product]
-    assert calls[0][1]["limit"] == 20
+    assert calls[0][1]["limit"] == 40
    assert calls[0][1]["max_pages"] == 2