diff --git a/.env.example b/.env.example index 48a7ecc..d59eeb9 100644 --- a/.env.example +++ b/.env.example @@ -512,8 +512,9 @@ COMPETITOR_INTEL_CACHE_TTL_SECONDS=21600 PCHOME_FEEDER_TIMEOUT=12 PCHOME_FEEDER_RATE_DELAY=1.0 PCHOME_FEEDER_SEARCH_LIMIT=20 -PCHOME_FEEDER_MAX_SEARCH_TERMS=5 +PCHOME_FEEDER_MAX_SEARCH_TERMS=6 PCHOME_FEEDER_SEARCH_MAX_PAGES=2 +PCHOME_FEEDER_SEARCH_COVERAGE_RESCUE_ENABLED=true # browse.sh 只作低信心/無結果的診斷計畫;正式排程預設不自動開瀏覽器。 PCHOME_FEEDER_BROWSE_SH_DIAGNOSTIC_ENABLED=true PCHOME_FEEDER_BROWSE_SH_EXECUTE_ENABLED=false diff --git a/TODO_NEXT_STEPS.txt b/TODO_NEXT_STEPS.txt index e74a65e..438b253 100644 --- a/TODO_NEXT_STEPS.txt +++ b/TODO_NEXT_STEPS.txt @@ -4,6 +4,7 @@ ================================================================================ 【已完成】 + - V10.571 提升 PChome pending 覆蓋率搜尋召回:`PCHOME_FEEDER_MAX_SEARCH_TERMS` 預設由 5 提升到 6,新增 `PCHOME_FEEDER_SEARCH_COVERAGE_RESCUE_ENABLED`,在主要搜尋詞與原始名稱 fallback 之間插入狹義 coverage rescue terms。搜尋詞會保留 `5.5g`、`2.4g` 等小數規格,不再變成 `5 5g` / `2 4g`;同時排除外出清潔、卸除髒汙、卸防曬等非身份核心噪音。正式 pilot 顯示 CeraVe / TUNEMAKERS / Embryolisse / Neogence / NIVEA 這類雙語品牌商品常卡在 PChome 搜尋召回,因此補上「英文品牌 + 中文品牌 + 核心身份 + 規格」窄搜尋詞;「品牌 + 品類 + 規格」仍只開給安全品類,避免為了拉 pending 覆蓋率引入假陽性。 - V10.570 補 PChome 身份 / 報價證據契約:matcher 的 `match_diagnostic_json` 新增 `identity_evidence`、`offer_evidence`,把品牌、品類、identity anchor、型號、規格、入數與 variant guardrail 拆成結構化證據;覆核隊列與 decision envelope 新增 `difference_highlights`,可直接指出容量、入數、色號、香味、款式、補充包、檔期組合等差異。價格明確標記為 offer evidence,不再被誤當身份證據,Dashboard / PPT / OpenClaw / Webcrumbs 能共用同一份比對證據。 - 外部專業 benchmark 固定節奏:已建立每週一 09:30 自動檢視,並新增 `docs/guides/external_professional_benchmark.md`,把 Google Merchant Center、Google Product structured data、Schema.org Product/Offer/AggregateOffer 與 Baymard 電商 UX 做法轉成可落地準則:identity evidence、fresh offer、review 差異高亮、PPT/AI evidence 分層。 - V10.565 補 PChome 覆蓋率操作建議:`/api/ai/pchome-match/backfill/status` 會把低覆蓋率拆成 `operation_backlog`,分別列出刷新舊 identity、重評近門檻、補抓未配對、人工覆核、單位價覆核與過期搜尋救援預覽;同時回傳 `recommended_next_action`,Dashboard 狀態摘要會顯示「建議執行比價補強 / 刷新過期 identity / 處理覆核」等下一步,讓覆蓋率 KPI 直接連到可執行行動。 diff --git a/config.py b/config.py index 79b5c87..e513abf 100644 --- a/config.py +++ b/config.py @@ -402,7 +402,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.570" +SYSTEM_VERSION = "V10.571" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md index 0b6b420..21da05c 100644 --- a/docs/memory/history_logs.md +++ b/docs/memory/history_logs.md @@ -13,6 +13,7 @@ ## 📅 詳細更新日誌 (考古存檔) ### 2026-06-01:PChome 比價新鮮度操作閉環 +- **V10.571 PChome pending 覆蓋率搜尋召回**: `competitor_price_feeder` 預設每個商品最多搜尋詞由 5 組提升為 6 組,並新增 `PCHOME_FEEDER_SEARCH_COVERAGE_RESCUE_ENABLED`。補抓流程會在主要 matcher 搜尋詞與原始名稱 fallback 之間加入狹義 coverage rescue terms,保留 `5.5g` / `2.4g` 等小數規格,並過濾外出清潔、卸除髒汙、卸防曬等非身份核心噪音。正式 pilot 顯示 CeraVe / TUNEMAKERS / Embryolisse / Neogence / NIVEA 這類雙語品牌商品常卡在 PChome 搜尋召回,因此補上「英文品牌 + 中文品牌 + 核心身份 + 規格」窄搜尋詞;`品牌 + 品類 + 規格` 仍只對安全品類開放,目標是提升 pending/no_result 候選取得率,同時維持 matcher hard veto 與 `MIN_MATCH_SCORE` 不變。 - **V10.570 PChome 身份 / 報價證據契約**: `score_marketplace_match()` 現在會在 `match_diagnostic_json` 內輸出 `identity_evidence` 與 `offer_evidence`,把品牌、品類、identity anchor、型號、規格、入數、variant guardrail 與價格 offer 拆層保存。`competitor_intel_repository` 會把這些證據轉成 `difference_highlights` 與 decision envelope 的 identity / offer evidence,讓覆核頁、PPT、OpenClaw、Webcrumbs 與 Telegram 摘要都能理解「為何同款 / 為何不同 / 價格只是報價證據不是身份證據」。 - **V10.569 Webcrumbs 比價信封摘要串接**: `build_webcrumbs_marketplace_host_data()` 讀取 `fetch_competitor_review_queue()` 後統一走 `summarize_review_decision_envelopes()`,在 host data payload 輸出 `reviewDecisionBrief`,並於 metadata 增加 `review_queue_count`、`hitl_count`、`auto_execute_blocked_count` 與 `decision_envelope_source`。Webcrumbs / Shared UI 現在和 Telegram、OpenClaw、PPT 共用同一份 PChome 覆核信封摘要,仍維持只讀、不呼叫 LLM、不抓外站、不寫 DB;同版收錄 `docs/guides/external_professional_benchmark.md` 作為外部專業做法週巡檢落地準則入口。 - **V10.568 價格類決策信封專業 brief**: `decision_envelope` 的價格 / PChome 覆核事件在 Telegram EventRouter 直送時,改以「標的、價格證據、比對證據、人工下一步」四段式排版呈現,保留 `momo:eig:` 忽略按鈕且不進 L1/L2 AI 重摘要。`competitor_intel_repository` 同步在 review queue 信封 subject 補上 `momo_price` / `competitor_price`,讓 Telegram、PPT、Webcrumbs 與 AI 摘要可共用同一份價格證據,不再各自補查或重組。 diff --git a/services/competitor_price_feeder.py b/services/competitor_price_feeder.py index dd9765a..98f53ef 100644 --- a/services/competitor_price_feeder.py +++ b/services/competitor_price_feeder.py @@ -42,12 +42,16 @@ MIN_MATCH_SCORE = 0.76 # 低於此分數不寫入;核心比價寧可待審 REPLACE_DIFFERENT_PRODUCT_SCORE = 0.84 # 已有不同 PChome 商品時,需超高信心才覆蓋 EARLY_STOP_MATCH_SCORE = 0.90 # 搜尋候選池只有強同款才提前停止,避免次佳候選卡住後續精準搜尋詞 SEARCH_LIMIT = int(os.getenv("PCHOME_FEEDER_SEARCH_LIMIT", "20")) # 每個搜尋詞取 PChome 前 N 筆 -MAX_SEARCH_TERMS = int(os.getenv("PCHOME_FEEDER_MAX_SEARCH_TERMS", "5")) # 每個 MOMO 商品最多嘗試幾組搜尋詞 +MAX_SEARCH_TERMS = int(os.getenv("PCHOME_FEEDER_MAX_SEARCH_TERMS", "6")) # 每個 MOMO 商品最多嘗試幾組搜尋詞 SEARCH_MAX_PAGES = int(os.getenv("PCHOME_FEEDER_SEARCH_MAX_PAGES", "2")) # 每個搜尋詞最多掃描 PChome 搜尋頁數 BATCH_SIZE = 30 # 每批 DB 寫入筆數 RATE_DELAY = float(os.getenv("PCHOME_FEEDER_RATE_DELAY", "1.0")) # 每次 PChome 請求間隔(秒) TTL_HOURS = int(os.getenv("PCHOME_FEEDER_TTL_HOURS", "48")) # competitor_prices 價格新鮮度有效期 REQUEST_TIMEOUT = float(os.getenv("PCHOME_FEEDER_TIMEOUT", "12")) # 避免外部搜尋 API 長時間卡住排程 +SEARCH_COVERAGE_RESCUE_ENABLED = os.getenv( + "PCHOME_FEEDER_SEARCH_COVERAGE_RESCUE_ENABLED", + "true", +).lower() in {"1", "true", "yes", "on"} VARIANT_RECALL_SORTS = ("sale/dc", "new/dc") RECOVERABLE_LOW_SCORE_FLOOR = max(MIN_MATCH_SCORE - 0.03, 0.72) GENERIC_RECALL_SAFE_PRODUCT_TYPES = { @@ -96,6 +100,16 @@ GENERIC_RECALL_BLOCK_TERMS = ( "防曬", "護手霜", ) +COVERAGE_RESCUE_NOISE_TERMS = ( + "外出清潔", + "卸除髒汙", + "卸除防曬", + "卸防曬", + "交換禮物", + "送禮", + "節日", + "推薦", +) GENERIC_RECALL_BLOCK_NAME_PATTERN = "|".join(re.escape(term) for term in GENERIC_RECALL_BLOCK_TERMS) BROWSE_SH_DIAGNOSTIC_ENABLED = os.getenv("PCHOME_FEEDER_BROWSE_SH_DIAGNOSTIC_ENABLED", "true").lower() in {"1", "true", "yes", "on"} BROWSE_SH_EXECUTE_ENABLED = os.getenv("PCHOME_FEEDER_BROWSE_SH_EXECUTE_ENABLED", "false").lower() in {"1", "true", "yes", "on"} @@ -393,7 +407,9 @@ def _extend_match_tags(tags: list, diagnostics, extra: list[str] = None) -> list def _clean_search_text(value: str) -> str: value = re.sub(r'[()()]', ' ', value or '') value = re.sub(r'[【】\[\]]', ' ', value) + value = re.sub(r"(?<=\d)\.(?=\d)", "DECIMALPOINT", value) value = re.sub(r'[^\w\u4e00-\u9fff]+', ' ', value) + value = value.replace("DECIMALPOINT", ".").replace("decimalpoint", ".") return re.sub(r'\s+', ' ', value).strip() @@ -414,6 +430,98 @@ def _dedupe_terms(terms: list) -> list: return result +def _identity_spec_terms(identity) -> list[str]: + terms: list[str] = [] + for value in getattr(identity, "volumes_ml", ()) or (): + terms.append(f"{float(value):g}ml") + for value in getattr(identity, "weights_g", ()) or (): + terms.append(f"{float(value):g}g") + for value in getattr(identity, "dosages_mg", ()) or (): + terms.append(f"{float(value):g}mg") + total_piece_count = getattr(identity, "total_piece_count", None) + if total_piece_count: + terms.append(f"{int(total_piece_count)}入") + return list(dict.fromkeys(terms)) + + +def _identity_core_rescue_phrases(identity, limit: int = 3) -> list[str]: + product_type = getattr(identity, "product_type", "") or "" + phrases: list[str] = [] + for token in sorted( + (getattr(identity, "core_tokens", set()) or set()), + key=lambda value: ( + product_type not in str(value), + not bool(re.search(r"[\u4e00-\u9fff]", str(value))), + -len(str(value)), + str(value), + ), + ): + phrase = _clean_search_text(str(token)) + compact = phrase.replace(" ", "") + if len(compact) < 2: + continue + if product_type and compact == str(product_type).replace(" ", ""): + continue + if compact.isdigit() or re.fullmatch(r"\d+(?:\.\d+)?(?:ml|g|mg|kg|l|入|片|支|瓶|盒|包)?", compact, re.I): + continue + if ( + any(term in compact for term in GENERIC_RECALL_BLOCK_TERMS) + and not _is_generic_recall_safe(identity) + ): + continue + if any(term in compact for term in COVERAGE_RESCUE_NOISE_TERMS): + continue + if phrase.lower() in {existing.lower() for existing in phrases}: + continue + phrases.append(phrase) + if len(phrases) >= limit: + break + return phrases + + +def _build_coverage_rescue_keywords(momo_name: str) -> list[str]: + """Add narrow identity fallback terms for pending/no-result coverage recovery.""" + if not SEARCH_COVERAGE_RESCUE_ENABLED: + return [] + try: + from services.marketplace_product_matcher import parse_product_identity + + identity = parse_product_identity(momo_name) + except Exception: + return [] + + brand_phrases = _coverage_brand_phrases(identity) + product_type = getattr(identity, "product_type", "") or "" + spec_part = " ".join(_identity_spec_terms(identity)) + core_phrases = _identity_core_rescue_phrases(identity, limit=3) + primary_core = core_phrases[0] if core_phrases else product_type + secondary_core = core_phrases[1] if len(core_phrases) > 1 else "" + + terms: list[str] = [] + for brand in brand_phrases: + terms.extend([ + " ".join(part for part in (brand, primary_core, spec_part) if part), + " ".join(part for part in (brand, primary_core, secondary_core, spec_part) if part), + " ".join(part for part in (brand, product_type, spec_part) if part) + if _is_generic_recall_safe(identity) + else "", + ]) + terms.append(" ".join(part for part in (primary_core, spec_part) if part)) + return _dedupe_terms(terms) + + +def _append_search_term(target: list[str], term: str, *, max_terms: int) -> None: + if len(target) >= max_terms: + return + cleaned_terms = _dedupe_terms([term]) + if not cleaned_terms: + return + cleaned = cleaned_terms[0] + if cleaned.lower() in {existing.lower() for existing in target}: + return + target.append(cleaned) + + def _build_search_keywords(momo_name: str) -> list: """ 用多組商品身份線索搜尋 PChome,提高命中率,但仍交給身份比對門檻把關。 @@ -431,13 +539,22 @@ def _build_search_keywords(momo_name: str) -> list: cleaned = _clean_search_text(momo_name) terms = [cleaned[:36], cleaned[:24]] - primary_terms = _dedupe_terms(terms[: max(1, MAX_SEARCH_TERMS - 1)]) + max_terms = max(1, MAX_SEARCH_TERMS) + primary_terms = _dedupe_terms(terms[: max(1, max_terms - 2)]) + rescue_terms = _build_coverage_rescue_keywords(momo_name) original_terms = _dedupe_terms([momo_name]) + selected_terms: list[str] = [] + for term in primary_terms: + _append_search_term(selected_terms, term, max_terms=max(1, max_terms - 1)) + for term in rescue_terms: + _append_search_term(selected_terms, term, max_terms=max(1, max_terms - 1)) for term in original_terms: - if term.lower() not in {existing.lower() for existing in primary_terms}: - primary_terms.append(term) + if term.lower() not in {existing.lower() for existing in selected_terms}: + if len(selected_terms) >= max_terms: + selected_terms = selected_terms[: max(0, max_terms - 1)] + selected_terms.append(term) break - return _dedupe_terms(primary_terms) + return _dedupe_terms(selected_terms) def _primary_brand_phrase(identity) -> str: @@ -483,6 +600,44 @@ def _primary_brand_phrase(identity) -> str: return short_latin[0].lower() if short_latin else "" +def _coverage_brand_phrases(identity) -> list[str]: + brand_tokens = {str(token).lower() for token in getattr(identity, "brand_tokens", set())} + chinese = sorted( + ( + token for token in getattr(identity, "brand_tokens", set()) + if re.search(r"[\u4e00-\u9fff]", str(token)) + ), + key=lambda token: (-len(str(token)), str(token)), + ) + latin = sorted( + ( + token for token in getattr(identity, "brand_tokens", set()) + if re.search(r"[a-z]", str(token), re.I) and len(str(token)) >= 2 + ), + key=lambda token: (-len(str(token)), str(token)), + ) + phrases: list[str] = [] + if latin and chinese: + phrases.append(f"{str(latin[0]).lower()} {chinese[0]}") + primary = _primary_brand_phrase(identity) + if primary: + phrases.append(primary) + for token in ( + "cerave", + "embryolisse", + "neogence", + "tunemakers", + "nivea", + "romand", + "lactacyd", + "pavaruni", + "solone", + ): + if token in brand_tokens: + phrases.append(token) + return _dedupe_terms(phrases) + + def _is_generic_recall_safe(identity) -> bool: product_type = getattr(identity, "product_type", None) if product_type not in GENERIC_RECALL_SAFE_PRODUCT_TYPES: diff --git a/tests/test_competitor_match_attempts_persistence.py b/tests/test_competitor_match_attempts_persistence.py index 706b41a..c231946 100644 --- a/tests/test_competitor_match_attempts_persistence.py +++ b/tests/test_competitor_match_attempts_persistence.py @@ -1856,9 +1856,44 @@ def test_competitor_feeder_keeps_original_name_as_search_fallback(): "【Mustela 慕之恬廊】慕之幼 免用水潔淨液 300ml(外出清潔 卸除髒汙 卸除防曬 卸防曬)" ) - assert len(terms) == competitor_price_feeder.MAX_SEARCH_TERMS + assert len(terms) <= competitor_price_feeder.MAX_SEARCH_TERMS assert terms[-1].startswith("Mustela 慕之恬廊 慕之幼 免用水潔淨液") assert any("免用水潔淨液 300ml" in term for term in terms[:4]) + assert not any("防曬 300ml" in term for term in terms[:-1]) + assert not any("卸除髒汙" in term for term in terms[:-1]) + + +def test_competitor_feeder_coverage_rescue_terms_preserve_decimal_specs(): + from services import competitor_price_feeder + + romand_terms = competitor_price_feeder._build_search_keywords("【rom&nd】果汁唇釉 12 5.5g") + lip_terms = competitor_price_feeder._build_search_keywords("曼秀雷敦 頂級濃潤柔霜潤唇膏 2.4g") + nivea_terms = competitor_price_feeder._build_search_keywords("【NIVEA 妮維雅】止汗爽身噴霧 150ml") + + assert any("5.5g" in term for term in romand_terms) + assert not any("5 5g" in term for term in romand_terms) + assert any("2.4g" in term for term in lip_terms) + assert not any("2 4g" in term for term in lip_terms) + assert any(term == "nivea 妮維雅 止汗噴霧 150ml" for term in nivea_terms) + assert len(nivea_terms) <= competitor_price_feeder.MAX_SEARCH_TERMS + + +def test_competitor_feeder_coverage_rescue_adds_bilingual_brand_terms(): + from services import competitor_price_feeder + + cerave_terms = competitor_price_feeder._build_search_keywords( + "【CeraVe 適樂膚】安敏補水★全效極潤修護精華水 200ml_A" + ) + tunemakers_terms = competitor_price_feeder._build_search_keywords( + "【TUNEMAKERS 渡美】神經醯胺修護凍膜70g(7天細彈滑/面膜/臉部保養)" + ) + + assert any(term == "cerave 適樂膚 全效極潤修護精華水 200ml" for term in cerave_terms) + assert any( + term == "tunemakers 渡美 神經醯胺修護凍膜 70g" + for term in tunemakers_terms + ) + assert not any(term == "tunemakers 渡美 面膜 70g" for term in tunemakers_terms) def test_competitor_feeder_refreshes_expired_identity_by_known_product_id(monkeypatch):