From 8edd8a86046d393642d132b8d5f55abbc4a215ef Mon Sep 17 00:00:00 2001 From: OoO Date: Wed, 20 May 2026 19:34:21 +0800 Subject: [PATCH] =?UTF-8?q?[V10.346]=20=E8=A3=9C=E5=BC=B7=20PChome=20ident?= =?UTF-8?q?ity=20anchor=20scorer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.py | 2 +- .../code_modularization_inventory_20260430.md | 4 +- services/marketplace_product_matcher.py | 60 +++++++++++++++++++ tests/test_marketplace_product_matcher.py | 22 +++++++ 4 files changed, 86 insertions(+), 2 deletions(-) diff --git a/config.py b/config.py index 51cd96c..2e1ceb5 100644 --- a/config.py +++ b/config.py @@ -320,7 +320,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.345" +SYSTEM_VERSION = "V10.346" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/memory/code_modularization_inventory_20260430.md b/docs/memory/code_modularization_inventory_20260430.md index 73b2df3..1e4811f 100644 --- a/docs/memory/code_modularization_inventory_20260430.md +++ b/docs/memory/code_modularization_inventory_20260430.md @@ -28,6 +28,7 @@ - 2026-05-20 追記:同步 PChome 近門檻候選重評與 matcher 系列/刀片數防錯配更新後的 `services/marketplace_product_matcher.py`、`services/competitor_price_feeder.py` 行數;此處只更新 inventory,不變更比價行為。 - 2026-05-20 追記:同步 PChome 搜尋詞品質層、候選召回與 hard-veto 狀態分流更新後的 `services/marketplace_product_matcher.py`、`services/competitor_price_feeder.py` 行數;並補列背景市場情報 deployment readiness 大檔,僅更新 inventory。 - 2026-05-20 追記:同步 PChome 搜尋詞特定品線優先級更新後的 `services/marketplace_product_matcher.py` 行數;此處只更新 inventory,不變更模組化決策。 +- 2026-05-20 追記:同步 PChome 共享 identity anchor scorer 與市場情報 review report route 進入大檔門檻後的行數;此處只更新 inventory,不變更功能。 ## 達到或超過 800 行檔案清單 @@ -54,7 +55,7 @@ | 940 | `services/import_service.py` | P2 import service | validators / import writers / report builders | | 933 | `services/telegram_templates.py` | P2 Telegram templates | alert template groups / channel-specific formatting / reusable render helpers | | 867 | `services/token_report_service.py` | P2 token report service | query / aggregation / chart payload / notification formatting | -| 1387 | `services/marketplace_product_matcher.py` | P2 marketplace matcher | identity parsing / unit-comparable scoring / search term quality / persistence normalization | +| 1447 | `services/marketplace_product_matcher.py` | P2 marketplace matcher | identity parsing / unit-comparable scoring / search term quality / persistence normalization | | 865 | `routes/daily_sales_routes.py` | P2 Daily Sales Blueprint | route glue / export helpers / daily query and formatting service | | 844 | `services/ollama_service.py` | P2 Ollama client | host health / request client / fallback policy / response parsing | | 849 | `services/pchome_crawler.py` | P2 PChome crawler | search fetch / parsing / fallback source handling / rate limit policy | @@ -64,6 +65,7 @@ | 1602 | `services/competitor_price_feeder.py` | P2 competitor price feeder | crawler scheduling / price normalization / cache strategy | | 1120 | `services/competitor_intel_repository.py` | P2 competitor intel repository | review queue query / cache shaping / formatting helpers | | 805 | `routes/bot_api_routes.py` | P2 Bot API Blueprint | route glue / bot action service | +| 804 | `routes/market_intel_review_report_routes.py` | P2 market intel review report Blueprint | review report route glue / export payload / phase handoff orchestration | | 811 | `services/market_intel/deployment_readiness.py` | P2 market intel deployment readiness | preflight gates / readiness payload / route contract helpers | ## 市場情報開發前置禁區 diff --git a/services/marketplace_product_matcher.py b/services/marketplace_product_matcher.py index 930c32c..bcf374f 100644 --- a/services/marketplace_product_matcher.py +++ b/services/marketplace_product_matcher.py @@ -114,6 +114,8 @@ GENERIC_TOKENS = { SEARCH_NOISE_PHRASES = ( "新品上市", "全新上市", + "國際航空版", + "超取免運", "任選一款", "任選1款", "任選一色", @@ -134,6 +136,10 @@ SEARCH_NOISE_PHRASES = ( "卸除髒汙", "卸除防曬", "卸防曬", + "防水眼線", + "寶寶牙刷", + "紗布牙刷", + "調節亮度", "韓國彩妝", "水光感", "官方直營", @@ -161,6 +167,14 @@ SEARCH_NOISE_TOKENS = { "防曬", "彩妝", "水光感", + "超取", + "免運", + "航空版", + "國際版", + "附燈泡", + "定時", + "眼妝", + "滅菌", "保濕", "抗老", "超品日", @@ -178,6 +192,17 @@ SEARCH_IDENTITY_ANCHORS = ( "青春敷面膜", "長效潤膚霜", "小黑瓶", + "私密處護潔露", + "私密護潔露", + "口腔清潔棒", + "含氟防蛀修護牙膏", + "自然遮瑕素顏霜", + "超持久細滑眼線筆", + "香氛融蠟燈", + "水晶香氛能量寶盒禮盒組", + "零粉感超持久柔焦蜜粉餅", + "私密肌潔淨露", + "身體除毛器", "免用水潔淨液", "身體按摩精油", "按摩精油", @@ -487,6 +512,11 @@ def _leading_brand_tokens(original: str, normalized: str) -> set[str]: tokens.add(token) leading = normalized[:48] + leading_tokens = _tokenize(leading) + if leading_tokens: + first_token = leading_tokens[0] + if re.fullmatch(r"[\u4e00-\u9fff]{2,6}", first_token) and first_token not in GENERIC_TOKENS: + tokens.add(first_token) for token in _tokenize(leading): if re.fullmatch(r"[a-z][a-z0-9\-']{2,}", token): tokens.add(token) @@ -1183,6 +1213,17 @@ def score_marketplace_match( ): score += 0.025 reasons.append("strong_exact_spec_match") + shared_anchor = _shared_identity_anchor(left, right) + if ( + shared_anchor + and brand_score >= 0.95 + and not hard_veto + and price_penalty == 0 + and spec_score >= 0.85 + and (token_score >= 0.43 or sequence_score >= 0.58) + ): + score += 0.03 + reasons.append("shared_identity_anchor") if ( brand_score >= 0.95 and not hard_veto @@ -1267,6 +1308,25 @@ def _extract_anchor_phrases(token: str) -> list[str]: return phrases +def _shared_identity_anchor(left: ProductIdentity, right: ProductIdentity) -> str: + left_anchors: set[str] = set() + right_anchors: set[str] = set() + for token in left.core_tokens: + left_anchors.update(_extract_anchor_phrases(token)) + for token in right.core_tokens: + right_anchors.update(_extract_anchor_phrases(token)) + + shared = sorted( + { + anchor + for anchor in left_anchors & right_anchors + if len(anchor.replace(" ", "")) >= 5 and anchor not in SEARCH_BROAD_ANCHORS + }, + key=lambda anchor: (-len(anchor.replace(" ", "")), anchor), + ) + return shared[0] if shared else "" + + def _search_core_score(token: str, all_tokens: set[str]) -> tuple[int, int, str]: cleaned = _clean_search_phrase(token) if not cleaned: diff --git a/tests/test_marketplace_product_matcher.py b/tests/test_marketplace_product_matcher.py index db66b63..3477a16 100644 --- a/tests/test_marketplace_product_matcher.py +++ b/tests/test_marketplace_product_matcher.py @@ -426,6 +426,28 @@ def test_marketplace_matcher_does_not_promote_different_option_without_spec(): assert "strong_exact_spec_match" not in diagnostics.reasons +def test_marketplace_matcher_promotes_shared_identity_anchor_near_threshold(): + from services.marketplace_product_matcher import score_marketplace_match + + obge = score_marketplace_match( + "【OBgE】自然遮瑕素顏霜 50g", + "OBgE/自然遮瑕素顏霜50g", + momo_price=699, + competitor_price=699, + ) + unicat = score_marketplace_match( + "【UNICAT 變臉貓】超持久細滑眼線筆1.5ml", + "【UNICAT】超持久細滑眼線筆 1.5ml 新品搶先優惠", + momo_price=399, + competitor_price=399, + ) + + for diagnostics in (obge, unicat): + assert diagnostics.score >= 0.76 + assert diagnostics.hard_veto is False + assert "shared_identity_anchor" in diagnostics.reasons + + def test_marketplace_search_terms_prefer_readable_brand_core_spec(): from services.marketplace_product_matcher import build_search_terms