diff --git a/config.py b/config.py index d77d810..ead65ea 100644 --- a/config.py +++ b/config.py @@ -320,7 +320,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.342" +SYSTEM_VERSION = "V10.343" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/memory/code_modularization_inventory_20260430.md b/docs/memory/code_modularization_inventory_20260430.md index 9f347bf..b4d18fa 100644 --- a/docs/memory/code_modularization_inventory_20260430.md +++ b/docs/memory/code_modularization_inventory_20260430.md @@ -26,6 +26,7 @@ - 2026-05-20 追記:同步背景 PChome identity / price direction 更新後的 `services/marketplace_product_matcher.py` 行數;此處只更新 inventory,不變更商品比對行為。 - 2026-05-20 追記:同步背景 PChome crawler 搜尋韌性擴充後的 `services/pchome_crawler.py` 行數;此處只更新 inventory,不變更 PChome crawler 行為。 - 2026-05-20 追記:同步 PChome 近門檻候選重評與 matcher 系列/刀片數防錯配更新後的 `services/marketplace_product_matcher.py`、`services/competitor_price_feeder.py` 行數;此處只更新 inventory,不變更比價行為。 +- 2026-05-20 追記:同步 PChome 搜尋詞品質層、候選召回與 hard-veto 狀態分流更新後的 `services/marketplace_product_matcher.py`、`services/competitor_price_feeder.py` 行數;並補列背景市場情報 deployment readiness 大檔,僅更新 inventory。 ## 達到或超過 800 行檔案清單 @@ -52,16 +53,17 @@ | 940 | `services/import_service.py` | P2 import service | validators / import writers / report builders | | 933 | `services/telegram_templates.py` | P2 Telegram templates | alert template groups / channel-specific formatting / reusable render helpers | | 867 | `services/token_report_service.py` | P2 token report service | query / aggregation / chart payload / notification formatting | -| 1128 | `services/marketplace_product_matcher.py` | P2 marketplace matcher | identity parsing / unit-comparable scoring / persistence normalization | +| 1356 | `services/marketplace_product_matcher.py` | P2 marketplace matcher | identity parsing / unit-comparable scoring / search term quality / persistence normalization | | 865 | `routes/daily_sales_routes.py` | P2 Daily Sales Blueprint | route glue / export helpers / daily query and formatting service | | 844 | `services/ollama_service.py` | P2 Ollama client | host health / request client / fallback policy / response parsing | | 849 | `services/pchome_crawler.py` | P2 PChome crawler | search fetch / parsing / fallback source handling / rate limit policy | | 1042 | `services/code_review_pipeline_service.py` | P2 Code review pipeline service | scan orchestration / finding normalization / persistence adapter | | 953 | `routes/export_routes.py` | P2 Export flow | export command/router glue / file path / download orchestration | | 816 | `services/ppt_vision_service.py` | P2 PPT vision QA service | runtime state / queue status / model probe / audit execution 分離 | -| 1592 | `services/competitor_price_feeder.py` | P2 competitor price feeder | crawler scheduling / price normalization / cache strategy | +| 1602 | `services/competitor_price_feeder.py` | P2 competitor price feeder | crawler scheduling / price normalization / cache strategy | | 1120 | `services/competitor_intel_repository.py` | P2 competitor intel repository | review queue query / cache shaping / formatting helpers | | 805 | `routes/bot_api_routes.py` | P2 Bot API Blueprint | route glue / bot action service | +| 811 | `services/market_intel/deployment_readiness.py` | P2 market intel deployment readiness | preflight gates / readiness payload / route contract helpers | ## 市場情報開發前置禁區 diff --git a/services/competitor_price_feeder.py b/services/competitor_price_feeder.py index d82bf65..cf20092 100644 --- a/services/competitor_price_feeder.py +++ b/services/competitor_price_feeder.py @@ -144,7 +144,13 @@ def _build_search_keywords(momo_name: str) -> list: cleaned = _clean_search_text(momo_name) terms = [cleaned[:36], cleaned[:24]] - return _dedupe_terms(terms) + primary_terms = _dedupe_terms(terms[: max(1, MAX_SEARCH_TERMS - 1)]) + original_terms = _dedupe_terms([momo_name]) + for term in original_terms: + if term.lower() not in {existing.lower() for existing in primary_terms}: + primary_terms.append(term) + break + return _dedupe_terms(primary_terms) def _format_match_diagnostics(diagnostics) -> str: @@ -252,8 +258,9 @@ def _search_pchome_candidates(crawler, momo_name: str, keywords: list = None, mo """以多組搜尋詞擴大 PChome 候選池,只在強同款時提前停止。""" candidates = [] seen_ids = set() + search_limit = SEARCH_LIMIT * max(1, SEARCH_MAX_PAGES) for keyword in keywords or _build_search_keywords(momo_name): - ok, _, products = crawler.search_products(keyword, limit=SEARCH_LIMIT, max_pages=SEARCH_MAX_PAGES) + ok, _, products = crawler.search_products(keyword, limit=search_limit, max_pages=SEARCH_MAX_PAGES) if not ok or not products: continue for product in products: @@ -1154,6 +1161,7 @@ class CompetitorPriceFeeder: continue if score < MIN_MATCH_SCORE and not manual_accept_override: + attempt_status = "identity_veto" if getattr(diagnostics, "hard_veto", False) else "low_score" logger.debug( f"[Feeder] {sku} 比對分數過低 ({score:.3f} < {MIN_MATCH_SCORE})," f"{_format_match_diagnostics(diagnostics)}" @@ -1165,7 +1173,7 @@ class CompetitorPriceFeeder: momo_price=momo_price, search_terms=search_terms, candidate_count=len(products), - attempt_status="low_score", + attempt_status=attempt_status, best_product=best_product, best_score=score, diagnostics=diagnostics, @@ -1382,6 +1390,7 @@ class CompetitorPriceFeeder: continue if score < MIN_MATCH_SCORE: + attempt_status = "identity_veto" if getattr(diagnostics, "hard_veto", False) else "refresh_low_score" self._record_match_attempt( sku, momo_name, @@ -1389,7 +1398,7 @@ class CompetitorPriceFeeder: momo_price=momo_price, search_terms=search_terms, candidate_count=1, - attempt_status="refresh_low_score", + attempt_status=attempt_status, best_product=best_product, best_score=score, diagnostics=diagnostics, diff --git a/services/marketplace_product_matcher.py b/services/marketplace_product_matcher.py index 9a3ad22..7915db2 100644 --- a/services/marketplace_product_matcher.py +++ b/services/marketplace_product_matcher.py @@ -111,6 +111,96 @@ GENERIC_TOKENS = { "美國", } +SEARCH_NOISE_PHRASES = ( + "新品上市", + "全新上市", + "任選一款", + "任選1款", + "任選一色", + "任選1色", + "多款任選", + "多款可選", + "色號可選", + "香味可選", + "口味可選", + "送精美紙袋", + "精美紙袋", + "交換禮物", + "聖誕禮物", + "母親節", + "父親節", + "情人節", + "外出清潔", + "卸除髒汙", + "卸除防曬", + "卸防曬", + "韓國彩妝", + "水光感", + "官方直營", + "官方", +) + +SEARCH_NOISE_TOKENS = { + "一款", + "1款", + "一色", + "1色", + "上市", + "全新", + "新品", + "香味", + "口味", + "味道", + "顏色", + "色號", + "紙袋", + "禮物", + "清潔", + "髒汙", + "防曬", + "彩妝", + "水光感", +} + +SEARCH_IDENTITY_ANCHORS = ( + "免用水潔淨液", + "身體按摩精油", + "按摩精油", + "擴香補充瓶", + "擴香瓶", + "全面修復霜", + "修復霜", + "護膚膏", + "屁屁噴", + "身體乳", + "緊實乳", + "潔膚露", + "潔淨液", + "護甲油", + "指甲油", + "美甲片", + "唇凍", + "唇釉", + "唇膏", + "粉底棒", + "遮瑕棒", + "化妝水", + "精華液", + "精華", + "面膜", + "乳液", + "乳霜", + "面霜", + "精油", + "水氧機", + "香氛機", +) + +SEARCH_AMBIGUOUS_PRODUCT_TERMS = { + "保護膜", + "保護貼", +} + BRAND_ALIAS_OVERRIDES = { "clarins": ("克蘭詩", "clarins"), "nars": ("nars",), @@ -1099,6 +1189,123 @@ def score_marketplace_match( ) +def _clean_search_phrase(value: str) -> str: + text = normalize_product_text(value) + for phrase in sorted(SEARCH_NOISE_PHRASES, key=len, reverse=True): + text = text.replace(phrase.lower(), " ") + text = re.sub(r"[^\w\u4e00-\u9fff]+", " ", text) + text = " ".join( + token for token in text.split() + if token not in SEARCH_NOISE_TOKENS and token not in GENERIC_TOKENS + ) + text = re.sub(r"\s+", " ", text).strip() + return text + + +def _search_spec_terms(identity: ProductIdentity) -> list[str]: + specs: list[str] = [] + if identity.volumes_ml: + volume = identity.volumes_ml[0] + specs.append(f"{volume:g}ml") + if identity.weights_g: + weight = identity.weights_g[0] + specs.append(f"{weight:g}g") + if identity.dosages_mg: + dosage = identity.dosages_mg[0] + specs.append(f"{dosage:g}mg") + if identity.total_piece_count: + specs.append(f"{identity.total_piece_count}包") + return specs + + +def _extract_anchor_phrases(token: str) -> list[str]: + cleaned = _clean_search_phrase(token) + if not cleaned: + return [] + + phrases: list[str] = [] + for anchor in SEARCH_IDENTITY_ANCHORS: + if anchor not in cleaned: + continue + if re.search(r"[\u4e00-\u9fff]", anchor): + prefix_width = 0 if len(anchor) >= 5 else (4 if len(anchor) >= 3 else 6) + match = re.search(rf"([\u4e00-\u9fff]{{0,{prefix_width}}}{re.escape(anchor)})", cleaned) + phrase = match.group(1) if match else anchor + else: + phrase = anchor + phrase = _clean_search_phrase(phrase) + if any(existing in phrase and existing != phrase for existing in phrases): + continue + if len(phrase) >= 2 and phrase not in phrases: + phrases.append(phrase) + return phrases + + +def _search_core_score(token: str, all_tokens: set[str]) -> tuple[int, int, str]: + cleaned = _clean_search_phrase(token) + if not cleaned: + return (-999, 0, cleaned) + compact = cleaned.replace(" ", "") + if compact in SEARCH_NOISE_TOKENS or compact in GENERIC_TOKENS: + return (-900, 0, cleaned) + + score = 0 + if re.search(r"[a-z][a-z0-9-]{2,}", cleaned): + score += 30 + if re.search(r"\d", cleaned): + score += 12 + + anchors = _extract_anchor_phrases(cleaned) + if anchors: + score += 90 + if anchors[0] == compact: + score += 8 + else: + score += max(0, 24 - len(compact)) + + if len(compact) <= 8: + score += 14 + elif len(compact) >= 12: + score -= 12 + + has_better_anchor = any( + other != token and _extract_anchor_phrases(other) + for other in all_tokens + ) + if has_better_anchor and any(term in compact for term in SEARCH_AMBIGUOUS_PRODUCT_TERMS): + score -= 80 + if any(noise in compact for noise in SEARCH_NOISE_TOKENS): + score -= 18 + + return (score, -len(compact), cleaned) + + +def _ranked_search_core_phrases(identity: ProductIdentity, limit: int = 4) -> list[str]: + tokens = {token for token in identity.core_tokens if token not in GENERIC_TOKENS} + ranked_tokens = sorted( + tokens, + key=lambda token: _search_core_score(token, tokens), + reverse=True, + ) + + phrases: list[str] = [] + for token in ranked_tokens: + if _search_core_score(token, tokens)[0] < -100: + continue + candidates = _extract_anchor_phrases(token) or [_clean_search_phrase(token)] + for phrase in candidates: + compact = phrase.replace(" ", "") + if len(compact) < 2 or compact in SEARCH_NOISE_TOKENS: + continue + if any(term in compact for term in SEARCH_AMBIGUOUS_PRODUCT_TERMS) and len(phrases) > 0: + continue + if phrase not in phrases: + phrases.append(phrase) + if len(phrases) >= limit: + return phrases + return phrases + + def build_search_terms(name: str, max_terms: int = 3) -> list[str]: identity = parse_product_identity(name) terms: list[str] = [] @@ -1120,30 +1327,27 @@ def build_search_terms(name: str, max_terms: int = 3) -> list[str]: return latin[0] if latin else "" brand_part = primary_brand_phrase() - core = " ".join(sorted(identity.core_tokens, key=lambda token: (-len(token), token))[:4]) - specs = [] - if identity.volumes_ml: - specs.append(f"{int(identity.volumes_ml[0])}ml") - if identity.weights_g: - specs.append(f"{int(identity.weights_g[0])}g") - if identity.dosages_mg: - dosage = identity.dosages_mg[0] - dosage_label = f"{int(dosage)}mg" if dosage.is_integer() else f"{dosage:g}mg" - specs.append(dosage_label) - if identity.total_piece_count: - specs.append(f"{identity.total_piece_count}包") - - spec_part = " ".join(specs) - core_tokens = sorted(identity.core_tokens, key=lambda token: (-len(token), token)) - core_short = " ".join(core_tokens[:2]) + spec_part = " ".join(_search_spec_terms(identity)) + core_phrases = _ranked_search_core_phrases(identity, limit=4) + core_short = " ".join(core_phrases[:2]) + core_primary = core_phrases[0] if core_phrases else "" + model_phrases = [ + phrase + for phrase in core_phrases[1:] + if re.fullmatch(r"[a-z]*\d+[a-z0-9-]*", phrase) + or re.fullmatch(r"[a-z][a-z0-9-]{2,}", phrase) + ] + primary_with_model = " ".join( + part for part in (core_primary, model_phrases[0] if model_phrases else "") if part + ) for value in ( + " ".join(part for part in (brand_part, primary_with_model, spec_part) if part), " ".join(part for part in (brand_part, core_short, spec_part) if part), " ".join(part for part in (brand_part, core_short) if part), - " ".join(part for part in (core_short, spec_part) if part), + " ".join(part for part in (core_primary, spec_part) if part), identity.searchable_name, ): - cleaned = re.sub(r"[^\w\u4e00-\u9fff]+", " ", value) - cleaned = re.sub(r"\s+", " ", cleaned).strip() + cleaned = _clean_search_phrase(value) if cleaned and cleaned not in terms: terms.append(cleaned[:42]) if len(terms) >= max_terms: diff --git a/tests/test_competitor_match_attempts_persistence.py b/tests/test_competitor_match_attempts_persistence.py index 0e721c9..20fd7d8 100644 --- a/tests/test_competitor_match_attempts_persistence.py +++ b/tests/test_competitor_match_attempts_persistence.py @@ -18,7 +18,8 @@ def test_competitor_feeder_persists_all_match_attempt_outcomes(): assert "INSERT INTO competitor_match_attempts" in source assert "CAST(:search_terms AS jsonb)" in source assert 'attempt_status="matched"' in source - assert 'attempt_status="low_score"' in source + assert '"low_score"' in source + assert '"identity_veto"' in source assert 'attempt_status="no_result"' in source assert 'attempt_status="no_match"' in source assert 'attempt_status="error"' in source @@ -342,6 +343,71 @@ def test_competitor_feeder_skips_rejected_candidate_and_uses_next_best(monkeypat assert attempts[0]["best_product"].product_id == "DDAB01-ACCEPTABLE" +def test_competitor_feeder_splits_hard_veto_from_low_score(monkeypatch): + from services.competitor_price_feeder import CompetitorPriceFeeder + from services.pchome_crawler import PChomeProduct + + product = PChomeProduct( + product_id="DDAB01-WRONG", + name="iPhone 16 Pro 保護膜", + price=399, + original_price=499, + discount=20, + image_url="", + product_url="https://24h.pchome.com.tw/prod/DDAB01-WRONG", + stock=20, + store="24h", + rating=4.7, + review_count=8, + is_on_sale=True, + crawled_at=datetime.now(), + ) + + class FakeCrawler: + def __init__(self, *_args, **_kwargs): + pass + + def search_products(self, *_args, **_kwargs): + return True, "ok", [product] + + def fake_score(*_args, **_kwargs): + return SimpleNamespace( + score=0.31, + brand_score=0.0, + token_score=0.1, + spec_score=0.55, + sequence_score=0.1, + type_score=0.55, + price_penalty=0.0, + hard_veto=True, + reasons=("brand_conflict", "product_line_conflict"), + comparison_mode="not_comparable", + tags=["identity_v2", "identity_veto"], + ) + + monkeypatch.setattr("services.pchome_crawler.PChomeCrawler", FakeCrawler) + monkeypatch.setattr("services.marketplace_product_matcher.score_marketplace_match", fake_score) + feeder = CompetitorPriceFeeder(engine=object()) + attempts = [] + monkeypatch.setattr( + feeder, + "_record_match_attempt", + lambda *args, **kwargs: attempts.append(kwargs), + ) + + result = feeder._run_sku_items([{ + "sku": "A006", + "name": "【TAICEND 泰陞】寶貝液體保護膜 屁屁噴 100ml", + "product_id": 6, + "momo_price": 399, + }]) + + assert result.matched == 0 + assert result.skipped_low_score == 1 + assert attempts[0]["attempt_status"] == "identity_veto" + assert attempts[0]["diagnostics"].hard_veto is True + + def test_search_candidates_does_not_stop_on_merely_acceptable_match(monkeypatch): from services.competitor_price_feeder import _search_pchome_candidates from services.pchome_crawler import PChomeProduct @@ -421,6 +487,18 @@ def test_competitor_feeder_logs_keyword_parser_fallback(monkeypatch, caplog): assert "fallback to cleaned product name" in caplog.text +def test_competitor_feeder_keeps_original_name_as_search_fallback(): + from services import competitor_price_feeder + + terms = competitor_price_feeder._build_search_keywords( + "【Mustela 慕之恬廊】慕之幼 免用水潔淨液 300ml(外出清潔 卸除髒汙 卸除防曬 卸防曬)" + ) + + assert len(terms) == competitor_price_feeder.MAX_SEARCH_TERMS + assert terms[-1].startswith("Mustela 慕之恬廊 慕之幼 免用水潔淨液") + assert any("免用水潔淨液 300ml" in term for term in terms[:4]) + + def test_competitor_feeder_refreshes_expired_identity_by_known_product_id(monkeypatch): from services.competitor_price_feeder import CompetitorPriceFeeder from services.pchome_crawler import PChomeProduct diff --git a/tests/test_frontend_v2_assets.py b/tests/test_frontend_v2_assets.py index 7be4d77..3a61465 100644 --- a/tests/test_frontend_v2_assets.py +++ b/tests/test_frontend_v2_assets.py @@ -419,7 +419,8 @@ def test_ai_product_pick_agent_uses_real_competitor_data_and_dashboard_action(): assert "MAX_SEARCH_TERMS" in feeder_source assert "_build_search_keywords" in feeder_source assert "_search_pchome_candidates" in feeder_source - assert "crawler.search_products(keyword, limit=SEARCH_LIMIT, max_pages=SEARCH_MAX_PAGES)" in feeder_source + assert "search_limit = SEARCH_LIMIT * max(1, SEARCH_MAX_PAGES)" in feeder_source + assert "crawler.search_products(keyword, limit=search_limit, max_pages=SEARCH_MAX_PAGES)" in feeder_source assert "_fetch_unmatched_priority_skus" in feeder_source assert "_fetch_expired_identity_skus" in feeder_source assert "run_expired_identity_refresh" in feeder_source diff --git a/tests/test_marketplace_product_matcher.py b/tests/test_marketplace_product_matcher.py index 5d32df1..3138a94 100644 --- a/tests/test_marketplace_product_matcher.py +++ b/tests/test_marketplace_product_matcher.py @@ -435,6 +435,39 @@ def test_marketplace_search_terms_prefer_readable_brand_core_spec(): assert not any(term.endswith(" l") for term in terms) +def test_marketplace_search_terms_prioritize_identity_phrase_over_ambiguous_copy(): + from services.marketplace_product_matcher import build_search_terms + + terms = build_search_terms("【TAICEND 泰陞】寶貝液體保護膜 屁屁噴 100ml", max_terms=5) + + assert terms[0] == "泰陞 屁屁噴 100ml" + assert "保護膜" not in terms[0] + assert "屁屁噴" in " ".join(terms[:3]) + + +def test_marketplace_search_terms_drop_option_and_marketing_noise(): + from services.marketplace_product_matcher import build_search_terms + + terms = build_search_terms("【YSL】情挑誘光嫩唇凍6ml(任選一款/新品上市)", max_terms=5) + + assert terms[0] == "ysl 情挑誘光嫩唇凍 6ml" + assert not any("一款" in term or "上市" in term for term in terms) + + +def test_marketplace_search_terms_keep_professional_product_phrase(): + from services.marketplace_product_matcher import build_search_terms + + abysse_terms = build_search_terms("【Abysse】天然植萃身體按摩精油550ml", max_terms=5) + mustela_terms = build_search_terms( + "【Mustela 慕之恬廊】慕之幼 免用水潔淨液 300ml(外出清潔 卸除髒汙 卸除防曬 卸防曬)", + max_terms=5, + ) + + assert abysse_terms[0] == "abysse 身體按摩精油 550ml" + assert mustela_terms[0] == "慕之恬廊 免用水潔淨液 300ml" + assert not any("卸除防曬" in term or "外出清潔" in term for term in mustela_terms) + + def test_batch_compare_top_uses_latest_momo_price_not_revenue(monkeypatch): from services import pchome_crawler diff --git a/tests/test_pchome_crawler_search.py b/tests/test_pchome_crawler_search.py index f948c65..ed73e86 100644 --- a/tests/test_pchome_crawler_search.py +++ b/tests/test_pchome_crawler_search.py @@ -179,5 +179,5 @@ def test_feeder_search_candidate_passes_page_cap(monkeypatch): ) assert candidates == [product] - assert calls[0][1]["limit"] == 20 + assert calls[0][1]["limit"] == 40 assert calls[0][1]["max_pages"] == 2