diff --git a/TODO_NEXT_STEPS.txt b/TODO_NEXT_STEPS.txt index 6aef90f..2de575b 100644 --- a/TODO_NEXT_STEPS.txt +++ b/TODO_NEXT_STEPS.txt @@ -4,6 +4,7 @@ ================================================================================ 【已完成】 + - V10.312 強化 PChome 商品身份比對防錯配:matcher 開始解析 mg/mcg 劑量、件組套組與多規格集合,60ml+150ml vs 60ml+20ml、10mg vs 20mg、10片 vs 10盒、精華 vs 化妝水都會進硬否決或單位價覆核,不再靠單一規格重疊放行;覆核診斷同步新增「劑量差異」標籤,降低核心比價錯配污染 daily/growth/PPT/AI 分析。 - V10.311 統一競品價差語意:`fetch_competitor_comparison_results()`、competitor PPT 與 OpenClaw competitor prompt 全部改用 `MOMO - PChome`,正值代表 MOMO 較貴 / PChome 低價壓力,負值代表 MOMO 價格優勢;避免 daily/growth 顯示價格壓力但 PPT/AI 反向解讀。 - V10.310 強化 MOMO/PChome 核心比價閉環:PChome feeder 搜尋候選只有強同款 `0.90` 才提前停止,避免第一個 0.76 次佳候選卡掉後續精準搜尋詞;人工否決的候選會被跳過並改挑下一個候選,不再讓已否決商品長期阻塞同 SKU。人工 `reject_identity`、`unit_price_required`、`needs_research` 會立即讓同候選正式 `competitor_prices` 過期,Dashboard 即使尚有舊價也不再顯示正式總價差;手機版比價覆核欄位標籤、覆核按鈕冒泡與候選證據顯示同步修正。 - V10.308 修正商品列表 PChome 比價閉環狀態:`manual_rejected`、`manual_unit_price_required`、`manual_needs_research` 不再掉回籠統「待比對」,改顯示「人工已否決 / 人工標記單位價 / 人工要求補搜尋」與後續 feeder 行為說明,避免人工覆核後 UI 看起來像沒有處理。 diff --git a/config.py b/config.py index 471574d..086759a 100644 --- a/config.py +++ b/config.py @@ -320,7 +320,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.311" +SYSTEM_VERSION = "V10.312" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index 89bee70..6038d8f 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -605,3 +605,4 @@ POSTGRES_HOST=momo-db | 2026-04-17 | 188 容器無 volume mount,`docker cp` 臨時解 | 重建 image(`COPY . .` bake 進新代碼);port 5001 衝突記錄為技術債 | | 2026-04-17 | 188 .env Telegram token 不正確(split-brain)| 修正為 `8610496165`,188→Telegram message_id=282 確認 | | 2026-04-17 | NIM Tool Calling E2E | 真實 NVIDIA_API_KEY 驗證:dispatched=3, errors=[] | +| 2026-05-20 | PChome 商品身份比對仍可能因單一規格重疊誤放行 | V10.312 起 matcher 解析 mg/mcg 劑量、件組套組、多規格集合與同數字不同單位;劑量/容量/重量/件數/品類衝突會硬否決或導向單位價覆核,避免錯配污染 Dashboard、daily/growth、PPT 與 AI 競價分析 | diff --git a/services/competitor_intel_repository.py b/services/competitor_intel_repository.py index 1e6d744..01d6f4d 100644 --- a/services/competitor_intel_repository.py +++ b/services/competitor_intel_repository.py @@ -79,6 +79,7 @@ MATCH_DIAGNOSTIC_REASON_LABELS = { "type_conflict": "品類不符", "volume_conflict": "容量差異", "weight_conflict": "重量差異", + "dosage_conflict": "劑量差異", "count_conflict": "件數差異", "component_count_conflict": "入數差異", "multi_component_conflict": "組合差異", diff --git a/services/marketplace_product_matcher.py b/services/marketplace_product_matcher.py index 3358e80..44a5afb 100644 --- a/services/marketplace_product_matcher.py +++ b/services/marketplace_product_matcher.py @@ -85,8 +85,10 @@ GENERIC_TOKENS = { "片", "支", "條", + "件", "ml", "g", + "mg", "la", "paris", } @@ -107,9 +109,50 @@ PRODUCT_TYPES = { "保健": ("錠", "膠囊", "粉", "飲", "包", "健康食品"), } -COUNT_UNITS = {"入", "組", "瓶", "支", "條", "盒", "包", "片", "顆", "錠", "枚"} +COUNT_UNITS = {"入", "組", "瓶", "支", "條", "盒", "包", "片", "顆", "錠", "枚", "件"} PIECE_UNITS = {"包", "片", "顆", "錠", "枚"} CONTAINER_UNITS = {"入", "組", "盒"} +BUNDLE_OFFER_PHRASES = ( + "囤貨組", + "超值組", + "特惠組", + "優惠組", + "優惠套組", + "禮盒組", + "加大組", + "加量組", + "分享組", + "明星組", + "套組", + "組合", + "組合包", + "雙件組", + "二件組", + "2件組", + "家庭組", + "多入組", +) +NON_BRAND_BRACKET_PHRASES = ( + "保濕組", + "熱銷款", + "限定", + "特惠", + "優惠", + "超值", + "囤貨", + "組合", + "套組", + "禮盒", + "分享", + "雙件", + "二件", + "2件", + "家庭", + "多入", + "任選", + "買", + "母親節", +) CHINESE_COUNT = { "一": 1, "二": 2, @@ -137,6 +180,7 @@ class ProductIdentity: core_tokens: frozenset[str] volumes_ml: tuple[float, ...] weights_g: tuple[float, ...] + dosages_mg: tuple[float, ...] counts: tuple[tuple[int, str], ...] total_piece_count: Optional[int] @@ -261,9 +305,10 @@ def _leading_brand_tokens(original: str, normalized: str) -> set[str]: bracket_match = re.match(r"\s*[【\[]([^】\]]{2,40})[】\]]", original or "") if bracket_match: content = normalize_product_text(bracket_match.group(1)) - for token in _tokenize(_strip_noise(content)): - if token not in GENERIC_TOKENS: - tokens.add(token) + if not any(phrase in content for phrase in NON_BRAND_BRACKET_PHRASES): + for token in _tokenize(_strip_noise(content)): + if token not in GENERIC_TOKENS: + tokens.add(token) leading = normalized[:48] for token in _tokenize(leading): @@ -293,28 +338,37 @@ def _convert_volume(value: str, unit: str) -> Optional[tuple[str, float]]: return ("g", number) if unit == "kg": return ("g", number * 1000) + if unit in {"mg", "毫克"}: + return ("mg", number) + if unit in {"mcg", "μg", "ug", "微克"}: + return ("mg", number / 1000) return None -def _extract_specs(text: str) -> tuple[tuple[float, ...], tuple[float, ...], tuple[tuple[int, str], ...], Optional[int]]: +def _extract_specs( + text: str, +) -> tuple[tuple[float, ...], tuple[float, ...], tuple[float, ...], tuple[tuple[int, str], ...], Optional[int]]: volumes_ml: list[float] = [] weights_g: list[float] = [] - for match in re.finditer(r"(\d+(?:\.\d+)?)\s*(ml|毫升|l|g|公克|kg)", text, re.I): + dosages_mg: list[float] = [] + for match in re.finditer(r"(\d+(?:\.\d+)?)\s*(ml|毫升|l|g|公克|kg|mg|毫克|mcg|μg|ug|微克)", text, re.I): converted = _convert_volume(match.group(1), match.group(2)) if not converted: continue unit, number = converted if unit == "ml": volumes_ml.append(number) - else: + elif unit == "g": weights_g.append(number) + else: + dosages_mg.append(number) counts: list[tuple[int, str]] = [] - for match in re.finditer(r"(\d+)\s*([入組瓶支條盒包片顆錠枚])", text): + for match in re.finditer(r"(\d+)\s*([入組瓶支條盒包片顆錠枚件])", text): counts.append((int(match.group(1)), match.group(2))) - for match in re.finditer(r"([一二兩雙三四五六七八九十])\s*([入組瓶支條盒包片顆錠枚])", text): + for match in re.finditer(r"([一二兩雙三四五六七八九十])\s*([入組瓶支條盒包片顆錠枚件])", text): counts.append((CHINESE_COUNT[match.group(1)], match.group(2))) - for match in re.finditer(r"(?:x|乘)\s*(\d+)\s*([入組瓶支條盒包片顆錠枚])?", text, re.I): + for match in re.finditer(r"(?:x|乘)\s*(\d+)\s*([入組瓶支條盒包片顆錠枚件])?", text, re.I): unit = match.group(2) or "入" counts.append((int(match.group(1)), unit)) @@ -334,6 +388,7 @@ def _extract_specs(text: str) -> tuple[tuple[float, ...], tuple[float, ...], tup return ( tuple(sorted(set(volumes_ml))), tuple(sorted(set(weights_g))), + tuple(sorted(set(dosages_mg))), unique_counts, total_piece_count, ) @@ -351,11 +406,11 @@ def parse_product_identity(name: str) -> ProductIdentity: for token in tokens if token not in GENERIC_TOKENS and not token.isdigit() - and not re.fullmatch(r"\d+(ml|g|kg|l)?", token) + and not re.fullmatch(r"\d+(ml|g|kg|l|mg|mcg|ug)?", token) } core_tokens -= brand_tokens - volumes_ml, weights_g, counts, total_piece_count = _extract_specs(searchable) + volumes_ml, weights_g, dosages_mg, counts, total_piece_count = _extract_specs(searchable) return ProductIdentity( original_name=name or "", normalized_name=normalized, @@ -366,6 +421,7 @@ def parse_product_identity(name: str) -> ProductIdentity: core_tokens=frozenset(core_tokens), volumes_ml=volumes_ml, weights_g=weights_g, + dosages_mg=dosages_mg, counts=counts, total_piece_count=total_piece_count, ) @@ -408,18 +464,58 @@ def _close_number(left: float, right: float, tolerance: float = 0.04) -> bool: def _spec_component(left_values: Iterable[float], right_values: Iterable[float]) -> tuple[float, bool]: - left_tuple = tuple(left_values) - right_tuple = tuple(right_values) + left_tuple = tuple(sorted(set(left_values))) + right_tuple = tuple(sorted(set(right_values))) if not left_tuple and not right_tuple: return 0.55, False if not left_tuple or not right_tuple: return 0.45, False + if len(left_tuple) > 1 or len(right_tuple) > 1: + if len(left_tuple) != len(right_tuple): + return 0.0, True + unmatched = list(right_tuple) + for left_value in left_tuple: + match_index = next( + ( + index + for index, right_value in enumerate(unmatched) + if _close_number(left_value, right_value) + ), + None, + ) + if match_index is None: + return 0.0, True + unmatched.pop(match_index) + return 1.0, False for left_value in left_tuple: if any(_close_number(left_value, right_value) for right_value in right_tuple): return 1.0, False return 0.0, True +def _has_hard_count_unit_conflict(left: ProductIdentity, right: ProductIdentity) -> bool: + if not left.counts or not right.counts: + return False + left_by_count: dict[int, set[str]] = {} + right_by_count: dict[int, set[str]] = {} + for count, unit in left.counts: + left_by_count.setdefault(count, set()).add(unit) + for count, unit in right.counts: + right_by_count.setdefault(count, set()).add(unit) + + for count in set(left_by_count) & set(right_by_count): + left_units = left_by_count[count] + right_units = right_by_count[count] + if left_units & right_units: + continue + if ( + (left_units & PIECE_UNITS and right_units & CONTAINER_UNITS) + or (right_units & PIECE_UNITS and left_units & CONTAINER_UNITS) + ): + return True + return False + + def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool]: left_counts = [count for count, _unit in left.counts] right_counts = [count for count, _unit in right.counts] @@ -431,6 +527,8 @@ def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, if left.counts and right.counts: if set(left.counts) & set(right.counts): return 0.85, False + if _has_hard_count_unit_conflict(left, right): + return 0.0, True if left_counts and right_counts: ratio = max(max(left_counts), max(right_counts)) / max(min(max(left_counts), max(right_counts)), 1) if ratio >= 1.5: @@ -444,6 +542,7 @@ def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool, tuple[str, ...]]: volume_score, volume_conflict = _spec_component(left.volumes_ml, right.volumes_ml) weight_score, weight_conflict = _spec_component(left.weights_g, right.weights_g) + dosage_score, dosage_conflict = _spec_component(left.dosages_mg, right.dosages_mg) count_score, count_conflict = _count_score(left, right) available = [] @@ -451,6 +550,8 @@ def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, b available.append(volume_score) if left.weights_g or right.weights_g: available.append(weight_score) + if left.dosages_mg or right.dosages_mg: + available.append(dosage_score) if left.counts or right.counts: available.append(count_score) if not available: @@ -462,6 +563,8 @@ def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, b conflicts.append("volume_conflict") if weight_conflict: conflicts.append("weight_conflict") + if dosage_conflict: + conflicts.append("dosage_conflict") if count_conflict: conflicts.append("count_conflict") return score, bool(conflicts), tuple(conflicts) @@ -473,16 +576,7 @@ def _has_bundle_offer(identity: ProductIdentity) -> bool: re.search(r"買\s*\d+\s*送\s*\d+", text) or re.search(r"買\s*[一二兩雙三四五六七八九十]\s*送\s*[一二兩雙三四五六七八九十]", text) or "買一送一" in text - or "囤貨組" in text - or "超值組" in text - or "特惠組" in text - or "優惠套組" in text - or "禮盒組" in text - or "加大組" in text - or "加量組" in text - or "分享組" in text - or "明星組" in text - or "套組" in text + or any(phrase in text for phrase in BUNDLE_OFFER_PHRASES) ) @@ -491,7 +585,7 @@ def _has_multi_component(identity: ProductIdentity) -> bool: return bool( "+" in text or "+" in text - or re.search(r"\d+\s*(?:ml|g|毫升|公克)\s*x\s*\d+", text, re.I) + or re.search(r"\d+\s*(?:ml|g|mg|毫升|公克|毫克)\s*x\s*\d+", text, re.I) ) @@ -507,7 +601,13 @@ def _has_refill_pack(identity: ProductIdentity) -> bool: def _spec_mention_count(identity: ProductIdentity) -> int: - return len(re.findall(r"\d+(?:\.\d+)?\s*(?:ml|毫升|l|g|公克|kg)", identity.normalized_name, re.I)) + return len( + re.findall( + r"\d+(?:\.\d+)?\s*(?:ml|毫升|l|g|公克|kg|mg|毫克|mcg|μg|ug|微克)", + identity.normalized_name, + re.I, + ) + ) def _count_text_value(value: str) -> Optional[int]: @@ -526,6 +626,11 @@ def _pack_multiplier(identity: ProductIdentity) -> int: return left + right if "買一送一" in text or "買1送1" in text: return 2 + piece_pack = re.search(r"(\d+|[一二兩雙三四五六七八九十])\s*件\s*組", text) + if piece_pack: + count = _count_text_value(piece_pack.group(1)) or 0 + if count > 1: + return count multipliers = [count for count, unit in identity.counts if unit in COUNT_UNITS and count > 1] if multipliers: @@ -764,7 +869,7 @@ def score_marketplace_match( hard_veto = True if chinese_name_score < 0.16 and token_score < 0.72: hard_veto = True - if left.product_type and right.product_type and left.product_type != right.product_type and token_score < 0.55: + if left.product_type and right.product_type and left.product_type != right.product_type: hard_veto = True comparison_mode = "exact_identity" @@ -845,6 +950,10 @@ def build_search_terms(name: str, max_terms: int = 3) -> list[str]: specs.append(f"{int(identity.volumes_ml[0])}ml") if identity.weights_g: specs.append(f"{int(identity.weights_g[0])}g") + if identity.dosages_mg: + dosage = identity.dosages_mg[0] + dosage_label = f"{int(dosage)}mg" if dosage.is_integer() else f"{dosage:g}mg" + specs.append(dosage_label) if identity.total_piece_count: specs.append(f"{identity.total_piece_count}包") diff --git a/tests/test_marketplace_product_matcher.py b/tests/test_marketplace_product_matcher.py index 7a01e79..3a26b99 100644 --- a/tests/test_marketplace_product_matcher.py +++ b/tests/test_marketplace_product_matcher.py @@ -167,6 +167,99 @@ def test_marketplace_matcher_does_not_promote_wide_price_refill_candidate(): assert "strong_product_line_match" not in diagnostics.reasons +def test_marketplace_matcher_rejects_partial_overlap_in_multi_spec_set(): + from services.marketplace_product_matcher import score_marketplace_match + + diagnostics = score_marketplace_match( + "【蘭蔻】玫瑰霜60ml+玫瑰精露150ml", + "【蘭蔻】玫瑰霜60ml+玫瑰精露20ml", + momo_price=18765, + competitor_price=7999, + ) + + assert diagnostics.score < 0.76 + assert diagnostics.hard_veto is True + assert "volume_conflict" in diagnostics.reasons + assert diagnostics.comparison_mode == "not_comparable" + + +def test_marketplace_matcher_rejects_dosage_conflict(): + from services.marketplace_product_matcher import score_marketplace_match + + diagnostics = score_marketplace_match( + "品牌 葉黃素 10mg 60錠", + "品牌 葉黃素 20mg 60錠", + momo_price=990, + competitor_price=890, + ) + + assert diagnostics.score < 0.76 + assert diagnostics.hard_veto is True + assert "dosage_conflict" in diagnostics.reasons + + +def test_marketplace_matcher_rejects_product_type_conflict_even_when_line_matches(): + from services.marketplace_product_matcher import score_marketplace_match + + diagnostics = score_marketplace_match( + "理膚寶水 MelaB3 淡斑精華 30ml", + "理膚寶水 MelaB3 淡斑化妝水 30ml", + momo_price=1280, + competitor_price=1180, + ) + + assert diagnostics.score < 0.76 + assert diagnostics.hard_veto is True + assert "type_conflict" in diagnostics.reasons + assert diagnostics.comparison_mode == "not_comparable" + + +def test_marketplace_matcher_rejects_same_count_different_unit_family(): + from services.marketplace_product_matcher import score_marketplace_match + + diagnostics = score_marketplace_match( + "品牌 保濕面膜10片", + "品牌 保濕面膜10盒", + momo_price=399, + competitor_price=1990, + ) + + assert diagnostics.score < 0.76 + assert diagnostics.hard_veto is True + assert "count_conflict" in diagnostics.reasons + + +def test_marketplace_matcher_marks_generic_bundle_words_as_unit_comparable(): + from services.marketplace_product_matcher import score_marketplace_match + + diagnostics = score_marketplace_match( + "【NARS】裸光蜜粉餅 雙件組 10g", + "【NARS】裸光蜜粉餅 10g", + momo_price=1999, + competitor_price=1099, + ) + + assert diagnostics.score < 0.76 + assert diagnostics.hard_veto is True + assert "bundle_offer_conflict" in diagnostics.reasons + assert diagnostics.comparison_mode == "unit_comparable" + + +def test_marketplace_matcher_ignores_non_brand_bracket_copy(): + from services.marketplace_product_matcher import score_marketplace_match + + diagnostics = score_marketplace_match( + "【保濕組】理膚寶水 B5 修復霜 40ml", + "理膚寶水 B5 修復霜 40ml", + momo_price=699, + competitor_price=679, + ) + + assert diagnostics.score >= 0.76 + assert diagnostics.hard_veto is False + assert "brand_conflict" not in diagnostics.reasons + + def test_batch_compare_top_uses_latest_momo_price_not_revenue(monkeypatch): from services import pchome_crawler