Files
ewoooc/services/marketplace_product_matcher.py
OoO 9d84cbfd43
All checks were successful
CD Pipeline / deploy (push) Successful in 1m8s
feat: deepen pchome momo backfill guardrails
2026-06-19 00:41:20 +08:00

5866 lines
214 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""跨電商商品身份比對工具。
這裡處理「是否為同一個商品」;價格只作為 sanity check不能主導配對。
"""
from __future__ import annotations
import re
import unicodedata
from dataclasses import dataclass, field
from difflib import SequenceMatcher
from typing import Iterable, Optional
NOISE_PHRASES = (
"momo",
"pchome",
"24h",
"官方直營",
"官方",
"公司貨",
"台灣公司貨",
"專櫃公司貨",
"正貨",
"原廠",
"限時",
"特惠",
"優惠",
"超值",
"加贈",
"贈品",
"送禮",
"",
"買一送一",
"買1送1",
"限定版",
"璀璨奢金限定版",
"單入任選",
"單入",
"全肌防護",
"經典防護王",
"賦活美學",
"弱酸性",
"植萃複方",
"溫和潤澤護理",
"ph值平衡",
"淨味沐浴乳",
"香氛凝膠",
"絲絨甜點新色",
"鎖吻棒",
"水光持色",
"贈精油一瓶",
"贈送薰衣草精油",
"超聲波大噴霧",
"天然陶瓷",
"女大電視節目推薦",
"醫師好辣推薦",
"嬰兒界萬用霜",
"48h抑味爽身",
"10度c順降",
"vit b5",
"vitb5",
"任選",
"即期品",
"福利品",
"預購",
"免運",
"熱銷",
"人氣",
"必買",
"推薦",
"新品",
"升級版",
"經典",
"獨家",
"囤貨組",
"超值組",
"優惠組",
"分享包",
"組合",
"多款可選",
"多款任選",
"任選多款",
"多色可選",
"色號可選",
"平行輸入",
"大容量",
"附燈泡",
"贈燈泡",
"定時",
"調節亮度",
"可調光",
"聖誕禮物",
"聖誕節禮物",
"懶人霜",
"打造素顏女神",
"第三代經典版白",
)
GENERIC_TOKENS = {
"官方",
"直營",
"公司貨",
"專櫃",
"正貨",
"原廠",
"限時",
"特惠",
"優惠",
"超值",
"加贈",
"贈品",
"送禮",
"即期品",
"新品",
"升級版",
"經典",
"人氣",
"熱銷",
"必買",
"推薦",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"ml",
"g",
"mg",
"la",
"paris",
"多款",
"可選",
"任選",
"平行輸入",
"大容量",
"日本",
"韓國",
"澳洲",
"法國",
"英國",
"美國",
}
SEARCH_NOISE_PHRASES = (
"新品上市",
"全新上市",
"國際航空版",
"超取免運",
"任選一款",
"任選1款",
"任選一色",
"任選1色",
"多款任選",
"多款可選",
"色號可選",
"香味可選",
"口味可選",
"送精美紙袋",
"精美紙袋",
"交換禮物",
"聖誕禮物",
"限定版",
"璀璨奢金限定版",
"單入任選",
"全肌防護",
"經典防護王",
"賦活美學",
"弱酸性",
"植萃複方",
"溫和潤澤護理",
"ph值平衡",
"淨味沐浴乳",
"香氛凝膠",
"絲絨甜點新色",
"鎖吻棒",
"水光持色",
"贈精油一瓶",
"贈送薰衣草精油",
"超聲波大噴霧",
"天然陶瓷",
"女大電視節目推薦",
"醫師好辣推薦",
"嬰兒界萬用霜",
"48h抑味爽身",
"10度c順降",
"vit b5",
"vitb5",
"母親節",
"父親節",
"情人節",
"外出清潔",
"卸除髒汙",
"卸除防曬",
"卸防曬",
"防水眼線",
"寶寶牙刷",
"紗布牙刷",
"調節亮度",
"韓國彩妝",
"水光感",
"官方直營",
"官方",
"經典款",
"校色",
"控油",
"好氣色",
"懶人霜",
"打造素顏女神",
"我愛修膚",
"第三代經典版白",
"溫和不乾澀",
"寶寶共和國",
"任選三款",
"三款",
"枚入",
"類光繚指甲油專用亮油",
"小銀蓋",
"如膠似漆",
"第三代",
"經典版",
"櫻花輕盈版",
"兩入組",
"超值兩入組",
"任選色號",
"多色任選",
"多色可選",
"多色",
"德國妮維雅",
"無印止汗滾珠",
"眉彩刷",
"眉餅盒分開販售",
"極細筆芯",
"防水抗暈",
"兒童化妝品",
"無毒防曬霜",
"天然彩妝",
"內贈芳香劑",
"衛浴精油擴香瓶棒組",
"衛浴精油擴香瓶",
"三色選一",
"贈複方",
)
SEARCH_NOISE_TOKENS = {
"一款",
"1款",
"一色",
"1色",
"上市",
"全新",
"新品",
"香味",
"口味",
"味道",
"顏色",
"色號",
"紙袋",
"禮物",
"清潔",
"髒汙",
"防曬",
"彩妝",
"水光感",
"超取",
"免運",
"航空版",
"國際版",
"附燈泡",
"定時",
"眼妝",
"滅菌",
"保濕",
"抗老",
"超品日",
"經典款",
"",
"pdrn",
"校色",
"控油",
"好氣色",
"懶人霜",
"我愛修膚",
"第三代",
"經典版",
"版白",
"限量聯名款",
"play",
"boy",
"小虎",
"啾啾妹",
"煎妮花",
"涼感潔淨",
"私密處清潔",
"溫和不乾澀",
"寶寶共和國",
"三款",
"枚入",
"小銀蓋",
"如膠似漆",
"美甲",
"3d",
"多色",
"提亮",
"兩入組",
"櫻花輕盈版",
}
SEARCH_IDENTITY_ANCHORS = (
"控油清爽防曬棒",
"蔓越莓私密清潔慕斯",
"金縷梅私密清潔慕斯",
"光采奪目眼部飾底乳",
"男性私密沐浴露",
"私密沐浴露",
"hydsto 車載香薰",
"電動刮鬍刀 s101",
"磁吸控油定妝噴霧",
"修容打亮棒",
"私密潔淨凝露",
"柔霧裸唇膏",
"潤浸保濕清爽身體乳液",
"閃亮珍珠眼影棒",
"智能光感應無線自動除臭芳香噴霧機",
"usb精油薰香機",
"超音波水氧機",
"類光繚指甲油",
"多效提亮防曬霜",
"速描眼線膠筆",
"經典旋轉眉筆",
"3d造型眉彩餅補充芯",
"止汗爽身乳液",
"慕之幼爽身潤膚乳",
"精油芬香護手霜",
"持久植物香氛精油",
"口袋雙色修容打亮盤",
"經典乳霜",
"蜂王玫瑰外泌微臻霜",
"微分子肌底原生露",
"小浪智能感應自動噴香機",
"智能感應自動噴香機",
"深層全效面膜",
"私密防護舒緩噴霧",
"櫻之花身體噴霧",
"光透立體眼線筆",
"葳兒柔賦活凝膠",
"兒童指甲油",
"麗駐蘭修復舒緩面膜",
"濕度數顯智能加濕器",
"新艷陽夏水離子熱防禦隔離露",
"3d極細防水眼線膠筆",
"恆久完美透膚煙染腮紅",
"私密植萃美白緊緻凝露",
"學習刷牙漱口水",
"羅馬柱智慧居家車用香氛機",
"經典菲菲染唇液",
"染唇液",
"晨霧純精油擴香儀",
"天然植物香氛精油",
"爆水擦澡濕巾",
"嬰兒潤膚乳",
"可撕式水性兒童指甲油",
"aroma lava 解憂放鬆緩緩燈",
"經典款香氛蠟燭暖燈",
"我愛超磁妝定妝噴霧",
"全天候超完美定妝噴霧",
"怪獸級持色唇膏",
"焦糖楓葉香氛擴香花禮盒",
"香氛蠟燭20種香味",
"tokyo車用夾式消臭芳香劑",
"北歐簡樸融蠟燈桌面氣氛夜燈",
"大地有機植萃護膚油",
"3d立體持色眉彩盤",
"細芯睛彩雙頭眉筆",
"雙頭旋轉極細眉筆",
"武士刀眉筆",
"自動武士刀眉筆筆蕊",
"無極限保濕防曬妝前乳",
"水凝光透 妝前防護乳",
"水凝光透妝前防護乳",
"經典素顏霜",
"閃耀保色護甲油",
"溫和洗手慕斯",
"足足稱奇足膜",
"時尚潮流美甲片",
"止汗爽身噴霧",
"止汗爽身乳膏pro",
"零粉感超持久粉底棒",
"超持久水光鎖吻唇釉",
"裸光蜜粉餅",
"私密潔膚露",
"私密肌潔膚露",
"男性私密醒肌抑菌噴霧",
"男性私密激淨凝露",
"私密抑菌噴霧",
"天然陶瓷精油香薰機",
"裸光幻閃亮采餅",
"絕對持久定妝噴霧",
"兒童防曬氣墊粉餅",
"勝過眼皮十色眼影盤",
"提提亮膚打亮液",
"甜甜嫩頰腮紅液",
"自動武士刀眉筆",
"超進化光感輕潤遮瑕棒",
"4合1微臻全能氣墊粉餅",
"唯我玫瑰裸光潤唇膏",
"晨曦冷香儀",
"舒恬良修護霜",
"頂級濃潤柔霜潤唇膏",
"絕對完美永生玫瑰逆齡乳霜",
"永生玫瑰逆齡乳霜",
"永生玫瑰霜",
"玫瑰精露",
"玫瑰霜",
"青春敷面膜",
"長效潤膚霜",
"小黑瓶",
"私密處護潔露",
"私密護潔露",
"口腔清潔棒",
"含氟防蛀修護牙膏",
"自然遮瑕素顏霜",
"超持久細滑眼線筆",
"香氛融蠟燈",
"水晶香氛能量寶盒禮盒組",
"零粉感超持久柔焦蜜粉餅",
"私密肌潔淨露",
"私密潔浴露",
"身體除毛器",
"免用水潔淨液",
"身體按摩精油",
"按摩精油",
"擴香補充瓶",
"擴香瓶",
"全面修復霜",
"修復霜",
"護膚膏",
"屁屁噴",
"身體乳",
"緊實乳",
"妝前防護乳",
"妝前乳",
"素顏霜",
"潔膚露",
"浴潔露",
"潔淨液",
"護甲油",
"指甲油",
"美甲片",
"唇凍",
"唇釉",
"唇膏",
"粉底棒",
"遮瑕棒",
"化妝水",
"精華液",
"精華",
"面膜",
"乳液",
"乳霜",
"面霜",
"精油",
"水氧機",
"香氛機",
)
FOCUSED_IDENTITY_REVIEW_ONLY_REASONS = {
"muji_aroma_hand_cream_brandless",
"johnsons_baby_lotion_variant_catalog",
"im_meme_fixx_cool_setting_spray",
"so_natural_fixx_setting_spray_catalog",
"kate_powder_case_catalog",
"kate_monster_lipstick_catalog",
"opi_gel_polish_series_catalog",
"romand_juicy_lip_tint_2_catalog",
"recipe_box_peelable_child_polish_catalog",
"solone_longlasting_eyeliner",
"shu_auto_hard_formula_refill_catalog",
"summer_eve_full_skin_wash_2pack",
"the_forest_maple_diffuser_flower_brandless",
}
FOCUSED_IDENTITY_VARIANT_REVIEW_BYPASS_REASONS = {
"artmis_cranberry_private_mousse_250ml",
"artmis_witch_hazel_private_mousse_250ml",
"arden_eight_hour_lip_spf15_3_7g_3pack",
"baan_baby_lip_original_strawberry_catalog",
"dhc_olive_lip_1_5g",
"flortte_fruit_salad_eyeliner_0_5ml_catalog",
"frudia_honey_blueberry_lip_10g",
"hh_private_cleanser_laundry_wash_set",
"kanebo_allie_bright_uv_milk_60g_catalog",
"laroche_posay_lip_balm_4_7ml",
"laroche_posay_repair_lip_balm_7_5ml",
"lush_sakura_body_spray",
"neutrogena_hand_cream_56g_scent_catalog",
"natures_care_sheep_oil_exact_pack",
"opi_gel_polish_exact_model",
"sebamed_baby_lip_4_8g_2pack",
"sebamed_ph38_private_wash_200ml_2pack",
"so_natural_fixx_setting_spray_120ml_plain",
"sk_ii_essence_330ml_2pack",
"amiino_whitening_repair_cream_30ml",
"tomoon_nail_clipper_luxury_size",
"yes_cuticle_scissors_9cm",
"yes_curved_scissors_9cm",
"yes_foot_nail_scissors_10_5cm",
"yes_nail_tool_exact_model_size",
"cetaphil_long_lotion_237ml",
"cetaphil_long_lotion_473ml",
"cetaphil_long_moisturizing_cream_250g",
"cetaphil_ad_repair_cream_227g",
"clarins_double_serum_eye_20ml",
"lab52_paw_patrol_baby_toothbrush_2pack",
"derma_baby_wash_150ml",
"derma_baby_wash_500ml",
"physiogel_ai_ice_essence_200ml_2pack",
"playjoy_powerman_male_care_30ml",
"ts6_private_elastic_gel_40g",
"beauty_foot_mask_exact_pack",
"kameria_centella_foot_mask_17ml_2pc",
"ts6_lubricant_100g_3pack",
"ts6_peach_bright_gel_45g_3pack",
"ts6_white_wash_peach_gel_kit",
"ts6_cooling_clean_mousse_100g",
"vaseline_baby_jelly_368g_3pack",
}
FOCUSED_IDENTITY_BRANDLESS_REVIEW_REASONS = {
"herbacin_classic_hand_cream_20ml_brandless",
"muji_aroma_hand_cream_brandless",
"the_forest_maple_diffuser_flower_brandless",
}
FOCUSED_IDENTITY_BRANDLESS_TOTAL_PRICE_REASONS = {
"herbacin_classic_hand_cream_20ml_brandless",
}
FOCUSED_IDENTITY_TOTAL_PRICE_REASONS = {
"3w_clinic_collagen_foundation_50ml_2pack",
"hanamisui_moisture_original_gel_1_7g_3pack",
"hanamisui_inclear_private_gel_1_7g_3pack",
"hanamisui_relax_lavender_gel_1_7g_3pack",
"the_ordinary_caffeine_egcg_30ml",
"herbacin_classic_hand_cream_20ml_brandless",
"sab_private_spray",
"st_clare_private_mousse_150ml_2pack",
"st_clare_private_mousse_spray_set",
"biopeutic_plus_aha_lotion_20_150ml",
"taisu_baby_bath_shampoo_3pc",
"arden_eight_hour_lip_spf15_3_7g_3pack",
"flortte_fruit_salad_eyeliner_0_5ml_catalog",
"hh_private_cleanser_laundry_wash_set",
"kanebo_allie_bright_uv_milk_60g_catalog",
"laroche_posay_repair_lip_balm_7_5ml",
"neutrogena_hand_cream_56g_scent_catalog",
"natures_care_sheep_oil_exact_pack",
"opi_gel_polish_exact_model",
"sebamed_ph38_private_wash_200ml_2pack",
"sk_ii_essence_330ml_2pack",
"amiino_whitening_repair_cream_30ml",
"kussen_baby_butt_cream_50ml_3pack",
"tomoon_nail_clipper_luxury_size",
"yes_cuticle_scissors_9cm",
"yes_curved_scissors_9cm",
"yes_foot_nail_scissors_10_5cm",
"yes_nail_tool_exact_model_size",
"bone_diffuser_gift_3pack",
"selection1990_half_dome_wax_lamp_white",
"selection1990_bendable_wax_lamp_white",
"canmake_tear_bag_palette",
"gdesign_aroma_lava_lamp_2",
"hooome_classic_white_wax_lamp_bulbs_giftbox",
"herb24_mist_diffuser_black",
"pavaruni_40_scent_oil",
"pavaruni_20_scent_candle",
"artmis_cranberry_private_mousse_250ml",
"artmis_witch_hazel_private_mousse_250ml",
"baan_baby_lip_original_strawberry_catalog",
"dhc_olive_lip_1_5g",
"derma_eco_skin_oil",
"frudia_honey_blueberry_lip_10g",
"laroche_posay_lip_balm_4_7ml",
"lush_sakura_body_spray",
"sebamed_baby_lip_4_8g_2pack",
"so_natural_fixx_setting_spray_120ml_plain",
"cetaphil_long_lotion_237ml",
"cetaphil_long_lotion_473ml",
"cetaphil_long_moisturizing_cream_250g",
"cetaphil_ad_repair_cream_227g",
"clarins_double_serum_eye_20ml",
"lab52_paw_patrol_baby_toothbrush_2pack",
"derma_baby_wash_150ml",
"derma_baby_wash_500ml",
"physiogel_ai_ice_essence_200ml_2pack",
"playjoy_powerman_male_care_30ml",
"ts6_private_elastic_gel_40g",
"beauty_foot_mask_exact_pack",
"kameria_centella_foot_mask_17ml_2pc",
"ts6_lubricant_100g_3pack",
"ts6_peach_bright_gel_45g_3pack",
"ts6_white_wash_peach_gel_kit",
"ts6_cooling_clean_mousse_100g",
"vaseline_baby_jelly_368g_3pack",
"nivea_creme_100ml",
"schick_womens_sensitive_blade_3pack",
}
SEARCH_BROAD_ANCHORS = {
"乳霜",
"面霜",
"面膜",
"精華",
"乳液",
"精油",
"香氛融蠟燈",
}
VARIANT_SENSITIVE_KEYWORDS = {
"香氛蠟燭",
"芬香蠟燭",
"蠟燭",
"擴香",
"融蠟燈",
"車用香氛",
"香味",
"私密清潔慕斯",
"私密清潔凝露",
"私密潔淨凝露",
"私密淨白清潔凝露",
"私密防護慕絲",
"慕絲",
"定妝噴霧",
"妝前防護乳",
"妝前乳",
"素顏霜",
"粉底",
"美甲片",
"指甲油",
"指彩",
"眼影盤",
"唇釉",
"唇膏",
"唇凍",
"潤唇膏",
"眉筆",
"眼線筆",
"腮紅液",
"打亮液",
"蜜粉餅",
"粉餅盒",
"粉底棒",
"遮瑕棒",
"遮瑕蜜",
"護手霜",
"滋養霜",
"修護乳",
"修容打亮棒",
"防曬",
"防曬乳",
"防曬霜",
"防曬噴霧",
"防曬棒",
}
VARIANT_OPTION_COLOR_WORDS = {
"茉莉花",
"梔子花",
"白茶蘭花",
"白茶",
"白麝香",
"黑麝香",
"清新花園",
"寶貝粉香",
"青檸羅勒",
"炭木香",
"無花果",
"鼠尾草",
"海鹽",
"檸檬草",
"茶樹",
"英國梨",
"小蒼蘭",
"英國梨小蒼蘭",
"櫻花",
"繡球花",
"魔髮奇緣",
"清甜柚香",
"杏仁牛奶",
"杏仁",
"薄荷",
"橙花",
"完熟白桃",
"琥珀橙",
"干邑棕",
"賽車綠",
"原味",
"草莓",
"蔓越莓",
"金縷梅",
"柔焦霧面",
"水光亮面",
"菸鹼醯胺",
"胺基酸",
"黑色",
"棕色",
"咖啡色",
"灰色",
"rose",
"白色",
"紅色",
"粉色",
"粉紅",
"桃紅",
"玫瑰",
"玫瑰色",
"珊瑚",
"珊瑚色",
"橘色",
"橙色",
"裸色",
"奶茶色",
"豆沙色",
"紫色",
"絕絕紫",
"薰衣草",
"藍色",
"綠色",
"膚色",
"自然色",
"明亮色",
"透明色",
"清爽型",
"滋潤型",
"橡棕",
"暗灰",
"灰棕",
"淺玫粉",
"極光之藍",
"月光銀影",
}
VARIANT_DESCRIPTOR_NOISE_KEYWORDS = {
"平輸航空版",
"多色任選",
"色號任選",
"任選色號",
"極細筆頭",
"筆頭",
"官方直營",
"入組",
"盒組",
}
MULTI_VARIANT_LISTING_PHRASES = (
"多款任選",
"多款可選",
"多色任選",
"多色可選",
"多種香味",
"多種香氣",
"香味任選",
"香味可選",
"味道可選",
"任選",
"可選",
)
SEARCH_AMBIGUOUS_PRODUCT_TERMS = {
"保護膜",
"保護貼",
}
BRAND_ALIAS_OVERRIDES = {
"clarins": ("克蘭詩", "clarins"),
"nars": ("nars",),
"relove": ("relove",),
"stadler form": ("stadler form", "stadlerform"),
"cetaphil": ("舒特膚", "cetaphil"),
"sisley": ("希思黎", "sisley"),
"gennies": ("奇妮", "gennies"),
"uruhimemomoko": ("潤姬桃子", "uruhimemomoko", "uruhime momoko"),
"arau baby": ("arau baby", "arau", "愛樂寶", "saraya"),
"sebamed": ("sebamed", "施巴"),
"shu uemura": ("shu uemura", "shuuemura", "植村秀"),
"johnsons": ("johnsons", "johnson's", "johnson", "嬌生"),
"gillette": ("gillette", "吉列"),
"schick": ("schick", "舒適牌"),
"obge": ("obge",),
"vaseline": ("vaseline", "凡士林"),
"eaoron": ("eaoron",),
"kameria": ("kameria", "凱蜜菈"),
"cocodor": ("cocodor",),
"peripera": ("peripera",),
"solone": ("solone",),
"im meme": ("im meme", "i'm meme", "im meme"),
"dr.wu": ("dr.wu", "dr wu", "drwu", "達爾膚"),
"febreze": ("febreze", "風倍清"),
"jo malone": ("jo malone",),
"prada": ("prada", "普拉達"),
"za": ("za",),
"hh": ("hh", "草本新淨界"),
"小浪": ("小浪",),
"xiaomi": ("小米有品", "小米", "xiaomi"),
"mac": ("m.a.c", "mac", "m a c"),
"opi": ("o.p.i", "opi", "o p i"),
"curel": ("curel", "珂潤"),
"karadium": ("karadium",),
"st雞仔牌": ("日本雞仔牌st", "日本st雞仔牌", "st雞仔牌", "雞仔牌st", "雞仔牌"),
}
PRODUCT_TYPES = {
"止汗噴霧": ("止汗爽身噴霧", "爽身噴霧", "止汗噴霧"),
"潔膚露": ("潔膚露", "浴潔露", "護潔露", "沐浴露", "wash", "私密潔浴露"),
"私密噴霧": ("私密噴霧", "抑菌噴霧", "醒肌抑菌噴霧"),
"私密凝露": ("凝露", "激淨凝露", "緊實凝露", "亮白凝露"),
"護甲油": ("護甲油", "亮油", "top coat"),
"定妝噴霧": ("定妝噴霧", "setting spray"),
"修容打亮棒": ("修容打亮棒", "修容棒", "打亮棒"),
"刮鬍刀": ("刮鬍刀", "刮胡刀", "shaver", "razor"),
"體香膏": ("體香膏", "體香劑", "deodorant"),
"電動牙刷": ("電動牙刷", "聲波電動牙刷", "electric toothbrush"),
"洗手慕斯": ("洗手慕斯", "洗手泡泡", "hand wash foam"),
"私密慕斯": ("私密清潔慕斯", "私密防護慕絲", "私密慕斯"),
"足膜": ("足膜", "足部膜", "足部去角質"),
"妝前乳": ("妝前乳", "妝前防護乳", "妝前隔離", "primer"),
"素顏霜": ("素顏霜", "tone up cream"),
"氣墊粉餅": ("氣墊粉餅", "cushion"),
"眼影盤": ("眼影盤",),
"打亮液": ("打亮液",),
"腮紅液": ("腮紅液",),
"護唇膏": ("護唇膏", "潤唇膏"),
"唇釉": ("唇釉", "唇彩", "lip tint", "lip glaze"),
"粉底棒": ("粉底棒", "foundation stick"),
"精華": ("精華", "精華液", "essence", "serum", "安瓶"),
"化妝水": ("化妝水", "機能水", "toner", "lotion"),
"乳液": ("乳液", "按摩乳", "潤膚乳", "身體乳", "嬰兒乳液", "寶寶乳液", "emulsion", "milk"),
"面霜": ("面霜", "乳霜", "", "cream"),
"防曬": ("防曬", "spf", "uv", "sunscreen"),
"洗面乳": ("洗面乳", "洗顏", "潔面", "cleanser", "foam"),
"面膜": ("面膜", "mask"),
"眼霜": ("眼霜", "眼部", "眼膜", "eye"),
"卸妝": ("卸妝", "cleansing", "remover"),
"粉底": ("粉底", "粉霜", "粉凝露", "foundation"),
"蜜粉": ("蜜粉", "powder"),
"精油": ("精油", "香氛", "擴香"),
"保健": ("", "膠囊", "", "", "", "健康食品"),
}
COUNT_UNITS = {"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "刀把", "刀片", "刀頭", ""}
COUNT_UNIT_PATTERN = r"(?:刀把|刀片|刀頭|入|組|瓶|支|條|盒|包|袋|片|顆|粒|錠|枚|件|罐|杯|本|蕊)"
PIECE_UNITS = {"", "", "", "", "", "", ""}
CONTAINER_UNITS = {"", "", "", "", "", "", "", "", "", ""}
COUNT_UNIT_FAMILIES = {
"刀片": "blade",
"刀頭": "blade",
"": "refill",
}
ENGLISH_COUNT_UNIT_RE = r"(?:pcs?|pieces?|capsules?|caps?|tablets?|tabs?|packs?|sachets?|bottles?|boxes?)"
BUNDLE_OFFER_PHRASES = (
"囤貨組",
"超值組",
"特惠組",
"優惠組",
"優惠套組",
"入門組",
"禮盒組",
"加大組",
"加量組",
"分享組",
"明星組",
"套組",
"組合",
"組合包",
"雙件組",
"二件組",
"2件組",
"家庭組",
"多入組",
)
NON_BRAND_BRACKET_PHRASES = (
"保濕組",
"熱銷款",
"限定",
"特惠",
"優惠",
"超值",
"囤貨",
"組合",
"套組",
"禮盒",
"分享",
"雙件",
"二件",
"2件",
"家庭",
"多入",
"任選",
"",
"母親節",
)
CHINESE_COUNT = {
"": 1,
"": 2,
"": 2,
"": 2,
"": 3,
"": 4,
"": 5,
"": 6,
"": 7,
"": 8,
"": 9,
"": 10,
}
@dataclass(frozen=True)
class ProductIdentity:
original_name: str
normalized_name: str
searchable_name: str
brand_tokens: frozenset[str]
product_type: Optional[str]
tokens: frozenset[str]
core_tokens: frozenset[str]
volumes_ml: tuple[float, ...]
weights_g: tuple[float, ...]
dosages_mg: tuple[float, ...]
counts: tuple[tuple[int, str], ...]
total_piece_count: Optional[int]
@dataclass(frozen=True)
class MatchDiagnostics:
score: float
brand_score: float
token_score: float
spec_score: float
sequence_score: float
type_score: float
price_penalty: float
hard_veto: bool
reasons: tuple[str, ...]
comparison_mode: str = "exact_identity"
match_type: str = "exact"
price_basis: str = "total_price"
alert_tier: str = "price_alert_exact"
evidence_flags: tuple[str, ...] = ()
identity_evidence: dict[str, object] = field(default_factory=dict)
offer_evidence: dict[str, object] = field(default_factory=dict)
@property
def tags(self) -> list[str]:
tags: list[str] = ["identity_v2"]
if self.comparison_mode:
tags.append(f"comparison_{self.comparison_mode}")
if self.match_type:
tags.append(f"match_type_{self.match_type}")
if self.price_basis:
tags.append(f"price_basis_{self.price_basis}")
if self.alert_tier:
tags.append(f"alert_tier_{self.alert_tier}")
if self.brand_score >= 0.95:
tags.append("brand_match")
if self.spec_score >= 0.85:
tags.append("spec_match")
if self.hard_veto:
tags.append("identity_veto")
for flag in self.evidence_flags:
tags.append(f"evidence_{flag}")
return tags
@dataclass(frozen=True)
class UnitPriceComparison:
comparable: bool
reason: str
unit_label: str = ""
momo_total_quantity: Optional[float] = None
competitor_total_quantity: Optional[float] = None
momo_unit_price: Optional[float] = None
competitor_unit_price: Optional[float] = None
unit_gap_amount: Optional[float] = None
unit_gap_pct: Optional[float] = None
summary: str = ""
def as_dict(self) -> dict:
return {
"comparable": self.comparable,
"reason": self.reason,
"unit_label": self.unit_label,
"momo_total_quantity": self.momo_total_quantity,
"competitor_total_quantity": self.competitor_total_quantity,
"momo_unit_price": self.momo_unit_price,
"competitor_unit_price": self.competitor_unit_price,
"unit_gap_amount": self.unit_gap_amount,
"unit_gap_pct": self.unit_gap_pct,
"summary": self.summary,
}
def normalize_product_text(value: str) -> str:
text = unicodedata.normalize("NFKC", value or "")
text = "".join(
char for char in unicodedata.normalize("NFKD", text)
if not unicodedata.combining(char)
)
text = text.replace("×", "x").replace("", "x").replace("*", "x")
text = text.replace("", "/").replace("", "&")
text = text.replace("", "")
text = re.sub(r"[\u3000\r\n\t]+", " ", text)
text = text.lower()
text = re.sub(r"[?]+", " ", text)
text = re.sub(r"[【】\[\]{}「」『』]", " ", text)
text = re.sub(r"[()]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def _strip_noise(value: str) -> str:
text = value
for phrase in sorted(NOISE_PHRASES, key=len, reverse=True):
text = text.replace(phrase.lower(), " ")
text = re.sub(r"\s+", " ", text).strip()
return text
def _tokenize(value: str) -> list[str]:
raw_tokens = re.findall(r"[a-z0-9]+|[\u4e00-\u9fff]+", value)
tokens: list[str] = []
for token in raw_tokens:
if len(token) <= 1 and not token.isdigit():
continue
tokens.append(token)
return tokens
def _extract_model_tokens(text: str) -> set[str]:
tokens: set[str] = set()
for match in re.finditer(r"(?<![a-z0-9])([a-z]{1,4}-?[a-z]{0,3}\d{2,}[a-z0-9-]*)(?![a-z0-9])", text, re.I):
compact = re.sub(r"[^a-z0-9]", "", match.group(1).lower())
if _is_spec_like_latin_token(compact):
continue
if len(compact) >= 4 and re.search(r"[a-z]", compact) and re.search(r"\d", compact):
tokens.add(compact)
return tokens
def _is_spec_like_latin_token(token: str) -> bool:
return bool(
re.fullmatch(r"spf\d{1,3}[a-z]?", token)
or re.fullmatch(r"pa\d*", token)
or token in {"uva", "uvb", "uv", "spf"}
)
def _brand_alias_present(text: str, alias_norm: str, text_tokens: set[str]) -> bool:
if not alias_norm:
return False
if re.search(r"[\u4e00-\u9fff]", alias_norm):
return alias_norm in text
if " " not in alias_norm and alias_norm in text_tokens:
return True
if re.fullmatch(r"[a-z0-9][a-z0-9\s'&.-]*", alias_norm):
pattern = r"(?<![a-z0-9])" + re.escape(alias_norm).replace(r"\ ", r"\s+") + r"(?![a-z0-9])"
return bool(re.search(pattern, text))
return alias_norm in text
def _known_brand_tokens(text: str) -> set[str]:
tokens: set[str] = set()
try:
from services.price_comparison import BRAND_ALIASES, BRAND_NORMALIZE_MAP
except Exception:
BRAND_ALIASES = {}
BRAND_NORMALIZE_MAP = {}
alias_map = dict(BRAND_NORMALIZE_MAP)
alias_groups = {canonical: list(aliases) for canonical, aliases in BRAND_ALIASES.items()}
for canonical, aliases in BRAND_ALIAS_OVERRIDES.items():
alias_groups.setdefault(canonical, [])
alias_groups[canonical].extend(aliases)
alias_map[canonical.lower()] = canonical
for alias in aliases:
alias_map[alias.lower()] = canonical
text_tokens = _tokenize(text)
for alias, canonical in alias_map.items():
alias_norm = normalize_product_text(alias)
if _brand_alias_present(text, alias_norm, text_tokens):
tokens.add(canonical)
tokens.update(
token for token in _tokenize(alias_norm)
if not re.fullmatch(r"[a-z]{1,2}", token)
)
for related in alias_groups.get(canonical, []):
tokens.update(
token for token in _tokenize(normalize_product_text(related))
if not re.fullmatch(r"[a-z]{1,2}", token)
)
return {token for token in tokens if token and token not in GENERIC_TOKENS}
def _leading_brand_tokens(original: str, normalized: str) -> set[str]:
tokens: set[str] = set()
bracket_match = re.match(r"\s*[【\[]([^】\]]{2,40})[】\]]", original or "")
if bracket_match:
content = normalize_product_text(bracket_match.group(1))
if not any(phrase in content for phrase in NON_BRAND_BRACKET_PHRASES):
for token in _tokenize(_strip_noise(content)):
if token not in GENERIC_TOKENS:
tokens.add(token)
leading = normalized[:48]
leading_tokens = _tokenize(leading)
if leading_tokens:
first_token = leading_tokens[0]
if re.fullmatch(r"[\u4e00-\u9fff]{2,6}", first_token) and first_token not in GENERIC_TOKENS:
tokens.add(first_token)
for token in _tokenize(leading):
if re.fullmatch(r"[a-z][a-z0-9\-']{2,}", token) and not _is_spec_like_latin_token(token):
tokens.add(token)
return tokens
def _extract_product_type(text: str) -> Optional[str]:
for product_type, aliases in PRODUCT_TYPES.items():
if any(alias.lower() in text for alias in aliases):
return product_type
return None
def _convert_volume(value: str, unit: str) -> Optional[tuple[str, float]]:
try:
number = float(value)
except (TypeError, ValueError):
return None
unit = unit.lower()
if unit in {"ml", "毫升"}:
return ("ml", number)
if unit == "l":
return ("ml", number * 1000)
if unit in {"g", "公克"}:
return ("g", number)
if unit == "kg":
return ("g", number * 1000)
if unit in {"mg", "毫克"}:
return ("mg", number)
if unit in {"mcg", "μg", "ug", "微克"}:
return ("mg", number / 1000)
return None
def _count_unit_family(unit: str) -> str:
return COUNT_UNIT_FAMILIES.get(unit, unit)
def _extract_specs(
text: str,
) -> tuple[tuple[float, ...], tuple[float, ...], tuple[float, ...], tuple[tuple[int, str], ...], Optional[int]]:
volumes_ml: list[float] = []
weights_g: list[float] = []
dosages_mg: list[float] = []
for match in re.finditer(r"(\d+(?:\.\d+)?)\s*(ml|毫升|l|g|公克|kg|mg|毫克|mcg|μg|ug|微克)", text, re.I):
converted = _convert_volume(match.group(1), match.group(2))
if not converted:
continue
unit, number = converted
if unit == "ml":
volumes_ml.append(number)
elif unit == "g":
weights_g.append(number)
else:
dosages_mg.append(number)
counts: list[tuple[int, str]] = []
for match in re.finditer(rf"(\d+)\s*({COUNT_UNIT_PATTERN})", text):
counts.append((int(match.group(1)), match.group(2)))
for match in re.finditer(rf"([一二兩雙三四五六七八九十])\s*({COUNT_UNIT_PATTERN})", text):
counts.append((CHINESE_COUNT[match.group(1)], match.group(2)))
for match in re.finditer(rf"(?:x|乘)\s*(\d+)\s*({COUNT_UNIT_PATTERN})?", text, re.I):
unit = match.group(2) or ""
if not match.group(2):
prefix = text[max(0, match.start() - 4):match.start()].strip().lower()
suffix = text[match.end():match.end() + 1]
spec_prefixed = bool(re.search(r"(?:ml|毫升|g|公克|kg|mg|oz)$", prefix))
if re.search(r"[a-z]$", prefix) and not spec_prefixed:
continue
if suffix and re.match(r"[\u4e00-\u9fff]", suffix) and not spec_prefixed:
continue
counts.append((int(match.group(1)), unit))
for match in re.finditer(rf"(\d+)\s*{ENGLISH_COUNT_UNIT_RE}", text, re.I):
counts.append((int(match.group(1)), ""))
buy_get = re.search(r"\s*(\d+|[一二兩雙三四五六七八九十])\s*送\s*(\d+|[一二兩雙三四五六七八九十])", text)
if buy_get:
total_count = (_count_text_value(buy_get.group(1)) or 0) + (_count_text_value(buy_get.group(2)) or 0)
if total_count > 1:
counts.append((total_count, ""))
if "買一送一" in text or "買1送1" in text:
counts.append((2, ""))
total_piece_count = None
explicit_total = re.search(r"\s*(\d+)\s*([包袋片顆粒錠枚])", text)
if explicit_total:
total_piece_count = int(explicit_total.group(1))
else:
piece_counts = [count for count, unit in counts if unit in PIECE_UNITS]
container_counts = [count for count, unit in counts if unit in CONTAINER_UNITS]
if piece_counts and container_counts:
total_piece_count = max(piece_counts) * max(container_counts)
elif piece_counts:
total_piece_count = max(piece_counts)
unique_counts = tuple(sorted(set(counts)))
return (
tuple(sorted(set(volumes_ml))),
tuple(sorted(set(weights_g))),
tuple(sorted(set(dosages_mg))),
unique_counts,
total_piece_count,
)
def parse_product_identity(name: str) -> ProductIdentity:
normalized = normalize_product_text(name)
searchable = _strip_noise(normalized)
tokens = set(_tokenize(searchable))
product_type = _extract_product_type(searchable)
known_brand_tokens = _known_brand_tokens(searchable)
brand_tokens = known_brand_tokens or _leading_brand_tokens(name, normalized)
core_tokens = {
token
for token in tokens
if token not in GENERIC_TOKENS
and not token.isdigit()
and not re.fullmatch(r"\d+(ml|g|kg|l|mg|mcg|ug)?", token)
}
core_tokens -= brand_tokens
core_tokens.update(_extract_model_tokens(searchable))
volumes_ml, weights_g, dosages_mg, counts, total_piece_count = _extract_specs(normalized)
return ProductIdentity(
original_name=name or "",
normalized_name=normalized,
searchable_name=searchable,
brand_tokens=frozenset(brand_tokens),
product_type=product_type,
tokens=frozenset(tokens),
core_tokens=frozenset(core_tokens),
volumes_ml=volumes_ml,
weights_g=weights_g,
dosages_mg=dosages_mg,
counts=counts,
total_piece_count=total_piece_count,
)
def _weighted_token_score(left: ProductIdentity, right: ProductIdentity) -> float:
def expand_tokens(identity: ProductIdentity) -> set[str]:
tokens = set(identity.brand_tokens | identity.core_tokens)
for token in identity.core_tokens:
chinese = "".join(char for char in token if "\u4e00" <= char <= "\u9fff")
if len(chinese) >= 3:
tokens.update(f"zh:{chinese[i:i + 2]}" for i in range(len(chinese) - 1))
return tokens
left_tokens = expand_tokens(left)
right_tokens = expand_tokens(right)
if not left_tokens or not right_tokens:
return SequenceMatcher(None, left.searchable_name, right.searchable_name).ratio() * 0.6
def weight(token: str) -> float:
if token in left.brand_tokens or token in right.brand_tokens:
return 1.4
if token.startswith("zh:"):
return 0.55
if re.search(r"\d", token):
return 1.2
if len(token) >= 4:
return 1.25
return 1.0
overlap = left_tokens & right_tokens
overlap_weight = sum(weight(token) for token in overlap)
total_weight = sum(weight(token) for token in left_tokens) + sum(weight(token) for token in right_tokens)
dice = (2 * overlap_weight / total_weight) if total_weight else 0
sequence = SequenceMatcher(None, " ".join(sorted(left_tokens)), " ".join(sorted(right_tokens))).ratio()
return min(1.0, dice * 0.72 + sequence * 0.28)
def _brand_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool, str | None]:
if not left.brand_tokens or not right.brand_tokens:
return 0.55, False, None
if left.brand_tokens & right.brand_tokens:
return 1.0, False, None
maquillage_anchor = "心機星魅蜜光圈潤唇膏"
left_has_shiseido = bool({"shiseido", "資生堂"} & left.brand_tokens)
right_has_shiseido = bool({"shiseido", "資生堂"} & right.brand_tokens)
left_has_maquillage = bool({"maquillage", "心機彩妝"} & left.brand_tokens)
right_has_maquillage = bool({"maquillage", "心機彩妝"} & right.brand_tokens)
if (
maquillage_anchor in left.normalized_name
and maquillage_anchor in right.normalized_name
and ((left_has_shiseido and right_has_maquillage) or (right_has_shiseido and left_has_maquillage))
):
return 1.0, False, None
return 0.0, True, "brand_conflict"
def _close_number(left: float, right: float, tolerance: float = 0.04) -> bool:
denominator = max(abs(left), abs(right), 1.0)
return abs(left - right) / denominator <= tolerance
def _spec_component(left_values: Iterable[float], right_values: Iterable[float]) -> tuple[float, bool]:
left_tuple = tuple(sorted(set(left_values)))
right_tuple = tuple(sorted(set(right_values)))
if not left_tuple and not right_tuple:
return 0.55, False
if not left_tuple or not right_tuple:
return 0.45, False
if len(left_tuple) > 1 or len(right_tuple) > 1:
if len(left_tuple) != len(right_tuple):
return 0.0, True
unmatched = list(right_tuple)
for left_value in left_tuple:
match_index = next(
(
index
for index, right_value in enumerate(unmatched)
if _close_number(left_value, right_value)
),
None,
)
if match_index is None:
return 0.0, True
unmatched.pop(match_index)
return 1.0, False
for left_value in left_tuple:
if any(_close_number(left_value, right_value) for right_value in right_tuple):
return 1.0, False
return 0.0, True
def _has_hard_count_unit_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
if not left.counts or not right.counts:
return False
left_by_count: dict[int, set[str]] = {}
right_by_count: dict[int, set[str]] = {}
for count, unit in left.counts:
left_by_count.setdefault(count, set()).add(_count_unit_family(unit))
for count, unit in right.counts:
right_by_count.setdefault(count, set()).add(_count_unit_family(unit))
for count in set(left_by_count) & set(right_by_count):
left_units = left_by_count[count]
right_units = right_by_count[count]
if left_units & right_units:
continue
if (
(left_units & PIECE_UNITS and right_units & CONTAINER_UNITS)
or (right_units & PIECE_UNITS and left_units & CONTAINER_UNITS)
):
return True
return False
def _allow_catalog_count_omission(left: ProductIdentity, right: ProductIdentity) -> bool:
"""Allow catalog-side piece counts for Dashing Diva nail lines when MOMO omits pack count."""
left_has_counts = bool(left.counts)
right_has_counts = bool(right.counts)
if left_has_counts == right_has_counts:
return False
shared_brand_tokens = {token.lower() for token in left.brand_tokens} & {
token.lower() for token in right.brand_tokens
}
if not ({"dashing", "diva"} <= shared_brand_tokens):
return False
searchable_pair = f"{left.searchable_name} {right.searchable_name}"
if "美甲片" not in searchable_pair:
return False
counted = left if left_has_counts else right
omitted = right if left_has_counts else left
if omitted.counts:
return False
if (counted.total_piece_count or 0) < 20:
return False
return any(
anchor in searchable_pair
for anchor in ("時尚潮流美甲片", "頂級璀燦美甲片", "薄型經典美甲片")
)
def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool]:
left_counts = [count for count, _unit in left.counts]
right_counts = [count for count, _unit in right.counts]
left_by_unit: dict[str, set[int]] = {}
right_by_unit: dict[str, set[int]] = {}
for count, unit in left.counts:
left_by_unit.setdefault(_count_unit_family(unit), set()).add(count)
for count, unit in right.counts:
right_by_unit.setdefault(_count_unit_family(unit), set()).add(count)
if left.total_piece_count and right.total_piece_count:
if left.total_piece_count == right.total_piece_count:
return 1.0, False
ratio = max(left.total_piece_count, right.total_piece_count) / max(min(left.total_piece_count, right.total_piece_count), 1)
return (0.0, True) if ratio >= 1.5 else (0.45, False)
for unit in set(left_by_unit) & set(right_by_unit):
if left_by_unit[unit] != right_by_unit[unit]:
return 0.0, True
if left.counts and right.counts:
if set(left.counts) & set(right.counts):
return 0.85, False
if _has_hard_count_unit_conflict(left, right):
return 0.0, True
if left_counts and right_counts:
ratio = max(max(left_counts), max(right_counts)) / max(min(max(left_counts), max(right_counts)), 1)
if ratio >= 1.5:
return 0.0, True
return 0.35, False
if _allow_catalog_count_omission(left, right):
return 0.55, False
if (left_counts and max(left_counts) > 1) or (right_counts and max(right_counts) > 1):
return 0.0, True
return 0.5, False
def _has_exact_count_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
if not left.counts or not right.counts:
return False
left_counts = sorted(count for count, _ in left.counts)
right_counts = sorted(count for count, _ in right.counts)
return left_counts == right_counts
def _has_pack_quantity_difference(left: ProductIdentity, right: ProductIdentity) -> bool:
if not left.counts or not right.counts or _has_exact_count_alignment(left, right):
return False
if left.total_piece_count and right.total_piece_count:
return left.total_piece_count != right.total_piece_count
left_by_unit: dict[str, set[int]] = {}
right_by_unit: dict[str, set[int]] = {}
for count, unit in left.counts:
family = _count_unit_family(unit)
if family in COUNT_UNITS or unit in COUNT_UNITS:
left_by_unit.setdefault(family, set()).add(count)
for count, unit in right.counts:
family = _count_unit_family(unit)
if family in COUNT_UNITS or unit in COUNT_UNITS:
right_by_unit.setdefault(family, set()).add(count)
for unit in set(left_by_unit) & set(right_by_unit):
if left_by_unit[unit] != right_by_unit[unit]:
return True
return False
NAMED_COMPONENT_QUANTITY_GROUPS = (
("嬰兒沐浴精", "嬰幼童洗髮精"),
("魅惑麋香", "湛藍海洋", "花妍巧語", "絲絨玫瑰"),
)
def _named_component_quantity_map(identity: ProductIdentity, terms: Iterable[str]) -> dict[str, int]:
text = identity.searchable_name
present_terms = tuple(term for term in terms if term in text)
if len(present_terms) < 2:
return {}
quantities: dict[str, int] = {}
for term in present_terms:
term_index = text.find(term)
if term_index < 0:
continue
suffix = text[term_index + len(term):term_index + len(term) + 28]
explicit_count = re.search(
r"(?:\d+(?:\.\d+)?\s*(?:ml|g|mg|毫升|公克|毫克))?\s*(?:x|乘)\s*(\d+)",
suffix,
flags=re.I,
)
if explicit_count:
quantities[term] = int(explicit_count.group(1))
if len(quantities) == len(present_terms):
return quantities
pack_counts = [
count
for count, unit in identity.counts
if _count_unit_family(unit) in COUNT_UNITS or unit in COUNT_UNITS
]
if not quantities and pack_counts and max(pack_counts) == len(present_terms) and re.search(r"[+//、]", text):
return {term: 1 for term in present_terms}
return {}
def _has_named_component_quantity_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
"""同名組合品若命名元件相同但數量反轉,不能視為同一價格標的。"""
for terms in NAMED_COMPONENT_QUANTITY_GROUPS:
left_quantities = _named_component_quantity_map(left, terms)
right_quantities = _named_component_quantity_map(right, terms)
shared_terms = set(left_quantities) & set(right_quantities)
if len(shared_terms) < 2:
continue
if any(left_quantities[term] != right_quantities[term] for term in shared_terms):
return True
return False
def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool, tuple[str, ...]]:
volume_score, volume_conflict = _spec_component(left.volumes_ml, right.volumes_ml)
weight_score, weight_conflict = _spec_component(left.weights_g, right.weights_g)
dosage_score, dosage_conflict = _spec_component(left.dosages_mg, right.dosages_mg)
count_score, count_conflict = _count_score(left, right)
available = []
if left.volumes_ml or right.volumes_ml:
available.append(volume_score)
if left.weights_g or right.weights_g:
available.append(weight_score)
if left.dosages_mg or right.dosages_mg:
available.append(dosage_score)
if left.counts or right.counts:
available.append(count_score)
if not available:
return 0.55, False, ()
score = sum(available) / len(available)
conflicts = []
if volume_conflict:
conflicts.append("volume_conflict")
if weight_conflict:
conflicts.append("weight_conflict")
if dosage_conflict:
conflicts.append("dosage_conflict")
if count_conflict:
conflicts.append("count_conflict")
return score, bool(conflicts), tuple(conflicts)
def _has_bundle_offer(identity: ProductIdentity) -> bool:
text = identity.normalized_name
return bool(
re.search(r"\s*\d+\s*送\s*\d+", text)
or re.search(r"\s*[一二兩雙三四五六七八九十]\s*送\s*[一二兩雙三四五六七八九十]", text)
or "買一送一" in text
or any(phrase in text for phrase in BUNDLE_OFFER_PHRASES)
)
def _has_multi_component(identity: ProductIdentity) -> bool:
text = _component_separator_text(identity)
return bool(
"+" in text
or "" in text
or re.search(r"\d+\s*(?:ml|g|mg|毫升|公克|毫克)\s*x\s*\d+", text, re.I)
)
def _component_separator_text(identity: ProductIdentity) -> str:
text = identity.normalized_name
text = re.sub(r"spf\s*(\d+)\s*[+]+", r"spf\1", text, flags=re.I)
text = re.sub(r"pa\s*[+]{1,5}", "pa", text, flags=re.I)
text = re.sub(
r"(\d+(?:\.\d+)?\s*(?:ml|g|mg|毫升|公克|毫克))\s*[+]\s*"
r"(\d+(?:\.\d+)?\s*(?:ml|g|mg|毫升|公克|毫克))",
r"\1 \2",
text,
flags=re.I,
)
text = re.sub(
r"\b[a-z]{1,6}\d{1,6}\s*[+]\s*[a-z]{1,6}\d{1,6}\b",
lambda match: re.sub(r"[+]", " ", match.group(0)),
text,
flags=re.I,
)
return text
def _multi_component_count(identity: ProductIdentity) -> int:
text = _component_separator_text(identity)
if not ("+" in text or "" in text):
return 1
parts = [
part.strip()
for part in re.split(r"[+]", text)
if part.strip() and not re.fullmatch(r"[\s\d-]+", part.strip())
]
return len(parts) if len(parts) > 1 else 1
def _repeated_single_spec_count(identity: ProductIdentity) -> Optional[int]:
text = _component_separator_text(identity)
matches = re.findall(
r"\d+(?:\.\d+)?\s*(?:ml|g|mg|毫升|公克|毫克)\s*x\s*(\d+)",
text,
flags=re.I,
)
if len(matches) != 1:
return None
try:
count = int(matches[0])
except (TypeError, ValueError):
return None
return count if count > 1 else None
def _refill_piece_count(identity: ProductIdentity) -> Optional[int]:
refill_counts = [
count
for count, unit in identity.counts
if _count_unit_family(unit) == "refill"
]
return max(refill_counts) if refill_counts else None
def _has_cushion_refill_pack_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
"""Align cushion compact refill language such as `一盒兩蕊` with `15g x2`."""
if left.product_type != "氣墊粉餅" or right.product_type != "氣墊粉餅":
return False
def aligned(refill_side: ProductIdentity, spec_side: ProductIdentity) -> bool:
refill_count = _refill_piece_count(refill_side)
spec_count = _repeated_single_spec_count(spec_side)
if not refill_count or not spec_count or refill_count != spec_count:
return False
box_counts = [
count
for count, unit in refill_side.counts
if unit in {"", "", ""} and count > 1
]
return not box_counts
return aligned(left, right) or aligned(right, left)
def _has_paulas_choice_body_lotion_210ml_2pack_alignment(
left: ProductIdentity,
right: ProductIdentity,
) -> bool:
"""Align PChome Nick `210ml x2` with MOMO `210ml二入` for the same body lotion."""
brand_tokens = left.brand_tokens | right.brand_tokens
if not ({"寶拉珍選", "paulas", "choice"} & brand_tokens):
return False
if not _has_shared_volume(left, right, 210):
return False
if not all("水楊酸" in item.searchable_name and "身體乳" in item.searchable_name for item in (left, right)):
return False
def has_two_pack(identity: ProductIdentity) -> bool:
text = identity.searchable_name
return bool(
re.search(r"(?:x\s*2|2\s*入|二\s*入|兩\s*入|雙\s*入|雙入組|二入組|兩入組)", text, re.I)
or (2, "") in identity.counts
)
return has_two_pack(left) and has_two_pack(right)
def _has_nivea_creme_100ml_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
brand_tokens = left.brand_tokens | right.brand_tokens
if not ({"nivea", "妮維雅"} & brand_tokens):
return False
if not _has_shared_volume(left, right, 100):
return False
return all("妮維雅霜" in item.searchable_name and "隨身版" in item.searchable_name for item in (left, right))
def _has_cetaphil_moisturizer_type_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
"""Treat Cetaphil moisturizer wording variants as the same type only on exact named lines."""
if not ({"cetaphil", "舒特膚"} & (left.brand_tokens & right.brand_tokens)):
return False
if {left.product_type, right.product_type} != {"乳液", "面霜"}:
return False
left_text = left.searchable_name
right_text = right.searchable_name
if all("長效潤膚霜" in item for item in (left_text, right_text)):
return _has_shared_weight(left, right, 250)
if all("益膚康修護舒敏乳霜" in item for item in (left_text, right_text)):
return _has_shared_weight(left, right, 227)
return False
def _has_refill_pack(identity: ProductIdentity) -> bool:
text = identity.normalized_name
return bool(
"補充瓶" in text
or "補充包" in text
or "補充芯" in text
or "補充蕊" in text
or "替換蕊" in text
or "替換芯" in text
or "refill" in text
)
def _has_accessory_case(identity: ProductIdentity) -> bool:
text = identity.normalized_name
return bool(
"眉彩餅盒" in text
or "盒一入款" in text
or "盒三入款" in text
or "盒單入" in text
or "空盒" in text
)
def _spec_mention_count(identity: ProductIdentity) -> int:
return len(
re.findall(
r"\d+(?:\.\d+)?\s*(?:ml|毫升|l|g|公克|kg|mg|毫克|mcg|μg|ug|微克)",
identity.normalized_name,
re.I,
)
)
def _count_text_value(value: str) -> Optional[int]:
if value.isdigit():
return int(value)
return CHINESE_COUNT.get(value)
def _pack_multiplier(identity: ProductIdentity) -> int:
text = identity.normalized_name
buy_get = re.search(r"\s*(\d+|[一二兩雙三四五六七八九十])\s*送\s*(\d+|[一二兩雙三四五六七八九十])", text)
if buy_get:
left = _count_text_value(buy_get.group(1)) or 0
right = _count_text_value(buy_get.group(2)) or 0
if left + right > 1:
return left + right
if "買一送一" in text or "買1送1" in text:
return 2
piece_pack = re.search(r"(\d+|[一二兩雙三四五六七八九十])\s*件\s*組", text)
if piece_pack:
count = _count_text_value(piece_pack.group(1)) or 0
if count > 1:
return count
multipliers = [count for count, unit in identity.counts if unit in COUNT_UNITS and count > 1]
if multipliers:
return max(multipliers)
return 1
def _has_overlapping_base_spec(left: ProductIdentity, right: ProductIdentity) -> bool:
left_volumes = tuple(sorted(set(left.volumes_ml)))
right_volumes = tuple(sorted(set(right.volumes_ml)))
if left_volumes or right_volumes:
if not left_volumes or not right_volumes:
return False
if len(left_volumes) > 1 or len(right_volumes) > 1:
return False
return _close_number(left_volumes[0], right_volumes[0])
left_weights = tuple(sorted(set(left.weights_g)))
right_weights = tuple(sorted(set(right.weights_g)))
if left_weights or right_weights:
if not left_weights or not right_weights:
return False
if len(left_weights) > 1 or len(right_weights) > 1:
return False
return _close_number(left_weights[0], right_weights[0])
return False
def _single_unit_total(identity: ProductIdentity) -> tuple[Optional[str], Optional[float], str]:
volumes = tuple(sorted(set(identity.volumes_ml)))
weights = tuple(sorted(set(identity.weights_g)))
if volumes and weights:
return None, None, "mixed_volume_weight"
if len(volumes) > 1 or len(weights) > 1:
return None, None, "multi_spec_component"
if volumes:
return "ml", volumes[0] * _pack_multiplier(identity), "ok"
if weights:
multiplier = identity.total_piece_count or _pack_multiplier(identity)
return "g", weights[0] * multiplier, "ok"
if identity.total_piece_count:
return "", float(identity.total_piece_count), "ok"
return None, None, "missing_single_unit"
def build_unit_price_comparison(
momo_name: str,
competitor_name: str,
momo_price: Optional[float],
competitor_price: Optional[float],
) -> dict:
"""Build deterministic unit-price evidence for unit-comparable candidates."""
diagnostics = score_marketplace_match(
momo_name,
competitor_name,
momo_price=momo_price,
competitor_price=competitor_price,
)
if diagnostics.comparison_mode != "unit_comparable":
return UnitPriceComparison(False, diagnostics.comparison_mode).as_dict()
left = parse_product_identity(momo_name)
right = parse_product_identity(competitor_name)
left_unit, left_total, left_reason = _single_unit_total(left)
right_unit, right_total, right_reason = _single_unit_total(right)
if left_reason != "ok" or right_reason != "ok":
return UnitPriceComparison(False, f"{left_reason}:{right_reason}").as_dict()
if left_unit != right_unit or not left_total or not right_total:
return UnitPriceComparison(False, "unit_mismatch").as_dict()
try:
momo_price_num = float(momo_price or 0)
competitor_price_num = float(competitor_price or 0)
except (TypeError, ValueError):
return UnitPriceComparison(False, "invalid_price").as_dict()
if momo_price_num <= 0 or competitor_price_num <= 0:
return UnitPriceComparison(False, "invalid_price").as_dict()
momo_unit_price = momo_price_num / left_total
competitor_unit_price = competitor_price_num / right_total
unit_gap_amount = momo_unit_price - competitor_unit_price
unit_gap_pct = unit_gap_amount / competitor_unit_price * 100 if competitor_unit_price else 0
summary = (
f"MOMO ${momo_unit_price:.2f}/{left_unit} vs "
f"PChome ${competitor_unit_price:.2f}/{left_unit} "
f"({unit_gap_pct:+.1f}%)"
)
return UnitPriceComparison(
comparable=True,
reason="unit_comparable",
unit_label=left_unit,
momo_total_quantity=round(left_total, 3),
competitor_total_quantity=round(right_total, 3),
momo_unit_price=round(momo_unit_price, 4),
competitor_unit_price=round(competitor_unit_price, 4),
unit_gap_amount=round(unit_gap_amount, 4),
unit_gap_pct=round(unit_gap_pct, 2),
summary=summary,
).as_dict()
def _is_unit_comparable_candidate(
left: ProductIdentity,
right: ProductIdentity,
token_score: float,
chinese_name_score: float,
brand_conflict: bool,
type_score: float,
reasons: Iterable[str],
) -> bool:
"""Identify same core product sold in different packs.
These are not safe exact matches. They can only enter a normalized unit-price
review lane, otherwise a bundle price may be incorrectly compared with a
single-item price.
"""
reason_set = set(reasons)
pack_difference = bool(reason_set & {
"bundle_offer_conflict",
"multi_component_conflict",
"count_conflict",
"component_count_conflict",
"pack_quantity_difference",
})
if not pack_difference:
return False
if brand_conflict or "brand_conflict" in reason_set:
return False
if "refill_pack_conflict" in reason_set:
return False
if type_score == 0.0 or "type_conflict" in reason_set:
return False
if not _has_overlapping_base_spec(left, right):
return False
if token_score < 0.45 and chinese_name_score < 0.28:
return False
if "product_line_conflict" in reason_set and token_score < 0.72:
return False
return True
def _chinese_bigram_score(left: ProductIdentity, right: ProductIdentity) -> float:
def signature(identity: ProductIdentity) -> set[str]:
text = identity.searchable_name
for token in sorted(identity.brand_tokens, key=len, reverse=True):
text = text.replace(token, " ")
text = re.sub(r"[a-z0-9]+", " ", text)
text = "".join(char for char in text if "\u4e00" <= char <= "\u9fff")
for phrase in (
"官方", "直營", "公司貨", "專櫃", "正貨", "原廠", "限定", "獨家",
"期間", "超值", "特惠", "優惠", "新品", "經典", "人氣", "熱銷",
"必買", "推薦", "任選", "禮盒", "母親節", "超品日", "多款",
"", "", "", "", "", "", "", "",
):
text = text.replace(phrase, "")
return {text[i:i + 2] for i in range(max(0, len(text) - 1))}
left_signature = signature(left)
right_signature = signature(right)
if not left_signature or not right_signature:
return 0.55
return 2 * len(left_signature & right_signature) / (len(left_signature) + len(right_signature))
def _has_strong_product_line_signal(
left: ProductIdentity,
right: ProductIdentity,
token_score: float,
chinese_name_score: float,
) -> bool:
shared_core = (left.core_tokens & right.core_tokens) - left.brand_tokens - right.brand_tokens
shared_latin_or_model = {
token for token in shared_core
if re.fullmatch(r"[a-z][a-z0-9-]{3,}", token)
or re.fullmatch(r"[a-z]{2,}-?\d+[a-z0-9-]*", token)
}
if shared_latin_or_model and token_score >= 0.50:
return True
return bool(shared_core) and token_score >= 0.56 and chinese_name_score >= 0.45
def _has_safe_exact_spec_signal(
left: ProductIdentity,
right: ProductIdentity,
token_score: float,
sequence_score: float,
type_score: float,
) -> bool:
if type_score < 0.55:
return False
if _spec_mention_count(left) > 1 or _spec_mention_count(right) > 1:
return False
if not _has_overlapping_base_spec(left, right):
return False
return token_score >= 0.42 or sequence_score >= 0.50
def _model_line_tokens(identity: ProductIdentity) -> set[str]:
tokens: set[str] = set()
for token in identity.core_tokens:
if token in GENERIC_TOKENS:
continue
if _is_spec_like_latin_token(token):
continue
if re.fullmatch(r"[a-z][a-z0-9-]{2,}", token):
tokens.add(token)
for match in re.finditer(r"([\u4e00-\u9fff]{2,})(?:系列)", token):
value = match.group(1)
if value not in GENERIC_TOKENS:
tokens.add(value)
return tokens
def _has_model_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
left_tokens = _model_line_tokens(left)
right_tokens = _model_line_tokens(right)
if not left_tokens or not right_tokens:
return False
return not bool(left_tokens & right_tokens)
def _nail_polish_model_codes(identity: ProductIdentity) -> set[str]:
if not any(term in identity.searchable_name for term in ("指甲油", "指彩", "美甲")):
return set()
text = f"{identity.original_name} {identity.searchable_name}".lower()
codes: set[str] = set()
for pattern in (
r"(?<![a-z0-9])a10[\._-]?\d{2,3}(?:[\._-]?\d{2,3})?(?![a-z0-9])",
r"(?<![a-z0-9])isl[a-z0-9]*\d{2,3}(?![a-z0-9])",
):
for match in re.finditer(pattern, text):
codes.add(re.sub(r"[\._-]+", "", match.group(0)))
return codes
def _has_nail_polish_model_code_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(term in pair_text for term in ("指甲油", "指彩", "美甲")):
return False
left_codes = _nail_polish_model_codes(left)
right_codes = _nail_polish_model_codes(right)
return bool(left_codes and right_codes and left_codes.isdisjoint(right_codes))
def _dedupe_tuple(values: Iterable[str]) -> tuple[str, ...]:
result: list[str] = []
seen: set[str] = set()
for value in values:
if not value or value in seen:
continue
seen.add(value)
result.append(value)
return tuple(result)
def _build_evidence_flags(
*,
brand_score: float,
token_score: float,
spec_score: float,
sequence_score: float,
type_score: float,
shared_anchor: str,
shared_models: set[str],
reasons: Iterable[str],
catalog_count_omission: bool,
) -> tuple[str, ...]:
reason_set = set(reasons)
flags: list[str] = []
if brand_score >= 0.95:
flags.append("brand")
if spec_score >= 0.85:
flags.append("spec")
if token_score >= 0.72:
flags.append("tokens")
if sequence_score >= 0.70:
flags.append("name_sequence")
if type_score >= 0.95:
flags.append("product_type")
if shared_anchor:
flags.append("identity_anchor")
if shared_models:
flags.append("model_token")
if catalog_count_omission:
flags.append("catalog_count_omission")
for reason in (
"unit_comparable",
"variant_selection_review",
"variant_option_conflict",
"variant_descriptor_conflict",
"pack_quantity_difference",
"count_conflict",
"bundle_offer_conflict",
"multi_component_conflict",
"multi_component_count_conflict",
"accessory_case_conflict",
"refill_pack_conflict",
"price_ratio_extreme",
"price_ratio_wide",
):
if reason in reason_set:
flags.append(reason)
return _dedupe_tuple(flags)
def _number_values(values: Iterable[float]) -> list[float | int]:
result: list[float | int] = []
for value in values or ():
try:
number = float(value)
except (TypeError, ValueError):
continue
result.append(int(number) if number.is_integer() else round(number, 3))
return result
def _count_values(values: Iterable[tuple[int, str]]) -> list[str]:
return [f"{count}{unit}" for count, unit in sorted(set(values or ()))]
def _identity_spec_payload(identity: ProductIdentity) -> dict[str, object]:
return {
"volumes_ml": _number_values(identity.volumes_ml),
"weights_g": _number_values(identity.weights_g),
"dosages_mg": _number_values(identity.dosages_mg),
"counts": _count_values(identity.counts),
"total_piece_count": identity.total_piece_count,
}
def _spec_mismatch_payload(left: ProductIdentity, right: ProductIdentity) -> list[dict[str, object]]:
specs = (
("volume_ml", "容量", _number_values(left.volumes_ml), _number_values(right.volumes_ml)),
("weight_g", "重量", _number_values(left.weights_g), _number_values(right.weights_g)),
("dosage_mg", "劑量", _number_values(left.dosages_mg), _number_values(right.dosages_mg)),
("count", "入數/件數", _count_values(left.counts), _count_values(right.counts)),
)
mismatches: list[dict[str, object]] = []
for field_name, label, momo_values, competitor_values in specs:
if momo_values and competitor_values and set(momo_values).isdisjoint(set(competitor_values)):
mismatches.append({
"field": field_name,
"label": label,
"momo": momo_values,
"competitor": competitor_values,
})
elif bool(momo_values) != bool(competitor_values):
mismatches.append({
"field": field_name,
"label": f"{label}單側缺漏",
"momo": momo_values,
"competitor": competitor_values,
"needs_review": True,
})
return mismatches
def _identity_evidence_payload(
left: ProductIdentity,
right: ProductIdentity,
*,
brand_score: float,
token_score: float,
spec_score: float,
sequence_score: float,
type_score: float,
hard_veto: bool,
comparison_mode: str,
match_type: str,
price_basis: str,
alert_tier: str,
shared_anchor: str,
shared_models: set[str],
reasons: Iterable[str],
catalog_count_omission: bool,
) -> dict[str, object]:
reason_set = set(reasons or ())
conflict_reasons = [
reason for reason in reason_set
if "conflict" in reason
or reason in {
"variant_selection_review",
"catalog_count_omission",
"pack_quantity_difference",
"unit_comparable",
}
]
shared_brand = sorted(left.brand_tokens & right.brand_tokens)
shared_core = sorted((left.core_tokens & right.core_tokens) - left.brand_tokens - right.brand_tokens)[:20]
return {
"version": "identity_evidence_v1",
"lane": {
"comparison_mode": comparison_mode,
"match_type": match_type,
"price_basis": price_basis,
"alert_tier": alert_tier,
},
"confidence_components": {
"brand_score": round(brand_score, 3),
"token_score": round(token_score, 3),
"spec_score": round(spec_score, 3),
"sequence_score": round(sequence_score, 3),
"type_score": round(type_score, 3),
},
"brand": {
"momo": sorted(left.brand_tokens),
"competitor": sorted(right.brand_tokens),
"shared": shared_brand,
},
"product_type": {
"momo": left.product_type or "",
"competitor": right.product_type or "",
"matched": bool(left.product_type and right.product_type and left.product_type == right.product_type),
},
"identity_anchor": shared_anchor or "",
"shared_model_tokens": sorted(shared_models),
"shared_core_tokens": shared_core,
"specs": {
"momo": _identity_spec_payload(left),
"competitor": _identity_spec_payload(right),
"mismatches": _spec_mismatch_payload(left, right),
},
"variant_guardrails": {
"hard_veto": bool(hard_veto),
"conflict_reasons": sorted(conflict_reasons),
"catalog_count_omission": bool(catalog_count_omission),
},
}
def _offer_evidence_payload(
momo_price: Optional[float],
competitor_price: Optional[float],
*,
price_penalty: float,
price_basis: str,
alert_tier: str,
) -> dict[str, object]:
payload: dict[str, object] = {
"version": "offer_evidence_v1",
"price_basis": price_basis,
"alert_tier": alert_tier,
"price_is_identity_evidence": False,
"price_penalty": round(price_penalty, 3),
}
try:
momo_value = float(momo_price) if momo_price is not None else None
competitor_value = float(competitor_price) if competitor_price is not None else None
except (TypeError, ValueError):
momo_value = None
competitor_value = None
if momo_value is not None:
payload["momo_price"] = round(momo_value, 2)
if competitor_value is not None:
payload["competitor_price"] = round(competitor_value, 2)
if momo_value is not None and competitor_value and competitor_value > 0:
payload["gap_amount"] = round(momo_value - competitor_value, 2)
payload["gap_pct"] = round((momo_value - competitor_value) / max(competitor_value, 1) * 100, 2)
return payload
def _has_safe_multi_component_exact_total_price(
left: ProductIdentity,
right: ProductIdentity,
*,
brand_score: float,
token_score: float,
spec_score: float,
sequence_score: float,
type_score: float,
hard_veto: bool,
variant_descriptor_conflict: bool,
reasons: Iterable[str],
) -> bool:
"""Allow exact total-price writes for same-component sets, not mixed bundles."""
if hard_veto or variant_descriptor_conflict:
return False
if not (_has_multi_component(left) and _has_multi_component(right)):
return False
reason_set = set(reasons)
blocked = {
"variant_selection_review",
"variant_option_conflict",
"variant_descriptor_conflict",
"pack_quantity_difference",
"count_conflict",
"bundle_offer_conflict",
"multi_component_conflict",
"multi_component_count_conflict",
"commercial_condition_gap",
"refill_pack_conflict",
"unit_comparable",
"price_ratio_extreme",
"price_ratio_wide",
}
if reason_set & blocked:
return False
if brand_score < 0.95 or spec_score < 0.85:
return False
exact_count_alignment = _has_exact_count_alignment(left, right)
same_base_spec = _has_overlapping_base_spec(left, right)
if (
exact_count_alignment
and type_score >= 0.55
and token_score >= 0.80
and sequence_score >= 0.75
and (
same_base_spec
or (
token_score >= 0.90
and sequence_score >= 0.90
and "strong_product_line_match" in reason_set
)
)
):
return True
if type_score < 0.95:
return False
if exact_count_alignment:
return token_score >= 0.50 and sequence_score >= 0.50
return (
token_score >= 0.62
and sequence_score >= 0.62
and bool(reason_set & {"strong_exact_spec_match", "shared_model_token", "spec_name_alignment"})
)
def _classify_match_quality(
*,
score: float,
brand_score: float,
token_score: float,
spec_score: float,
sequence_score: float,
type_score: float,
hard_veto: bool,
comparison_mode: str,
reasons: Iterable[str],
shared_anchor: str,
shared_models: set[str],
catalog_count_omission: bool,
multi_component_pair: bool,
) -> tuple[str, str, str]:
"""Map raw matcher scores into operator-facing price comparison lanes."""
reason_set = set(reasons)
if comparison_mode == "unit_comparable":
return "same_product_different_pack", "unit_price", "unit_price_review"
if hard_veto or comparison_mode == "not_comparable":
variant_conflict = bool(reason_set & {"variant_option_conflict", "variant_descriptor_conflict"})
same_line_signal = bool(shared_anchor and brand_score >= 0.95 and type_score >= 0.55)
if variant_conflict and same_line_signal:
return "same_line_variant", "manual_review", "suppress"
return "no_match", "none", "suppress"
direct_spec_evidence = spec_score >= 0.85 or bool(shared_models)
focused_total_price_safe = "focused_exact_total_price_safe" in reason_set
strong_identity_evidence = (
(
brand_score >= 0.95
and type_score >= 0.55
and score >= 0.86
and (direct_spec_evidence or (shared_anchor and token_score >= 0.62 and sequence_score >= 0.58))
)
or (
focused_total_price_safe
and type_score >= 0.55
and score >= 0.86
)
)
if strong_identity_evidence and not catalog_count_omission:
if focused_total_price_safe and "variant_selection_review" not in reason_set:
return "exact", "total_price", "price_alert_exact"
safe_multi_component_total_price = "safe_multi_component_exact_total_price" in reason_set
if "variant_selection_review" in reason_set:
return "exact", "manual_review", "identity_review"
if multi_component_pair and not safe_multi_component_total_price:
return "exact", "manual_review", "identity_review"
return "exact", "total_price", "price_alert_exact"
if score >= 0.76:
if catalog_count_omission:
return "same_product_different_pack", "manual_review", "unit_price_review"
return "comparable", "manual_review", "identity_review"
return "no_match", "none", "suppress"
def score_marketplace_match(
momo_name: str,
competitor_name: str,
momo_price: Optional[float] = None,
competitor_price: Optional[float] = None,
) -> MatchDiagnostics:
left = parse_product_identity(momo_name)
right = parse_product_identity(competitor_name)
brand_score, brand_conflict, brand_reason = _brand_score(left, right)
token_score = _weighted_token_score(left, right)
spec_score, spec_conflict, spec_reasons = _spec_score(left, right)
sequence_score = SequenceMatcher(None, left.searchable_name, right.searchable_name).ratio()
chinese_name_score = _chinese_bigram_score(left, right)
nivea_creme_100ml_alignment = _has_nivea_creme_100ml_alignment(left, right)
cetaphil_moisturizer_type_alignment = _has_cetaphil_moisturizer_type_alignment(left, right)
type_aligned = (
left.product_type == right.product_type
or nivea_creme_100ml_alignment
or cetaphil_moisturizer_type_alignment
)
if left.product_type and right.product_type:
type_score = 1.0 if type_aligned else 0.0
else:
type_score = 0.55
reasons = []
if brand_reason:
reasons.append(brand_reason)
reasons.extend(spec_reasons)
if left.product_type and right.product_type and left.product_type != right.product_type and not type_aligned:
reasons.append("type_conflict")
if nivea_creme_100ml_alignment:
reasons.append("nivea_creme_100ml_type_alignment")
if cetaphil_moisturizer_type_alignment:
reasons.append("cetaphil_moisturizer_type_alignment")
model_line_conflict = _has_model_line_conflict(left, right)
if model_line_conflict:
reasons.append("model_line_conflict")
nail_polish_model_code_conflict = _has_nail_polish_model_code_conflict(left, right)
if nail_polish_model_code_conflict:
reasons.append("nail_polish_model_code_conflict")
bundle_offer_conflict = (
_has_bundle_offer(left) != _has_bundle_offer(right)
and not (
left.total_piece_count
and right.total_piece_count
and left.total_piece_count == right.total_piece_count
)
)
if bundle_offer_conflict:
reasons.append("bundle_offer_conflict")
cushion_refill_pack_alignment = _has_cushion_refill_pack_alignment(left, right)
paulas_choice_body_lotion_2pack_alignment = _has_paulas_choice_body_lotion_210ml_2pack_alignment(left, right)
if (
_has_multi_component(left) != _has_multi_component(right)
and not cushion_refill_pack_alignment
and not paulas_choice_body_lotion_2pack_alignment
):
reasons.append("multi_component_conflict")
if cushion_refill_pack_alignment:
reasons.append("cushion_refill_pack_alignment")
if paulas_choice_body_lotion_2pack_alignment:
reasons.append("paulas_choice_body_lotion_210ml_2pack_alignment")
multi_component_count_conflict = (
_has_multi_component(left)
and _has_multi_component(right)
and _multi_component_count(left) != _multi_component_count(right)
)
if multi_component_count_conflict:
reasons.append("multi_component_count_conflict")
if _has_refill_pack(left) != _has_refill_pack(right):
reasons.append("refill_pack_conflict")
accessory_case_conflict = _has_accessory_case(left) != _has_accessory_case(right)
if accessory_case_conflict:
reasons.append("accessory_case_conflict")
left_spec_mentions = _spec_mention_count(left)
right_spec_mentions = _spec_mention_count(right)
if left_spec_mentions and right_spec_mentions and left_spec_mentions != right_spec_mentions:
reasons.append("component_count_conflict")
if chinese_name_score < 0.16:
reasons.append("product_line_conflict")
shared_anchor = _shared_identity_anchor(left, right)
catalog_count_omission = _allow_catalog_count_omission(left, right)
if catalog_count_omission:
reasons.append("catalog_count_omission")
if _has_pack_quantity_difference(left, right):
reasons.append("pack_quantity_difference")
named_component_quantity_conflict = _has_named_component_quantity_conflict(left, right)
if named_component_quantity_conflict:
reasons.append("named_component_quantity_conflict")
variant_descriptor_conflict = _has_variant_descriptor_conflict(left, right, shared_anchor)
sun_protection_line_conflict = (
variant_descriptor_conflict
and left.product_type == right.product_type == "防曬"
and not shared_anchor
)
if sun_protection_line_conflict:
reasons.append("variant_descriptor_conflict")
reasons.append("sun_protection_line_conflict")
variant_option_conflict = _has_explicit_variant_option_conflict(left, right, shared_anchor)
if variant_option_conflict:
reasons.append("variant_option_conflict")
saugella_variant_conflict = _has_saugella_private_wash_variant_conflict(left, right)
if saugella_variant_conflict:
reasons.append("saugella_variant_conflict")
lactacyd_variant_conflict = _has_lactacyd_private_wash_variant_conflict(left, right)
if lactacyd_variant_conflict:
reasons.append("lactacyd_variant_conflict")
makeup_usage_conflict = _has_makeup_usage_conflict(left, right)
if makeup_usage_conflict:
reasons.append("makeup_usage_conflict")
makeup_finish_conflict = _has_makeup_finish_conflict(left, right)
if makeup_finish_conflict:
reasons.append("makeup_finish_conflict")
sun_protection_spf_conflict = _has_sun_protection_spf_conflict(left, right)
if sun_protection_spf_conflict:
reasons.append("spf_value_conflict")
makeup_spray_line_conflict = _has_makeup_spray_line_conflict(left, right)
if makeup_spray_line_conflict:
reasons.append("makeup_spray_line_conflict")
romand_lip_line_conflict = _has_romand_lip_line_conflict(left, right)
if romand_lip_line_conflict:
reasons.append("romand_lip_line_conflict")
nail_tool_function_conflict = _has_nail_tool_function_conflict(left, right)
if nail_tool_function_conflict:
reasons.append("nail_tool_function_conflict")
schick_razor_line_conflict = _has_schick_razor_line_conflict(left, right)
if schick_razor_line_conflict:
reasons.append("schick_razor_line_conflict")
lancome_line_conflict = _has_lancome_ultra_line_conflict(left, right)
if lancome_line_conflict:
reasons.append("lancome_line_conflict")
dr_hsieh_line_conflict = _has_dr_hsieh_labsmart_serum_line_conflict(left, right)
if dr_hsieh_line_conflict:
reasons.append("dr_hsieh_labsmart_line_conflict")
cotton_swab_variant_conflict = _has_cotton_swab_variant_conflict(left, right)
if cotton_swab_variant_conflict:
reasons.append("cotton_swab_variant_conflict")
kanebo_milano_type_conflict = _has_kanebo_milano_powder_perfume_conflict(left, right)
if kanebo_milano_type_conflict:
reasons.append("kanebo_milano_type_conflict")
hoi_candle_line_conflict = _has_hoi_candle_line_conflict(left, right)
if hoi_candle_line_conflict:
reasons.append("hoi_candle_line_conflict")
aroma_scent_variant_conflict = _has_aroma_scent_variant_conflict(left, right)
if aroma_scent_variant_conflict:
reasons.append("aroma_scent_variant_conflict")
unknown_scent_variant_conflict = _has_unknown_scent_variant_conflict(left, right)
if unknown_scent_variant_conflict:
reasons.append("unknown_scent_variant_conflict")
nail_polish_color_name_conflict = _has_nail_polish_color_name_conflict(left, right)
if nail_polish_color_name_conflict:
reasons.append("nail_polish_color_name_conflict")
ingredient_line_conflict = _has_core_ingredient_line_conflict(left, right)
if ingredient_line_conflict:
reasons.append("core_ingredient_line_conflict")
clarins_body_oil_line_conflict = _has_clarins_body_oil_line_conflict(left, right)
if clarins_body_oil_line_conflict:
reasons.append("clarins_body_oil_line_conflict")
branded_powder_line_conflict = _has_branded_powder_line_conflict(left, right)
if branded_powder_line_conflict:
reasons.append("branded_powder_line_conflict")
cleanser_lotion_line_conflict = _has_cleanser_lotion_line_conflict(left, right)
if cleanser_lotion_line_conflict:
reasons.append("cleanser_lotion_line_conflict")
selection1990_wax_lamp_design_conflict = _has_selection1990_wax_lamp_design_conflict(left, right)
if selection1990_wax_lamp_design_conflict:
reasons.append("selection1990_wax_lamp_design_conflict")
aroma_lamp_style_selection_gap = _has_aroma_lamp_style_selection_gap(left, right)
if aroma_lamp_style_selection_gap:
reasons.append("aroma_lamp_style_selection_gap")
hooome_wax_lamp_design_gap = _has_hooome_wax_lamp_design_gap(left, right)
if hooome_wax_lamp_design_gap:
reasons.append("hooome_wax_lamp_design_gap")
wax_lamp_size_letter_conflict = _has_wax_lamp_size_letter_conflict(left, right)
if wax_lamp_size_letter_conflict:
reasons.append("size_letter_variant_conflict")
nitori_diffuser_model_conflict = _has_nitori_diffuser_model_conflict(left, right)
if nitori_diffuser_model_conflict:
reasons.append("nitori_diffuser_model_conflict")
commercial_condition_gap = _has_commercial_condition_gap(left, right)
if commercial_condition_gap:
reasons.append("commercial_condition_gap")
relove_private_cleanser_variant_gap = _has_relove_private_cleanser_variant_gap(left, right)
if relove_private_cleanser_variant_gap:
reasons.append("relove_private_cleanser_variant_gap")
candle_catalog_selection_gap = _has_candle_catalog_selection_gap(left, right)
if candle_catalog_selection_gap:
reasons.append("candle_catalog_selection_gap")
bath_additive_variant_gap = _has_bath_additive_variant_gap(left, right)
if bath_additive_variant_gap:
reasons.append("bath_additive_variant_gap")
makeup_catalog_selection_gap = _has_makeup_catalog_selection_gap(left, right)
if makeup_catalog_selection_gap:
reasons.append("makeup_catalog_selection_gap")
loreal_serum_variant_gap = _has_loreal_serum_variant_gap(left, right)
if loreal_serum_variant_gap:
reasons.append("loreal_serum_variant_gap")
sebamed_shampoo_variant_catalog_gap = _has_sebamed_shampoo_variant_catalog_gap(left, right)
if sebamed_shampoo_variant_catalog_gap:
reasons.append("sebamed_shampoo_variant_catalog_gap")
schick_2in1_model_gap = _has_schick_2in1_model_gap(left, right)
if schick_2in1_model_gap:
reasons.append("schick_2in1_model_gap")
taicend_protection_form_gap = _has_taicend_protection_form_gap(left, right)
if taicend_protection_form_gap:
reasons.append("taicend_protection_form_gap")
variant_selection_review = (
_has_named_variant_selection_review(left, right, shared_anchor)
or commercial_condition_gap
or relove_private_cleanser_variant_gap
or candle_catalog_selection_gap
or bath_additive_variant_gap
or aroma_lamp_style_selection_gap
or hooome_wax_lamp_design_gap
or makeup_catalog_selection_gap
or loreal_serum_variant_gap
or sebamed_shampoo_variant_catalog_gap
or schick_2in1_model_gap
or taicend_protection_form_gap
)
if variant_selection_review:
reasons.append("variant_selection_review")
hard_veto = brand_conflict or spec_conflict
if bundle_offer_conflict:
hard_veto = True
if (
_has_multi_component(left) != _has_multi_component(right)
and not cushion_refill_pack_alignment
and not paulas_choice_body_lotion_2pack_alignment
):
hard_veto = True
if multi_component_count_conflict:
hard_veto = True
if named_component_quantity_conflict:
hard_veto = True
if _has_refill_pack(left) != _has_refill_pack(right):
hard_veto = True
if accessory_case_conflict:
hard_veto = True
if model_line_conflict:
hard_veto = True
if nail_polish_model_code_conflict:
hard_veto = True
if left_spec_mentions and right_spec_mentions and left_spec_mentions != right_spec_mentions:
hard_veto = True
if chinese_name_score < 0.16 and token_score < 0.72:
hard_veto = True
if left.product_type and right.product_type and left.product_type != right.product_type and not type_aligned:
hard_veto = True
if sun_protection_line_conflict:
hard_veto = True
if variant_option_conflict:
hard_veto = True
if saugella_variant_conflict:
hard_veto = True
if lactacyd_variant_conflict:
hard_veto = True
if makeup_usage_conflict:
hard_veto = True
if makeup_finish_conflict:
hard_veto = True
if sun_protection_spf_conflict:
hard_veto = True
if makeup_spray_line_conflict:
hard_veto = True
if romand_lip_line_conflict:
hard_veto = True
if nail_tool_function_conflict:
hard_veto = True
if schick_razor_line_conflict:
hard_veto = True
if lancome_line_conflict:
hard_veto = True
if dr_hsieh_line_conflict:
hard_veto = True
if cotton_swab_variant_conflict:
hard_veto = True
if kanebo_milano_type_conflict:
hard_veto = True
if hoi_candle_line_conflict:
hard_veto = True
if aroma_scent_variant_conflict:
hard_veto = True
if unknown_scent_variant_conflict:
hard_veto = True
if nail_polish_color_name_conflict:
hard_veto = True
if ingredient_line_conflict:
hard_veto = True
if clarins_body_oil_line_conflict:
hard_veto = True
if branded_powder_line_conflict:
hard_veto = True
if cleanser_lotion_line_conflict:
hard_veto = True
if selection1990_wax_lamp_design_conflict:
hard_veto = True
if wax_lamp_size_letter_conflict:
hard_veto = True
if nitori_diffuser_model_conflict:
hard_veto = True
focused_exact_line_reason = _has_focused_low_score_exact_identity_line(left, right)
if focused_exact_line_reason in FOCUSED_IDENTITY_REVIEW_ONLY_REASONS:
reasons.append("variant_selection_review")
if (
focused_exact_line_reason in FOCUSED_IDENTITY_VARIANT_REVIEW_BYPASS_REASONS
and not commercial_condition_gap
):
reasons = [reason for reason in reasons if reason != "variant_selection_review"]
variant_selection_review = False
focused_exact_price_safe = (
focused_exact_line_reason
and brand_score >= 0.95
and not hard_veto
and spec_score >= 0.45
and token_score >= 0.30
and sequence_score >= 0.40
and not variant_descriptor_conflict
)
focused_exact_review_boost_safe = (
focused_exact_line_reason
and not hard_veto
and spec_score >= 0.45
and token_score >= 0.30
and sequence_score >= 0.40
and not variant_descriptor_conflict
and (
brand_score >= 0.95
or (
focused_exact_line_reason in FOCUSED_IDENTITY_BRANDLESS_REVIEW_REASONS
and brand_score == 0.55
and bool(left.brand_tokens) != bool(right.brand_tokens)
and spec_score >= 0.85
and token_score >= 0.55
and sequence_score >= 0.50
)
)
)
focused_total_price_brand_safe = (
brand_score >= 0.95
or (
focused_exact_line_reason in FOCUSED_IDENTITY_BRANDLESS_TOTAL_PRICE_REASONS
and brand_score == 0.55
and bool(left.brand_tokens) != bool(right.brand_tokens)
and spec_score >= 0.85
and token_score >= 0.70
and sequence_score >= 0.55
)
)
focused_exact_total_price_safe = (
focused_exact_line_reason in FOCUSED_IDENTITY_TOTAL_PRICE_REASONS
and focused_total_price_brand_safe
and not hard_veto
and spec_score >= 0.45
and token_score >= 0.30
and sequence_score >= 0.40
and (
not variant_descriptor_conflict
or focused_exact_line_reason == "hanamisui_inclear_private_gel_1_7g_3pack"
)
and "variant_selection_review" not in reasons
and "commercial_condition_gap" not in reasons
)
if focused_exact_total_price_safe:
reasons.append("focused_exact_total_price_safe")
reasons.append(f"focused_exact_identity_{focused_exact_line_reason}")
comparison_mode = "exact_identity"
if _is_unit_comparable_candidate(
left,
right,
token_score,
chinese_name_score,
brand_conflict,
type_score,
reasons,
):
comparison_mode = "unit_comparable"
reasons.append("unit_comparable")
elif hard_veto:
comparison_mode = "not_comparable"
price_penalty = 0.0
try:
if momo_price and competitor_price:
ratio = float(competitor_price) / max(float(momo_price), 1.0)
lip_care_exact_identity = (
shared_anchor
and "唇膏" in shared_anchor
and brand_score >= 0.95
and not hard_veto
and spec_score >= 0.99
and token_score >= 0.50
and sequence_score >= 0.50
and not variant_descriptor_conflict
)
allow_price_penalty_suppression = (
shared_anchor
and len(shared_anchor.replace(" ", "")) >= 7
and brand_score >= 0.95
and not hard_veto
and type_score >= 0.55
and spec_score >= 0.99
and token_score >= 0.68
and sequence_score >= 0.72
)
allow_wide_price_penalty_suppression = (
(
shared_anchor
and len(shared_anchor.replace(" ", "")) >= 5
and brand_score >= 0.95
and not hard_veto
and type_score >= 0.55
and spec_score >= 0.99
and token_score >= 0.50
and (sequence_score >= 0.55 or lip_care_exact_identity)
)
or focused_exact_price_safe
)
if (ratio < 0.3 or ratio > 3.2) and token_score < 0.78:
if allow_price_penalty_suppression:
reasons.append("price_penalty_suppressed_exact_identity")
else:
price_penalty = 0.12
reasons.append("price_ratio_extreme")
elif (ratio < 0.48 or ratio > 2.2) and token_score < 0.68:
if allow_wide_price_penalty_suppression:
reasons.append("price_penalty_suppressed_wide_exact_identity")
else:
price_penalty = 0.06
reasons.append("price_ratio_wide")
except (TypeError, ValueError, ZeroDivisionError):
price_penalty = 0.0
score = (
brand_score * 0.20
+ token_score * 0.36
+ spec_score * 0.25
+ sequence_score * 0.12
+ type_score * 0.07
- price_penalty
)
if token_score >= 0.72 and spec_score >= 0.82 and not brand_conflict:
score += 0.08
if (
brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.55
and spec_score >= 0.55
and not variant_descriptor_conflict
and _has_strong_product_line_signal(left, right, token_score, chinese_name_score)
):
score += 0.07
reasons.append("strong_product_line_match")
if (
brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and _has_safe_exact_spec_signal(left, right, token_score, sequence_score, type_score)
):
score += 0.025
reasons.append("strong_exact_spec_match")
if (
cushion_refill_pack_alignment
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and token_score >= 0.65
and sequence_score >= 0.65
and not variant_descriptor_conflict
):
score += 0.04
reasons.append("cushion_refill_pack_alignment_score")
if (
focused_exact_review_boost_safe
and price_penalty == 0
):
score += 0.16
reasons.append(f"focused_exact_identity_{focused_exact_line_reason}")
if (
shared_anchor
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and spec_score >= 0.85
and (token_score >= 0.43 or sequence_score >= 0.58)
):
score += 0.08
reasons.append("shared_identity_anchor")
if (
shared_anchor
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and spec_score >= 0.55
and token_score >= 0.70
and sequence_score >= 0.62
and not variant_descriptor_conflict
):
score += 0.03
reasons.append("shared_identity_anchor_no_spec")
if (
shared_anchor
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.55
and spec_score >= 0.45
and token_score >= 0.56
and sequence_score >= 0.60
and not variant_descriptor_conflict
):
score += 0.02
reasons.append("shared_identity_anchor_packaging_variant")
if (
shared_anchor
and len(shared_anchor.replace(" ", "")) >= 8
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and spec_score >= 0.45
and token_score >= 0.60
and sequence_score >= 0.68
and not variant_descriptor_conflict
):
score += 0.03
reasons.append("shared_identity_anchor_marketing_variant")
if (
shared_anchor
and len(shared_anchor.replace(" ", "")) >= 5
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.55
and spec_score >= 0.45
and token_score >= 0.88
and not variant_descriptor_conflict
):
score += 0.02
reasons.append("shared_identity_anchor_core_line")
if (
shared_anchor
and len(shared_anchor.replace(" ", "")) >= 6
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.55
and spec_score >= 0.45
and token_score >= 0.86
and sequence_score >= 0.75
and not variant_descriptor_conflict
):
score += 0.07
reasons.append("shared_identity_anchor_exact_line")
if (
"無印乾爽止汗爽身乳液" in shared_anchor
and {"nivea", "妮維雅"} & (left.brand_tokens | right.brand_tokens)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and spec_score >= 0.45
and token_score >= 0.55
and sequence_score >= 0.62
and not variant_descriptor_conflict
):
score += 0.08
reasons.append("shared_identity_anchor_nivea_dry_lotion")
if (
"多效提亮防曬霜" in shared_anchor
and {"recipe", "box"} <= (left.brand_tokens | right.brand_tokens)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and spec_score >= 0.55
and token_score >= 0.54
and sequence_score >= 0.50
and not variant_descriptor_conflict
):
score += 0.09
reasons.append("shared_identity_anchor_recipe_box_line")
if (
"私密潔浴露" in shared_anchor
and {"lactacyd", "立朵舒"} & (left.brand_tokens | right.brand_tokens)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and spec_score >= 0.70
and token_score >= 0.35
and sequence_score >= 0.50
and not variant_descriptor_conflict
):
score += 0.10
reasons.append("shared_identity_anchor_lactacyd_wash")
if (
"私密潔膚露" in shared_anchor
and {"femfresh", "芳芯"} & (left.brand_tokens | right.brand_tokens)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and spec_score >= 0.85
and token_score >= 0.30
and sequence_score >= 0.45
and not variant_descriptor_conflict
):
score += 0.06
reasons.append("shared_identity_anchor_femfresh_wash")
if (
"私密沐浴露" in shared_anchor
and {"vigill", "婦潔"} & (left.brand_tokens | right.brand_tokens)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and spec_score >= 0.70
and token_score >= 0.45
and sequence_score >= 0.55
and not variant_descriptor_conflict
):
score += 0.06
reasons.append("shared_identity_anchor_vigill_private_wash")
if (
"私密潔淨凝露" in shared_anchor
and {"relove"} <= (left.brand_tokens | right.brand_tokens)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and spec_score >= 0.85
and token_score >= 0.30
and sequence_score >= 0.40
and not variant_descriptor_conflict
):
score += 0.11
reasons.append("shared_identity_anchor_relove_cleanser")
if (
"柔霧裸唇膏" in shared_anchor
and {"kate", "凱婷"} & (left.brand_tokens | right.brand_tokens)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.55
and spec_score >= 0.45
and token_score >= 0.50
and sequence_score >= 0.50
and not variant_descriptor_conflict
):
score += 0.15
reasons.append("shared_identity_anchor_kate_bare_lip")
if (
"閃亮珍珠眼影棒" in shared_anchor
and {"karadium"} <= (left.brand_tokens | right.brand_tokens)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.55
and spec_score >= 0.55
and token_score >= 0.50
and sequence_score >= 0.60
and not variant_descriptor_conflict
):
score += 0.12
reasons.append("shared_identity_anchor_karadium_eye_stick")
if (
_has_seche_vite_top_coat_alignment(left, right)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and token_score >= 0.70
and sequence_score >= 0.70
and not variant_descriptor_conflict
):
score += 0.04
reasons.append("shared_identity_anchor_seche_vite_top_coat")
if (
_has_xiaomi_s101_shaver_alignment(left, right)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and token_score >= 0.60
and not variant_descriptor_conflict
):
score += 0.04
reasons.append("shared_model_token_xiaomi_s101_shaver")
if (
_has_hinoki_roller_oil_alignment(left, right)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and spec_score >= 0.85
and sequence_score >= 0.50
and not variant_descriptor_conflict
):
score += 0.04
reasons.append("shared_identity_anchor_hinoki_roller_oil")
if (
_has_brush_baby_wildones_toothbrush_alignment(left, right)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and token_score >= 0.78
and sequence_score >= 0.90
and not variant_descriptor_conflict
):
score += 0.04
reasons.append("shared_model_token_brush_baby_wildones")
if (
_has_pshine_beauty_foot_file_alignment(left, right)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and token_score >= 0.60
and sequence_score >= 0.78
and not variant_descriptor_conflict
):
score += 0.05
reasons.append("shared_model_token_pshine_beauty_foot_file")
if (
_has_catalog_variant_listing_alignment(left, right)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and spec_score >= 0.85
and type_score >= 0.95
and sequence_score >= 0.50
and not variant_descriptor_conflict
):
score += 0.06
reasons.append("catalog_variant_listing_alignment")
if (
_has_baan_baby_lip_catalog_alignment(left, right)
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and token_score >= 0.70
and sequence_score >= 0.45
and not variant_descriptor_conflict
):
score += 0.05
reasons.append("catalog_variant_listing_alignment_baan_lip")
if (
shared_anchor
and len(shared_anchor.replace(" ", "")) >= 5
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.55
and spec_score >= 0.45
and token_score >= 0.74
and sequence_score >= 0.60
and _shared_variant_descriptors(left, right)
and not variant_descriptor_conflict
):
score += 0.05
reasons.append("shared_variant_descriptor_alignment")
if (
shared_anchor
and len(shared_anchor.replace(" ", "")) >= 8
and not hard_veto
and price_penalty == 0
and brand_score == 0.55
and bool(left.brand_tokens) != bool(right.brand_tokens)
and type_score >= 0.55
and spec_score >= 0.55
and token_score >= 0.80
and sequence_score >= 0.80
and chinese_name_score >= 0.42
and not variant_descriptor_conflict
):
score += 0.09
reasons.append("brandless_exact_identity")
if (
shared_anchor
and len(shared_anchor.replace(" ", "")) >= 6
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and spec_score >= 0.85
and token_score >= 0.30
and sequence_score >= 0.50
and not variant_descriptor_conflict
):
score += 0.06
reasons.append("shared_identity_anchor_reordered_line")
if (
shared_anchor
and len(shared_anchor.replace(" ", "")) >= 4
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.95
and spec_score >= 0.65
and token_score >= 0.50
and sequence_score >= 0.50
and _has_exact_count_alignment(left, right)
and not variant_descriptor_conflict
):
score += 0.05
reasons.append("shared_identity_anchor_bundle_equivalent")
if (
shared_anchor
and len(shared_anchor.replace(" ", "")) >= 6
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and type_score >= 0.55
and spec_score >= 0.45
and token_score >= 0.58
and sequence_score >= 0.50
and not variant_descriptor_conflict
):
score += 0.025
reasons.append("shared_identity_anchor_variant_safe")
if (
brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and spec_score >= 0.99
and token_score >= 0.44
and sequence_score >= 0.60
and type_score >= 0.55
):
score += 0.025
reasons.append("spec_name_alignment")
shared_models = _shared_model_tokens(left, right)
if (
shared_models
and brand_score >= 0.95
and not hard_veto
and price_penalty == 0
and token_score >= 0.50
and sequence_score >= 0.62
):
score += 0.04
reasons.append("shared_model_token")
if variant_descriptor_conflict and spec_score < 0.85:
score -= 0.05
reasons.append("variant_descriptor_conflict")
if (
brand_score >= 0.95
and not hard_veto
and not reasons
and price_penalty == 0
and type_score >= 0.95
and token_score >= 0.82
and spec_score >= 0.40
and chinese_name_score >= 0.65
):
score += 0.04
reasons.append("strong_component_line_match")
if hard_veto:
score = min(score, 0.74 if comparison_mode == "unit_comparable" else 0.32)
score = max(0.0, min(1.0, score))
if _has_safe_multi_component_exact_total_price(
left,
right,
brand_score=brand_score,
token_score=token_score,
spec_score=spec_score,
sequence_score=sequence_score,
type_score=type_score,
hard_veto=hard_veto,
variant_descriptor_conflict=variant_descriptor_conflict,
reasons=reasons,
):
reasons.append("safe_multi_component_exact_total_price")
reason_tuple = _dedupe_tuple(reasons)
match_type, price_basis, alert_tier = _classify_match_quality(
score=score,
brand_score=brand_score,
token_score=token_score,
spec_score=spec_score,
sequence_score=sequence_score,
type_score=type_score,
hard_veto=hard_veto,
comparison_mode=comparison_mode,
reasons=reason_tuple,
shared_anchor=shared_anchor,
shared_models=shared_models,
catalog_count_omission=catalog_count_omission,
multi_component_pair=_has_multi_component(left) and _has_multi_component(right),
)
evidence_flags = _build_evidence_flags(
brand_score=brand_score,
token_score=token_score,
spec_score=spec_score,
sequence_score=sequence_score,
type_score=type_score,
shared_anchor=shared_anchor,
shared_models=shared_models,
reasons=reason_tuple,
catalog_count_omission=catalog_count_omission,
)
identity_evidence = _identity_evidence_payload(
left,
right,
brand_score=brand_score,
token_score=token_score,
spec_score=spec_score,
sequence_score=sequence_score,
type_score=type_score,
hard_veto=hard_veto,
comparison_mode=comparison_mode,
match_type=match_type,
price_basis=price_basis,
alert_tier=alert_tier,
shared_anchor=shared_anchor,
shared_models=shared_models,
reasons=reason_tuple,
catalog_count_omission=catalog_count_omission,
)
offer_evidence = _offer_evidence_payload(
momo_price,
competitor_price,
price_penalty=price_penalty,
price_basis=price_basis,
alert_tier=alert_tier,
)
return MatchDiagnostics(
score=round(score, 3),
brand_score=round(brand_score, 3),
token_score=round(token_score, 3),
spec_score=round(spec_score, 3),
sequence_score=round(sequence_score, 3),
type_score=round(type_score, 3),
price_penalty=round(price_penalty, 3),
hard_veto=hard_veto,
reasons=reason_tuple,
comparison_mode=comparison_mode,
match_type=match_type,
price_basis=price_basis,
alert_tier=alert_tier,
evidence_flags=evidence_flags,
identity_evidence=identity_evidence,
offer_evidence=offer_evidence,
)
def _clean_search_phrase(value: str) -> str:
text = normalize_product_text(value)
for phrase in sorted(SEARCH_NOISE_PHRASES, key=len, reverse=True):
text = text.replace(phrase.lower(), " ")
text = re.sub(r"(?<=\d)\.(?=\d)", "DECIMALPOINT", text)
text = re.sub(r"[^\w\u4e00-\u9fff]+", " ", text)
text = text.replace("DECIMALPOINT", ".").replace("decimalpoint", ".")
text = " ".join(
token for token in text.split()
if token not in SEARCH_NOISE_TOKENS and token not in GENERIC_TOKENS
)
text = re.sub(r"\s+", " ", text).strip()
return text
def _search_spec_terms(identity: ProductIdentity) -> list[str]:
specs: list[str] = []
if identity.volumes_ml:
volume = identity.volumes_ml[0]
specs.append(f"{volume:g}ml")
if identity.weights_g:
weight = identity.weights_g[0]
specs.append(f"{weight:g}g")
if identity.dosages_mg:
dosage = identity.dosages_mg[0]
specs.append(f"{dosage:g}mg")
if identity.total_piece_count:
specs.append(f"{identity.total_piece_count}")
return specs
def _extract_anchor_phrases(token: str) -> list[str]:
normalized = normalize_product_text(token)
cleaned = _clean_search_phrase(token)
if not cleaned:
if "經典乳霜" in normalized:
return ["經典乳霜"]
return []
phrases: list[str] = []
if "經典旋轉眉筆" in cleaned:
phrases.append("經典旋轉眉筆")
if "無印乾爽" in cleaned and "止汗爽身乳液" in cleaned:
phrases.append("無印乾爽止汗爽身乳液")
if "智能光感應" in cleaned and "無線自動除臭芳香噴霧機" in cleaned:
phrases.append("智能光感應無線自動除臭芳香噴霧機")
if "悠斯晶" in normalized and "經典乳霜" in normalized:
phrases.append("悠斯晶經典乳霜")
if "經典乳霜" in normalized:
phrases.append("經典乳霜")
if "蜂王玫瑰" in cleaned and any(
keyword in cleaned for keyword in ("外泌微臻霜", "微泌新生霜", "瑰泌霜")
):
phrases.append("蜂王玫瑰瑰泌霜")
if "瞬效" in cleaned and "b5" in cleaned and "玻尿酸" in cleaned and "精華" in cleaned:
phrases.append("瞬效b5玻尿酸精華")
if "慕之幼" in cleaned and "爽身潤膚乳" in cleaned:
phrases.append("慕之幼爽身潤膚乳")
for anchor in SEARCH_IDENTITY_ANCHORS:
anchor_phrase = _clean_search_phrase(anchor)
if not anchor_phrase or anchor_phrase not in cleaned:
continue
if re.search(r"[\u4e00-\u9fff]", anchor_phrase):
prefix_width = 0 if len(anchor_phrase) >= 5 else (4 if len(anchor_phrase) >= 3 else 6)
match = re.search(rf"([\u4e00-\u9fff]{{0,{prefix_width}}}{re.escape(anchor_phrase)})", cleaned)
phrase = match.group(1) if match else anchor_phrase
else:
phrase = anchor_phrase
phrase = _clean_search_phrase(phrase)
if phrase.startswith("") and len(phrase) > 2:
phrase = phrase[1:]
if any(existing in phrase and existing != phrase for existing in phrases):
continue
if len(phrase) >= 2 and phrase not in phrases:
phrases.append(phrase)
return phrases
def _shared_identity_anchor(left: ProductIdentity, right: ProductIdentity) -> str:
left_anchors: set[str] = set()
right_anchors: set[str] = set()
for token in left.core_tokens:
left_anchors.update(_extract_anchor_phrases(token))
for token in right.core_tokens:
right_anchors.update(_extract_anchor_phrases(token))
left_anchors.update(_extract_anchor_phrases(left.normalized_name))
right_anchors.update(_extract_anchor_phrases(right.normalized_name))
left_anchors.update(_extract_anchor_phrases(left.searchable_name))
right_anchors.update(_extract_anchor_phrases(right.searchable_name))
partial_shared: set[str] = set()
for left_anchor in left_anchors:
left_compact = left_anchor.replace(" ", "")
for right_anchor in right_anchors:
right_compact = right_anchor.replace(" ", "")
if left_compact == right_compact:
partial_shared.add(left_anchor)
continue
if len(left_compact) >= 5 and left_compact in right_compact:
partial_shared.add(left_anchor)
elif len(right_compact) >= 5 and right_compact in left_compact:
partial_shared.add(right_anchor)
shared = sorted(
{
anchor for anchor in partial_shared
if len(anchor.replace(" ", "")) >= 5 and anchor not in SEARCH_BROAD_ANCHORS
},
key=lambda anchor: (-len(anchor.replace(" ", "")), anchor),
)
return shared[0] if shared else ""
def _shared_model_tokens(left: ProductIdentity, right: ProductIdentity) -> set[str]:
return {
token
for token in left.core_tokens & right.core_tokens
if len(token) >= 4
and re.search(r"[a-z]", token)
and re.search(r"\d", token)
and not _is_spec_like_latin_token(token)
}
def _variant_descriptors(identity: ProductIdentity) -> set[str]:
descriptors: set[str] = set()
brand_compacts = {brand.replace(" ", "") for brand in identity.brand_tokens}
for token in identity.core_tokens:
value = token
for anchor in sorted(_extract_anchor_phrases(token), key=len, reverse=True):
value = value.replace(anchor, " ")
value = _clean_search_phrase(value)
compact = value.replace(" ", "")
if len(compact) < 2:
continue
if compact in brand_compacts:
continue
if compact in SEARCH_NOISE_TOKENS or compact in SEARCH_BROAD_ANCHORS:
continue
if any(keyword in compact for keyword in VARIANT_DESCRIPTOR_NOISE_KEYWORDS):
continue
if re.fullmatch(r"[a-z0-9-]+", compact):
continue
descriptors.add(compact.removesuffix(""))
return {token for token in descriptors if token}
def _shared_variant_descriptors(left: ProductIdentity, right: ProductIdentity) -> set[str]:
left_descriptors = _variant_descriptors(left)
right_descriptors = _variant_descriptors(right)
shared: set[str] = set()
for left_descriptor in left_descriptors:
for right_descriptor in right_descriptors:
if left_descriptor == right_descriptor:
shared.add(left_descriptor)
continue
if len(left_descriptor) >= 2 and left_descriptor in right_descriptor:
shared.add(left_descriptor)
elif len(right_descriptor) >= 2 and right_descriptor in left_descriptor:
shared.add(right_descriptor)
return shared
def _has_serum_formulation_conflict(left: ProductIdentity, right: ProductIdentity, shared_anchor: str) -> bool:
if "精華" not in shared_anchor:
return False
formulations = ("精華乳", "精華霜", "精華液")
left_hit = next((token for token in formulations if token in left.normalized_name), None)
right_hit = next((token for token in formulations if token in right.normalized_name), None)
return bool(left_hit and right_hit and left_hit != right_hit)
def _has_saugella_private_wash_variant_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
if not (
("saugella" in left_text or "賽吉兒" in left_text)
and ("saugella" in right_text or "賽吉兒" in right_text)
):
return False
variant_tokens = ("日用", "日用型", "加強", "潤澤", "黃金女郎型")
left_hits = {token for token in variant_tokens if token in left_text}
right_hits = {token for token in variant_tokens if token in right_text}
return bool(left_hits and right_hits and left_hits.isdisjoint(right_hits))
def _has_lactacyd_private_wash_variant_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
if not (
("lactacyd" in left_text or "立朵舒" in left_text)
and ("lactacyd" in right_text or "立朵舒" in right_text)
):
return False
variant_tokens = (
"清新舒涼",
"生理呵護",
"滋潤緊緻",
"加倍修護",
"柔軟滋潤",
"亮肌柔滑",
"全日清爽",
)
left_hits = {token for token in variant_tokens if token in left_text}
right_hits = {token for token in variant_tokens if token in right_text}
return bool(left_hits and right_hits and left_hits.isdisjoint(right_hits))
def _has_makeup_usage_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
cheek_terms = ("頰彩", "腮紅", "blush")
eye_terms = ("眼彩", "眼影", "eyeshadow")
left_cheek = any(term in left_text for term in cheek_terms)
right_cheek = any(term in right_text for term in cheek_terms)
left_eye = any(term in left_text for term in eye_terms)
right_eye = any(term in right_text for term in eye_terms)
return bool((left_cheek and right_eye) or (left_eye and right_cheek))
def _has_makeup_finish_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
if "mac" not in (left.brand_tokens & right.brand_tokens):
return False
if not (
"macximal" in left_text
and "macximal" in right_text
and "唇膏" in left_text
and "唇膏" in right_text
):
return False
matte_terms = ("柔霧", "霧面", "matte")
satin_terms = ("緞光", "satin")
left_matte = any(term in left_text for term in matte_terms)
right_matte = any(term in right_text for term in matte_terms)
left_satin = any(term in left_text for term in satin_terms)
right_satin = any(term in right_text for term in satin_terms)
return bool((left_matte and right_satin) or (left_satin and right_matte))
def _spf_values(identity: ProductIdentity) -> set[int]:
return {
int(match.group(1))
for match in re.finditer(r"spf\s*(\d{1,3})", identity.normalized_name, re.I)
}
def _has_sun_protection_spf_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(term in pair_text for term in ("防曬", "素顏霜", "spf")):
return False
left_spf = _spf_values(left)
right_spf = _spf_values(right)
return bool(left_spf and right_spf and left_spf.isdisjoint(right_spf))
def _makeup_spray_line_groups(identity: ProductIdentity) -> set[str]:
text = identity.searchable_name
groups: set[str] = set()
if "fix+" in text or "定妝噴霧" in text or "超持妝" in text:
groups.add("setting_spray")
if "活氧水" in text or "激活版" in text:
groups.add("activating_water")
if "精華版" in text:
groups.add("serum_variant")
if "控油" in text or "黑特霧" in text:
groups.add("oil_control")
return groups
def _has_makeup_spray_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(term in pair_text for term in ("定妝噴霧", "活氧水", "fix+", "超光肌", "超持妝")):
return False
left_groups = _makeup_spray_line_groups(left)
right_groups = _makeup_spray_line_groups(right)
if not left_groups or not right_groups:
return False
return bool(
("setting_spray" in left_groups and "activating_water" in right_groups)
or ("activating_water" in left_groups and "setting_spray" in right_groups)
)
def _has_makeup_spray_variant_selection_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
left_groups = _makeup_spray_line_groups(left)
right_groups = _makeup_spray_line_groups(right)
if not left_groups or not right_groups or _has_makeup_spray_line_conflict(left, right):
return False
return left_groups != right_groups
def _romand_lip_line_groups(identity: ProductIdentity) -> set[str]:
text = identity.searchable_name
groups: set[str] = set()
if "果汁唇釉" in text or "juicy" in text:
groups.add("juicy")
if "零絲絨" in text or "zero velvet" in text or "霧面唇釉" in text:
groups.add("zero_velvet")
if "果凍唇釉" in text or "glasting" in text or "唇凍" in text:
groups.add("glasting")
if "水感唇釉" in text:
groups.add("water_gloss")
return groups
def _has_romand_lip_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not (
{"rom", "romand"} & (left.brand_tokens | right.brand_tokens)
or "rom&nd" in pair_text
or "romand" in pair_text
):
return False
if "" not in left.searchable_name or "" not in right.searchable_name:
return False
left_groups = _romand_lip_line_groups(left)
right_groups = _romand_lip_line_groups(right)
return bool(left_groups and right_groups and left_groups.isdisjoint(right_groups))
def _has_nail_tool_function_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
if "erbe" not in (left.brand_tokens & right.brand_tokens):
return False
if "指甲" not in left_text or "指甲" not in right_text:
return False
cleaning_terms = ("清垢棒", "清潔棒")
plane_terms = ("指甲緣刨刀", "刨刀")
left_cleaning = any(term in left_text for term in cleaning_terms)
right_cleaning = any(term in right_text for term in cleaning_terms)
left_plane = any(term in left_text for term in plane_terms)
right_plane = any(term in right_text for term in plane_terms)
return bool((left_cleaning and right_plane) or (left_plane and right_cleaning))
def _has_yes_nail_tool_exact_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
if not ({"yes", "德悅氏"} & (left.brand_tokens & right.brand_tokens)):
return False
left_text = left.searchable_name
right_text = right.searchable_name
pair_text = f"{left_text} {right_text}"
if not any(term in pair_text for term in ("指甲剪", "銼刀", "腳皮銼", "拋光棒")):
return False
exact_lines = (
("指甲剪附除垢銼刀", ("8cm",), ("亮面", "霧面")),
("腳皮銼腳板", ("23.5cm",), ()),
("藍寶石銼刀", ("9cm",), ("可收納",)),
("指甲拋光棒", ("17.5cm",), ("三面",)),
)
for line, sizes, required_options in exact_lines:
if line not in left_text or line not in right_text:
continue
if not any(size in left_text and size in right_text for size in sizes):
continue
if required_options and not any(option in left_text and option in right_text for option in required_options):
continue
return True
if "指甲剪" in left_text and "指甲剪" in right_text:
excluded = ("附除垢", "", "硬皮", "鋒利窄弧型")
if any(term in left_text or term in right_text for term in excluded):
return False
if not any(size in left_text and size in right_text for size in ("6cm", "8cm")):
return False
finishes = ("亮面", "霧面", "不掉屑")
return any(finish in left_text and finish in right_text for finish in finishes)
return False
def _has_schick_razor_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
if not ({"schick", "舒適牌"} & (left.brand_tokens & right.brand_tokens)):
return False
pair_text = f"{left_text} {right_text}"
if "除毛刀" not in pair_text:
return False
women_razor_terms = ("仕女", "除毛刀")
if not all(term in pair_text for term in women_razor_terms):
return False
left_silk_effects = "舒芙" in left_text
right_silk_effects = "舒芙" in right_text
left_intuition = "舒綺" in left_text
right_intuition = "舒綺" in right_text
return bool((left_silk_effects and right_intuition) or (left_intuition and right_silk_effects))
def _has_lancome_ultra_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
if not (
("lancome" in left_text or "蘭蔻" in left_text)
and ("lancome" in right_text or "蘭蔻" in right_text)
):
return False
glow_terms = ("超極光", "極光水", "晶露", "活粹晶露", "四重酸")
genifique_terms = ("超極限", "肌因", "小黑瓶", "賦活露", "肌因精華")
left_glow = any(term in left_text for term in glow_terms)
right_glow = any(term in right_text for term in glow_terms)
left_genifique = any(term in left_text for term in genifique_terms)
right_genifique = any(term in right_text for term in genifique_terms)
return bool((left_glow and right_genifique) or (left_genifique and right_glow))
def _has_dr_hsieh_labsmart_serum_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
if not (
("dr" in left_text or "達特醫" in left_text)
and ("dr" in right_text or "達特醫" in right_text)
):
return False
if left.product_type != "精華" or right.product_type != "精華":
return False
labsmart_terms = ("labsmart", "hi tech", "hi-tech", "classic")
repair_terms = ("神經醯胺多重修復", "多重修復保濕精華", "多重修復保濕精華液")
left_labsmart = any(term in left_text for term in labsmart_terms)
right_labsmart = any(term in right_text for term in labsmart_terms)
left_repair = any(term in left_text for term in repair_terms)
right_repair = any(term in right_text for term in repair_terms)
return bool((left_labsmart and right_repair and not right_labsmart) or (right_labsmart and left_repair and not left_labsmart))
def _has_cotton_swab_variant_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
if "棉棒" not in left_text or "棉棒" not in right_text:
return False
variant_tokens = ("細軸", "黑色")
left_hits = {token for token in variant_tokens if token in left_text}
right_hits = {token for token in variant_tokens if token in right_text}
return bool(left_hits and right_hits and left_hits.isdisjoint(right_hits))
def _has_kanebo_milano_powder_perfume_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
if not ({"kanebo", "佳麗寶"} & (left.brand_tokens & right.brand_tokens)):
return False
if not (
("milano" in left_text or "米蘭" in left_text or "collection" in left_text)
and ("milano" in right_text or "米蘭" in right_text or "collection" in right_text)
):
return False
powder_terms = ("蜜粉", "粉餅")
fragrance_terms = ("香水", "淡香精", "淡香水", "perfume")
left_powder = any(term in left_text for term in powder_terms)
right_powder = any(term in right_text for term in powder_terms)
left_fragrance = any(term in left_text for term in fragrance_terms)
right_fragrance = any(term in right_text for term in fragrance_terms)
return bool((left_powder and right_fragrance) or (right_powder and left_fragrance))
def _has_hoi_candle_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
if "hoi" not in (left.brand_tokens & right.brand_tokens):
return False
if "蠟燭" not in left_text or "蠟燭" not in right_text:
return False
day_mountain_terms = ("日京山風",)
lab_terms = ("hoi!lab", "hoilab", "實驗室香氛", "經典篇")
left_day_mountain = any(term in left_text for term in day_mountain_terms)
right_day_mountain = any(term in right_text for term in day_mountain_terms)
left_lab = any(term in left_text for term in lab_terms)
right_lab = any(term in right_text for term in lab_terms)
return bool((left_day_mountain and right_lab) or (right_day_mountain and left_lab))
def _has_aroma_scent_variant_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if any(term in pair_text for term in ("護手霜", "融蠟燈", "蠟燭暖燈")):
return False
if not any(
term in pair_text
for term in (
"香氛固體凝膠",
"香氛凝膠",
"空氣芳香劑",
"車用香氛",
"車用擴香",
"擴香蕊",
"擴香罐",
"香薰蠟燭",
"香氛蠟燭",
"蠟燭",
"滾珠精油",
"香氛精油",
"植物精油",
)
):
return False
if _is_multi_variant_catalog_listing(left) or _is_multi_variant_catalog_listing(right):
return False
left_options = _explicit_variant_option_tokens(left)
right_options = _explicit_variant_option_tokens(right)
if left_options and right_options:
return not bool(left_options & right_options)
scent_words = {
"藤蔓果園",
"清新花園",
"白麝香",
"黑麝香",
"寶貝粉香",
"青檸羅勒",
"炭木香",
"無花果",
"白茶蘭花",
"白茶",
"檸檬草",
"茶樹",
"鼠尾草",
"海鹽",
"橙花",
"薄荷",
"杏仁",
"薰衣草",
"茉莉",
"櫻花",
"繡球花",
"玫瑰",
"雪松",
"檀香",
}
left_scent = {word for word in scent_words if word in left.searchable_name}
right_scent = {word for word in scent_words if word in right.searchable_name}
if bool(left_options or left_scent) != bool(right_options or right_scent):
return True
if left_scent and right_scent and not (left_scent & right_scent):
return True
return False
def _has_unknown_scent_variant_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if any(term in pair_text for term in ("暖燈", "融蠟燈", "融燭燈", "香氛燈")):
return False
if any(term in pair_text for term in ("香氛誘霜", "車用", "芳香劑", "香氛機", "擴香儀", "噴香機")):
return False
if not (
("護手霜" in pair_text and any(term in pair_text for term in ("芬香", "香味", "香氣", "精油")))
or "擴香瓶" in pair_text
):
return False
if _is_multi_variant_catalog_listing(left) or _is_multi_variant_catalog_listing(right):
return False
left_descriptors = _variant_descriptors(left)
right_descriptors = _variant_descriptors(right)
if not left_descriptors or not right_descriptors:
return False
shared = _shared_variant_descriptors(left, right)
left_unique = left_descriptors - shared
right_unique = right_descriptors - shared
descriptor_noise = ("護手霜", "擴香瓶", "精油芬香", "經典擴香")
left_unique = {
descriptor for descriptor in left_unique
if 2 <= len(descriptor) <= 6 and not any(noise in descriptor for noise in descriptor_noise)
}
right_unique = {
descriptor for descriptor in right_unique
if 2 <= len(descriptor) <= 6 and not any(noise in descriptor for noise in descriptor_noise)
}
if not left_unique or not right_unique:
return False
for left_descriptor in left_unique:
for right_descriptor in right_unique:
if left_descriptor == right_descriptor:
return False
if left_descriptor in right_descriptor or right_descriptor in left_descriptor:
return False
return True
def _has_nail_polish_color_name_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(term in pair_text for term in ("指甲油", "指彩", "美甲彩繪")):
return False
if _shared_model_tokens(left, right):
return False
left_model_codes = _nail_polish_model_codes(left)
right_model_codes = _nail_polish_model_codes(right)
if left_model_codes and right_model_codes:
return left_model_codes.isdisjoint(right_model_codes)
if _is_multi_variant_catalog_listing(left) or _is_multi_variant_catalog_listing(right):
return False
left_descriptors = _variant_descriptors(left)
right_descriptors = _variant_descriptors(right)
if not left_descriptors or not right_descriptors:
return False
shared = _shared_variant_descriptors(left, right)
left_unique = left_descriptors - shared
right_unique = right_descriptors - shared
if not left_unique or not right_unique:
return False
for left_descriptor in left_unique:
for right_descriptor in right_unique:
if left_descriptor in right_descriptor or right_descriptor in left_descriptor:
return False
return True
def _has_aroma_lamp_style_selection_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not ({"les", "nez", "香鼻子"} & (left.brand_tokens & right.brand_tokens)):
return False
if not any(term in pair_text for term in ("融蠟燈", "融燭燈", "蠟燭暖燈", "香氛燈")):
return False
if not any(term in left.searchable_name for term in ("融蠟燈", "融燭燈", "蠟燭暖燈", "香氛燈")):
return False
if not any(term in right.searchable_name for term in ("融蠟燈", "融燭燈", "蠟燭暖燈", "香氛燈")):
return False
style_aliases = {
"流金歲月": ("流金歲月",),
"暮光琥珀": ("暮光琥珀",),
"閃耀琥珀": ("閃耀琥珀",),
"星夜": ("星夜款", "星夜"),
"流光玫瑰金": ("流光玫瑰金", "玫瑰金"),
"土耳其風": ("土耳其風",),
"手工拼貼玻璃": ("手工拼貼玻璃",),
"手工玻璃": ("手工玻璃",),
"北歐": ("北歐",),
"水晶燈": ("水晶燈",),
}
left_styles = {
style
for style, aliases in style_aliases.items()
if any(alias in left.searchable_name for alias in aliases)
}
right_styles = {
style
for style, aliases in style_aliases.items()
if any(alias in right.searchable_name for alias in aliases)
}
if not left_styles and not right_styles:
return False
if left_styles == right_styles:
return False
shared_styles = left_styles & right_styles
left_specific = left_styles - shared_styles
right_specific = right_styles - shared_styles
return bool(left_specific or right_specific)
def _has_core_ingredient_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(term in pair_text for term in ("油膏", "護膚油", "身體油", "精油", "基礎油", "按摩油", "甜杏仁油", "酪梨油", "", "乳霜")):
return False
ingredient_groups = {
"coconut_oil": ("椰子油", "coconut"),
"shea_butter": ("乳木果油", "shea"),
"sweet_almond_oil": ("甜杏仁油", "sweet almond"),
"apricot_kernel_oil": ("杏桃核仁油", "杏核仁油", "apricot kernel"),
"avocado_oil": ("酪梨油", "avocado"),
}
left_groups = {
group
for group, terms in ingredient_groups.items()
if any(term in left.searchable_name for term in terms)
}
right_groups = {
group
for group, terms in ingredient_groups.items()
if any(term in right.searchable_name for term in terms)
}
return bool(left_groups and right_groups and not (left_groups & right_groups))
def _has_clarins_body_oil_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
if not ({"clarins", "克蘭詩"} & (left.brand_tokens & right.brand_tokens)):
return False
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(term in pair_text for term in ("護理油", "身體油", "美體油", "調和護理油")):
return False
line_groups = {
"contour_lightweight": ("輕盈美體", "美體護理油", "contour"),
"tonic_body": ("身體調和", "調和護理油", "孕期身體調和", "tonic"),
}
left_groups = {
group
for group, terms in line_groups.items()
if any(term in left.searchable_name for term in terms)
}
right_groups = {
group
for group, terms in line_groups.items()
if any(term in right.searchable_name for term in terms)
}
return bool(left_groups and right_groups and left_groups.isdisjoint(right_groups))
def _has_branded_powder_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
if not ({"港香蘭"} & (left.brand_tokens & right.brand_tokens)):
return False
if "爽身粉" not in left.searchable_name or "爽身粉" not in right.searchable_name:
return False
named_lines = ("漢本", "艾魔菈")
left_lines = {line for line in named_lines if line in left.searchable_name}
right_lines = {line for line in named_lines if line in right.searchable_name}
return bool(left_lines and right_lines and not (left_lines & right_lines))
def _has_cleanser_lotion_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
if not (left.brand_tokens & right.brand_tokens):
return False
if not _has_overlapping_base_spec(left, right):
return False
cleanser_terms = ("潔膚露", "潔膚", "潔淨露", "潔面", "洗面乳", "cleanser")
lotion_terms = ("修護乳", "乳液", "身體乳", "潤膚乳", "lotion")
left_cleanser = any(term in left.searchable_name for term in cleanser_terms)
right_cleanser = any(term in right.searchable_name for term in cleanser_terms)
left_lotion = any(term in left.searchable_name for term in lotion_terms)
right_lotion = any(term in right.searchable_name for term in lotion_terms)
return bool((left_cleanser and right_lotion) or (right_cleanser and left_lotion))
def _selection1990_wax_lamp_design_groups(identity: ProductIdentity) -> set[str]:
text = identity.searchable_name
groups: set[str] = set()
if "現代簡約半圓罩融燭燈" in text or "半圓罩" in text:
groups.add("half_dome")
if "歐式可彎融燭燈" in text or "可彎融燭燈" in text:
groups.add("bendable")
if "韓風原木底座融燭燈" in text or "原木底座融燭燈" in text:
groups.add("wood_base")
if "北歐簡樸融蠟燈" in text or "北歐簡樸" in text:
groups.add("nordic")
return groups
def _has_selection1990_wax_lamp_design_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
if not ({"1990", "選物"} <= (left.brand_tokens & right.brand_tokens)):
return False
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(term in pair_text for term in ("融燭燈", "蠟燭暖燈", "融蠟燈")):
return False
left_groups = _selection1990_wax_lamp_design_groups(left)
right_groups = _selection1990_wax_lamp_design_groups(right)
return bool(left_groups and right_groups and left_groups.isdisjoint(right_groups))
def _has_hooome_wax_lamp_design_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
if "hooome" not in (left.brand_tokens & right.brand_tokens):
return False
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(term in pair_text for term in ("香氛蠟燭暖燈", "蠟燭暖燈", "融蠟燈")):
return False
concrete_design_terms = ("大理石", "雲石", "原木", "半圓罩", "陶瓷", "玻璃", "水晶", "金屬", "鐵藝")
left_designs = {term for term in concrete_design_terms if term in left.searchable_name}
right_designs = {term for term in concrete_design_terms if term in right.searchable_name}
return bool(left_designs or right_designs) and left_designs != right_designs
def _standalone_size_letter_tokens(identity: ProductIdentity) -> set[str]:
text = identity.searchable_name
return {
match.group(1).lower()
for match in re.finditer(r"(?<![a-z0-9])([sml])(?![a-z0-9])", text, re.I)
}
def _has_wax_lamp_size_letter_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(term in pair_text for term in ("香氛蠟燭暖燈", "蠟燭暖燈", "融蠟燈")):
return False
left_sizes = _standalone_size_letter_tokens(left)
right_sizes = _standalone_size_letter_tokens(right)
return bool(left_sizes and right_sizes and not (left_sizes & right_sizes))
def _has_nitori_diffuser_model_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
if not ({"nitori", "宜得利家居"} & (left.brand_tokens & right.brand_tokens)):
return False
if "香氛噴霧器" not in left.searchable_name or "香氛噴霧器" not in right.searchable_name:
return False
def model_tokens(identity: ProductIdentity) -> set[str]:
text = identity.searchable_name
numeric_models = set(re.findall(r"(?<![a-z0-9])\d{3,5}(?![a-z0-9])", text))
compact_models = {
match.group(1).lower()
for match in re.finditer(r"(?<![a-z0-9])([a-z]{1,4}\d{2,}[a-z0-9-]*)(?![a-z0-9])", text, re.I)
if not _is_spec_like_latin_token(match.group(1).lower())
}
return _extract_model_tokens(text) | numeric_models | compact_models
left_models = model_tokens(left)
right_models = model_tokens(right)
return bool(left_models and right_models and not (left_models & right_models))
def _makeup_shade_tokens(identity: ProductIdentity) -> set[str]:
text = identity.searchable_name
tokens = set(_explicit_variant_option_tokens(identity))
shade_pattern = (
r"(?<![a-z0-9])(?:#|no\.?|色號|號色)?\s*(\d{1,3})\s+"
r"(rosy ivory|ivory|beige|sand|fair|light|medium|porcelain|rose)(?![a-z0-9])"
)
for match in re.finditer(shade_pattern, text, re.I):
tokens.add(match.group(1).lower())
tokens.add(match.group(2).lower().replace(" ", "_"))
for match in re.finditer(r"(?<![a-z0-9])([a-z]?\d{1,3}[a-z]?)(?=\s*[\u4e00-\u9fff]{2,})", text, re.I):
value = re.sub(r"[^a-z0-9]", "", match.group(1).lower())
if re.fullmatch(r"\d+(?:g|m|l|ml|mg)", value):
continue
if value:
tokens.add(value)
return tokens
def _has_makeup_shade_selection_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(
term in pair_text
for term in ("氣墊粉霜", "粉底", "粉霜", "蜜粉", "唇釉", "唇膏", "唇蜜", "染眉膏", "眉筆", "眉膏", "眉彩", "眼線", "遮瑕")
):
return False
left_shades = _makeup_shade_tokens(left)
right_shades = _makeup_shade_tokens(right)
return bool(left_shades) != bool(right_shades)
def _commercial_condition_terms(identity: ProductIdentity) -> set[str]:
text = identity.normalized_name
terms: set[str] = set()
if any(term in text for term in ("即期品", "臨期", "短效", "短效期", "效期", "保存期限", "有效期限")):
terms.add("expiry_sensitive")
if any(term in text for term in ("盒損", "盒損品", "外盒損", "外盒瑕疵")):
terms.add("box_damage")
if any(term in text for term in ("福利品", "瑕疵品")):
terms.add("clearance_condition")
return terms
def _has_commercial_condition_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
left_terms = _commercial_condition_terms(left)
right_terms = _commercial_condition_terms(right)
return bool(left_terms or right_terms) and left_terms != right_terms
def _has_relove_private_cleanser_variant_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
if not _is_relove_cleanser_gel_like(left, right):
return False
brightening_terms = ("傳明酸", "淨白", "美白", "亮白", "菸鹼醯胺", "niacinamide")
left_brightening = any(term in left.searchable_name for term in brightening_terms)
right_brightening = any(term in right.searchable_name for term in brightening_terms)
return left_brightening != right_brightening
def _has_makeup_catalog_selection_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
sensitive_terms = (
"遮瑕蜜",
"遮瑕",
"粉底",
"粉霜",
"氣墊",
"蜜粉",
"腮紅",
"眼線",
"眉筆",
"染眉膏",
"唇膏",
"唇釉",
"唇蜜",
)
if not any(term in pair_text for term in sensitive_terms):
return False
if not (_is_catalog_or_delimited_variant_listing(left) or _is_catalog_or_delimited_variant_listing(right)):
return False
left_shades = _makeup_shade_tokens(left)
right_shades = _makeup_shade_tokens(right)
if left_shades and right_shades and _variant_options_overlap(left_shades, right_shades):
return False
return True
def _is_candle_scent_catalog_listing(identity: ProductIdentity) -> bool:
text = identity.searchable_name
if _is_multi_variant_catalog_listing(identity):
return True
return bool(re.search(r"\d+\s*種(?:香味|香氣|味道)", text))
def _has_candle_catalog_selection_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(term in pair_text for term in ("香氛蠟燭", "大豆蠟燭", "蠟燭")):
return False
if "融蠟燈" in pair_text or "融燭燈" in pair_text or "蠟燭燈" in pair_text:
return False
left_catalog = _is_candle_scent_catalog_listing(left)
right_catalog = _is_candle_scent_catalog_listing(right)
return left_catalog != right_catalog
def _has_loreal_serum_variant_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not ({"loreal", "巴黎萊雅"} & (left.brand_tokens | right.brand_tokens)):
return False
if "玻尿酸瞬效保濕" not in pair_text:
return False
variant_terms = ("啵啵精華水", "液態紫熨斗", "水光精華", "修護晶露", "保濕水光")
left_terms = {term for term in variant_terms if term in left.searchable_name}
right_terms = {term for term in variant_terms if term in right.searchable_name}
if not (left_terms or right_terms):
return False
return left_terms != right_terms or _is_catalog_or_delimited_variant_listing(left) != _is_catalog_or_delimited_variant_listing(right)
def _has_sebamed_shampoo_variant_catalog_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
if not ({"sebamed", "施巴"} & (left.brand_tokens | right.brand_tokens)):
return False
if "洗髮乳" not in left.searchable_name or "洗髮乳" not in right.searchable_name:
return False
variant_terms = ("溫和", "油性抗屑", "抗屑", "乾性", "敏感")
left_terms = {term for term in variant_terms if term in left.searchable_name}
right_terms = {term for term in variant_terms if term in right.searchable_name}
if _is_catalog_or_delimited_variant_listing(left) != _is_catalog_or_delimited_variant_listing(right):
return True
return bool(left_terms or right_terms) and left_terms != right_terms
def _has_schick_2in1_model_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
if not ({"schick", "舒適牌"} & (left.brand_tokens & right.brand_tokens)):
return False
pair_text = f"{left.searchable_name} {right.searchable_name}"
if "舒綺" not in pair_text or "美型刀" not in pair_text:
return False
left_2in1 = bool(re.search(r"2\s*(?:-?in-?|合)?\s*1", left.searchable_name, re.I))
right_2in1 = bool(re.search(r"2\s*(?:-?in-?|合)?\s*1", right.searchable_name, re.I))
return left_2in1 != right_2in1
def _has_taicend_protection_form_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
if not ({"taicend", "泰陞"} & (left.brand_tokens & right.brand_tokens)):
return False
pair_text = f"{left.searchable_name} {right.searchable_name}"
if "保護膜" not in pair_text and "保護噴霧" not in pair_text and "液態皮膚保護膜" not in pair_text:
return False
if "屁屁噴" in left.searchable_name and "屁屁噴" in right.searchable_name:
return False
left_terms = {
term
for term in ("寶貝液體保護膜", "液態皮膚保護膜", "皮膚保護噴霧", "保護噴霧")
if term in left.searchable_name
}
right_terms = {
term
for term in ("寶貝液體保護膜", "液態皮膚保護膜", "皮膚保護噴霧", "保護噴霧")
if term in right.searchable_name
}
return bool(left_terms or right_terms) and left_terms != right_terms
def _has_catalog_specific_variant_selection_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(
term in pair_text
for term in (
"身體去角質",
"美體主張",
"私密潔浴露",
"私密潔浴",
"私密防護慕絲",
"私密慕絲",
"慕絲",
"嬰兒潤膚乳",
"定妝噴霧",
"染眉膏",
"眼線膠筆",
"粉餅盒",
"遮瑕蜜",
"護手霜",
"護唇膏",
"護唇棒",
"唇釉",
"唇膏",
"蜜粉",
"防曬素顏霜",
"車用香氛",
"車用擴香",
"車用擴香蕊",
"香氛擴香罐",
"擴香瓶",
"擴香罐",
"擴香蕊",
"水性指甲油",
"指甲油",
"足膜",
"泡澡入浴劑",
"入浴劑",
"融蠟小夜燈",
"融蠟燈",
"滋養霜",
)
):
return False
left_catalog = _is_catalog_or_delimited_variant_listing(left)
right_catalog = _is_catalog_or_delimited_variant_listing(right)
return left_catalog != right_catalog
def _has_bath_additive_variant_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
pair_text = f"{left.searchable_name} {right.searchable_name}"
if not any(term in pair_text for term in ("入浴劑", "泡澡錠", "泡澡包", "泡澡")):
return False
if not (left.brand_tokens & right.brand_tokens):
return False
left_terms = {
term
for term in ("馨香", "懷舊", "橘盒", "綠盒", "粉盒", "藍盒")
if term in left.searchable_name
}
right_terms = {
term
for term in ("馨香", "懷舊", "橘盒", "綠盒", "粉盒", "藍盒")
if term in right.searchable_name
}
return bool(left_terms and right_terms and not (left_terms & right_terms))
def _has_taicend_baby_spray_equivalence(left: ProductIdentity, right: ProductIdentity) -> bool:
brand_tokens = {"taicend", "泰陞"}
return (
bool(left.brand_tokens & brand_tokens)
and bool(right.brand_tokens & brand_tokens)
and "屁屁噴" in left.searchable_name
and "屁屁噴" in right.searchable_name
and _has_overlapping_base_spec(left, right)
)
def _has_seche_vite_top_coat_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
brand_tokens = {"seche", "vite"}
return (
brand_tokens <= (left.brand_tokens | right.brand_tokens)
and bool(left.brand_tokens & brand_tokens)
and bool(right.brand_tokens & brand_tokens)
and "快乾亮油" in left.searchable_name
and "快乾亮油" in right.searchable_name
)
def _has_xiaomi_s101_shaver_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
brand_tokens = {"xiaomi", "小米", "小米有品"}
return (
bool(left.brand_tokens & brand_tokens)
and bool(right.brand_tokens & brand_tokens)
and "s101" in left.searchable_name
and "s101" in right.searchable_name
and "電動刮鬍刀" in left.searchable_name
and "電動刮鬍刀" in right.searchable_name
)
def _has_hinoki_roller_oil_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
return (
"檜山坊" in left.brand_tokens
and "檜山坊" in right.brand_tokens
and "檜木精油" in left.searchable_name
and "檜木精油" in right.searchable_name
and "滾珠瓶" in left.searchable_name
and "滾珠瓶" in right.searchable_name
and _has_overlapping_base_spec(left, right)
)
def _has_brush_baby_wildones_toothbrush_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
brand_tokens = {"brush", "baby", "wildones"}
return (
brand_tokens <= left.brand_tokens
and brand_tokens <= right.brand_tokens
and "電動牙刷" in left.searchable_name
and "電動牙刷" in right.searchable_name
and "0-10y" in left.searchable_name
and "0-10y" in right.searchable_name
)
def _has_pshine_beauty_foot_file_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
brand_tokens = {"beauty", "shine", "foot"}
return (
brand_tokens <= left.brand_tokens
and brand_tokens <= right.brand_tokens
and "雙面" in left.searchable_name
and "雙面" in right.searchable_name
and "" in left.searchable_name
and "" in right.searchable_name
and ("硬皮" in left.searchable_name or "去角質" in left.searchable_name)
and ("硬皮" in right.searchable_name or "去角質" in right.searchable_name)
and ("磨砂棒" in left.searchable_name or "足搓棒" in left.searchable_name)
and ("磨砂棒" in right.searchable_name or "足搓棒" in right.searchable_name)
)
def _has_baan_baby_lip_catalog_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
brand_tokens = {"baan", "貝恩"}
left_options = _explicit_variant_option_tokens(left)
right_options = _explicit_variant_option_tokens(right)
return (
bool(left.brand_tokens & brand_tokens)
and bool(right.brand_tokens & brand_tokens)
and "嬰兒修護唇膏" in left.searchable_name
and "嬰兒修護唇膏" in right.searchable_name
and {"原味", "草莓"} <= left_options
and {"原味", "草莓"} <= right_options
)
def _has_recipe_box_child_sunscreen_cushion_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
brand_tokens = left.brand_tokens | right.brand_tokens
return (
{"recipe", "box"} <= brand_tokens
and "兒童防曬氣墊粉餅" in left.searchable_name
and "兒童防曬氣墊粉餅" in right.searchable_name
)
def _has_pavaruni_40_scent_oil_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
return (
"pavaruni" in (left.brand_tokens & right.brand_tokens)
and "天然植物" in f"{left_text} {right_text}"
and "精油" in left_text
and "精油" in right_text
and _has_shared_volume(left, right, 10)
and ("40香味" in left_text or "40種香味" in left_text)
and ("40香味" in right_text or "40種香味" in right_text)
)
def _has_pavaruni_20_scent_candle_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
return (
"pavaruni" in (left.brand_tokens & right.brand_tokens)
and "香氛蠟燭" in left_text
and "香氛蠟燭" in right_text
and _has_shared_weight(left, right, 450)
and ("20香味" in left_text or "20種香味" in left_text)
and ("20香味" in right_text or "20種香味" in right_text)
)
def _has_laundrin_tokyo_car_freshener_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
left_text = left.searchable_name
right_text = right.searchable_name
return (
{"laundrin", "朗德林"} & (left.brand_tokens & right.brand_tokens)
and "tokyo" in left_text
and "tokyo" in right_text
and "車用" in left_text
and "車用" in right_text
and "芳香劑" in left_text
and "芳香劑" in right_text
and _has_shared_count(left, right, 1, "")
)
def _has_shared_count(left: ProductIdentity, right: ProductIdentity, count: int, unit: str) -> bool:
return (count, unit) in set(left.counts) and (count, unit) in set(right.counts)
def _has_shared_volume(left: ProductIdentity, right: ProductIdentity, volume_ml: float) -> bool:
return any(_close_number(value, volume_ml) for value in left.volumes_ml) and any(
_close_number(value, volume_ml) for value in right.volumes_ml
)
def _has_shared_weight(left: ProductIdentity, right: ProductIdentity, weight_g: float) -> bool:
return any(_close_number(value, weight_g) for value in left.weights_g) and any(
_close_number(value, weight_g) for value in right.weights_g
)
def _has_focused_low_score_exact_identity_line(left: ProductIdentity, right: ProductIdentity) -> str:
left_text = left.searchable_name
right_text = right.searchable_name
pair_text = f"{left_text} {right_text}"
left_raw = left.original_name.lower()
right_raw = right.original_name.lower()
brand_tokens = left.brand_tokens | right.brand_tokens
if (
"biodance" in (left.brand_tokens & right.brand_tokens)
and "深層全效面膜" in left_text
and "深層全效面膜" in right_text
and "膠原蛋白" in pair_text
and _has_shared_count(left, right, 4, "")
):
return "biodance_deep_mask"
if (
{"muji", "無印良品"} & brand_tokens
and "精油芬香護手霜" in left_text
and "精油芬香護手霜" in right_text
and _has_shared_weight(left, right, 50)
and bool(left.brand_tokens) != bool(right.brand_tokens)
):
return "muji_aroma_hand_cream_brandless"
if (
{"herbacin", "德國小甘菊"} & brand_tokens
and "小甘菊" in left_text
and "小甘菊" in right_text
and "護手霜" in left_text
and "護手霜" in right_text
and _has_shared_volume(left, right, 20)
and bool(left.brand_tokens) != bool(right.brand_tokens)
):
return "herbacin_classic_hand_cream_20ml_brandless"
if (
{"sab", "初淨肌"} & (left.brand_tokens & right.brand_tokens)
and "私密防護舒緩噴霧" in left_text
and "私密防護舒緩噴霧" in right_text
and _has_shared_volume(left, right, 30)
):
return "sab_private_spray"
if (
"lush" in (left.brand_tokens & right.brand_tokens)
and "櫻之花身體噴霧" in left_text
and "櫻之花身體噴霧" in right_text
and _has_shared_volume(left, right, 200)
):
return "lush_sakura_body_spray"
if (
{"kanebo", "佳麗寶"} & (left.brand_tokens & right.brand_tokens)
and "coffret" in left_text
and "coffret" in right_text
and "光透立體眼線筆" in left_text
and "光透立體眼線筆" in right_text
):
return "kanebo_coffret_eyeliner"
if (
"artmis" in brand_tokens
and "葳兒柔" in left_text
and "葳兒柔" in right_text
and "賦活凝膠" in left_text
and "賦活凝膠" in right_text
and _has_shared_volume(left, right, 35)
):
return "artmis_virile_gel"
if (
"artmis" in brand_tokens
and "私密清潔慕斯" in left_text
and "私密清潔慕斯" in right_text
and "金縷梅" in left_text
and "金縷梅" in right_text
and _has_shared_volume(left, right, 250)
):
return "artmis_witch_hazel_private_mousse_250ml"
if (
"artmis" in brand_tokens
and "私密清潔慕斯" in left_text
and "私密清潔慕斯" in right_text
and "蔓越莓" in left_text
and "蔓越莓" in right_text
and _has_shared_volume(left, right, 250)
):
return "artmis_cranberry_private_mousse_250ml"
if (
"powerman" in pair_text
and "男性私密養護液" in left_text
and "男性私密養護液" in right_text
and _has_shared_volume(left, right, 30)
):
return "playjoy_powerman_male_care_30ml"
if (
{"physiogel", "潔美淨"} & (left.brand_tokens & right.brand_tokens)
and "ai冰鎮精華露" in left_text
and "ai冰鎮精華露" in right_text
and _has_shared_volume(left, right, 200)
and _has_exact_count_alignment(left, right)
):
return "physiogel_ai_ice_essence_200ml_2pack"
if (
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
and "緊彈水嫩凝膠" in left_text
and "緊彈水嫩凝膠" in right_text
and _has_shared_weight(left, right, 40)
):
return "ts6_private_elastic_gel_40g"
if (
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
and "超美" in left_text
and "超美" in right_text
and "香氛誘霜" in left_text
and "香氛誘霜" in right_text
and (120.0 in set(left.weights_g) or 120.0 in set(left.volumes_ml))
and (120.0 in set(right.weights_g) or 120.0 in set(right.volumes_ml))
):
return "ts6_private_white_fragrance_cream_120"
if (
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
and "淨白植感慕斯" in left_text
and "淨白植感慕斯" in right_text
and _has_shared_weight(left, right, 180)
and _has_exact_count_alignment(left, right)
):
return "ts6_white_mousse_180g_3pack"
if (
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
and "沁涼潔淨慕斯" in left_text
and "沁涼潔淨慕斯" in right_text
and _has_shared_weight(left, right, 100)
and _has_exact_count_alignment(left, right)
):
return "ts6_cooling_clean_mousse_100g"
if (
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
and "蜜愛潤滑液" in left_text
and "蜜愛潤滑液" in right_text
and _has_shared_weight(left, right, 100)
and _has_exact_count_alignment(left, right)
):
return "ts6_lubricant_100g_3pack"
if (
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
and "蜜桃煥白凝膠" in left_text
and "蜜桃煥白凝膠" in right_text
and _has_shared_weight(left, right, 45)
and _has_exact_count_alignment(left, right)
):
return "ts6_peach_bright_gel_45g_3pack"
if (
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
and "極淨白私密潔膚露" in left_text
and "極淨白私密潔膚露" in right_text
and "蜜桃煥白凝膠" in left_text
and "蜜桃煥白凝膠" in right_text
and _has_shared_weight(left, right, 250)
and _has_shared_weight(left, right, 45)
):
return "ts6_white_wash_peach_gel_kit"
if (
{"beauty", "foot"} <= (left.brand_tokens & right.brand_tokens)
and "足膜" in left_text
and "足膜" in right_text
and any(_has_shared_volume(left, right, volume) for volume in (25, 30))
and _has_exact_count_alignment(left, right)
and not _is_multi_variant_catalog_listing(left)
and not _is_multi_variant_catalog_listing(right)
):
return "beauty_foot_mask_exact_pack"
if (
{"kameria", "凱蜜菈"} & (left.brand_tokens & right.brand_tokens)
and "足足稱奇" in left_text
and "足足稱奇" in right_text
and "積雪草" in left_text
and "積雪草" in right_text
and "足膜" in left_text
and "足膜" in right_text
and _has_shared_volume(left, right, 17)
and _has_exact_count_alignment(left, right)
and not _is_multi_variant_catalog_listing(left)
and not _is_multi_variant_catalog_listing(right)
):
return "kameria_centella_foot_mask_17ml_2pc"
if (
{"vaseline", "凡士林"} & (left.brand_tokens & right.brand_tokens)
and "嬰兒高純修護凝膠" in left_text
and "嬰兒高純修護凝膠" in right_text
and _has_shared_weight(left, right, 368)
and _has_exact_count_alignment(left, right)
):
return "vaseline_baby_jelly_368g_3pack"
if (
{"derma", "丹麥德瑪"} & (left.brand_tokens & right.brand_tokens)
and "寶寶" in left_text
and "寶寶" in right_text
and "洗髮沐浴露" in left_text
and "洗髮沐浴露" in right_text
and _has_shared_volume(left, right, 150)
):
return "derma_baby_wash_150ml"
if (
{"derma", "丹麥德瑪"} & (left.brand_tokens & right.brand_tokens)
and "寶寶" in left_text
and "寶寶" in right_text
and "洗髮沐浴露" in left_text
and "洗髮沐浴露" in right_text
and _has_shared_volume(left, right, 500)
):
return "derma_baby_wash_500ml"
if (
{"clarins", "克蘭詩"} & (left.brand_tokens & right.brand_tokens)
and "黃金亮眼萃" in left_text
and "黃金亮眼萃" in right_text
and _has_shared_volume(left, right, 20)
):
return "clarins_double_serum_eye_20ml"
if (
{"cetaphil", "舒特膚"} & (left.brand_tokens & right.brand_tokens)
and "長效潤膚乳" in left_text
and "長效潤膚乳" in right_text
and _has_shared_volume(left, right, 237)
):
return "cetaphil_long_lotion_237ml"
if (
{"cetaphil", "舒特膚"} & (left.brand_tokens & right.brand_tokens)
and "長效潤膚乳" in left_text
and "長效潤膚乳" in right_text
and _has_shared_volume(left, right, 473)
):
return "cetaphil_long_lotion_473ml"
if (
{"cetaphil", "舒特膚"} & (left.brand_tokens & right.brand_tokens)
and "長效潤膚霜" in left_text
and "長效潤膚霜" in right_text
and _has_shared_weight(left, right, 250)
):
return "cetaphil_long_moisturizing_cream_250g"
if (
{"cetaphil", "舒特膚"} & (left.brand_tokens & right.brand_tokens)
and "益膚康修護舒敏乳霜" in left_text
and "益膚康修護舒敏乳霜" in right_text
and _has_shared_weight(left, right, 227)
):
return "cetaphil_ad_repair_cream_227g"
if (
{"nivea", "妮維雅"} & (left.brand_tokens & right.brand_tokens)
and "妮維雅霜" in left_text
and "妮維雅霜" in right_text
and "隨身版" in left_text
and "隨身版" in right_text
and _has_shared_volume(left, right, 100)
):
return "nivea_creme_100ml"
if (
"nailmatic" in (left.brand_tokens & right.brand_tokens)
and "小精靈" in left_text
and "小精靈" in right_text
and "指甲油" in left_text
and "指甲油" in right_text
):
return "nailmatic_casper_polish"
if (
"小浪" in (left.brand_tokens & right.brand_tokens)
and "智能感應自動噴香機" in left_text
and "智能感應自動噴香機" in right_text
and "補充液" in left_text
and "補充液" in right_text
and _has_shared_count(left, right, 3, "")
):
return "xiaolang_spray_machine_refill_set"
if (
{"yunmi", "j10"} <= (left.brand_tokens & right.brand_tokens)
and "濕度數顯智能加濕器" in left_text
and "濕度數顯智能加濕器" in right_text
):
return "yunmi_j10_humidifier"
if (
"aquiesse" in (left.brand_tokens & right.brand_tokens)
and "香氛蠟燭" in left_text
and "香氛蠟燭" in right_text
and "5oz" in left_text
and "5oz" in right_text
and _is_multi_variant_catalog_listing(left)
and _is_multi_variant_catalog_listing(right)
):
return "aquiesse_5oz_candle_catalog"
if (
{"rejuran", "麗珠蘭"} & (left.brand_tokens & right.brand_tokens)
and "麗駐蘭修復舒緩面膜" in left_text
and "麗駐蘭修復舒緩面膜" in right_text
and "5p" in left_text
and "5p" in right_text
):
return "rejuran_repair_mask_5p"
if (
{"shiseido", "資生堂"} & (left.brand_tokens & right.brand_tokens)
and "新艷陽" in left_text
and "新艷陽" in right_text
and "水離子熱防禦" in left_text
and "水離子熱防禦" in right_text
and "隔離露" in left_text
and "隔離露" in right_text
):
return "shiseido_blue_sunscreen"
if (
"dhc" in pair_text
and "純欖護唇膏" in left_text
and "純欖護唇膏" in right_text
and _has_shared_weight(left, right, 1.5)
):
return "dhc_olive_lip_1_5g"
if (
"frudia" in pair_text
and "蜂蜜藍莓" in left_text
and "蜂蜜藍莓" in right_text
and "護唇膏" in left_text
and "護唇膏" in right_text
and _has_shared_weight(left, right, 10)
):
return "frudia_honey_blueberry_lip_10g"
if (
{"sebamed", "施巴"} & brand_tokens
and "嬰兒護唇膏" in left_text
and "嬰兒護唇膏" in right_text
and _has_shared_weight(left, right, 4.8)
and _has_exact_count_alignment(left, right)
):
return "sebamed_baby_lip_4_8g_2pack"
if (
"理膚寶水" in pair_text
and "滋養修護潤唇膏" in left_text
and "滋養修護潤唇膏" in right_text
and _has_shared_volume(left, right, 4.7)
):
return "laroche_posay_lip_balm_4_7ml"
if (
{"baan", "貝恩"} & (left.brand_tokens & right.brand_tokens)
and "嬰兒修護唇膏" in left_text
and "嬰兒修護唇膏" in right_text
and left.product_type == right.product_type == "護唇膏"
and "原味" in left_text
and "原味" in right_text
and "草莓" in left_text
and "草莓" in right_text
):
return "baan_baby_lip_original_strawberry_catalog"
if (
{"baan", "貝恩"} & (left.brand_tokens & right.brand_tokens)
and "嬰兒修護唇膏" in left_text
and "嬰兒修護唇膏" in right_text
and left.product_type == right.product_type == "護唇膏"
):
return "baan_baby_lip_base_catalog"
if (
{"shu uemura", "植村秀"} & (left.brand_tokens & right.brand_tokens)
and "3d極細防水眼線膠筆" in left_text
and "3d極細防水眼線膠筆" in right_text
):
return "shu_3d_eyeliner"
if (
{"ysl", "聖羅蘭"} & (left.brand_tokens & right.brand_tokens)
and "恆久完美透膚煙染腮紅" in left_text
and "恆久完美透膚煙染腮紅" in right_text
):
return "ysl_blush_catalog"
if (
{"hh", "草本新淨界"} & (left.brand_tokens & right.brand_tokens)
and "私密植萃美白緊緻凝露" in left_text
and "私密植萃美白緊緻凝露" in right_text
and _has_shared_volume(left, right, 30)
):
return "hh_private_gel"
if (
{"lab52", "齒妍堂"} & (left.brand_tokens & right.brand_tokens)
and "學習刷牙漱口水" in left_text
and "學習刷牙漱口水" in right_text
and _has_overlapping_base_spec(left, right)
):
return "lab52_mouthwash"
if (
{"lab52", "齒妍堂"} & (left.brand_tokens | right.brand_tokens)
and "牙刷" in left_text
and "牙刷" in right_text
and any(term in left_text for term in ("嬰幼兒", "幼兒", "汪汪隊"))
and any(term in right_text for term in ("嬰幼兒", "幼兒", "汪汪隊"))
and _has_shared_count(left, right, 2, "")
):
return "lab52_paw_patrol_baby_toothbrush_2pack"
if (
"benefit" in (left.brand_tokens & right.brand_tokens)
and "染唇液" in left_text
and "染唇液" in right_text
and "唇頰兩用" in pair_text
):
return "benefit_lip_tint"
if (
{"schick", "舒適牌"} & (left.brand_tokens & right.brand_tokens)
and "舒綺" in left_text
and "舒綺" in right_text
and "除毛刀片" in left_text
and "除毛刀片" in right_text
and "敏感肌" in left_text
and "敏感肌" in right_text
and _has_shared_count(left, right, 3, "")
):
return "schick_womens_sensitive_blade_3pack"
if (
{"herb24", "草本"} & (left.brand_tokens & right.brand_tokens)
and "晨霧純精油擴香儀ii" in left_text
and "晨霧純精油擴香儀ii" in right_text
and (("霧黑" in left_text and "黑色" in right_text) or ("霧黑" in right_text and "黑色" in left_text))
):
return "herb24_mist_diffuser_black"
if _has_pavaruni_40_scent_oil_alignment(left, right):
return "pavaruni_40_scent_oil"
if _has_pavaruni_20_scent_candle_alignment(left, right):
return "pavaruni_20_scent_candle"
if _has_laundrin_tokyo_car_freshener_alignment(left, right):
return "laundrin_tokyo_car_freshener"
if (
"好物良品" in (left.brand_tokens & right.brand_tokens)
and "北歐簡樸融蠟燈桌面氣氛夜燈" in left_text
and "北歐簡樸融蠟燈桌面氣氛夜燈" in right_text
):
return "goodgoods_nordic_wax_lamp"
if (
{"derma", "丹麥德瑪"} & (left.brand_tokens & right.brand_tokens)
and "有機植萃" in left_text
and "有機植萃" in right_text
and "護膚油" in left_text
and "護膚油" in right_text
and _has_shared_volume(left, right, 150)
):
return "derma_eco_skin_oil"
if (
{"derma", "丹麥德瑪"} & (left.brand_tokens & right.brand_tokens)
and "大地" in left_text
and "大地" in right_text
and "植萃" in left_text
and "植萃" in right_text
and "護膚油" in left_text
and "護膚油" in right_text
and _has_exact_count_alignment(left, right)
):
return "derma_eco_skin_oil_2pack_review"
if (
{"修護保養"} & (left.brand_tokens & right.brand_tokens)
and "蝸牛特潤修護面膜" in left_text
and "蝸牛特潤修護面膜" in right_text
and _has_shared_count(left, right, 6, "")
):
return "w_repair_snail_mask_6pcs_review"
if (
{"yuskin", "悠斯晶"} & (left.brand_tokens & right.brand_tokens)
and "乳霜" in left_text
and "乳霜" in right_text
and _has_shared_weight(left, right, 30)
and _has_exact_count_alignment(left, right)
):
return "yuskin_classic_cream_30g_6pack"
if (
{"johnson", "johnsons", "嬌生"} & (left.brand_tokens & right.brand_tokens)
and "嬰兒" in left_text
and "嬰兒" in right_text
and "潤膚乳" in left_text
and "潤膚乳" in right_text
and _has_shared_volume(left, right, 500)
and (not left.counts or not right.counts or _has_exact_count_alignment(left, right))
and any(option in left_text and option in right_text for option in ("牛奶", "純淨", "甜夢", "溫和", "棉柔"))
):
return "johnsons_baby_lotion_variant_catalog"
if (
{"im meme", "meme"} & (left.brand_tokens & right.brand_tokens)
and "我愛超磁妝定妝噴霧" in left_text
and "我愛超磁妝定妝噴霧" in right_text
and "涼感" in left_text
and "涼感" in right_text
):
return "im_meme_fixx_cool_setting_spray"
if (
{"so", "natural", "fixx"} <= (left.brand_tokens & right.brand_tokens)
and "全天候超完美定妝噴霧" in left_text
and "全天候超完美定妝噴霧" in right_text
and _has_shared_volume(left, right, 120)
and not any(term in pair_text for term in ("經典款", "光澤款", "霧面款", "夏日款", "涼感", "一般"))
):
return "so_natural_fixx_setting_spray_120ml_plain"
if (
{"so", "natural", "fixx"} <= (left.brand_tokens & right.brand_tokens)
and "全天候超完美定妝噴霧" in left_text
and "全天候超完美定妝噴霧" in right_text
):
return "so_natural_fixx_setting_spray_catalog"
if (
{"kate", "凱婷"} & (left.brand_tokens & right.brand_tokens)
and "粉餅盒" in left_text
and "粉餅盒" in right_text
):
return "kate_powder_case_catalog"
if (
{"kate", "凱婷"} & (left.brand_tokens & right.brand_tokens)
and "怪獸級持色唇膏" in left_text
and "怪獸級持色唇膏" in right_text
):
return "kate_monster_lipstick_catalog"
if (
"opi" in (left.brand_tokens & right.brand_tokens)
and _shared_model_tokens(left, right)
and "類光繚" in left_text
and "類光繚" in right_text
and ("指甲油" in left_text or "指彩" in left_text)
and ("指甲油" in right_text or "指彩" in right_text)
and ("如膠似漆" in left_text or "如膠似漆" in right_text)
):
return "opi_gel_polish_exact_model"
if (
"opi" in (left.brand_tokens & right.brand_tokens)
and "類光繚指甲油" in left_text
and "類光繚指甲油" in right_text
and any(series in left_text and series in right_text for series in ("白日夢遊", "驕傲果凍"))
):
return "opi_gel_polish_series_catalog"
if (
("rom" in (left.brand_tokens & right.brand_tokens) or "romand" in (left.brand_tokens & right.brand_tokens))
and "果汁唇釉" in left_text
and "果汁唇釉" in right_text
and "2.0" in left_text
and "2.0" in right_text
):
return "romand_juicy_lip_tint_2_catalog"
if (
"solone" in (left.brand_tokens & right.brand_tokens)
and "持久眼線筆" in left_text
and "持久眼線筆" in right_text
):
return "solone_longlasting_eyeliner"
if (
{"shu uemura", "植村秀"} & (left.brand_tokens & right.brand_tokens)
and "自動武士刀眉筆" in left_text
and "自動武士刀眉筆" in right_text
and "筆蕊" in left_text
and "筆蕊" in right_text
):
return "shu_auto_hard_formula_refill_catalog"
if (
{"summer", "eve", "舒摩兒"} & (left.brand_tokens & right.brand_tokens)
and "浴潔露" in left_text
and "浴潔露" in right_text
and "全肌防護" in left_raw
and "全肌防護" in right_raw
and _has_exact_count_alignment(left, right)
):
return "summer_eve_full_skin_wash_2pack"
if (
"焦糖楓葉香氛擴香花禮盒" in left_text
and "焦糖楓葉香氛擴香花禮盒" in right_text
and _has_shared_volume(left, right, 30)
and bool(left.brand_tokens) != bool(right.brand_tokens)
):
return "the_forest_maple_diffuser_flower_brandless"
if (
"gatsby" in (left.brand_tokens & right.brand_tokens)
and "爆水擦澡濕巾" in left_text
and "爆水擦澡濕巾" in right_text
and "24張入" in left_text
and "24張入" in right_text
):
return "gatsby_body_wipes_24"
if (
{"3w", "clinic"} <= (left.brand_tokens & right.brand_tokens)
and "膠原蛋白粉底液" in left_text
and "膠原蛋白粉底液" in right_text
and _has_shared_volume(left, right, 50)
and _has_exact_count_alignment(left, right)
):
return "3w_clinic_collagen_foundation_50ml_2pack"
if (
"花美水" in (left.brand_tokens & right.brand_tokens)
and "moisture" in (left.brand_tokens & right.brand_tokens)
and "保濕修護" in left_text
and "保濕修護" in right_text
and "精華凝膠" in left_text
and "精華凝膠" in right_text
and ("原黃金" in left_text and "原黃金" in right_text)
and _has_shared_weight(left, right, 1.7)
and _has_exact_count_alignment(left, right)
):
return "hanamisui_moisture_original_gel_1_7g_3pack"
if (
"花美水" in (left.brand_tokens & right.brand_tokens)
and "inclear" in (left.brand_tokens & right.brand_tokens)
and ("櫻克麗兒" in left_text and "櫻克麗兒" in right_text)
and ("私密淨化凝膠" in left_text and "私密淨化凝膠" in right_text)
and _has_shared_weight(left, right, 1.7)
and _has_exact_count_alignment(left, right)
):
return "hanamisui_inclear_private_gel_1_7g_3pack"
if (
"花美水" in (left.brand_tokens & right.brand_tokens)
and "relax" in left_raw
and "relax" in right_raw
and "薰衣草" in left_text
and "薰衣草" in right_text
and "潤滑凝膠" in left_text
and "潤滑凝膠" in right_text
and _has_shared_weight(left, right, 1.7)
and _has_exact_count_alignment(left, right)
):
return "hanamisui_relax_lavender_gel_1_7g_3pack"
if (
("聖克萊爾" in left_text and "聖克萊爾" in right_text)
and "私密呼呼溫和潔淨慕斯" in left_text
and "私密呼呼溫和潔淨慕斯" in right_text
and _has_shared_volume(left, right, 150)
and _has_exact_count_alignment(left, right)
):
return "st_clare_private_mousse_150ml_2pack"
if (
("聖克萊爾" in left_text and "聖克萊爾" in right_text)
and "私密呼呼溫和潔淨慕斯" in left_text
and "私密呼呼溫和潔淨慕斯" in right_text
and "私密呼呼舒緩護理噴霧" in left_text
and "私密呼呼舒緩護理噴霧" in right_text
and _has_shared_volume(left, right, 150)
and _has_shared_volume(left, right, 50)
):
return "st_clare_private_mousse_spray_set"
if (
("biopeutic" in (left.brand_tokens & right.brand_tokens) or ("葆療美" in left_text and "葆療美" in right_text))
and "果酸煥膚水凝乳" in left_text
and "果酸煥膚水凝乳" in right_text
and "20%" in left_raw
and "20%" in right_raw
and _has_shared_volume(left, right, 150)
):
return "biopeutic_plus_aha_lotion_20_150ml"
if (
"台塑生醫" in left_text
and "台塑生醫" in right_text
and "嬰兒沐浴洗髮" in left_text
and "嬰兒沐浴洗髮" in right_text
and "3件組" in left_text
and "3件組" in right_text
and "嬰兒沐浴精" in left_text
and "嬰兒沐浴精" in right_text
and "嬰幼童洗髮精" in left_text
and "嬰幼童洗髮精" in right_text
):
return "taisu_baby_bath_shampoo_3pc"
if (
"雅頓" in left_text
and "雅頓" in right_text
and "八小時潤澤護唇膏" in left_text
and "八小時潤澤護唇膏" in right_text
and "spf15" in left_raw
and "spf15" in right_raw
and _has_shared_weight(left, right, 3.7)
and _has_exact_count_alignment(left, right)
):
return "arden_eight_hour_lip_spf15_3_7g_3pack"
if (
"理膚寶水" in left_text
and "理膚寶水" in right_text
and "全面修復潤唇膏" in left_text
and "全面修復潤唇膏" in right_text
and _has_shared_volume(left, right, 7.5)
):
return "laroche_posay_repair_lip_balm_7_5ml"
if (
{"flortte", "花洛莉亞"} & (left.brand_tokens & right.brand_tokens)
and "水果沙拉系列彩色防水眼線液筆" in left_text
and "水果沙拉系列彩色防水眼線液筆" in right_text
and "色號" in left_text
and "色號" in right_text
and "任選" in left_raw
and "任選" in right_raw
and _has_shared_volume(left, right, 0.5)
):
return "flortte_fruit_salad_eyeliner_0_5ml_catalog"
if (
{"neutrogena", "露得清"} & (left.brand_tokens & right.brand_tokens)
and "護手霜" in left_text
and "護手霜" in right_text
and "無香" in left_text
and "無香" in right_text
and "有香" in left_text
and "有香" in right_text
and _has_shared_weight(left, right, 56)
):
return "neutrogena_hand_cream_56g_scent_catalog"
if (
{"kanebo", "佳麗寶"} & (left.brand_tokens & right.brand_tokens)
and "allie" in left_raw
and "allie" in right_raw
and "持采亮化uv防曬水凝乳" in left_text
and "持采亮化uv防曬水凝乳" in right_text
and "任選" in left_raw
and "任選" in right_raw
and _has_shared_weight(left, right, 60)
):
return "kanebo_allie_bright_uv_milk_60g_catalog"
if (
"ordinary" in (left.brand_tokens & right.brand_tokens)
and "咖啡因" in left_text
and "咖啡因" in right_text
and "egcg" in left_raw
and "egcg" in right_raw
and "兒茶眼部配方" in left_text
and "兒茶眼部配方" in right_text
and (
_has_shared_volume(left, right, 30)
or (30.0 in left.volumes_ml and not right.volumes_ml)
or (30.0 in right.volumes_ml and not left.volumes_ml)
)
):
return "the_ordinary_caffeine_egcg_30ml"
if (
{"sk-ii", "skii", "sk2"} & (left.brand_tokens & right.brand_tokens)
and "青春露" in left_text
and "青春露" in right_text
and _has_shared_volume(left, right, 330)
and _has_shared_count(left, right, 2, "")
):
return "sk_ii_essence_330ml_2pack"
if (
{"amiino", "安美諾"} & (left.brand_tokens | right.brand_tokens)
and "美白修護霜" in left_text
and "美白修護霜" in right_text
and _has_shared_volume(left, right, 30)
):
return "amiino_whitening_repair_cream_30ml"
if (
{"natures", "care"} <= (left.brand_tokens & right.brand_tokens)
and "綿羊油" in left_text
and "綿羊油" in right_text
and _has_exact_count_alignment(left, right)
and (
_has_shared_volume(left, right, 125)
or (125.0 in left.volumes_ml and not right.volumes_ml and "125m" in right_text)
or (125.0 in right.volumes_ml and not left.volumes_ml and "125m" in left_text)
)
):
return "natures_care_sheep_oil_exact_pack"
if (
"tomoon" in (left.brand_tokens & right.brand_tokens)
and "德國奔月" in left_text
and "德國奔月" in right_text
and "豪華套裝組" in left_text
and "豪華套裝組" in right_text
and ("指甲剪" in left_text or "指甲刀" in left_text)
and ("指甲剪" in right_text or "指甲刀" in right_text)
and any(size in left_text and size in right_text for size in ("l號", "s號"))
):
return "tomoon_nail_clipper_luxury_size"
if (
{"hh", "草本新淨界"} & (left.brand_tokens & right.brand_tokens)
and "私密植萃抗菌潔淨露" in left_text
and "私密植萃抗菌潔淨露" in right_text
and "私密衣物抗菌手洗精" in left_text
and "私密衣物抗菌手洗精" in right_text
and _has_shared_volume(left, right, 200)
):
return "hh_private_cleanser_laundry_wash_set"
if (
{"sebamed", "施巴"} & (left.brand_tokens & right.brand_tokens)
and "護潔露" in left_text
and "護潔露" in right_text
and _has_shared_volume(left, right, 200)
and _has_exact_count_alignment(left, right)
):
return "sebamed_ph38_private_wash_200ml_2pack"
if (
{"yes", "德悅氏"} & (left.brand_tokens & right.brand_tokens)
and "鋒利窄弧型剪刀" in left_text
and "鋒利窄弧型剪刀" in right_text
and "9cm" in left_text
and "9cm" in right_text
):
return "yes_curved_scissors_9cm"
if (
{"yes", "德悅氏"} & (left.brand_tokens & right.brand_tokens)
and "腳指甲剪刀" in left_text
and "腳指甲剪刀" in right_text
and "10.5cm" in left_text
and "10.5cm" in right_text
):
return "yes_foot_nail_scissors_10_5cm"
if (
{"yes", "德悅氏"} & (left.brand_tokens & right.brand_tokens)
and "極細指甲緣硬皮剪刀" in left_text
and "極細指甲緣硬皮剪刀" in right_text
and "9cm" in left_text
and "9cm" in right_text
):
return "yes_cuticle_scissors_9cm"
if _has_yes_nail_tool_exact_alignment(left, right):
return "yes_nail_tool_exact_model_size"
if (
{"kussen", "葵森"} & (left.brand_tokens & right.brand_tokens)
and "寶寶益菌屁屁膏" in left_text
and "寶寶益菌屁屁膏" in right_text
and _has_shared_volume(left, right, 50)
and _has_exact_count_alignment(left, right)
):
return "kussen_baby_butt_cream_50ml_3pack"
if (
"bone" in (left.brand_tokens & right.brand_tokens)
and "擴香禮盒三入組" in left_text
and "擴香禮盒三入組" in right_text
and all(component in left_text and component in right_text for component in ("原木麋鹿", "搖搖貓頭鷹", "薰衣草精油"))
and _has_exact_count_alignment(left, right)
):
return "bone_diffuser_gift_3pack"
if (
{"1990", "選物"} <= (left.brand_tokens & right.brand_tokens)
and "現代簡約半圓罩融燭燈" in left_text
and "現代簡約半圓罩融燭燈" in right_text
and "白色款" in left_text
and "白色款" in right_text
):
return "selection1990_half_dome_wax_lamp_white"
if (
{"1990", "選物"} <= (left.brand_tokens & right.brand_tokens)
and "歐式可彎融燭燈" in left_text
and "歐式可彎融燭燈" in right_text
and "白色款" in left_text
and "白色款" in right_text
):
return "selection1990_bendable_wax_lamp_white"
if (
"canmake" in (left.brand_tokens & right.brand_tokens)
and "淚袋專用盤" in left_text
and "淚袋專用盤" in right_text
and "淚袋眼影盤" in left_text
and "淚袋眼影盤" in right_text
):
return "canmake_tear_bag_palette"
if (
{"recipe", "box"} <= brand_tokens
and "可撕式水性兒童指甲油" in left_text
and "可撕式水性兒童指甲油" in right_text
):
return "recipe_box_peelable_child_polish_catalog"
if (
"gdesign" in (left.brand_tokens & right.brand_tokens)
and "aroma" in left_text
and "aroma" in right_text
and "lava" in left_text
and "lava" in right_text
and "解憂放鬆緩緩燈2.0" in left_text
and "解憂放鬆緩緩燈2.0" in right_text
and "熔岩燈" in left_text
and "熔岩燈" in right_text
and "精油擴香" in left_text
and "精油擴香" in right_text
):
return "gdesign_aroma_lava_lamp_2"
if (
"hooome" in (left.brand_tokens & right.brand_tokens)
and "白色" in left_text
and "白色" in right_text
and "香氛蠟燭暖燈" in left_text
and "香氛蠟燭暖燈" in right_text
and "兩顆燈泡" in left_text
and "兩顆燈泡" in right_text
and "禮盒" in left_text
and "禮盒" in right_text
):
return "hooome_classic_white_wax_lamp_bulbs_giftbox"
return ""
def _is_relove_private_cleanser_line(left: ProductIdentity, right: ProductIdentity) -> bool:
return (
"relove" in (left.brand_tokens | right.brand_tokens)
and "私密" in left.searchable_name
and "私密" in right.searchable_name
and "凝露" in left.searchable_name
and "凝露" in right.searchable_name
)
def _is_relove_cleanser_gel_like(left: ProductIdentity, right: ProductIdentity) -> bool:
if "relove" not in (left.brand_tokens | right.brand_tokens):
return False
cleanser_terms = ("私密", "潔淨", "清潔")
return (
"凝露" in left.searchable_name
and "凝露" in right.searchable_name
and any(term in left.searchable_name for term in cleanser_terms)
and any(term in right.searchable_name for term in cleanser_terms)
)
def _is_multi_variant_catalog_listing(identity: ProductIdentity) -> bool:
text = identity.normalized_name
return any(phrase in text for phrase in MULTI_VARIANT_LISTING_PHRASES)
def _normalize_variant_option(value: str) -> set[str]:
compact = re.sub(r"[^a-z0-9]", "", (value or "").lower())
if not compact:
return set()
return {compact}
def _variant_option_compare_key(option: str) -> str:
if option.isdigit():
return option.lstrip("0") or "0"
return option
def _variant_options_overlap(left_options: set[str], right_options: set[str]) -> bool:
if left_options & right_options:
return True
left_keys = {_variant_option_compare_key(option) for option in left_options}
right_keys = {_variant_option_compare_key(option) for option in right_options}
return bool(left_keys & right_keys)
def _is_catalog_or_delimited_variant_listing(identity: ProductIdentity) -> bool:
if _is_multi_variant_catalog_listing(identity):
return True
text = identity.searchable_name
if re.search(r"(?<![a-z0-9])([a-z]?\d{1,3}[a-z]?)\s*(?:~||至|-)\s*([a-z]?\d{1,3}[a-z]?)(?![a-z0-9])", text, re.I):
return True
options = _explicit_variant_option_tokens(identity)
if len(options) < 2:
return bool(
re.search(r"[//、,.&]", text)
and any(term in text for term in ("粉餅盒", "眼線膠筆", "眉筆", "唇膏", "唇釉", "遮瑕蜜", "車用擴香", "車用香氛"))
)
return bool(re.search(r"[//、,.&]", text))
def _has_catalog_variant_listing_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
if not (_is_multi_variant_catalog_listing(left) and _is_multi_variant_catalog_listing(right)):
return False
if left.product_type != right.product_type or left.product_type not in {"精油", "護唇膏"}:
return False
if not _has_overlapping_base_spec(left, right):
return False
shared_core = left.core_tokens & right.core_tokens
if shared_core:
return True
left_text = left.searchable_name
right_text = right.searchable_name
catalog_terms = ("香氛擴香罐", "香氛蠟燭", "蠟燭", "擴香罐", "修護唇膏")
return any(term in left_text and term in right_text for term in catalog_terms)
def _is_variant_sensitive_identity(
left: ProductIdentity,
right: ProductIdentity,
shared_anchor: str,
) -> bool:
corpus = (
shared_anchor,
left.product_type or "",
right.product_type or "",
left.searchable_name,
right.searchable_name,
)
return any(keyword in text for text in corpus for keyword in VARIANT_SENSITIVE_KEYWORDS if text)
def _has_variant_descriptor_conflict(left: ProductIdentity, right: ProductIdentity, shared_anchor: str) -> bool:
if _has_serum_formulation_conflict(left, right, shared_anchor):
return True
if _has_taicend_baby_spray_equivalence(left, right):
return False
if _has_brush_baby_wildones_toothbrush_alignment(left, right):
return False
if _has_baan_baby_lip_catalog_alignment(left, right):
return False
if _has_recipe_box_child_sunscreen_cushion_alignment(left, right):
return False
if _has_pavaruni_40_scent_oil_alignment(left, right):
return False
if _has_pavaruni_20_scent_candle_alignment(left, right):
return False
if _has_laundrin_tokyo_car_freshener_alignment(left, right):
return False
if _is_relove_private_cleanser_line(left, right):
return False
if (
shared_anchor
and shared_anchor not in SEARCH_BROAD_ANCHORS
and not _is_variant_sensitive_identity(left, right, shared_anchor)
):
return False
if _shared_model_tokens(left, right):
return False
left_descriptors = _variant_descriptors(left)
right_descriptors = _variant_descriptors(right)
if not left_descriptors or not right_descriptors:
return False
if left_descriptors & right_descriptors:
return False
for left_descriptor in left_descriptors:
for right_descriptor in right_descriptors:
if left_descriptor in right_descriptor or right_descriptor in left_descriptor:
return False
return True
def _explicit_variant_option_tokens(identity: ProductIdentity) -> set[str]:
text = identity.searchable_name
options: set[str] = set()
for match in re.finditer(r"(?<![a-z0-9])([a-z]?\d{1,3}[a-z]?)\s*(?:~||至|-)\s*([a-z]?\d{1,3}[a-z]?)(?![a-z0-9])", text, re.I):
for group in (match.group(1), match.group(2)):
options.update(_normalize_variant_option(group))
for match in re.finditer(r"(?:#|no\.?|色號|號色)\s*([a-z]?\d{1,3}[a-z]?)(?![a-z0-9])", text, re.I):
options.update(_normalize_variant_option(match.group(1)))
for match in re.finditer(r"(?<![a-z0-9])((?:0?\d){1,2})(?=[\u4e00-\u9fff])", text, re.I):
if text[match.end(1):match.end(1) + 4] in {"號護唇膏", "號護脣膏"}:
continue
options.update(_normalize_variant_option(match.group(1)))
for color_word in VARIANT_OPTION_COLOR_WORDS:
if color_word in text:
options.add(color_word)
return options
def _has_variant_option_selection_gap(identity: ProductIdentity, options: set[str]) -> bool:
named_options = {option for option in options if not option.isdigit()}
if len(named_options) < 2:
return False
text = identity.searchable_name
return _is_multi_variant_catalog_listing(identity) or bool(re.search(r"[//、,]", text))
def _has_catalog_options_against_generic_count_alignment(
left: ProductIdentity,
right: ProductIdentity,
left_options: set[str],
right_options: set[str],
) -> bool:
if not _has_overlapping_base_spec(left, right):
return False
if left.product_type and right.product_type and left.product_type != right.product_type:
return False
for catalog_identity, generic_identity, catalog_options, generic_options in (
(left, right, left_options, right_options),
(right, left, right_options, left_options),
):
named_catalog_options = {option for option in catalog_options if not option.isdigit()}
named_generic_options = {option for option in generic_options if not option.isdigit()}
shared_count_options = {
option for option in catalog_options & generic_options
if option.isdigit()
}
if (
len(named_catalog_options) >= 2
and not named_generic_options
and shared_count_options
and _is_multi_variant_catalog_listing(catalog_identity)
and "" in generic_identity.searchable_name
):
return True
return False
def _has_explicit_variant_option_conflict(
left: ProductIdentity,
right: ProductIdentity,
shared_anchor: str,
) -> bool:
if not _is_variant_sensitive_identity(left, right, shared_anchor):
return False
left_options = _explicit_variant_option_tokens(left)
right_options = _explicit_variant_option_tokens(right)
if not left_options or not right_options:
return False
if left_options == right_options:
return False
if _variant_options_overlap(left_options, right_options):
if _has_catalog_options_against_generic_count_alignment(left, right, left_options, right_options):
return False
pair_text = f"{left.searchable_name} {right.searchable_name}"
if any(term in pair_text for term in ("眉筆", "眼線膠筆", "唇膏", "唇釉", "粉餅盒", "遮瑕蜜")) and (
(
len(left_options) > len(right_options)
and _is_catalog_or_delimited_variant_listing(left)
)
or (
len(right_options) > len(left_options)
and _is_catalog_or_delimited_variant_listing(right)
)
):
return False
if (
len(left_options) > len(right_options)
and _has_variant_option_selection_gap(left, left_options)
) or (
len(right_options) > len(left_options)
and _has_variant_option_selection_gap(right, right_options)
):
return True
return False
for left_option in left_options:
for right_option in right_options:
if left_option in right_option or right_option in left_option:
return False
return True
def _has_named_variant_selection_review(
left: ProductIdentity,
right: ProductIdentity,
shared_anchor: str,
) -> bool:
if (
_has_makeup_shade_selection_gap(left, right)
or _has_makeup_spray_variant_selection_gap(left, right)
or _has_catalog_specific_variant_selection_gap(left, right)
):
return True
left_options = _explicit_variant_option_tokens(left)
right_options = _explicit_variant_option_tokens(right)
if left_options and right_options:
for catalog_identity, catalog_options, specific_options in (
(left, left_options, right_options),
(right, right_options, left_options),
):
if (
_is_catalog_or_delimited_variant_listing(catalog_identity)
and len(catalog_options) > len(specific_options)
and _variant_options_overlap(catalog_options, specific_options)
and _is_variant_sensitive_identity(left, right, shared_anchor)
):
return True
if bool(left_options) != bool(right_options):
option_identity = left if left_options else right
catalog_identity = right if left_options else left
if (
_is_variant_sensitive_identity(left, right, shared_anchor)
and _is_catalog_or_delimited_variant_listing(catalog_identity)
and _explicit_variant_option_tokens(option_identity)
):
return True
if (
_is_variant_sensitive_identity(left, right, shared_anchor)
and _has_overlapping_base_spec(left, right)
and _explicit_variant_option_tokens(option_identity)
and any(
term in f"{left.searchable_name} {right.searchable_name}"
for term in ("粉餅盒", "護手霜", "護唇膏", "護唇棒", "滋養霜", "眼線膠筆", "遮瑕蜜")
)
):
return True
if bool(left_options) == bool(right_options):
return False
option_identity = left if left_options else right
named_options = {option for option in (left_options or right_options) if not option.isdigit()}
if len(named_options) < 2:
return False
text = option_identity.searchable_name
return _is_multi_variant_catalog_listing(option_identity) or bool(re.search(r"[//、,&]", text))
def _search_core_score(token: str, all_tokens: set[str]) -> tuple[int, int, str]:
cleaned = _clean_search_phrase(token)
if not cleaned:
return (-999, 0, cleaned)
compact = cleaned.replace(" ", "")
if compact in SEARCH_NOISE_TOKENS or compact in GENERIC_TOKENS:
return (-900, 0, cleaned)
if re.fullmatch(r"\d+(?:\.\d+)?(?:ml|g|mg|kg|l)x\d+", compact, re.I):
return (-900, 0, cleaned)
score = 0
if re.search(r"[a-z][a-z0-9-]{2,}", cleaned):
score += 30
if re.search(r"\d", cleaned):
score += 12
anchors = _extract_anchor_phrases(cleaned)
if anchors:
score += 90
score += min(24, len(anchors[0]) * 3)
if anchors[0] == compact:
score += 8
if compact in SEARCH_BROAD_ANCHORS:
score -= 28
else:
score += max(0, 24 - len(compact))
if len(compact) <= 8:
score += 14
elif len(compact) >= 12:
score -= 12
has_better_anchor = any(
other != token and _extract_anchor_phrases(other)
for other in all_tokens
)
if has_better_anchor and any(term in compact for term in SEARCH_AMBIGUOUS_PRODUCT_TERMS):
score -= 80
if any(noise in compact for noise in SEARCH_NOISE_TOKENS):
score -= 18
return (score, -len(compact), cleaned)
def _ranked_search_core_phrases(identity: ProductIdentity, limit: int = 4) -> list[str]:
tokens = {token for token in identity.core_tokens if token not in GENERIC_TOKENS}
ranked_tokens = sorted(
tokens,
key=lambda token: _search_core_score(token, tokens),
reverse=True,
)
phrases: list[str] = []
for token in ranked_tokens:
if _search_core_score(token, tokens)[0] < -100:
continue
candidates = _extract_anchor_phrases(token) or [_clean_search_phrase(token)]
for phrase in candidates:
compact = phrase.replace(" ", "")
if len(compact) < 2 or compact in SEARCH_NOISE_TOKENS:
continue
if any(term in compact for term in SEARCH_AMBIGUOUS_PRODUCT_TERMS) and len(phrases) > 0:
continue
if phrase not in phrases:
phrases.append(phrase)
if len(phrases) >= limit:
return phrases
return phrases
def _variant_primary_phrase(identity: ProductIdentity) -> str:
text = identity.searchable_name
for anchor in ("時尚潮流美甲片", "頂級璀燦美甲片", "薄型經典美甲片", "足部時尚潮流美甲片"):
pattern = rf"{re.escape(anchor)}[-_ ]*([\u4e00-\u9fff]{{2,8}})"
match = re.search(pattern, text)
if not match:
continue
phrase = _clean_search_phrase(match.group(1))
compact = phrase.replace(" ", "")
if compact and compact not in SEARCH_NOISE_TOKENS:
return phrase
variant_descriptors = sorted(_variant_descriptors(identity), key=lambda token: (len(token), token))
return variant_descriptors[0] if variant_descriptors else ""
def build_search_terms(name: str, max_terms: int = 3) -> list[str]:
identity = parse_product_identity(name)
terms: list[str] = []
is_dashing_diva_nail_line = {"dashing", "diva"} <= identity.brand_tokens and "美甲片" in identity.searchable_name
def primary_brand_phrase() -> str:
if {"dashing", "diva"} <= identity.brand_tokens:
return "dashing diva"
if {"rom", "nd"} <= identity.brand_tokens:
return "romand"
if {"im", "meme"} <= identity.brand_tokens:
return "im meme"
if {"recipe", "box"} <= identity.brand_tokens:
return "recipe box"
chinese = sorted(
(token for token in identity.brand_tokens if re.search(r"[\u4e00-\u9fff]", token)),
key=lambda token: (-len(token), token),
)
if chinese:
return chinese[0]
latin = sorted(
(
token for token in identity.brand_tokens
if re.search(r"[a-z]", token) and len(token) >= 3 and token not in GENERIC_TOKENS
),
key=lambda token: (" " not in token and "-" not in token, -len(token), token),
)
if latin:
return latin[0]
short_latin = sorted(
(
token for token in identity.brand_tokens
if re.search(r"[a-z]", token) and len(token) >= 2 and token not in GENERIC_TOKENS
),
key=lambda token: (" " not in token and "-" not in token, -len(token), token),
)
return short_latin[0] if short_latin else ""
brand_part = primary_brand_phrase()
spec_terms = _search_spec_terms(identity)
spec_part = " ".join(spec_terms)
core_phrases = _ranked_search_core_phrases(identity, limit=4)
full_name_anchor_phrases = _extract_anchor_phrases(name)
if full_name_anchor_phrases:
core_phrases = list(dict.fromkeys(full_name_anchor_phrases + core_phrases))
core_short = " ".join(core_phrases[:2])
core_primary = core_phrases[0] if core_phrases else ""
product_type_aliases = set(PRODUCT_TYPES.get(identity.product_type or "", ()))
chinese_detail_phrases = [
phrase
for phrase in core_phrases[1:]
if re.search(r"[\u4e00-\u9fff]", phrase)
and phrase != core_primary
and phrase != (identity.product_type or "")
and phrase not in SEARCH_BROAD_ANCHORS
and not any(phrase == alias or phrase in alias or alias in phrase for alias in product_type_aliases)
]
modifier_with_primary = " ".join(
part for part in (chinese_detail_phrases[0] if chinese_detail_phrases else "", core_primary) if part
)
variant_primary = _variant_primary_phrase(identity)
variant_options = sorted(
(token for token in _explicit_variant_option_tokens(identity) if token != "0"),
key=lambda token: (len(token), token),
)
variant_option_part = " ".join(variant_options[:2])
model_phrases = [
phrase
for phrase in core_phrases[1:]
if re.fullmatch(r"[a-z]*\d+[a-z0-9-]*", phrase)
or re.fullmatch(r"[a-z][a-z0-9-]{2,}", phrase)
]
if "護甲油" in identity.searchable_name:
model_phrases = [
phrase for phrase in model_phrases
if phrase.lower() not in {"top", "coat"} and not re.fullmatch(r"ist\d+", phrase, re.I)
]
primary_with_model = " ".join(
part for part in (core_primary, model_phrases[0] if model_phrases else "") if part
)
variant_sensitive = any(keyword in identity.searchable_name for keyword in VARIANT_SENSITIVE_KEYWORDS)
model_like_spec = any(
re.search(r"[a-z]", term)
and re.search(r"\d", term)
and not re.fullmatch(r"\d+(?:\.\d+)?(?:ml|g|mg|kg|l)", term, re.I)
for term in spec_terms
)
prefer_variant_search = (
variant_sensitive
and bool(variant_primary)
and not model_phrases
and not model_like_spec
and not variant_options
and "護甲油" not in identity.searchable_name
and any(
term in identity.searchable_name
for term in ("護手霜", "芬香", "香氛", "香味", "擴香", "精油", "指甲油", "指彩")
)
)
for value in (
" ".join(part for part in (brand_part, core_primary, variant_primary, spec_part) if part)
if is_dashing_diva_nail_line and variant_sensitive and variant_primary
else "",
" ".join(part for part in (brand_part, core_primary, variant_option_part, spec_part) if part)
if variant_sensitive and variant_option_part and not model_phrases and not model_like_spec
else "",
" ".join(part for part in (brand_part, core_primary, variant_primary, spec_part) if part)
if prefer_variant_search
else "",
" ".join(part for part in (brand_part, primary_with_model, spec_part) if part)
if primary_with_model and model_phrases
else "",
" ".join(part for part in (brand_part, modifier_with_primary, spec_part) if part)
if modifier_with_primary and identity.product_type and identity.product_type in core_primary
else "",
" ".join(part for part in (brand_part, core_primary, spec_part) if part)
if variant_sensitive and core_primary and not variant_options
else "",
" ".join(part for part in (brand_part, core_primary, variant_primary, spec_part) if part)
if variant_sensitive and variant_primary and variant_options and not model_phrases and not model_like_spec
else "",
" ".join(part for part in (brand_part, primary_with_model, spec_part) if part),
" ".join(part for part in (brand_part, core_short, spec_part) if part),
" ".join(part for part in (brand_part, core_short) if part),
" ".join(part for part in (core_primary, spec_part) if part),
identity.searchable_name,
):
cleaned = _clean_search_phrase(value)
if cleaned and cleaned not in terms:
terms.append(cleaned[:42])
if len(terms) >= max_terms:
break
return terms