5866 lines
214 KiB
Python
5866 lines
214 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""跨電商商品身份比對工具。
|
||
|
||
這裡處理「是否為同一個商品」;價格只作為 sanity check,不能主導配對。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
import unicodedata
|
||
from dataclasses import dataclass, field
|
||
from difflib import SequenceMatcher
|
||
from typing import Iterable, Optional
|
||
|
||
|
||
NOISE_PHRASES = (
|
||
"momo",
|
||
"pchome",
|
||
"24h",
|
||
"官方直營",
|
||
"官方",
|
||
"公司貨",
|
||
"台灣公司貨",
|
||
"專櫃公司貨",
|
||
"正貨",
|
||
"原廠",
|
||
"限時",
|
||
"特惠",
|
||
"優惠",
|
||
"超值",
|
||
"加贈",
|
||
"贈品",
|
||
"送禮",
|
||
"送",
|
||
"買一送一",
|
||
"買1送1",
|
||
"限定版",
|
||
"璀璨奢金限定版",
|
||
"單入任選",
|
||
"單入",
|
||
"全肌防護",
|
||
"經典防護王",
|
||
"賦活美學",
|
||
"弱酸性",
|
||
"植萃複方",
|
||
"溫和潤澤護理",
|
||
"ph值平衡",
|
||
"淨味沐浴乳",
|
||
"香氛凝膠",
|
||
"絲絨甜點新色",
|
||
"鎖吻棒",
|
||
"水光持色",
|
||
"贈精油一瓶",
|
||
"贈送薰衣草精油",
|
||
"超聲波大噴霧",
|
||
"天然陶瓷",
|
||
"女大電視節目推薦",
|
||
"醫師好辣推薦",
|
||
"嬰兒界萬用霜",
|
||
"48h抑味爽身",
|
||
"10度c順降",
|
||
"vit b5",
|
||
"vitb5",
|
||
"任選",
|
||
"即期品",
|
||
"福利品",
|
||
"預購",
|
||
"免運",
|
||
"熱銷",
|
||
"人氣",
|
||
"必買",
|
||
"推薦",
|
||
"新品",
|
||
"升級版",
|
||
"經典",
|
||
"獨家",
|
||
"囤貨組",
|
||
"超值組",
|
||
"優惠組",
|
||
"分享包",
|
||
"組合",
|
||
"多款可選",
|
||
"多款任選",
|
||
"任選多款",
|
||
"多色可選",
|
||
"色號可選",
|
||
"平行輸入",
|
||
"大容量",
|
||
"附燈泡",
|
||
"贈燈泡",
|
||
"定時",
|
||
"調節亮度",
|
||
"可調光",
|
||
"聖誕禮物",
|
||
"聖誕節禮物",
|
||
"懶人霜",
|
||
"打造素顏女神",
|
||
"第三代經典版白",
|
||
)
|
||
|
||
GENERIC_TOKENS = {
|
||
"官方",
|
||
"直營",
|
||
"公司貨",
|
||
"專櫃",
|
||
"正貨",
|
||
"原廠",
|
||
"限時",
|
||
"特惠",
|
||
"優惠",
|
||
"超值",
|
||
"加贈",
|
||
"贈品",
|
||
"送禮",
|
||
"即期品",
|
||
"新品",
|
||
"升級版",
|
||
"經典",
|
||
"人氣",
|
||
"熱銷",
|
||
"必買",
|
||
"推薦",
|
||
"組",
|
||
"入",
|
||
"瓶",
|
||
"盒",
|
||
"包",
|
||
"片",
|
||
"支",
|
||
"條",
|
||
"件",
|
||
"ml",
|
||
"g",
|
||
"mg",
|
||
"la",
|
||
"paris",
|
||
"多款",
|
||
"可選",
|
||
"任選",
|
||
"平行輸入",
|
||
"大容量",
|
||
"日本",
|
||
"韓國",
|
||
"澳洲",
|
||
"法國",
|
||
"英國",
|
||
"美國",
|
||
}
|
||
|
||
SEARCH_NOISE_PHRASES = (
|
||
"新品上市",
|
||
"全新上市",
|
||
"國際航空版",
|
||
"超取免運",
|
||
"任選一款",
|
||
"任選1款",
|
||
"任選一色",
|
||
"任選1色",
|
||
"多款任選",
|
||
"多款可選",
|
||
"色號可選",
|
||
"香味可選",
|
||
"口味可選",
|
||
"送精美紙袋",
|
||
"精美紙袋",
|
||
"交換禮物",
|
||
"聖誕禮物",
|
||
"限定版",
|
||
"璀璨奢金限定版",
|
||
"單入任選",
|
||
"全肌防護",
|
||
"經典防護王",
|
||
"賦活美學",
|
||
"弱酸性",
|
||
"植萃複方",
|
||
"溫和潤澤護理",
|
||
"ph值平衡",
|
||
"淨味沐浴乳",
|
||
"香氛凝膠",
|
||
"絲絨甜點新色",
|
||
"鎖吻棒",
|
||
"水光持色",
|
||
"贈精油一瓶",
|
||
"贈送薰衣草精油",
|
||
"超聲波大噴霧",
|
||
"天然陶瓷",
|
||
"女大電視節目推薦",
|
||
"醫師好辣推薦",
|
||
"嬰兒界萬用霜",
|
||
"48h抑味爽身",
|
||
"10度c順降",
|
||
"vit b5",
|
||
"vitb5",
|
||
"母親節",
|
||
"父親節",
|
||
"情人節",
|
||
"外出清潔",
|
||
"卸除髒汙",
|
||
"卸除防曬",
|
||
"卸防曬",
|
||
"防水眼線",
|
||
"寶寶牙刷",
|
||
"紗布牙刷",
|
||
"調節亮度",
|
||
"韓國彩妝",
|
||
"水光感",
|
||
"官方直營",
|
||
"官方",
|
||
"經典款",
|
||
"校色",
|
||
"控油",
|
||
"好氣色",
|
||
"懶人霜",
|
||
"打造素顏女神",
|
||
"我愛修膚",
|
||
"第三代經典版白",
|
||
"溫和不乾澀",
|
||
"寶寶共和國",
|
||
"任選三款",
|
||
"三款",
|
||
"枚入",
|
||
"類光繚指甲油專用亮油",
|
||
"小銀蓋",
|
||
"如膠似漆",
|
||
"第三代",
|
||
"經典版",
|
||
"櫻花輕盈版",
|
||
"兩入組",
|
||
"超值兩入組",
|
||
"任選色號",
|
||
"多色任選",
|
||
"多色可選",
|
||
"多色",
|
||
"德國妮維雅",
|
||
"無印止汗滾珠",
|
||
"眉彩刷",
|
||
"眉餅盒分開販售",
|
||
"極細筆芯",
|
||
"防水抗暈",
|
||
"兒童化妝品",
|
||
"無毒防曬霜",
|
||
"天然彩妝",
|
||
"內贈芳香劑",
|
||
"衛浴精油擴香瓶棒組",
|
||
"衛浴精油擴香瓶",
|
||
"三色選一",
|
||
"贈複方",
|
||
)
|
||
|
||
SEARCH_NOISE_TOKENS = {
|
||
"一款",
|
||
"1款",
|
||
"一色",
|
||
"1色",
|
||
"上市",
|
||
"全新",
|
||
"新品",
|
||
"香味",
|
||
"口味",
|
||
"味道",
|
||
"顏色",
|
||
"色號",
|
||
"紙袋",
|
||
"禮物",
|
||
"清潔",
|
||
"髒汙",
|
||
"防曬",
|
||
"彩妝",
|
||
"水光感",
|
||
"超取",
|
||
"免運",
|
||
"航空版",
|
||
"國際版",
|
||
"附燈泡",
|
||
"定時",
|
||
"眼妝",
|
||
"滅菌",
|
||
"保濕",
|
||
"抗老",
|
||
"超品日",
|
||
"經典款",
|
||
"款",
|
||
"pdrn",
|
||
"校色",
|
||
"控油",
|
||
"好氣色",
|
||
"懶人霜",
|
||
"我愛修膚",
|
||
"第三代",
|
||
"經典版",
|
||
"版白",
|
||
"限量聯名款",
|
||
"play",
|
||
"boy",
|
||
"小虎",
|
||
"啾啾妹",
|
||
"煎妮花",
|
||
"涼感潔淨",
|
||
"私密處清潔",
|
||
"溫和不乾澀",
|
||
"寶寶共和國",
|
||
"三款",
|
||
"枚入",
|
||
"小銀蓋",
|
||
"如膠似漆",
|
||
"美甲",
|
||
"3d",
|
||
"多色",
|
||
"提亮",
|
||
"兩入組",
|
||
"櫻花輕盈版",
|
||
}
|
||
|
||
SEARCH_IDENTITY_ANCHORS = (
|
||
"控油清爽防曬棒",
|
||
"蔓越莓私密清潔慕斯",
|
||
"金縷梅私密清潔慕斯",
|
||
"光采奪目眼部飾底乳",
|
||
"男性私密沐浴露",
|
||
"私密沐浴露",
|
||
"hydsto 車載香薰",
|
||
"電動刮鬍刀 s101",
|
||
"磁吸控油定妝噴霧",
|
||
"修容打亮棒",
|
||
"私密潔淨凝露",
|
||
"柔霧裸唇膏",
|
||
"潤浸保濕清爽身體乳液",
|
||
"閃亮珍珠眼影棒",
|
||
"智能光感應無線自動除臭芳香噴霧機",
|
||
"usb精油薰香機",
|
||
"超音波水氧機",
|
||
"類光繚指甲油",
|
||
"多效提亮防曬霜",
|
||
"速描眼線膠筆",
|
||
"經典旋轉眉筆",
|
||
"3d造型眉彩餅補充芯",
|
||
"止汗爽身乳液",
|
||
"慕之幼爽身潤膚乳",
|
||
"精油芬香護手霜",
|
||
"持久植物香氛精油",
|
||
"口袋雙色修容打亮盤",
|
||
"經典乳霜",
|
||
"蜂王玫瑰外泌微臻霜",
|
||
"微分子肌底原生露",
|
||
"小浪智能感應自動噴香機",
|
||
"智能感應自動噴香機",
|
||
"深層全效面膜",
|
||
"私密防護舒緩噴霧",
|
||
"櫻之花身體噴霧",
|
||
"光透立體眼線筆",
|
||
"葳兒柔賦活凝膠",
|
||
"兒童指甲油",
|
||
"麗駐蘭修復舒緩面膜",
|
||
"濕度數顯智能加濕器",
|
||
"新艷陽夏水離子熱防禦隔離露",
|
||
"3d極細防水眼線膠筆",
|
||
"恆久完美透膚煙染腮紅",
|
||
"私密植萃美白緊緻凝露",
|
||
"學習刷牙漱口水",
|
||
"羅馬柱智慧居家車用香氛機",
|
||
"經典菲菲染唇液",
|
||
"染唇液",
|
||
"晨霧純精油擴香儀",
|
||
"天然植物香氛精油",
|
||
"爆水擦澡濕巾",
|
||
"嬰兒潤膚乳",
|
||
"可撕式水性兒童指甲油",
|
||
"aroma lava 解憂放鬆緩緩燈",
|
||
"經典款香氛蠟燭暖燈",
|
||
"我愛超磁妝定妝噴霧",
|
||
"全天候超完美定妝噴霧",
|
||
"怪獸級持色唇膏",
|
||
"焦糖楓葉香氛擴香花禮盒",
|
||
"香氛蠟燭20種香味",
|
||
"tokyo車用夾式消臭芳香劑",
|
||
"北歐簡樸融蠟燈桌面氣氛夜燈",
|
||
"大地有機植萃護膚油",
|
||
"3d立體持色眉彩盤",
|
||
"細芯睛彩雙頭眉筆",
|
||
"雙頭旋轉極細眉筆",
|
||
"武士刀眉筆",
|
||
"自動武士刀眉筆筆蕊",
|
||
"無極限保濕防曬妝前乳",
|
||
"水凝光透 妝前防護乳",
|
||
"水凝光透妝前防護乳",
|
||
"經典素顏霜",
|
||
"閃耀保色護甲油",
|
||
"溫和洗手慕斯",
|
||
"足足稱奇足膜",
|
||
"時尚潮流美甲片",
|
||
"止汗爽身噴霧",
|
||
"止汗爽身乳膏pro",
|
||
"零粉感超持久粉底棒",
|
||
"超持久水光鎖吻唇釉",
|
||
"裸光蜜粉餅",
|
||
"私密潔膚露",
|
||
"私密肌潔膚露",
|
||
"男性私密醒肌抑菌噴霧",
|
||
"男性私密激淨凝露",
|
||
"私密抑菌噴霧",
|
||
"天然陶瓷精油香薰機",
|
||
"裸光幻閃亮采餅",
|
||
"絕對持久定妝噴霧",
|
||
"兒童防曬氣墊粉餅",
|
||
"勝過眼皮十色眼影盤",
|
||
"提提亮膚打亮液",
|
||
"甜甜嫩頰腮紅液",
|
||
"自動武士刀眉筆",
|
||
"超進化光感輕潤遮瑕棒",
|
||
"4合1微臻全能氣墊粉餅",
|
||
"唯我玫瑰裸光潤唇膏",
|
||
"晨曦冷香儀",
|
||
"舒恬良修護霜",
|
||
"頂級濃潤柔霜潤唇膏",
|
||
"絕對完美永生玫瑰逆齡乳霜",
|
||
"永生玫瑰逆齡乳霜",
|
||
"永生玫瑰霜",
|
||
"玫瑰精露",
|
||
"玫瑰霜",
|
||
"青春敷面膜",
|
||
"長效潤膚霜",
|
||
"小黑瓶",
|
||
"私密處護潔露",
|
||
"私密護潔露",
|
||
"口腔清潔棒",
|
||
"含氟防蛀修護牙膏",
|
||
"自然遮瑕素顏霜",
|
||
"超持久細滑眼線筆",
|
||
"香氛融蠟燈",
|
||
"水晶香氛能量寶盒禮盒組",
|
||
"零粉感超持久柔焦蜜粉餅",
|
||
"私密肌潔淨露",
|
||
"私密潔浴露",
|
||
"身體除毛器",
|
||
"免用水潔淨液",
|
||
"身體按摩精油",
|
||
"按摩精油",
|
||
"擴香補充瓶",
|
||
"擴香瓶",
|
||
"全面修復霜",
|
||
"修復霜",
|
||
"護膚膏",
|
||
"屁屁噴",
|
||
"身體乳",
|
||
"緊實乳",
|
||
"妝前防護乳",
|
||
"妝前乳",
|
||
"素顏霜",
|
||
"潔膚露",
|
||
"浴潔露",
|
||
"潔淨液",
|
||
"護甲油",
|
||
"指甲油",
|
||
"美甲片",
|
||
"唇凍",
|
||
"唇釉",
|
||
"唇膏",
|
||
"粉底棒",
|
||
"遮瑕棒",
|
||
"化妝水",
|
||
"精華液",
|
||
"精華",
|
||
"面膜",
|
||
"乳液",
|
||
"乳霜",
|
||
"面霜",
|
||
"精油",
|
||
"水氧機",
|
||
"香氛機",
|
||
)
|
||
|
||
FOCUSED_IDENTITY_REVIEW_ONLY_REASONS = {
|
||
"muji_aroma_hand_cream_brandless",
|
||
"johnsons_baby_lotion_variant_catalog",
|
||
"im_meme_fixx_cool_setting_spray",
|
||
"so_natural_fixx_setting_spray_catalog",
|
||
"kate_powder_case_catalog",
|
||
"kate_monster_lipstick_catalog",
|
||
"opi_gel_polish_series_catalog",
|
||
"romand_juicy_lip_tint_2_catalog",
|
||
"recipe_box_peelable_child_polish_catalog",
|
||
"solone_longlasting_eyeliner",
|
||
"shu_auto_hard_formula_refill_catalog",
|
||
"summer_eve_full_skin_wash_2pack",
|
||
"the_forest_maple_diffuser_flower_brandless",
|
||
}
|
||
|
||
FOCUSED_IDENTITY_VARIANT_REVIEW_BYPASS_REASONS = {
|
||
"artmis_cranberry_private_mousse_250ml",
|
||
"artmis_witch_hazel_private_mousse_250ml",
|
||
"arden_eight_hour_lip_spf15_3_7g_3pack",
|
||
"baan_baby_lip_original_strawberry_catalog",
|
||
"dhc_olive_lip_1_5g",
|
||
"flortte_fruit_salad_eyeliner_0_5ml_catalog",
|
||
"frudia_honey_blueberry_lip_10g",
|
||
"hh_private_cleanser_laundry_wash_set",
|
||
"kanebo_allie_bright_uv_milk_60g_catalog",
|
||
"laroche_posay_lip_balm_4_7ml",
|
||
"laroche_posay_repair_lip_balm_7_5ml",
|
||
"lush_sakura_body_spray",
|
||
"neutrogena_hand_cream_56g_scent_catalog",
|
||
"natures_care_sheep_oil_exact_pack",
|
||
"opi_gel_polish_exact_model",
|
||
"sebamed_baby_lip_4_8g_2pack",
|
||
"sebamed_ph38_private_wash_200ml_2pack",
|
||
"so_natural_fixx_setting_spray_120ml_plain",
|
||
"sk_ii_essence_330ml_2pack",
|
||
"amiino_whitening_repair_cream_30ml",
|
||
"tomoon_nail_clipper_luxury_size",
|
||
"yes_cuticle_scissors_9cm",
|
||
"yes_curved_scissors_9cm",
|
||
"yes_foot_nail_scissors_10_5cm",
|
||
"yes_nail_tool_exact_model_size",
|
||
"cetaphil_long_lotion_237ml",
|
||
"cetaphil_long_lotion_473ml",
|
||
"cetaphil_long_moisturizing_cream_250g",
|
||
"cetaphil_ad_repair_cream_227g",
|
||
"clarins_double_serum_eye_20ml",
|
||
"lab52_paw_patrol_baby_toothbrush_2pack",
|
||
"derma_baby_wash_150ml",
|
||
"derma_baby_wash_500ml",
|
||
"physiogel_ai_ice_essence_200ml_2pack",
|
||
"playjoy_powerman_male_care_30ml",
|
||
"ts6_private_elastic_gel_40g",
|
||
"beauty_foot_mask_exact_pack",
|
||
"kameria_centella_foot_mask_17ml_2pc",
|
||
"ts6_lubricant_100g_3pack",
|
||
"ts6_peach_bright_gel_45g_3pack",
|
||
"ts6_white_wash_peach_gel_kit",
|
||
"ts6_cooling_clean_mousse_100g",
|
||
"vaseline_baby_jelly_368g_3pack",
|
||
}
|
||
|
||
FOCUSED_IDENTITY_BRANDLESS_REVIEW_REASONS = {
|
||
"herbacin_classic_hand_cream_20ml_brandless",
|
||
"muji_aroma_hand_cream_brandless",
|
||
"the_forest_maple_diffuser_flower_brandless",
|
||
}
|
||
|
||
FOCUSED_IDENTITY_BRANDLESS_TOTAL_PRICE_REASONS = {
|
||
"herbacin_classic_hand_cream_20ml_brandless",
|
||
}
|
||
|
||
FOCUSED_IDENTITY_TOTAL_PRICE_REASONS = {
|
||
"3w_clinic_collagen_foundation_50ml_2pack",
|
||
"hanamisui_moisture_original_gel_1_7g_3pack",
|
||
"hanamisui_inclear_private_gel_1_7g_3pack",
|
||
"hanamisui_relax_lavender_gel_1_7g_3pack",
|
||
"the_ordinary_caffeine_egcg_30ml",
|
||
"herbacin_classic_hand_cream_20ml_brandless",
|
||
"sab_private_spray",
|
||
"st_clare_private_mousse_150ml_2pack",
|
||
"st_clare_private_mousse_spray_set",
|
||
"biopeutic_plus_aha_lotion_20_150ml",
|
||
"taisu_baby_bath_shampoo_3pc",
|
||
"arden_eight_hour_lip_spf15_3_7g_3pack",
|
||
"flortte_fruit_salad_eyeliner_0_5ml_catalog",
|
||
"hh_private_cleanser_laundry_wash_set",
|
||
"kanebo_allie_bright_uv_milk_60g_catalog",
|
||
"laroche_posay_repair_lip_balm_7_5ml",
|
||
"neutrogena_hand_cream_56g_scent_catalog",
|
||
"natures_care_sheep_oil_exact_pack",
|
||
"opi_gel_polish_exact_model",
|
||
"sebamed_ph38_private_wash_200ml_2pack",
|
||
"sk_ii_essence_330ml_2pack",
|
||
"amiino_whitening_repair_cream_30ml",
|
||
"kussen_baby_butt_cream_50ml_3pack",
|
||
"tomoon_nail_clipper_luxury_size",
|
||
"yes_cuticle_scissors_9cm",
|
||
"yes_curved_scissors_9cm",
|
||
"yes_foot_nail_scissors_10_5cm",
|
||
"yes_nail_tool_exact_model_size",
|
||
"bone_diffuser_gift_3pack",
|
||
"selection1990_half_dome_wax_lamp_white",
|
||
"selection1990_bendable_wax_lamp_white",
|
||
"canmake_tear_bag_palette",
|
||
"gdesign_aroma_lava_lamp_2",
|
||
"hooome_classic_white_wax_lamp_bulbs_giftbox",
|
||
"herb24_mist_diffuser_black",
|
||
"pavaruni_40_scent_oil",
|
||
"pavaruni_20_scent_candle",
|
||
"artmis_cranberry_private_mousse_250ml",
|
||
"artmis_witch_hazel_private_mousse_250ml",
|
||
"baan_baby_lip_original_strawberry_catalog",
|
||
"dhc_olive_lip_1_5g",
|
||
"derma_eco_skin_oil",
|
||
"frudia_honey_blueberry_lip_10g",
|
||
"laroche_posay_lip_balm_4_7ml",
|
||
"lush_sakura_body_spray",
|
||
"sebamed_baby_lip_4_8g_2pack",
|
||
"so_natural_fixx_setting_spray_120ml_plain",
|
||
"cetaphil_long_lotion_237ml",
|
||
"cetaphil_long_lotion_473ml",
|
||
"cetaphil_long_moisturizing_cream_250g",
|
||
"cetaphil_ad_repair_cream_227g",
|
||
"clarins_double_serum_eye_20ml",
|
||
"lab52_paw_patrol_baby_toothbrush_2pack",
|
||
"derma_baby_wash_150ml",
|
||
"derma_baby_wash_500ml",
|
||
"physiogel_ai_ice_essence_200ml_2pack",
|
||
"playjoy_powerman_male_care_30ml",
|
||
"ts6_private_elastic_gel_40g",
|
||
"beauty_foot_mask_exact_pack",
|
||
"kameria_centella_foot_mask_17ml_2pc",
|
||
"ts6_lubricant_100g_3pack",
|
||
"ts6_peach_bright_gel_45g_3pack",
|
||
"ts6_white_wash_peach_gel_kit",
|
||
"ts6_cooling_clean_mousse_100g",
|
||
"vaseline_baby_jelly_368g_3pack",
|
||
"nivea_creme_100ml",
|
||
"schick_womens_sensitive_blade_3pack",
|
||
}
|
||
|
||
SEARCH_BROAD_ANCHORS = {
|
||
"乳霜",
|
||
"面霜",
|
||
"面膜",
|
||
"精華",
|
||
"乳液",
|
||
"精油",
|
||
"香氛融蠟燈",
|
||
}
|
||
|
||
VARIANT_SENSITIVE_KEYWORDS = {
|
||
"香氛蠟燭",
|
||
"芬香蠟燭",
|
||
"蠟燭",
|
||
"擴香",
|
||
"融蠟燈",
|
||
"車用香氛",
|
||
"香味",
|
||
"私密清潔慕斯",
|
||
"私密清潔凝露",
|
||
"私密潔淨凝露",
|
||
"私密淨白清潔凝露",
|
||
"私密防護慕絲",
|
||
"慕絲",
|
||
"定妝噴霧",
|
||
"妝前防護乳",
|
||
"妝前乳",
|
||
"素顏霜",
|
||
"粉底",
|
||
"美甲片",
|
||
"指甲油",
|
||
"指彩",
|
||
"眼影盤",
|
||
"唇釉",
|
||
"唇膏",
|
||
"唇凍",
|
||
"潤唇膏",
|
||
"眉筆",
|
||
"眼線筆",
|
||
"腮紅液",
|
||
"打亮液",
|
||
"蜜粉餅",
|
||
"粉餅盒",
|
||
"粉底棒",
|
||
"遮瑕棒",
|
||
"遮瑕蜜",
|
||
"護手霜",
|
||
"滋養霜",
|
||
"修護乳",
|
||
"修容打亮棒",
|
||
"防曬",
|
||
"防曬乳",
|
||
"防曬霜",
|
||
"防曬噴霧",
|
||
"防曬棒",
|
||
}
|
||
|
||
VARIANT_OPTION_COLOR_WORDS = {
|
||
"茉莉花",
|
||
"梔子花",
|
||
"白茶蘭花",
|
||
"白茶",
|
||
"白麝香",
|
||
"黑麝香",
|
||
"清新花園",
|
||
"寶貝粉香",
|
||
"青檸羅勒",
|
||
"炭木香",
|
||
"無花果",
|
||
"鼠尾草",
|
||
"海鹽",
|
||
"檸檬草",
|
||
"茶樹",
|
||
"英國梨",
|
||
"小蒼蘭",
|
||
"英國梨小蒼蘭",
|
||
"櫻花",
|
||
"繡球花",
|
||
"魔髮奇緣",
|
||
"清甜柚香",
|
||
"杏仁牛奶",
|
||
"杏仁",
|
||
"薄荷",
|
||
"橙花",
|
||
"完熟白桃",
|
||
"琥珀橙",
|
||
"干邑棕",
|
||
"賽車綠",
|
||
"原味",
|
||
"草莓",
|
||
"蔓越莓",
|
||
"金縷梅",
|
||
"柔焦霧面",
|
||
"水光亮面",
|
||
"菸鹼醯胺",
|
||
"胺基酸",
|
||
"黑色",
|
||
"棕色",
|
||
"咖啡色",
|
||
"灰色",
|
||
"rose",
|
||
"白色",
|
||
"紅色",
|
||
"粉色",
|
||
"粉紅",
|
||
"桃紅",
|
||
"玫瑰",
|
||
"玫瑰色",
|
||
"珊瑚",
|
||
"珊瑚色",
|
||
"橘色",
|
||
"橙色",
|
||
"裸色",
|
||
"奶茶色",
|
||
"豆沙色",
|
||
"紫色",
|
||
"絕絕紫",
|
||
"薰衣草",
|
||
"藍色",
|
||
"綠色",
|
||
"膚色",
|
||
"自然色",
|
||
"明亮色",
|
||
"透明色",
|
||
"清爽型",
|
||
"滋潤型",
|
||
"橡棕",
|
||
"暗灰",
|
||
"灰棕",
|
||
"淺玫粉",
|
||
"極光之藍",
|
||
"月光銀影",
|
||
}
|
||
|
||
VARIANT_DESCRIPTOR_NOISE_KEYWORDS = {
|
||
"平輸航空版",
|
||
"多色任選",
|
||
"色號任選",
|
||
"任選色號",
|
||
"極細筆頭",
|
||
"筆頭",
|
||
"官方直營",
|
||
"入組",
|
||
"盒組",
|
||
}
|
||
|
||
MULTI_VARIANT_LISTING_PHRASES = (
|
||
"多款任選",
|
||
"多款可選",
|
||
"多色任選",
|
||
"多色可選",
|
||
"多種香味",
|
||
"多種香氣",
|
||
"香味任選",
|
||
"香味可選",
|
||
"味道可選",
|
||
"任選",
|
||
"可選",
|
||
)
|
||
|
||
SEARCH_AMBIGUOUS_PRODUCT_TERMS = {
|
||
"保護膜",
|
||
"保護貼",
|
||
}
|
||
|
||
BRAND_ALIAS_OVERRIDES = {
|
||
"clarins": ("克蘭詩", "clarins"),
|
||
"nars": ("nars",),
|
||
"relove": ("relove",),
|
||
"stadler form": ("stadler form", "stadlerform"),
|
||
"cetaphil": ("舒特膚", "cetaphil"),
|
||
"sisley": ("希思黎", "sisley"),
|
||
"gennies": ("奇妮", "gennies"),
|
||
"uruhimemomoko": ("潤姬桃子", "uruhimemomoko", "uruhime momoko"),
|
||
"arau baby": ("arau baby", "arau", "愛樂寶", "saraya"),
|
||
"sebamed": ("sebamed", "施巴"),
|
||
"shu uemura": ("shu uemura", "shuuemura", "植村秀"),
|
||
"johnsons": ("johnsons", "johnson's", "johnson", "嬌生"),
|
||
"gillette": ("gillette", "吉列"),
|
||
"schick": ("schick", "舒適牌"),
|
||
"obge": ("obge",),
|
||
"vaseline": ("vaseline", "凡士林"),
|
||
"eaoron": ("eaoron",),
|
||
"kameria": ("kameria", "凱蜜菈"),
|
||
"cocodor": ("cocodor",),
|
||
"peripera": ("peripera",),
|
||
"solone": ("solone",),
|
||
"im meme": ("im meme", "i'm meme", "i’m meme"),
|
||
"dr.wu": ("dr.wu", "dr wu", "drwu", "達爾膚"),
|
||
"febreze": ("febreze", "風倍清"),
|
||
"jo malone": ("jo malone",),
|
||
"prada": ("prada", "普拉達"),
|
||
"za": ("za",),
|
||
"hh": ("hh", "草本新淨界"),
|
||
"小浪": ("小浪",),
|
||
"xiaomi": ("小米有品", "小米", "xiaomi"),
|
||
"mac": ("m.a.c", "mac", "m a c"),
|
||
"opi": ("o.p.i", "opi", "o p i"),
|
||
"curel": ("curel", "珂潤"),
|
||
"karadium": ("karadium",),
|
||
"st雞仔牌": ("日本雞仔牌st", "日本st雞仔牌", "st雞仔牌", "雞仔牌st", "雞仔牌"),
|
||
}
|
||
|
||
PRODUCT_TYPES = {
|
||
"止汗噴霧": ("止汗爽身噴霧", "爽身噴霧", "止汗噴霧"),
|
||
"潔膚露": ("潔膚露", "浴潔露", "護潔露", "沐浴露", "wash", "私密潔浴露"),
|
||
"私密噴霧": ("私密噴霧", "抑菌噴霧", "醒肌抑菌噴霧"),
|
||
"私密凝露": ("凝露", "激淨凝露", "緊實凝露", "亮白凝露"),
|
||
"護甲油": ("護甲油", "亮油", "top coat"),
|
||
"定妝噴霧": ("定妝噴霧", "setting spray"),
|
||
"修容打亮棒": ("修容打亮棒", "修容棒", "打亮棒"),
|
||
"刮鬍刀": ("刮鬍刀", "刮胡刀", "shaver", "razor"),
|
||
"體香膏": ("體香膏", "體香劑", "deodorant"),
|
||
"電動牙刷": ("電動牙刷", "聲波電動牙刷", "electric toothbrush"),
|
||
"洗手慕斯": ("洗手慕斯", "洗手泡泡", "hand wash foam"),
|
||
"私密慕斯": ("私密清潔慕斯", "私密防護慕絲", "私密慕斯"),
|
||
"足膜": ("足膜", "足部膜", "足部去角質"),
|
||
"妝前乳": ("妝前乳", "妝前防護乳", "妝前隔離", "primer"),
|
||
"素顏霜": ("素顏霜", "tone up cream"),
|
||
"氣墊粉餅": ("氣墊粉餅", "cushion"),
|
||
"眼影盤": ("眼影盤",),
|
||
"打亮液": ("打亮液",),
|
||
"腮紅液": ("腮紅液",),
|
||
"護唇膏": ("護唇膏", "潤唇膏"),
|
||
"唇釉": ("唇釉", "唇彩", "lip tint", "lip glaze"),
|
||
"粉底棒": ("粉底棒", "foundation stick"),
|
||
"精華": ("精華", "精華液", "essence", "serum", "安瓶"),
|
||
"化妝水": ("化妝水", "機能水", "toner", "lotion"),
|
||
"乳液": ("乳液", "按摩乳", "潤膚乳", "身體乳", "嬰兒乳液", "寶寶乳液", "emulsion", "milk"),
|
||
"面霜": ("面霜", "乳霜", "霜", "cream"),
|
||
"防曬": ("防曬", "spf", "uv", "sunscreen"),
|
||
"洗面乳": ("洗面乳", "洗顏", "潔面", "cleanser", "foam"),
|
||
"面膜": ("面膜", "mask"),
|
||
"眼霜": ("眼霜", "眼部", "眼膜", "eye"),
|
||
"卸妝": ("卸妝", "cleansing", "remover"),
|
||
"粉底": ("粉底", "粉霜", "粉凝露", "foundation"),
|
||
"蜜粉": ("蜜粉", "powder"),
|
||
"精油": ("精油", "香氛", "擴香"),
|
||
"保健": ("錠", "膠囊", "粉", "飲", "包", "健康食品"),
|
||
}
|
||
|
||
COUNT_UNITS = {"入", "組", "瓶", "支", "條", "盒", "包", "袋", "片", "顆", "粒", "錠", "枚", "件", "罐", "杯", "本", "刀把", "刀片", "刀頭", "蕊"}
|
||
COUNT_UNIT_PATTERN = r"(?:刀把|刀片|刀頭|入|組|瓶|支|條|盒|包|袋|片|顆|粒|錠|枚|件|罐|杯|本|蕊)"
|
||
PIECE_UNITS = {"包", "袋", "片", "顆", "粒", "錠", "枚"}
|
||
CONTAINER_UNITS = {"入", "組", "盒", "罐", "杯", "本", "瓶", "支", "條", "件"}
|
||
COUNT_UNIT_FAMILIES = {
|
||
"刀片": "blade",
|
||
"刀頭": "blade",
|
||
"蕊": "refill",
|
||
}
|
||
ENGLISH_COUNT_UNIT_RE = r"(?:pcs?|pieces?|capsules?|caps?|tablets?|tabs?|packs?|sachets?|bottles?|boxes?)"
|
||
BUNDLE_OFFER_PHRASES = (
|
||
"囤貨組",
|
||
"超值組",
|
||
"特惠組",
|
||
"優惠組",
|
||
"優惠套組",
|
||
"入門組",
|
||
"禮盒組",
|
||
"加大組",
|
||
"加量組",
|
||
"分享組",
|
||
"明星組",
|
||
"套組",
|
||
"組合",
|
||
"組合包",
|
||
"雙件組",
|
||
"二件組",
|
||
"2件組",
|
||
"家庭組",
|
||
"多入組",
|
||
)
|
||
NON_BRAND_BRACKET_PHRASES = (
|
||
"保濕組",
|
||
"熱銷款",
|
||
"限定",
|
||
"特惠",
|
||
"優惠",
|
||
"超值",
|
||
"囤貨",
|
||
"組合",
|
||
"套組",
|
||
"禮盒",
|
||
"分享",
|
||
"雙件",
|
||
"二件",
|
||
"2件",
|
||
"家庭",
|
||
"多入",
|
||
"任選",
|
||
"買",
|
||
"母親節",
|
||
)
|
||
CHINESE_COUNT = {
|
||
"一": 1,
|
||
"二": 2,
|
||
"兩": 2,
|
||
"雙": 2,
|
||
"三": 3,
|
||
"四": 4,
|
||
"五": 5,
|
||
"六": 6,
|
||
"七": 7,
|
||
"八": 8,
|
||
"九": 9,
|
||
"十": 10,
|
||
}
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class ProductIdentity:
|
||
original_name: str
|
||
normalized_name: str
|
||
searchable_name: str
|
||
brand_tokens: frozenset[str]
|
||
product_type: Optional[str]
|
||
tokens: frozenset[str]
|
||
core_tokens: frozenset[str]
|
||
volumes_ml: tuple[float, ...]
|
||
weights_g: tuple[float, ...]
|
||
dosages_mg: tuple[float, ...]
|
||
counts: tuple[tuple[int, str], ...]
|
||
total_piece_count: Optional[int]
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class MatchDiagnostics:
|
||
score: float
|
||
brand_score: float
|
||
token_score: float
|
||
spec_score: float
|
||
sequence_score: float
|
||
type_score: float
|
||
price_penalty: float
|
||
hard_veto: bool
|
||
reasons: tuple[str, ...]
|
||
comparison_mode: str = "exact_identity"
|
||
match_type: str = "exact"
|
||
price_basis: str = "total_price"
|
||
alert_tier: str = "price_alert_exact"
|
||
evidence_flags: tuple[str, ...] = ()
|
||
identity_evidence: dict[str, object] = field(default_factory=dict)
|
||
offer_evidence: dict[str, object] = field(default_factory=dict)
|
||
|
||
@property
|
||
def tags(self) -> list[str]:
|
||
tags: list[str] = ["identity_v2"]
|
||
if self.comparison_mode:
|
||
tags.append(f"comparison_{self.comparison_mode}")
|
||
if self.match_type:
|
||
tags.append(f"match_type_{self.match_type}")
|
||
if self.price_basis:
|
||
tags.append(f"price_basis_{self.price_basis}")
|
||
if self.alert_tier:
|
||
tags.append(f"alert_tier_{self.alert_tier}")
|
||
if self.brand_score >= 0.95:
|
||
tags.append("brand_match")
|
||
if self.spec_score >= 0.85:
|
||
tags.append("spec_match")
|
||
if self.hard_veto:
|
||
tags.append("identity_veto")
|
||
for flag in self.evidence_flags:
|
||
tags.append(f"evidence_{flag}")
|
||
return tags
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class UnitPriceComparison:
|
||
comparable: bool
|
||
reason: str
|
||
unit_label: str = ""
|
||
momo_total_quantity: Optional[float] = None
|
||
competitor_total_quantity: Optional[float] = None
|
||
momo_unit_price: Optional[float] = None
|
||
competitor_unit_price: Optional[float] = None
|
||
unit_gap_amount: Optional[float] = None
|
||
unit_gap_pct: Optional[float] = None
|
||
summary: str = ""
|
||
|
||
def as_dict(self) -> dict:
|
||
return {
|
||
"comparable": self.comparable,
|
||
"reason": self.reason,
|
||
"unit_label": self.unit_label,
|
||
"momo_total_quantity": self.momo_total_quantity,
|
||
"competitor_total_quantity": self.competitor_total_quantity,
|
||
"momo_unit_price": self.momo_unit_price,
|
||
"competitor_unit_price": self.competitor_unit_price,
|
||
"unit_gap_amount": self.unit_gap_amount,
|
||
"unit_gap_pct": self.unit_gap_pct,
|
||
"summary": self.summary,
|
||
}
|
||
|
||
|
||
def normalize_product_text(value: str) -> str:
|
||
text = unicodedata.normalize("NFKC", value or "")
|
||
text = "".join(
|
||
char for char in unicodedata.normalize("NFKD", text)
|
||
if not unicodedata.combining(char)
|
||
)
|
||
text = text.replace("×", "x").replace("*", "x").replace("*", "x")
|
||
text = text.replace("/", "/").replace("&", "&")
|
||
text = text.replace("粧", "妝")
|
||
text = re.sub(r"[\u3000\r\n\t]+", " ", text)
|
||
text = text.lower()
|
||
text = re.sub(r"[??]+", " ", text)
|
||
text = re.sub(r"[【】\[\]{}「」『』]", " ", text)
|
||
text = re.sub(r"[()()]", " ", text)
|
||
text = re.sub(r"\s+", " ", text).strip()
|
||
return text
|
||
|
||
|
||
def _strip_noise(value: str) -> str:
|
||
text = value
|
||
for phrase in sorted(NOISE_PHRASES, key=len, reverse=True):
|
||
text = text.replace(phrase.lower(), " ")
|
||
text = re.sub(r"\s+", " ", text).strip()
|
||
return text
|
||
|
||
|
||
def _tokenize(value: str) -> list[str]:
|
||
raw_tokens = re.findall(r"[a-z0-9]+|[\u4e00-\u9fff]+", value)
|
||
tokens: list[str] = []
|
||
for token in raw_tokens:
|
||
if len(token) <= 1 and not token.isdigit():
|
||
continue
|
||
tokens.append(token)
|
||
return tokens
|
||
|
||
|
||
def _extract_model_tokens(text: str) -> set[str]:
|
||
tokens: set[str] = set()
|
||
for match in re.finditer(r"(?<![a-z0-9])([a-z]{1,4}-?[a-z]{0,3}\d{2,}[a-z0-9-]*)(?![a-z0-9])", text, re.I):
|
||
compact = re.sub(r"[^a-z0-9]", "", match.group(1).lower())
|
||
if _is_spec_like_latin_token(compact):
|
||
continue
|
||
if len(compact) >= 4 and re.search(r"[a-z]", compact) and re.search(r"\d", compact):
|
||
tokens.add(compact)
|
||
return tokens
|
||
|
||
|
||
def _is_spec_like_latin_token(token: str) -> bool:
|
||
return bool(
|
||
re.fullmatch(r"spf\d{1,3}[a-z]?", token)
|
||
or re.fullmatch(r"pa\d*", token)
|
||
or token in {"uva", "uvb", "uv", "spf"}
|
||
)
|
||
|
||
|
||
def _brand_alias_present(text: str, alias_norm: str, text_tokens: set[str]) -> bool:
|
||
if not alias_norm:
|
||
return False
|
||
if re.search(r"[\u4e00-\u9fff]", alias_norm):
|
||
return alias_norm in text
|
||
if " " not in alias_norm and alias_norm in text_tokens:
|
||
return True
|
||
if re.fullmatch(r"[a-z0-9][a-z0-9\s'&.-]*", alias_norm):
|
||
pattern = r"(?<![a-z0-9])" + re.escape(alias_norm).replace(r"\ ", r"\s+") + r"(?![a-z0-9])"
|
||
return bool(re.search(pattern, text))
|
||
return alias_norm in text
|
||
|
||
|
||
def _known_brand_tokens(text: str) -> set[str]:
|
||
tokens: set[str] = set()
|
||
try:
|
||
from services.price_comparison import BRAND_ALIASES, BRAND_NORMALIZE_MAP
|
||
except Exception:
|
||
BRAND_ALIASES = {}
|
||
BRAND_NORMALIZE_MAP = {}
|
||
|
||
alias_map = dict(BRAND_NORMALIZE_MAP)
|
||
alias_groups = {canonical: list(aliases) for canonical, aliases in BRAND_ALIASES.items()}
|
||
for canonical, aliases in BRAND_ALIAS_OVERRIDES.items():
|
||
alias_groups.setdefault(canonical, [])
|
||
alias_groups[canonical].extend(aliases)
|
||
alias_map[canonical.lower()] = canonical
|
||
for alias in aliases:
|
||
alias_map[alias.lower()] = canonical
|
||
|
||
text_tokens = _tokenize(text)
|
||
for alias, canonical in alias_map.items():
|
||
alias_norm = normalize_product_text(alias)
|
||
if _brand_alias_present(text, alias_norm, text_tokens):
|
||
tokens.add(canonical)
|
||
tokens.update(
|
||
token for token in _tokenize(alias_norm)
|
||
if not re.fullmatch(r"[a-z]{1,2}", token)
|
||
)
|
||
for related in alias_groups.get(canonical, []):
|
||
tokens.update(
|
||
token for token in _tokenize(normalize_product_text(related))
|
||
if not re.fullmatch(r"[a-z]{1,2}", token)
|
||
)
|
||
|
||
return {token for token in tokens if token and token not in GENERIC_TOKENS}
|
||
|
||
|
||
def _leading_brand_tokens(original: str, normalized: str) -> set[str]:
|
||
tokens: set[str] = set()
|
||
bracket_match = re.match(r"\s*[【\[]([^】\]]{2,40})[】\]]", original or "")
|
||
if bracket_match:
|
||
content = normalize_product_text(bracket_match.group(1))
|
||
if not any(phrase in content for phrase in NON_BRAND_BRACKET_PHRASES):
|
||
for token in _tokenize(_strip_noise(content)):
|
||
if token not in GENERIC_TOKENS:
|
||
tokens.add(token)
|
||
|
||
leading = normalized[:48]
|
||
leading_tokens = _tokenize(leading)
|
||
if leading_tokens:
|
||
first_token = leading_tokens[0]
|
||
if re.fullmatch(r"[\u4e00-\u9fff]{2,6}", first_token) and first_token not in GENERIC_TOKENS:
|
||
tokens.add(first_token)
|
||
for token in _tokenize(leading):
|
||
if re.fullmatch(r"[a-z][a-z0-9\-']{2,}", token) and not _is_spec_like_latin_token(token):
|
||
tokens.add(token)
|
||
return tokens
|
||
|
||
|
||
def _extract_product_type(text: str) -> Optional[str]:
|
||
for product_type, aliases in PRODUCT_TYPES.items():
|
||
if any(alias.lower() in text for alias in aliases):
|
||
return product_type
|
||
return None
|
||
|
||
|
||
def _convert_volume(value: str, unit: str) -> Optional[tuple[str, float]]:
|
||
try:
|
||
number = float(value)
|
||
except (TypeError, ValueError):
|
||
return None
|
||
unit = unit.lower()
|
||
if unit in {"ml", "毫升"}:
|
||
return ("ml", number)
|
||
if unit == "l":
|
||
return ("ml", number * 1000)
|
||
if unit in {"g", "公克"}:
|
||
return ("g", number)
|
||
if unit == "kg":
|
||
return ("g", number * 1000)
|
||
if unit in {"mg", "毫克"}:
|
||
return ("mg", number)
|
||
if unit in {"mcg", "μg", "ug", "微克"}:
|
||
return ("mg", number / 1000)
|
||
return None
|
||
|
||
|
||
def _count_unit_family(unit: str) -> str:
|
||
return COUNT_UNIT_FAMILIES.get(unit, unit)
|
||
|
||
|
||
def _extract_specs(
|
||
text: str,
|
||
) -> tuple[tuple[float, ...], tuple[float, ...], tuple[float, ...], tuple[tuple[int, str], ...], Optional[int]]:
|
||
volumes_ml: list[float] = []
|
||
weights_g: list[float] = []
|
||
dosages_mg: list[float] = []
|
||
for match in re.finditer(r"(\d+(?:\.\d+)?)\s*(ml|毫升|l|g|公克|kg|mg|毫克|mcg|μg|ug|微克)", text, re.I):
|
||
converted = _convert_volume(match.group(1), match.group(2))
|
||
if not converted:
|
||
continue
|
||
unit, number = converted
|
||
if unit == "ml":
|
||
volumes_ml.append(number)
|
||
elif unit == "g":
|
||
weights_g.append(number)
|
||
else:
|
||
dosages_mg.append(number)
|
||
|
||
counts: list[tuple[int, str]] = []
|
||
for match in re.finditer(rf"(\d+)\s*({COUNT_UNIT_PATTERN})", text):
|
||
counts.append((int(match.group(1)), match.group(2)))
|
||
for match in re.finditer(rf"([一二兩雙三四五六七八九十])\s*({COUNT_UNIT_PATTERN})", text):
|
||
counts.append((CHINESE_COUNT[match.group(1)], match.group(2)))
|
||
for match in re.finditer(rf"(?:x|乘)\s*(\d+)\s*({COUNT_UNIT_PATTERN})?", text, re.I):
|
||
unit = match.group(2) or "入"
|
||
if not match.group(2):
|
||
prefix = text[max(0, match.start() - 4):match.start()].strip().lower()
|
||
suffix = text[match.end():match.end() + 1]
|
||
spec_prefixed = bool(re.search(r"(?:ml|毫升|g|公克|kg|mg|oz)$", prefix))
|
||
if re.search(r"[a-z]$", prefix) and not spec_prefixed:
|
||
continue
|
||
if suffix and re.match(r"[\u4e00-\u9fff]", suffix) and not spec_prefixed:
|
||
continue
|
||
counts.append((int(match.group(1)), unit))
|
||
for match in re.finditer(rf"(\d+)\s*{ENGLISH_COUNT_UNIT_RE}", text, re.I):
|
||
counts.append((int(match.group(1)), "入"))
|
||
buy_get = re.search(r"買\s*(\d+|[一二兩雙三四五六七八九十])\s*送\s*(\d+|[一二兩雙三四五六七八九十])", text)
|
||
if buy_get:
|
||
total_count = (_count_text_value(buy_get.group(1)) or 0) + (_count_text_value(buy_get.group(2)) or 0)
|
||
if total_count > 1:
|
||
counts.append((total_count, "入"))
|
||
if "買一送一" in text or "買1送1" in text:
|
||
counts.append((2, "入"))
|
||
|
||
total_piece_count = None
|
||
explicit_total = re.search(r"共\s*(\d+)\s*([包袋片顆粒錠枚])", text)
|
||
if explicit_total:
|
||
total_piece_count = int(explicit_total.group(1))
|
||
else:
|
||
piece_counts = [count for count, unit in counts if unit in PIECE_UNITS]
|
||
container_counts = [count for count, unit in counts if unit in CONTAINER_UNITS]
|
||
if piece_counts and container_counts:
|
||
total_piece_count = max(piece_counts) * max(container_counts)
|
||
elif piece_counts:
|
||
total_piece_count = max(piece_counts)
|
||
|
||
unique_counts = tuple(sorted(set(counts)))
|
||
return (
|
||
tuple(sorted(set(volumes_ml))),
|
||
tuple(sorted(set(weights_g))),
|
||
tuple(sorted(set(dosages_mg))),
|
||
unique_counts,
|
||
total_piece_count,
|
||
)
|
||
|
||
|
||
def parse_product_identity(name: str) -> ProductIdentity:
|
||
normalized = normalize_product_text(name)
|
||
searchable = _strip_noise(normalized)
|
||
tokens = set(_tokenize(searchable))
|
||
product_type = _extract_product_type(searchable)
|
||
known_brand_tokens = _known_brand_tokens(searchable)
|
||
brand_tokens = known_brand_tokens or _leading_brand_tokens(name, normalized)
|
||
|
||
core_tokens = {
|
||
token
|
||
for token in tokens
|
||
if token not in GENERIC_TOKENS
|
||
and not token.isdigit()
|
||
and not re.fullmatch(r"\d+(ml|g|kg|l|mg|mcg|ug)?", token)
|
||
}
|
||
core_tokens -= brand_tokens
|
||
core_tokens.update(_extract_model_tokens(searchable))
|
||
|
||
volumes_ml, weights_g, dosages_mg, counts, total_piece_count = _extract_specs(normalized)
|
||
return ProductIdentity(
|
||
original_name=name or "",
|
||
normalized_name=normalized,
|
||
searchable_name=searchable,
|
||
brand_tokens=frozenset(brand_tokens),
|
||
product_type=product_type,
|
||
tokens=frozenset(tokens),
|
||
core_tokens=frozenset(core_tokens),
|
||
volumes_ml=volumes_ml,
|
||
weights_g=weights_g,
|
||
dosages_mg=dosages_mg,
|
||
counts=counts,
|
||
total_piece_count=total_piece_count,
|
||
)
|
||
|
||
|
||
def _weighted_token_score(left: ProductIdentity, right: ProductIdentity) -> float:
|
||
def expand_tokens(identity: ProductIdentity) -> set[str]:
|
||
tokens = set(identity.brand_tokens | identity.core_tokens)
|
||
for token in identity.core_tokens:
|
||
chinese = "".join(char for char in token if "\u4e00" <= char <= "\u9fff")
|
||
if len(chinese) >= 3:
|
||
tokens.update(f"zh:{chinese[i:i + 2]}" for i in range(len(chinese) - 1))
|
||
return tokens
|
||
|
||
left_tokens = expand_tokens(left)
|
||
right_tokens = expand_tokens(right)
|
||
if not left_tokens or not right_tokens:
|
||
return SequenceMatcher(None, left.searchable_name, right.searchable_name).ratio() * 0.6
|
||
|
||
def weight(token: str) -> float:
|
||
if token in left.brand_tokens or token in right.brand_tokens:
|
||
return 1.4
|
||
if token.startswith("zh:"):
|
||
return 0.55
|
||
if re.search(r"\d", token):
|
||
return 1.2
|
||
if len(token) >= 4:
|
||
return 1.25
|
||
return 1.0
|
||
|
||
overlap = left_tokens & right_tokens
|
||
overlap_weight = sum(weight(token) for token in overlap)
|
||
total_weight = sum(weight(token) for token in left_tokens) + sum(weight(token) for token in right_tokens)
|
||
dice = (2 * overlap_weight / total_weight) if total_weight else 0
|
||
sequence = SequenceMatcher(None, " ".join(sorted(left_tokens)), " ".join(sorted(right_tokens))).ratio()
|
||
return min(1.0, dice * 0.72 + sequence * 0.28)
|
||
|
||
|
||
def _brand_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool, str | None]:
|
||
if not left.brand_tokens or not right.brand_tokens:
|
||
return 0.55, False, None
|
||
if left.brand_tokens & right.brand_tokens:
|
||
return 1.0, False, None
|
||
maquillage_anchor = "心機星魅蜜光圈潤唇膏"
|
||
left_has_shiseido = bool({"shiseido", "資生堂"} & left.brand_tokens)
|
||
right_has_shiseido = bool({"shiseido", "資生堂"} & right.brand_tokens)
|
||
left_has_maquillage = bool({"maquillage", "心機彩妝"} & left.brand_tokens)
|
||
right_has_maquillage = bool({"maquillage", "心機彩妝"} & right.brand_tokens)
|
||
if (
|
||
maquillage_anchor in left.normalized_name
|
||
and maquillage_anchor in right.normalized_name
|
||
and ((left_has_shiseido and right_has_maquillage) or (right_has_shiseido and left_has_maquillage))
|
||
):
|
||
return 1.0, False, None
|
||
return 0.0, True, "brand_conflict"
|
||
|
||
|
||
def _close_number(left: float, right: float, tolerance: float = 0.04) -> bool:
|
||
denominator = max(abs(left), abs(right), 1.0)
|
||
return abs(left - right) / denominator <= tolerance
|
||
|
||
|
||
def _spec_component(left_values: Iterable[float], right_values: Iterable[float]) -> tuple[float, bool]:
|
||
left_tuple = tuple(sorted(set(left_values)))
|
||
right_tuple = tuple(sorted(set(right_values)))
|
||
if not left_tuple and not right_tuple:
|
||
return 0.55, False
|
||
if not left_tuple or not right_tuple:
|
||
return 0.45, False
|
||
if len(left_tuple) > 1 or len(right_tuple) > 1:
|
||
if len(left_tuple) != len(right_tuple):
|
||
return 0.0, True
|
||
unmatched = list(right_tuple)
|
||
for left_value in left_tuple:
|
||
match_index = next(
|
||
(
|
||
index
|
||
for index, right_value in enumerate(unmatched)
|
||
if _close_number(left_value, right_value)
|
||
),
|
||
None,
|
||
)
|
||
if match_index is None:
|
||
return 0.0, True
|
||
unmatched.pop(match_index)
|
||
return 1.0, False
|
||
for left_value in left_tuple:
|
||
if any(_close_number(left_value, right_value) for right_value in right_tuple):
|
||
return 1.0, False
|
||
return 0.0, True
|
||
|
||
|
||
def _has_hard_count_unit_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not left.counts or not right.counts:
|
||
return False
|
||
left_by_count: dict[int, set[str]] = {}
|
||
right_by_count: dict[int, set[str]] = {}
|
||
for count, unit in left.counts:
|
||
left_by_count.setdefault(count, set()).add(_count_unit_family(unit))
|
||
for count, unit in right.counts:
|
||
right_by_count.setdefault(count, set()).add(_count_unit_family(unit))
|
||
|
||
for count in set(left_by_count) & set(right_by_count):
|
||
left_units = left_by_count[count]
|
||
right_units = right_by_count[count]
|
||
if left_units & right_units:
|
||
continue
|
||
if (
|
||
(left_units & PIECE_UNITS and right_units & CONTAINER_UNITS)
|
||
or (right_units & PIECE_UNITS and left_units & CONTAINER_UNITS)
|
||
):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _allow_catalog_count_omission(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
"""Allow catalog-side piece counts for Dashing Diva nail lines when MOMO omits pack count."""
|
||
left_has_counts = bool(left.counts)
|
||
right_has_counts = bool(right.counts)
|
||
if left_has_counts == right_has_counts:
|
||
return False
|
||
|
||
shared_brand_tokens = {token.lower() for token in left.brand_tokens} & {
|
||
token.lower() for token in right.brand_tokens
|
||
}
|
||
if not ({"dashing", "diva"} <= shared_brand_tokens):
|
||
return False
|
||
|
||
searchable_pair = f"{left.searchable_name} {right.searchable_name}"
|
||
if "美甲片" not in searchable_pair:
|
||
return False
|
||
|
||
counted = left if left_has_counts else right
|
||
omitted = right if left_has_counts else left
|
||
if omitted.counts:
|
||
return False
|
||
if (counted.total_piece_count or 0) < 20:
|
||
return False
|
||
|
||
return any(
|
||
anchor in searchable_pair
|
||
for anchor in ("時尚潮流美甲片", "頂級璀燦美甲片", "薄型經典美甲片")
|
||
)
|
||
|
||
|
||
def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool]:
|
||
left_counts = [count for count, _unit in left.counts]
|
||
right_counts = [count for count, _unit in right.counts]
|
||
left_by_unit: dict[str, set[int]] = {}
|
||
right_by_unit: dict[str, set[int]] = {}
|
||
for count, unit in left.counts:
|
||
left_by_unit.setdefault(_count_unit_family(unit), set()).add(count)
|
||
for count, unit in right.counts:
|
||
right_by_unit.setdefault(_count_unit_family(unit), set()).add(count)
|
||
|
||
if left.total_piece_count and right.total_piece_count:
|
||
if left.total_piece_count == right.total_piece_count:
|
||
return 1.0, False
|
||
ratio = max(left.total_piece_count, right.total_piece_count) / max(min(left.total_piece_count, right.total_piece_count), 1)
|
||
return (0.0, True) if ratio >= 1.5 else (0.45, False)
|
||
|
||
for unit in set(left_by_unit) & set(right_by_unit):
|
||
if left_by_unit[unit] != right_by_unit[unit]:
|
||
return 0.0, True
|
||
if left.counts and right.counts:
|
||
if set(left.counts) & set(right.counts):
|
||
return 0.85, False
|
||
if _has_hard_count_unit_conflict(left, right):
|
||
return 0.0, True
|
||
if left_counts and right_counts:
|
||
ratio = max(max(left_counts), max(right_counts)) / max(min(max(left_counts), max(right_counts)), 1)
|
||
if ratio >= 1.5:
|
||
return 0.0, True
|
||
return 0.35, False
|
||
if _allow_catalog_count_omission(left, right):
|
||
return 0.55, False
|
||
if (left_counts and max(left_counts) > 1) or (right_counts and max(right_counts) > 1):
|
||
return 0.0, True
|
||
return 0.5, False
|
||
|
||
|
||
def _has_exact_count_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not left.counts or not right.counts:
|
||
return False
|
||
left_counts = sorted(count for count, _ in left.counts)
|
||
right_counts = sorted(count for count, _ in right.counts)
|
||
return left_counts == right_counts
|
||
|
||
|
||
def _has_pack_quantity_difference(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not left.counts or not right.counts or _has_exact_count_alignment(left, right):
|
||
return False
|
||
|
||
if left.total_piece_count and right.total_piece_count:
|
||
return left.total_piece_count != right.total_piece_count
|
||
|
||
left_by_unit: dict[str, set[int]] = {}
|
||
right_by_unit: dict[str, set[int]] = {}
|
||
for count, unit in left.counts:
|
||
family = _count_unit_family(unit)
|
||
if family in COUNT_UNITS or unit in COUNT_UNITS:
|
||
left_by_unit.setdefault(family, set()).add(count)
|
||
for count, unit in right.counts:
|
||
family = _count_unit_family(unit)
|
||
if family in COUNT_UNITS or unit in COUNT_UNITS:
|
||
right_by_unit.setdefault(family, set()).add(count)
|
||
|
||
for unit in set(left_by_unit) & set(right_by_unit):
|
||
if left_by_unit[unit] != right_by_unit[unit]:
|
||
return True
|
||
return False
|
||
|
||
|
||
NAMED_COMPONENT_QUANTITY_GROUPS = (
|
||
("嬰兒沐浴精", "嬰幼童洗髮精"),
|
||
("魅惑麋香", "湛藍海洋", "花妍巧語", "絲絨玫瑰"),
|
||
)
|
||
|
||
|
||
def _named_component_quantity_map(identity: ProductIdentity, terms: Iterable[str]) -> dict[str, int]:
|
||
text = identity.searchable_name
|
||
present_terms = tuple(term for term in terms if term in text)
|
||
if len(present_terms) < 2:
|
||
return {}
|
||
|
||
quantities: dict[str, int] = {}
|
||
for term in present_terms:
|
||
term_index = text.find(term)
|
||
if term_index < 0:
|
||
continue
|
||
suffix = text[term_index + len(term):term_index + len(term) + 28]
|
||
explicit_count = re.search(
|
||
r"(?:\d+(?:\.\d+)?\s*(?:ml|g|mg|毫升|公克|毫克))?\s*(?:x|乘)\s*(\d+)",
|
||
suffix,
|
||
flags=re.I,
|
||
)
|
||
if explicit_count:
|
||
quantities[term] = int(explicit_count.group(1))
|
||
|
||
if len(quantities) == len(present_terms):
|
||
return quantities
|
||
|
||
pack_counts = [
|
||
count
|
||
for count, unit in identity.counts
|
||
if _count_unit_family(unit) in COUNT_UNITS or unit in COUNT_UNITS
|
||
]
|
||
if not quantities and pack_counts and max(pack_counts) == len(present_terms) and re.search(r"[++//、]", text):
|
||
return {term: 1 for term in present_terms}
|
||
return {}
|
||
|
||
|
||
def _has_named_component_quantity_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
"""同名組合品若命名元件相同但數量反轉,不能視為同一價格標的。"""
|
||
for terms in NAMED_COMPONENT_QUANTITY_GROUPS:
|
||
left_quantities = _named_component_quantity_map(left, terms)
|
||
right_quantities = _named_component_quantity_map(right, terms)
|
||
shared_terms = set(left_quantities) & set(right_quantities)
|
||
if len(shared_terms) < 2:
|
||
continue
|
||
if any(left_quantities[term] != right_quantities[term] for term in shared_terms):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool, tuple[str, ...]]:
|
||
volume_score, volume_conflict = _spec_component(left.volumes_ml, right.volumes_ml)
|
||
weight_score, weight_conflict = _spec_component(left.weights_g, right.weights_g)
|
||
dosage_score, dosage_conflict = _spec_component(left.dosages_mg, right.dosages_mg)
|
||
count_score, count_conflict = _count_score(left, right)
|
||
|
||
available = []
|
||
if left.volumes_ml or right.volumes_ml:
|
||
available.append(volume_score)
|
||
if left.weights_g or right.weights_g:
|
||
available.append(weight_score)
|
||
if left.dosages_mg or right.dosages_mg:
|
||
available.append(dosage_score)
|
||
if left.counts or right.counts:
|
||
available.append(count_score)
|
||
if not available:
|
||
return 0.55, False, ()
|
||
|
||
score = sum(available) / len(available)
|
||
conflicts = []
|
||
if volume_conflict:
|
||
conflicts.append("volume_conflict")
|
||
if weight_conflict:
|
||
conflicts.append("weight_conflict")
|
||
if dosage_conflict:
|
||
conflicts.append("dosage_conflict")
|
||
if count_conflict:
|
||
conflicts.append("count_conflict")
|
||
return score, bool(conflicts), tuple(conflicts)
|
||
|
||
|
||
def _has_bundle_offer(identity: ProductIdentity) -> bool:
|
||
text = identity.normalized_name
|
||
return bool(
|
||
re.search(r"買\s*\d+\s*送\s*\d+", text)
|
||
or re.search(r"買\s*[一二兩雙三四五六七八九十]\s*送\s*[一二兩雙三四五六七八九十]", text)
|
||
or "買一送一" in text
|
||
or any(phrase in text for phrase in BUNDLE_OFFER_PHRASES)
|
||
)
|
||
|
||
|
||
def _has_multi_component(identity: ProductIdentity) -> bool:
|
||
text = _component_separator_text(identity)
|
||
return bool(
|
||
"+" in text
|
||
or "+" in text
|
||
or re.search(r"\d+\s*(?:ml|g|mg|毫升|公克|毫克)\s*x\s*\d+", text, re.I)
|
||
)
|
||
|
||
|
||
def _component_separator_text(identity: ProductIdentity) -> str:
|
||
text = identity.normalized_name
|
||
text = re.sub(r"spf\s*(\d+)\s*[++]+", r"spf\1", text, flags=re.I)
|
||
text = re.sub(r"pa\s*[++]{1,5}", "pa", text, flags=re.I)
|
||
text = re.sub(
|
||
r"(\d+(?:\.\d+)?\s*(?:ml|g|mg|毫升|公克|毫克))\s*[++]\s*"
|
||
r"(\d+(?:\.\d+)?\s*(?:ml|g|mg|毫升|公克|毫克))",
|
||
r"\1 \2",
|
||
text,
|
||
flags=re.I,
|
||
)
|
||
text = re.sub(
|
||
r"\b[a-z]{1,6}\d{1,6}\s*[++]\s*[a-z]{1,6}\d{1,6}\b",
|
||
lambda match: re.sub(r"[++]", " ", match.group(0)),
|
||
text,
|
||
flags=re.I,
|
||
)
|
||
return text
|
||
|
||
|
||
def _multi_component_count(identity: ProductIdentity) -> int:
|
||
text = _component_separator_text(identity)
|
||
if not ("+" in text or "+" in text):
|
||
return 1
|
||
parts = [
|
||
part.strip()
|
||
for part in re.split(r"[++]", text)
|
||
if part.strip() and not re.fullmatch(r"[\s\d-]+", part.strip())
|
||
]
|
||
return len(parts) if len(parts) > 1 else 1
|
||
|
||
|
||
def _repeated_single_spec_count(identity: ProductIdentity) -> Optional[int]:
|
||
text = _component_separator_text(identity)
|
||
matches = re.findall(
|
||
r"\d+(?:\.\d+)?\s*(?:ml|g|mg|毫升|公克|毫克)\s*x\s*(\d+)",
|
||
text,
|
||
flags=re.I,
|
||
)
|
||
if len(matches) != 1:
|
||
return None
|
||
try:
|
||
count = int(matches[0])
|
||
except (TypeError, ValueError):
|
||
return None
|
||
return count if count > 1 else None
|
||
|
||
|
||
def _refill_piece_count(identity: ProductIdentity) -> Optional[int]:
|
||
refill_counts = [
|
||
count
|
||
for count, unit in identity.counts
|
||
if _count_unit_family(unit) == "refill"
|
||
]
|
||
return max(refill_counts) if refill_counts else None
|
||
|
||
|
||
def _has_cushion_refill_pack_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
"""Align cushion compact refill language such as `一盒兩蕊` with `15g x2`."""
|
||
if left.product_type != "氣墊粉餅" or right.product_type != "氣墊粉餅":
|
||
return False
|
||
|
||
def aligned(refill_side: ProductIdentity, spec_side: ProductIdentity) -> bool:
|
||
refill_count = _refill_piece_count(refill_side)
|
||
spec_count = _repeated_single_spec_count(spec_side)
|
||
if not refill_count or not spec_count or refill_count != spec_count:
|
||
return False
|
||
box_counts = [
|
||
count
|
||
for count, unit in refill_side.counts
|
||
if unit in {"盒", "組", "入"} and count > 1
|
||
]
|
||
return not box_counts
|
||
|
||
return aligned(left, right) or aligned(right, left)
|
||
|
||
|
||
def _has_paulas_choice_body_lotion_210ml_2pack_alignment(
|
||
left: ProductIdentity,
|
||
right: ProductIdentity,
|
||
) -> bool:
|
||
"""Align PChome Nick `210ml x2` with MOMO `210ml二入` for the same body lotion."""
|
||
brand_tokens = left.brand_tokens | right.brand_tokens
|
||
if not ({"寶拉珍選", "paulas", "choice"} & brand_tokens):
|
||
return False
|
||
if not _has_shared_volume(left, right, 210):
|
||
return False
|
||
if not all("水楊酸" in item.searchable_name and "身體乳" in item.searchable_name for item in (left, right)):
|
||
return False
|
||
|
||
def has_two_pack(identity: ProductIdentity) -> bool:
|
||
text = identity.searchable_name
|
||
return bool(
|
||
re.search(r"(?:x\s*2|2\s*入|二\s*入|兩\s*入|雙\s*入|雙入組|二入組|兩入組)", text, re.I)
|
||
or (2, "入") in identity.counts
|
||
)
|
||
|
||
return has_two_pack(left) and has_two_pack(right)
|
||
|
||
|
||
def _has_nivea_creme_100ml_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
brand_tokens = left.brand_tokens | right.brand_tokens
|
||
if not ({"nivea", "妮維雅"} & brand_tokens):
|
||
return False
|
||
if not _has_shared_volume(left, right, 100):
|
||
return False
|
||
return all("妮維雅霜" in item.searchable_name and "隨身版" in item.searchable_name for item in (left, right))
|
||
|
||
|
||
def _has_cetaphil_moisturizer_type_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
"""Treat Cetaphil moisturizer wording variants as the same type only on exact named lines."""
|
||
if not ({"cetaphil", "舒特膚"} & (left.brand_tokens & right.brand_tokens)):
|
||
return False
|
||
if {left.product_type, right.product_type} != {"乳液", "面霜"}:
|
||
return False
|
||
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
if all("長效潤膚霜" in item for item in (left_text, right_text)):
|
||
return _has_shared_weight(left, right, 250)
|
||
if all("益膚康修護舒敏乳霜" in item for item in (left_text, right_text)):
|
||
return _has_shared_weight(left, right, 227)
|
||
return False
|
||
|
||
|
||
def _has_refill_pack(identity: ProductIdentity) -> bool:
|
||
text = identity.normalized_name
|
||
return bool(
|
||
"補充瓶" in text
|
||
or "補充包" in text
|
||
or "補充芯" in text
|
||
or "補充蕊" in text
|
||
or "替換蕊" in text
|
||
or "替換芯" in text
|
||
or "refill" in text
|
||
)
|
||
|
||
|
||
def _has_accessory_case(identity: ProductIdentity) -> bool:
|
||
text = identity.normalized_name
|
||
return bool(
|
||
"眉彩餅盒" in text
|
||
or "盒一入款" in text
|
||
or "盒三入款" in text
|
||
or "盒單入" in text
|
||
or "空盒" in text
|
||
)
|
||
|
||
|
||
def _spec_mention_count(identity: ProductIdentity) -> int:
|
||
return len(
|
||
re.findall(
|
||
r"\d+(?:\.\d+)?\s*(?:ml|毫升|l|g|公克|kg|mg|毫克|mcg|μg|ug|微克)",
|
||
identity.normalized_name,
|
||
re.I,
|
||
)
|
||
)
|
||
|
||
|
||
def _count_text_value(value: str) -> Optional[int]:
|
||
if value.isdigit():
|
||
return int(value)
|
||
return CHINESE_COUNT.get(value)
|
||
|
||
|
||
def _pack_multiplier(identity: ProductIdentity) -> int:
|
||
text = identity.normalized_name
|
||
buy_get = re.search(r"買\s*(\d+|[一二兩雙三四五六七八九十])\s*送\s*(\d+|[一二兩雙三四五六七八九十])", text)
|
||
if buy_get:
|
||
left = _count_text_value(buy_get.group(1)) or 0
|
||
right = _count_text_value(buy_get.group(2)) or 0
|
||
if left + right > 1:
|
||
return left + right
|
||
if "買一送一" in text or "買1送1" in text:
|
||
return 2
|
||
piece_pack = re.search(r"(\d+|[一二兩雙三四五六七八九十])\s*件\s*組", text)
|
||
if piece_pack:
|
||
count = _count_text_value(piece_pack.group(1)) or 0
|
||
if count > 1:
|
||
return count
|
||
|
||
multipliers = [count for count, unit in identity.counts if unit in COUNT_UNITS and count > 1]
|
||
if multipliers:
|
||
return max(multipliers)
|
||
return 1
|
||
|
||
|
||
def _has_overlapping_base_spec(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_volumes = tuple(sorted(set(left.volumes_ml)))
|
||
right_volumes = tuple(sorted(set(right.volumes_ml)))
|
||
if left_volumes or right_volumes:
|
||
if not left_volumes or not right_volumes:
|
||
return False
|
||
if len(left_volumes) > 1 or len(right_volumes) > 1:
|
||
return False
|
||
return _close_number(left_volumes[0], right_volumes[0])
|
||
|
||
left_weights = tuple(sorted(set(left.weights_g)))
|
||
right_weights = tuple(sorted(set(right.weights_g)))
|
||
if left_weights or right_weights:
|
||
if not left_weights or not right_weights:
|
||
return False
|
||
if len(left_weights) > 1 or len(right_weights) > 1:
|
||
return False
|
||
return _close_number(left_weights[0], right_weights[0])
|
||
|
||
return False
|
||
|
||
|
||
def _single_unit_total(identity: ProductIdentity) -> tuple[Optional[str], Optional[float], str]:
|
||
volumes = tuple(sorted(set(identity.volumes_ml)))
|
||
weights = tuple(sorted(set(identity.weights_g)))
|
||
if volumes and weights:
|
||
return None, None, "mixed_volume_weight"
|
||
if len(volumes) > 1 or len(weights) > 1:
|
||
return None, None, "multi_spec_component"
|
||
if volumes:
|
||
return "ml", volumes[0] * _pack_multiplier(identity), "ok"
|
||
if weights:
|
||
multiplier = identity.total_piece_count or _pack_multiplier(identity)
|
||
return "g", weights[0] * multiplier, "ok"
|
||
if identity.total_piece_count:
|
||
return "入", float(identity.total_piece_count), "ok"
|
||
return None, None, "missing_single_unit"
|
||
|
||
|
||
def build_unit_price_comparison(
|
||
momo_name: str,
|
||
competitor_name: str,
|
||
momo_price: Optional[float],
|
||
competitor_price: Optional[float],
|
||
) -> dict:
|
||
"""Build deterministic unit-price evidence for unit-comparable candidates."""
|
||
diagnostics = score_marketplace_match(
|
||
momo_name,
|
||
competitor_name,
|
||
momo_price=momo_price,
|
||
competitor_price=competitor_price,
|
||
)
|
||
if diagnostics.comparison_mode != "unit_comparable":
|
||
return UnitPriceComparison(False, diagnostics.comparison_mode).as_dict()
|
||
|
||
left = parse_product_identity(momo_name)
|
||
right = parse_product_identity(competitor_name)
|
||
left_unit, left_total, left_reason = _single_unit_total(left)
|
||
right_unit, right_total, right_reason = _single_unit_total(right)
|
||
if left_reason != "ok" or right_reason != "ok":
|
||
return UnitPriceComparison(False, f"{left_reason}:{right_reason}").as_dict()
|
||
if left_unit != right_unit or not left_total or not right_total:
|
||
return UnitPriceComparison(False, "unit_mismatch").as_dict()
|
||
|
||
try:
|
||
momo_price_num = float(momo_price or 0)
|
||
competitor_price_num = float(competitor_price or 0)
|
||
except (TypeError, ValueError):
|
||
return UnitPriceComparison(False, "invalid_price").as_dict()
|
||
if momo_price_num <= 0 or competitor_price_num <= 0:
|
||
return UnitPriceComparison(False, "invalid_price").as_dict()
|
||
|
||
momo_unit_price = momo_price_num / left_total
|
||
competitor_unit_price = competitor_price_num / right_total
|
||
unit_gap_amount = momo_unit_price - competitor_unit_price
|
||
unit_gap_pct = unit_gap_amount / competitor_unit_price * 100 if competitor_unit_price else 0
|
||
summary = (
|
||
f"MOMO ${momo_unit_price:.2f}/{left_unit} vs "
|
||
f"PChome ${competitor_unit_price:.2f}/{left_unit} "
|
||
f"({unit_gap_pct:+.1f}%)"
|
||
)
|
||
return UnitPriceComparison(
|
||
comparable=True,
|
||
reason="unit_comparable",
|
||
unit_label=left_unit,
|
||
momo_total_quantity=round(left_total, 3),
|
||
competitor_total_quantity=round(right_total, 3),
|
||
momo_unit_price=round(momo_unit_price, 4),
|
||
competitor_unit_price=round(competitor_unit_price, 4),
|
||
unit_gap_amount=round(unit_gap_amount, 4),
|
||
unit_gap_pct=round(unit_gap_pct, 2),
|
||
summary=summary,
|
||
).as_dict()
|
||
|
||
|
||
def _is_unit_comparable_candidate(
|
||
left: ProductIdentity,
|
||
right: ProductIdentity,
|
||
token_score: float,
|
||
chinese_name_score: float,
|
||
brand_conflict: bool,
|
||
type_score: float,
|
||
reasons: Iterable[str],
|
||
) -> bool:
|
||
"""Identify same core product sold in different packs.
|
||
|
||
These are not safe exact matches. They can only enter a normalized unit-price
|
||
review lane, otherwise a bundle price may be incorrectly compared with a
|
||
single-item price.
|
||
"""
|
||
reason_set = set(reasons)
|
||
pack_difference = bool(reason_set & {
|
||
"bundle_offer_conflict",
|
||
"multi_component_conflict",
|
||
"count_conflict",
|
||
"component_count_conflict",
|
||
"pack_quantity_difference",
|
||
})
|
||
if not pack_difference:
|
||
return False
|
||
if brand_conflict or "brand_conflict" in reason_set:
|
||
return False
|
||
if "refill_pack_conflict" in reason_set:
|
||
return False
|
||
if type_score == 0.0 or "type_conflict" in reason_set:
|
||
return False
|
||
if not _has_overlapping_base_spec(left, right):
|
||
return False
|
||
if token_score < 0.45 and chinese_name_score < 0.28:
|
||
return False
|
||
if "product_line_conflict" in reason_set and token_score < 0.72:
|
||
return False
|
||
return True
|
||
|
||
|
||
def _chinese_bigram_score(left: ProductIdentity, right: ProductIdentity) -> float:
|
||
def signature(identity: ProductIdentity) -> set[str]:
|
||
text = identity.searchable_name
|
||
for token in sorted(identity.brand_tokens, key=len, reverse=True):
|
||
text = text.replace(token, " ")
|
||
text = re.sub(r"[a-z0-9]+", " ", text)
|
||
text = "".join(char for char in text if "\u4e00" <= char <= "\u9fff")
|
||
for phrase in (
|
||
"官方", "直營", "公司貨", "專櫃", "正貨", "原廠", "限定", "獨家",
|
||
"期間", "超值", "特惠", "優惠", "新品", "經典", "人氣", "熱銷",
|
||
"必買", "推薦", "任選", "禮盒", "母親節", "超品日", "多款",
|
||
"組", "入", "瓶", "盒", "包", "片", "支", "條",
|
||
):
|
||
text = text.replace(phrase, "")
|
||
return {text[i:i + 2] for i in range(max(0, len(text) - 1))}
|
||
|
||
left_signature = signature(left)
|
||
right_signature = signature(right)
|
||
if not left_signature or not right_signature:
|
||
return 0.55
|
||
return 2 * len(left_signature & right_signature) / (len(left_signature) + len(right_signature))
|
||
|
||
|
||
def _has_strong_product_line_signal(
|
||
left: ProductIdentity,
|
||
right: ProductIdentity,
|
||
token_score: float,
|
||
chinese_name_score: float,
|
||
) -> bool:
|
||
shared_core = (left.core_tokens & right.core_tokens) - left.brand_tokens - right.brand_tokens
|
||
shared_latin_or_model = {
|
||
token for token in shared_core
|
||
if re.fullmatch(r"[a-z][a-z0-9-]{3,}", token)
|
||
or re.fullmatch(r"[a-z]{2,}-?\d+[a-z0-9-]*", token)
|
||
}
|
||
if shared_latin_or_model and token_score >= 0.50:
|
||
return True
|
||
return bool(shared_core) and token_score >= 0.56 and chinese_name_score >= 0.45
|
||
|
||
|
||
def _has_safe_exact_spec_signal(
|
||
left: ProductIdentity,
|
||
right: ProductIdentity,
|
||
token_score: float,
|
||
sequence_score: float,
|
||
type_score: float,
|
||
) -> bool:
|
||
if type_score < 0.55:
|
||
return False
|
||
if _spec_mention_count(left) > 1 or _spec_mention_count(right) > 1:
|
||
return False
|
||
if not _has_overlapping_base_spec(left, right):
|
||
return False
|
||
return token_score >= 0.42 or sequence_score >= 0.50
|
||
|
||
|
||
def _model_line_tokens(identity: ProductIdentity) -> set[str]:
|
||
tokens: set[str] = set()
|
||
for token in identity.core_tokens:
|
||
if token in GENERIC_TOKENS:
|
||
continue
|
||
if _is_spec_like_latin_token(token):
|
||
continue
|
||
if re.fullmatch(r"[a-z][a-z0-9-]{2,}", token):
|
||
tokens.add(token)
|
||
for match in re.finditer(r"([\u4e00-\u9fff]{2,})(?:系列)", token):
|
||
value = match.group(1)
|
||
if value not in GENERIC_TOKENS:
|
||
tokens.add(value)
|
||
return tokens
|
||
|
||
|
||
def _has_model_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_tokens = _model_line_tokens(left)
|
||
right_tokens = _model_line_tokens(right)
|
||
if not left_tokens or not right_tokens:
|
||
return False
|
||
return not bool(left_tokens & right_tokens)
|
||
|
||
|
||
def _nail_polish_model_codes(identity: ProductIdentity) -> set[str]:
|
||
if not any(term in identity.searchable_name for term in ("指甲油", "指彩", "美甲")):
|
||
return set()
|
||
text = f"{identity.original_name} {identity.searchable_name}".lower()
|
||
codes: set[str] = set()
|
||
for pattern in (
|
||
r"(?<![a-z0-9])a10[\._-]?\d{2,3}(?:[\._-]?\d{2,3})?(?![a-z0-9])",
|
||
r"(?<![a-z0-9])isl[a-z0-9]*\d{2,3}(?![a-z0-9])",
|
||
):
|
||
for match in re.finditer(pattern, text):
|
||
codes.add(re.sub(r"[\._-]+", "", match.group(0)))
|
||
return codes
|
||
|
||
|
||
def _has_nail_polish_model_code_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(term in pair_text for term in ("指甲油", "指彩", "美甲")):
|
||
return False
|
||
left_codes = _nail_polish_model_codes(left)
|
||
right_codes = _nail_polish_model_codes(right)
|
||
return bool(left_codes and right_codes and left_codes.isdisjoint(right_codes))
|
||
|
||
|
||
def _dedupe_tuple(values: Iterable[str]) -> tuple[str, ...]:
|
||
result: list[str] = []
|
||
seen: set[str] = set()
|
||
for value in values:
|
||
if not value or value in seen:
|
||
continue
|
||
seen.add(value)
|
||
result.append(value)
|
||
return tuple(result)
|
||
|
||
|
||
def _build_evidence_flags(
|
||
*,
|
||
brand_score: float,
|
||
token_score: float,
|
||
spec_score: float,
|
||
sequence_score: float,
|
||
type_score: float,
|
||
shared_anchor: str,
|
||
shared_models: set[str],
|
||
reasons: Iterable[str],
|
||
catalog_count_omission: bool,
|
||
) -> tuple[str, ...]:
|
||
reason_set = set(reasons)
|
||
flags: list[str] = []
|
||
if brand_score >= 0.95:
|
||
flags.append("brand")
|
||
if spec_score >= 0.85:
|
||
flags.append("spec")
|
||
if token_score >= 0.72:
|
||
flags.append("tokens")
|
||
if sequence_score >= 0.70:
|
||
flags.append("name_sequence")
|
||
if type_score >= 0.95:
|
||
flags.append("product_type")
|
||
if shared_anchor:
|
||
flags.append("identity_anchor")
|
||
if shared_models:
|
||
flags.append("model_token")
|
||
if catalog_count_omission:
|
||
flags.append("catalog_count_omission")
|
||
for reason in (
|
||
"unit_comparable",
|
||
"variant_selection_review",
|
||
"variant_option_conflict",
|
||
"variant_descriptor_conflict",
|
||
"pack_quantity_difference",
|
||
"count_conflict",
|
||
"bundle_offer_conflict",
|
||
"multi_component_conflict",
|
||
"multi_component_count_conflict",
|
||
"accessory_case_conflict",
|
||
"refill_pack_conflict",
|
||
"price_ratio_extreme",
|
||
"price_ratio_wide",
|
||
):
|
||
if reason in reason_set:
|
||
flags.append(reason)
|
||
return _dedupe_tuple(flags)
|
||
|
||
|
||
def _number_values(values: Iterable[float]) -> list[float | int]:
|
||
result: list[float | int] = []
|
||
for value in values or ():
|
||
try:
|
||
number = float(value)
|
||
except (TypeError, ValueError):
|
||
continue
|
||
result.append(int(number) if number.is_integer() else round(number, 3))
|
||
return result
|
||
|
||
|
||
def _count_values(values: Iterable[tuple[int, str]]) -> list[str]:
|
||
return [f"{count}{unit}" for count, unit in sorted(set(values or ()))]
|
||
|
||
|
||
def _identity_spec_payload(identity: ProductIdentity) -> dict[str, object]:
|
||
return {
|
||
"volumes_ml": _number_values(identity.volumes_ml),
|
||
"weights_g": _number_values(identity.weights_g),
|
||
"dosages_mg": _number_values(identity.dosages_mg),
|
||
"counts": _count_values(identity.counts),
|
||
"total_piece_count": identity.total_piece_count,
|
||
}
|
||
|
||
|
||
def _spec_mismatch_payload(left: ProductIdentity, right: ProductIdentity) -> list[dict[str, object]]:
|
||
specs = (
|
||
("volume_ml", "容量", _number_values(left.volumes_ml), _number_values(right.volumes_ml)),
|
||
("weight_g", "重量", _number_values(left.weights_g), _number_values(right.weights_g)),
|
||
("dosage_mg", "劑量", _number_values(left.dosages_mg), _number_values(right.dosages_mg)),
|
||
("count", "入數/件數", _count_values(left.counts), _count_values(right.counts)),
|
||
)
|
||
mismatches: list[dict[str, object]] = []
|
||
for field_name, label, momo_values, competitor_values in specs:
|
||
if momo_values and competitor_values and set(momo_values).isdisjoint(set(competitor_values)):
|
||
mismatches.append({
|
||
"field": field_name,
|
||
"label": label,
|
||
"momo": momo_values,
|
||
"competitor": competitor_values,
|
||
})
|
||
elif bool(momo_values) != bool(competitor_values):
|
||
mismatches.append({
|
||
"field": field_name,
|
||
"label": f"{label}單側缺漏",
|
||
"momo": momo_values,
|
||
"competitor": competitor_values,
|
||
"needs_review": True,
|
||
})
|
||
return mismatches
|
||
|
||
|
||
def _identity_evidence_payload(
|
||
left: ProductIdentity,
|
||
right: ProductIdentity,
|
||
*,
|
||
brand_score: float,
|
||
token_score: float,
|
||
spec_score: float,
|
||
sequence_score: float,
|
||
type_score: float,
|
||
hard_veto: bool,
|
||
comparison_mode: str,
|
||
match_type: str,
|
||
price_basis: str,
|
||
alert_tier: str,
|
||
shared_anchor: str,
|
||
shared_models: set[str],
|
||
reasons: Iterable[str],
|
||
catalog_count_omission: bool,
|
||
) -> dict[str, object]:
|
||
reason_set = set(reasons or ())
|
||
conflict_reasons = [
|
||
reason for reason in reason_set
|
||
if "conflict" in reason
|
||
or reason in {
|
||
"variant_selection_review",
|
||
"catalog_count_omission",
|
||
"pack_quantity_difference",
|
||
"unit_comparable",
|
||
}
|
||
]
|
||
shared_brand = sorted(left.brand_tokens & right.brand_tokens)
|
||
shared_core = sorted((left.core_tokens & right.core_tokens) - left.brand_tokens - right.brand_tokens)[:20]
|
||
return {
|
||
"version": "identity_evidence_v1",
|
||
"lane": {
|
||
"comparison_mode": comparison_mode,
|
||
"match_type": match_type,
|
||
"price_basis": price_basis,
|
||
"alert_tier": alert_tier,
|
||
},
|
||
"confidence_components": {
|
||
"brand_score": round(brand_score, 3),
|
||
"token_score": round(token_score, 3),
|
||
"spec_score": round(spec_score, 3),
|
||
"sequence_score": round(sequence_score, 3),
|
||
"type_score": round(type_score, 3),
|
||
},
|
||
"brand": {
|
||
"momo": sorted(left.brand_tokens),
|
||
"competitor": sorted(right.brand_tokens),
|
||
"shared": shared_brand,
|
||
},
|
||
"product_type": {
|
||
"momo": left.product_type or "",
|
||
"competitor": right.product_type or "",
|
||
"matched": bool(left.product_type and right.product_type and left.product_type == right.product_type),
|
||
},
|
||
"identity_anchor": shared_anchor or "",
|
||
"shared_model_tokens": sorted(shared_models),
|
||
"shared_core_tokens": shared_core,
|
||
"specs": {
|
||
"momo": _identity_spec_payload(left),
|
||
"competitor": _identity_spec_payload(right),
|
||
"mismatches": _spec_mismatch_payload(left, right),
|
||
},
|
||
"variant_guardrails": {
|
||
"hard_veto": bool(hard_veto),
|
||
"conflict_reasons": sorted(conflict_reasons),
|
||
"catalog_count_omission": bool(catalog_count_omission),
|
||
},
|
||
}
|
||
|
||
|
||
def _offer_evidence_payload(
|
||
momo_price: Optional[float],
|
||
competitor_price: Optional[float],
|
||
*,
|
||
price_penalty: float,
|
||
price_basis: str,
|
||
alert_tier: str,
|
||
) -> dict[str, object]:
|
||
payload: dict[str, object] = {
|
||
"version": "offer_evidence_v1",
|
||
"price_basis": price_basis,
|
||
"alert_tier": alert_tier,
|
||
"price_is_identity_evidence": False,
|
||
"price_penalty": round(price_penalty, 3),
|
||
}
|
||
try:
|
||
momo_value = float(momo_price) if momo_price is not None else None
|
||
competitor_value = float(competitor_price) if competitor_price is not None else None
|
||
except (TypeError, ValueError):
|
||
momo_value = None
|
||
competitor_value = None
|
||
if momo_value is not None:
|
||
payload["momo_price"] = round(momo_value, 2)
|
||
if competitor_value is not None:
|
||
payload["competitor_price"] = round(competitor_value, 2)
|
||
if momo_value is not None and competitor_value and competitor_value > 0:
|
||
payload["gap_amount"] = round(momo_value - competitor_value, 2)
|
||
payload["gap_pct"] = round((momo_value - competitor_value) / max(competitor_value, 1) * 100, 2)
|
||
return payload
|
||
|
||
|
||
def _has_safe_multi_component_exact_total_price(
|
||
left: ProductIdentity,
|
||
right: ProductIdentity,
|
||
*,
|
||
brand_score: float,
|
||
token_score: float,
|
||
spec_score: float,
|
||
sequence_score: float,
|
||
type_score: float,
|
||
hard_veto: bool,
|
||
variant_descriptor_conflict: bool,
|
||
reasons: Iterable[str],
|
||
) -> bool:
|
||
"""Allow exact total-price writes for same-component sets, not mixed bundles."""
|
||
if hard_veto or variant_descriptor_conflict:
|
||
return False
|
||
if not (_has_multi_component(left) and _has_multi_component(right)):
|
||
return False
|
||
reason_set = set(reasons)
|
||
blocked = {
|
||
"variant_selection_review",
|
||
"variant_option_conflict",
|
||
"variant_descriptor_conflict",
|
||
"pack_quantity_difference",
|
||
"count_conflict",
|
||
"bundle_offer_conflict",
|
||
"multi_component_conflict",
|
||
"multi_component_count_conflict",
|
||
"commercial_condition_gap",
|
||
"refill_pack_conflict",
|
||
"unit_comparable",
|
||
"price_ratio_extreme",
|
||
"price_ratio_wide",
|
||
}
|
||
if reason_set & blocked:
|
||
return False
|
||
if brand_score < 0.95 or spec_score < 0.85:
|
||
return False
|
||
exact_count_alignment = _has_exact_count_alignment(left, right)
|
||
same_base_spec = _has_overlapping_base_spec(left, right)
|
||
if (
|
||
exact_count_alignment
|
||
and type_score >= 0.55
|
||
and token_score >= 0.80
|
||
and sequence_score >= 0.75
|
||
and (
|
||
same_base_spec
|
||
or (
|
||
token_score >= 0.90
|
||
and sequence_score >= 0.90
|
||
and "strong_product_line_match" in reason_set
|
||
)
|
||
)
|
||
):
|
||
return True
|
||
if type_score < 0.95:
|
||
return False
|
||
if exact_count_alignment:
|
||
return token_score >= 0.50 and sequence_score >= 0.50
|
||
return (
|
||
token_score >= 0.62
|
||
and sequence_score >= 0.62
|
||
and bool(reason_set & {"strong_exact_spec_match", "shared_model_token", "spec_name_alignment"})
|
||
)
|
||
|
||
|
||
def _classify_match_quality(
|
||
*,
|
||
score: float,
|
||
brand_score: float,
|
||
token_score: float,
|
||
spec_score: float,
|
||
sequence_score: float,
|
||
type_score: float,
|
||
hard_veto: bool,
|
||
comparison_mode: str,
|
||
reasons: Iterable[str],
|
||
shared_anchor: str,
|
||
shared_models: set[str],
|
||
catalog_count_omission: bool,
|
||
multi_component_pair: bool,
|
||
) -> tuple[str, str, str]:
|
||
"""Map raw matcher scores into operator-facing price comparison lanes."""
|
||
reason_set = set(reasons)
|
||
if comparison_mode == "unit_comparable":
|
||
return "same_product_different_pack", "unit_price", "unit_price_review"
|
||
|
||
if hard_veto or comparison_mode == "not_comparable":
|
||
variant_conflict = bool(reason_set & {"variant_option_conflict", "variant_descriptor_conflict"})
|
||
same_line_signal = bool(shared_anchor and brand_score >= 0.95 and type_score >= 0.55)
|
||
if variant_conflict and same_line_signal:
|
||
return "same_line_variant", "manual_review", "suppress"
|
||
return "no_match", "none", "suppress"
|
||
|
||
direct_spec_evidence = spec_score >= 0.85 or bool(shared_models)
|
||
focused_total_price_safe = "focused_exact_total_price_safe" in reason_set
|
||
strong_identity_evidence = (
|
||
(
|
||
brand_score >= 0.95
|
||
and type_score >= 0.55
|
||
and score >= 0.86
|
||
and (direct_spec_evidence or (shared_anchor and token_score >= 0.62 and sequence_score >= 0.58))
|
||
)
|
||
or (
|
||
focused_total_price_safe
|
||
and type_score >= 0.55
|
||
and score >= 0.86
|
||
)
|
||
)
|
||
if strong_identity_evidence and not catalog_count_omission:
|
||
if focused_total_price_safe and "variant_selection_review" not in reason_set:
|
||
return "exact", "total_price", "price_alert_exact"
|
||
safe_multi_component_total_price = "safe_multi_component_exact_total_price" in reason_set
|
||
if "variant_selection_review" in reason_set:
|
||
return "exact", "manual_review", "identity_review"
|
||
if multi_component_pair and not safe_multi_component_total_price:
|
||
return "exact", "manual_review", "identity_review"
|
||
return "exact", "total_price", "price_alert_exact"
|
||
|
||
if score >= 0.76:
|
||
if catalog_count_omission:
|
||
return "same_product_different_pack", "manual_review", "unit_price_review"
|
||
return "comparable", "manual_review", "identity_review"
|
||
|
||
return "no_match", "none", "suppress"
|
||
|
||
|
||
def score_marketplace_match(
|
||
momo_name: str,
|
||
competitor_name: str,
|
||
momo_price: Optional[float] = None,
|
||
competitor_price: Optional[float] = None,
|
||
) -> MatchDiagnostics:
|
||
left = parse_product_identity(momo_name)
|
||
right = parse_product_identity(competitor_name)
|
||
|
||
brand_score, brand_conflict, brand_reason = _brand_score(left, right)
|
||
token_score = _weighted_token_score(left, right)
|
||
spec_score, spec_conflict, spec_reasons = _spec_score(left, right)
|
||
sequence_score = SequenceMatcher(None, left.searchable_name, right.searchable_name).ratio()
|
||
chinese_name_score = _chinese_bigram_score(left, right)
|
||
nivea_creme_100ml_alignment = _has_nivea_creme_100ml_alignment(left, right)
|
||
cetaphil_moisturizer_type_alignment = _has_cetaphil_moisturizer_type_alignment(left, right)
|
||
type_aligned = (
|
||
left.product_type == right.product_type
|
||
or nivea_creme_100ml_alignment
|
||
or cetaphil_moisturizer_type_alignment
|
||
)
|
||
if left.product_type and right.product_type:
|
||
type_score = 1.0 if type_aligned else 0.0
|
||
else:
|
||
type_score = 0.55
|
||
|
||
reasons = []
|
||
if brand_reason:
|
||
reasons.append(brand_reason)
|
||
reasons.extend(spec_reasons)
|
||
if left.product_type and right.product_type and left.product_type != right.product_type and not type_aligned:
|
||
reasons.append("type_conflict")
|
||
if nivea_creme_100ml_alignment:
|
||
reasons.append("nivea_creme_100ml_type_alignment")
|
||
if cetaphil_moisturizer_type_alignment:
|
||
reasons.append("cetaphil_moisturizer_type_alignment")
|
||
model_line_conflict = _has_model_line_conflict(left, right)
|
||
if model_line_conflict:
|
||
reasons.append("model_line_conflict")
|
||
nail_polish_model_code_conflict = _has_nail_polish_model_code_conflict(left, right)
|
||
if nail_polish_model_code_conflict:
|
||
reasons.append("nail_polish_model_code_conflict")
|
||
bundle_offer_conflict = (
|
||
_has_bundle_offer(left) != _has_bundle_offer(right)
|
||
and not (
|
||
left.total_piece_count
|
||
and right.total_piece_count
|
||
and left.total_piece_count == right.total_piece_count
|
||
)
|
||
)
|
||
if bundle_offer_conflict:
|
||
reasons.append("bundle_offer_conflict")
|
||
cushion_refill_pack_alignment = _has_cushion_refill_pack_alignment(left, right)
|
||
paulas_choice_body_lotion_2pack_alignment = _has_paulas_choice_body_lotion_210ml_2pack_alignment(left, right)
|
||
if (
|
||
_has_multi_component(left) != _has_multi_component(right)
|
||
and not cushion_refill_pack_alignment
|
||
and not paulas_choice_body_lotion_2pack_alignment
|
||
):
|
||
reasons.append("multi_component_conflict")
|
||
if cushion_refill_pack_alignment:
|
||
reasons.append("cushion_refill_pack_alignment")
|
||
if paulas_choice_body_lotion_2pack_alignment:
|
||
reasons.append("paulas_choice_body_lotion_210ml_2pack_alignment")
|
||
multi_component_count_conflict = (
|
||
_has_multi_component(left)
|
||
and _has_multi_component(right)
|
||
and _multi_component_count(left) != _multi_component_count(right)
|
||
)
|
||
if multi_component_count_conflict:
|
||
reasons.append("multi_component_count_conflict")
|
||
if _has_refill_pack(left) != _has_refill_pack(right):
|
||
reasons.append("refill_pack_conflict")
|
||
accessory_case_conflict = _has_accessory_case(left) != _has_accessory_case(right)
|
||
if accessory_case_conflict:
|
||
reasons.append("accessory_case_conflict")
|
||
left_spec_mentions = _spec_mention_count(left)
|
||
right_spec_mentions = _spec_mention_count(right)
|
||
if left_spec_mentions and right_spec_mentions and left_spec_mentions != right_spec_mentions:
|
||
reasons.append("component_count_conflict")
|
||
if chinese_name_score < 0.16:
|
||
reasons.append("product_line_conflict")
|
||
shared_anchor = _shared_identity_anchor(left, right)
|
||
catalog_count_omission = _allow_catalog_count_omission(left, right)
|
||
if catalog_count_omission:
|
||
reasons.append("catalog_count_omission")
|
||
if _has_pack_quantity_difference(left, right):
|
||
reasons.append("pack_quantity_difference")
|
||
named_component_quantity_conflict = _has_named_component_quantity_conflict(left, right)
|
||
if named_component_quantity_conflict:
|
||
reasons.append("named_component_quantity_conflict")
|
||
variant_descriptor_conflict = _has_variant_descriptor_conflict(left, right, shared_anchor)
|
||
sun_protection_line_conflict = (
|
||
variant_descriptor_conflict
|
||
and left.product_type == right.product_type == "防曬"
|
||
and not shared_anchor
|
||
)
|
||
if sun_protection_line_conflict:
|
||
reasons.append("variant_descriptor_conflict")
|
||
reasons.append("sun_protection_line_conflict")
|
||
variant_option_conflict = _has_explicit_variant_option_conflict(left, right, shared_anchor)
|
||
if variant_option_conflict:
|
||
reasons.append("variant_option_conflict")
|
||
saugella_variant_conflict = _has_saugella_private_wash_variant_conflict(left, right)
|
||
if saugella_variant_conflict:
|
||
reasons.append("saugella_variant_conflict")
|
||
lactacyd_variant_conflict = _has_lactacyd_private_wash_variant_conflict(left, right)
|
||
if lactacyd_variant_conflict:
|
||
reasons.append("lactacyd_variant_conflict")
|
||
makeup_usage_conflict = _has_makeup_usage_conflict(left, right)
|
||
if makeup_usage_conflict:
|
||
reasons.append("makeup_usage_conflict")
|
||
makeup_finish_conflict = _has_makeup_finish_conflict(left, right)
|
||
if makeup_finish_conflict:
|
||
reasons.append("makeup_finish_conflict")
|
||
sun_protection_spf_conflict = _has_sun_protection_spf_conflict(left, right)
|
||
if sun_protection_spf_conflict:
|
||
reasons.append("spf_value_conflict")
|
||
makeup_spray_line_conflict = _has_makeup_spray_line_conflict(left, right)
|
||
if makeup_spray_line_conflict:
|
||
reasons.append("makeup_spray_line_conflict")
|
||
romand_lip_line_conflict = _has_romand_lip_line_conflict(left, right)
|
||
if romand_lip_line_conflict:
|
||
reasons.append("romand_lip_line_conflict")
|
||
nail_tool_function_conflict = _has_nail_tool_function_conflict(left, right)
|
||
if nail_tool_function_conflict:
|
||
reasons.append("nail_tool_function_conflict")
|
||
schick_razor_line_conflict = _has_schick_razor_line_conflict(left, right)
|
||
if schick_razor_line_conflict:
|
||
reasons.append("schick_razor_line_conflict")
|
||
lancome_line_conflict = _has_lancome_ultra_line_conflict(left, right)
|
||
if lancome_line_conflict:
|
||
reasons.append("lancome_line_conflict")
|
||
dr_hsieh_line_conflict = _has_dr_hsieh_labsmart_serum_line_conflict(left, right)
|
||
if dr_hsieh_line_conflict:
|
||
reasons.append("dr_hsieh_labsmart_line_conflict")
|
||
cotton_swab_variant_conflict = _has_cotton_swab_variant_conflict(left, right)
|
||
if cotton_swab_variant_conflict:
|
||
reasons.append("cotton_swab_variant_conflict")
|
||
kanebo_milano_type_conflict = _has_kanebo_milano_powder_perfume_conflict(left, right)
|
||
if kanebo_milano_type_conflict:
|
||
reasons.append("kanebo_milano_type_conflict")
|
||
hoi_candle_line_conflict = _has_hoi_candle_line_conflict(left, right)
|
||
if hoi_candle_line_conflict:
|
||
reasons.append("hoi_candle_line_conflict")
|
||
aroma_scent_variant_conflict = _has_aroma_scent_variant_conflict(left, right)
|
||
if aroma_scent_variant_conflict:
|
||
reasons.append("aroma_scent_variant_conflict")
|
||
unknown_scent_variant_conflict = _has_unknown_scent_variant_conflict(left, right)
|
||
if unknown_scent_variant_conflict:
|
||
reasons.append("unknown_scent_variant_conflict")
|
||
nail_polish_color_name_conflict = _has_nail_polish_color_name_conflict(left, right)
|
||
if nail_polish_color_name_conflict:
|
||
reasons.append("nail_polish_color_name_conflict")
|
||
ingredient_line_conflict = _has_core_ingredient_line_conflict(left, right)
|
||
if ingredient_line_conflict:
|
||
reasons.append("core_ingredient_line_conflict")
|
||
clarins_body_oil_line_conflict = _has_clarins_body_oil_line_conflict(left, right)
|
||
if clarins_body_oil_line_conflict:
|
||
reasons.append("clarins_body_oil_line_conflict")
|
||
branded_powder_line_conflict = _has_branded_powder_line_conflict(left, right)
|
||
if branded_powder_line_conflict:
|
||
reasons.append("branded_powder_line_conflict")
|
||
cleanser_lotion_line_conflict = _has_cleanser_lotion_line_conflict(left, right)
|
||
if cleanser_lotion_line_conflict:
|
||
reasons.append("cleanser_lotion_line_conflict")
|
||
selection1990_wax_lamp_design_conflict = _has_selection1990_wax_lamp_design_conflict(left, right)
|
||
if selection1990_wax_lamp_design_conflict:
|
||
reasons.append("selection1990_wax_lamp_design_conflict")
|
||
aroma_lamp_style_selection_gap = _has_aroma_lamp_style_selection_gap(left, right)
|
||
if aroma_lamp_style_selection_gap:
|
||
reasons.append("aroma_lamp_style_selection_gap")
|
||
hooome_wax_lamp_design_gap = _has_hooome_wax_lamp_design_gap(left, right)
|
||
if hooome_wax_lamp_design_gap:
|
||
reasons.append("hooome_wax_lamp_design_gap")
|
||
wax_lamp_size_letter_conflict = _has_wax_lamp_size_letter_conflict(left, right)
|
||
if wax_lamp_size_letter_conflict:
|
||
reasons.append("size_letter_variant_conflict")
|
||
nitori_diffuser_model_conflict = _has_nitori_diffuser_model_conflict(left, right)
|
||
if nitori_diffuser_model_conflict:
|
||
reasons.append("nitori_diffuser_model_conflict")
|
||
commercial_condition_gap = _has_commercial_condition_gap(left, right)
|
||
if commercial_condition_gap:
|
||
reasons.append("commercial_condition_gap")
|
||
relove_private_cleanser_variant_gap = _has_relove_private_cleanser_variant_gap(left, right)
|
||
if relove_private_cleanser_variant_gap:
|
||
reasons.append("relove_private_cleanser_variant_gap")
|
||
candle_catalog_selection_gap = _has_candle_catalog_selection_gap(left, right)
|
||
if candle_catalog_selection_gap:
|
||
reasons.append("candle_catalog_selection_gap")
|
||
bath_additive_variant_gap = _has_bath_additive_variant_gap(left, right)
|
||
if bath_additive_variant_gap:
|
||
reasons.append("bath_additive_variant_gap")
|
||
makeup_catalog_selection_gap = _has_makeup_catalog_selection_gap(left, right)
|
||
if makeup_catalog_selection_gap:
|
||
reasons.append("makeup_catalog_selection_gap")
|
||
loreal_serum_variant_gap = _has_loreal_serum_variant_gap(left, right)
|
||
if loreal_serum_variant_gap:
|
||
reasons.append("loreal_serum_variant_gap")
|
||
sebamed_shampoo_variant_catalog_gap = _has_sebamed_shampoo_variant_catalog_gap(left, right)
|
||
if sebamed_shampoo_variant_catalog_gap:
|
||
reasons.append("sebamed_shampoo_variant_catalog_gap")
|
||
schick_2in1_model_gap = _has_schick_2in1_model_gap(left, right)
|
||
if schick_2in1_model_gap:
|
||
reasons.append("schick_2in1_model_gap")
|
||
taicend_protection_form_gap = _has_taicend_protection_form_gap(left, right)
|
||
if taicend_protection_form_gap:
|
||
reasons.append("taicend_protection_form_gap")
|
||
variant_selection_review = (
|
||
_has_named_variant_selection_review(left, right, shared_anchor)
|
||
or commercial_condition_gap
|
||
or relove_private_cleanser_variant_gap
|
||
or candle_catalog_selection_gap
|
||
or bath_additive_variant_gap
|
||
or aroma_lamp_style_selection_gap
|
||
or hooome_wax_lamp_design_gap
|
||
or makeup_catalog_selection_gap
|
||
or loreal_serum_variant_gap
|
||
or sebamed_shampoo_variant_catalog_gap
|
||
or schick_2in1_model_gap
|
||
or taicend_protection_form_gap
|
||
)
|
||
if variant_selection_review:
|
||
reasons.append("variant_selection_review")
|
||
|
||
hard_veto = brand_conflict or spec_conflict
|
||
if bundle_offer_conflict:
|
||
hard_veto = True
|
||
if (
|
||
_has_multi_component(left) != _has_multi_component(right)
|
||
and not cushion_refill_pack_alignment
|
||
and not paulas_choice_body_lotion_2pack_alignment
|
||
):
|
||
hard_veto = True
|
||
if multi_component_count_conflict:
|
||
hard_veto = True
|
||
if named_component_quantity_conflict:
|
||
hard_veto = True
|
||
if _has_refill_pack(left) != _has_refill_pack(right):
|
||
hard_veto = True
|
||
if accessory_case_conflict:
|
||
hard_veto = True
|
||
if model_line_conflict:
|
||
hard_veto = True
|
||
if nail_polish_model_code_conflict:
|
||
hard_veto = True
|
||
if left_spec_mentions and right_spec_mentions and left_spec_mentions != right_spec_mentions:
|
||
hard_veto = True
|
||
if chinese_name_score < 0.16 and token_score < 0.72:
|
||
hard_veto = True
|
||
if left.product_type and right.product_type and left.product_type != right.product_type and not type_aligned:
|
||
hard_veto = True
|
||
if sun_protection_line_conflict:
|
||
hard_veto = True
|
||
if variant_option_conflict:
|
||
hard_veto = True
|
||
if saugella_variant_conflict:
|
||
hard_veto = True
|
||
if lactacyd_variant_conflict:
|
||
hard_veto = True
|
||
if makeup_usage_conflict:
|
||
hard_veto = True
|
||
if makeup_finish_conflict:
|
||
hard_veto = True
|
||
if sun_protection_spf_conflict:
|
||
hard_veto = True
|
||
if makeup_spray_line_conflict:
|
||
hard_veto = True
|
||
if romand_lip_line_conflict:
|
||
hard_veto = True
|
||
if nail_tool_function_conflict:
|
||
hard_veto = True
|
||
if schick_razor_line_conflict:
|
||
hard_veto = True
|
||
if lancome_line_conflict:
|
||
hard_veto = True
|
||
if dr_hsieh_line_conflict:
|
||
hard_veto = True
|
||
if cotton_swab_variant_conflict:
|
||
hard_veto = True
|
||
if kanebo_milano_type_conflict:
|
||
hard_veto = True
|
||
if hoi_candle_line_conflict:
|
||
hard_veto = True
|
||
if aroma_scent_variant_conflict:
|
||
hard_veto = True
|
||
if unknown_scent_variant_conflict:
|
||
hard_veto = True
|
||
if nail_polish_color_name_conflict:
|
||
hard_veto = True
|
||
if ingredient_line_conflict:
|
||
hard_veto = True
|
||
if clarins_body_oil_line_conflict:
|
||
hard_veto = True
|
||
if branded_powder_line_conflict:
|
||
hard_veto = True
|
||
if cleanser_lotion_line_conflict:
|
||
hard_veto = True
|
||
if selection1990_wax_lamp_design_conflict:
|
||
hard_veto = True
|
||
if wax_lamp_size_letter_conflict:
|
||
hard_veto = True
|
||
if nitori_diffuser_model_conflict:
|
||
hard_veto = True
|
||
|
||
focused_exact_line_reason = _has_focused_low_score_exact_identity_line(left, right)
|
||
if focused_exact_line_reason in FOCUSED_IDENTITY_REVIEW_ONLY_REASONS:
|
||
reasons.append("variant_selection_review")
|
||
if (
|
||
focused_exact_line_reason in FOCUSED_IDENTITY_VARIANT_REVIEW_BYPASS_REASONS
|
||
and not commercial_condition_gap
|
||
):
|
||
reasons = [reason for reason in reasons if reason != "variant_selection_review"]
|
||
variant_selection_review = False
|
||
focused_exact_price_safe = (
|
||
focused_exact_line_reason
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and spec_score >= 0.45
|
||
and token_score >= 0.30
|
||
and sequence_score >= 0.40
|
||
and not variant_descriptor_conflict
|
||
)
|
||
focused_exact_review_boost_safe = (
|
||
focused_exact_line_reason
|
||
and not hard_veto
|
||
and spec_score >= 0.45
|
||
and token_score >= 0.30
|
||
and sequence_score >= 0.40
|
||
and not variant_descriptor_conflict
|
||
and (
|
||
brand_score >= 0.95
|
||
or (
|
||
focused_exact_line_reason in FOCUSED_IDENTITY_BRANDLESS_REVIEW_REASONS
|
||
and brand_score == 0.55
|
||
and bool(left.brand_tokens) != bool(right.brand_tokens)
|
||
and spec_score >= 0.85
|
||
and token_score >= 0.55
|
||
and sequence_score >= 0.50
|
||
)
|
||
)
|
||
)
|
||
focused_total_price_brand_safe = (
|
||
brand_score >= 0.95
|
||
or (
|
||
focused_exact_line_reason in FOCUSED_IDENTITY_BRANDLESS_TOTAL_PRICE_REASONS
|
||
and brand_score == 0.55
|
||
and bool(left.brand_tokens) != bool(right.brand_tokens)
|
||
and spec_score >= 0.85
|
||
and token_score >= 0.70
|
||
and sequence_score >= 0.55
|
||
)
|
||
)
|
||
focused_exact_total_price_safe = (
|
||
focused_exact_line_reason in FOCUSED_IDENTITY_TOTAL_PRICE_REASONS
|
||
and focused_total_price_brand_safe
|
||
and not hard_veto
|
||
and spec_score >= 0.45
|
||
and token_score >= 0.30
|
||
and sequence_score >= 0.40
|
||
and (
|
||
not variant_descriptor_conflict
|
||
or focused_exact_line_reason == "hanamisui_inclear_private_gel_1_7g_3pack"
|
||
)
|
||
and "variant_selection_review" not in reasons
|
||
and "commercial_condition_gap" not in reasons
|
||
)
|
||
if focused_exact_total_price_safe:
|
||
reasons.append("focused_exact_total_price_safe")
|
||
reasons.append(f"focused_exact_identity_{focused_exact_line_reason}")
|
||
|
||
comparison_mode = "exact_identity"
|
||
if _is_unit_comparable_candidate(
|
||
left,
|
||
right,
|
||
token_score,
|
||
chinese_name_score,
|
||
brand_conflict,
|
||
type_score,
|
||
reasons,
|
||
):
|
||
comparison_mode = "unit_comparable"
|
||
reasons.append("unit_comparable")
|
||
elif hard_veto:
|
||
comparison_mode = "not_comparable"
|
||
|
||
price_penalty = 0.0
|
||
try:
|
||
if momo_price and competitor_price:
|
||
ratio = float(competitor_price) / max(float(momo_price), 1.0)
|
||
lip_care_exact_identity = (
|
||
shared_anchor
|
||
and "唇膏" in shared_anchor
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and spec_score >= 0.99
|
||
and token_score >= 0.50
|
||
and sequence_score >= 0.50
|
||
and not variant_descriptor_conflict
|
||
)
|
||
allow_price_penalty_suppression = (
|
||
shared_anchor
|
||
and len(shared_anchor.replace(" ", "")) >= 7
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and type_score >= 0.55
|
||
and spec_score >= 0.99
|
||
and token_score >= 0.68
|
||
and sequence_score >= 0.72
|
||
)
|
||
allow_wide_price_penalty_suppression = (
|
||
(
|
||
shared_anchor
|
||
and len(shared_anchor.replace(" ", "")) >= 5
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and type_score >= 0.55
|
||
and spec_score >= 0.99
|
||
and token_score >= 0.50
|
||
and (sequence_score >= 0.55 or lip_care_exact_identity)
|
||
)
|
||
or focused_exact_price_safe
|
||
)
|
||
if (ratio < 0.3 or ratio > 3.2) and token_score < 0.78:
|
||
if allow_price_penalty_suppression:
|
||
reasons.append("price_penalty_suppressed_exact_identity")
|
||
else:
|
||
price_penalty = 0.12
|
||
reasons.append("price_ratio_extreme")
|
||
elif (ratio < 0.48 or ratio > 2.2) and token_score < 0.68:
|
||
if allow_wide_price_penalty_suppression:
|
||
reasons.append("price_penalty_suppressed_wide_exact_identity")
|
||
else:
|
||
price_penalty = 0.06
|
||
reasons.append("price_ratio_wide")
|
||
except (TypeError, ValueError, ZeroDivisionError):
|
||
price_penalty = 0.0
|
||
|
||
score = (
|
||
brand_score * 0.20
|
||
+ token_score * 0.36
|
||
+ spec_score * 0.25
|
||
+ sequence_score * 0.12
|
||
+ type_score * 0.07
|
||
- price_penalty
|
||
)
|
||
|
||
if token_score >= 0.72 and spec_score >= 0.82 and not brand_conflict:
|
||
score += 0.08
|
||
|
||
if (
|
||
brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.55
|
||
and spec_score >= 0.55
|
||
and not variant_descriptor_conflict
|
||
and _has_strong_product_line_signal(left, right, token_score, chinese_name_score)
|
||
):
|
||
score += 0.07
|
||
reasons.append("strong_product_line_match")
|
||
if (
|
||
brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and _has_safe_exact_spec_signal(left, right, token_score, sequence_score, type_score)
|
||
):
|
||
score += 0.025
|
||
reasons.append("strong_exact_spec_match")
|
||
if (
|
||
cushion_refill_pack_alignment
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and token_score >= 0.65
|
||
and sequence_score >= 0.65
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.04
|
||
reasons.append("cushion_refill_pack_alignment_score")
|
||
if (
|
||
focused_exact_review_boost_safe
|
||
and price_penalty == 0
|
||
):
|
||
score += 0.16
|
||
reasons.append(f"focused_exact_identity_{focused_exact_line_reason}")
|
||
if (
|
||
shared_anchor
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and spec_score >= 0.85
|
||
and (token_score >= 0.43 or sequence_score >= 0.58)
|
||
):
|
||
score += 0.08
|
||
reasons.append("shared_identity_anchor")
|
||
if (
|
||
shared_anchor
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and spec_score >= 0.55
|
||
and token_score >= 0.70
|
||
and sequence_score >= 0.62
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.03
|
||
reasons.append("shared_identity_anchor_no_spec")
|
||
if (
|
||
shared_anchor
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.55
|
||
and spec_score >= 0.45
|
||
and token_score >= 0.56
|
||
and sequence_score >= 0.60
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.02
|
||
reasons.append("shared_identity_anchor_packaging_variant")
|
||
if (
|
||
shared_anchor
|
||
and len(shared_anchor.replace(" ", "")) >= 8
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and spec_score >= 0.45
|
||
and token_score >= 0.60
|
||
and sequence_score >= 0.68
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.03
|
||
reasons.append("shared_identity_anchor_marketing_variant")
|
||
if (
|
||
shared_anchor
|
||
and len(shared_anchor.replace(" ", "")) >= 5
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.55
|
||
and spec_score >= 0.45
|
||
and token_score >= 0.88
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.02
|
||
reasons.append("shared_identity_anchor_core_line")
|
||
if (
|
||
shared_anchor
|
||
and len(shared_anchor.replace(" ", "")) >= 6
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.55
|
||
and spec_score >= 0.45
|
||
and token_score >= 0.86
|
||
and sequence_score >= 0.75
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.07
|
||
reasons.append("shared_identity_anchor_exact_line")
|
||
if (
|
||
"無印乾爽止汗爽身乳液" in shared_anchor
|
||
and {"nivea", "妮維雅"} & (left.brand_tokens | right.brand_tokens)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and spec_score >= 0.45
|
||
and token_score >= 0.55
|
||
and sequence_score >= 0.62
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.08
|
||
reasons.append("shared_identity_anchor_nivea_dry_lotion")
|
||
if (
|
||
"多效提亮防曬霜" in shared_anchor
|
||
and {"recipe", "box"} <= (left.brand_tokens | right.brand_tokens)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and spec_score >= 0.55
|
||
and token_score >= 0.54
|
||
and sequence_score >= 0.50
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.09
|
||
reasons.append("shared_identity_anchor_recipe_box_line")
|
||
if (
|
||
"私密潔浴露" in shared_anchor
|
||
and {"lactacyd", "立朵舒"} & (left.brand_tokens | right.brand_tokens)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and spec_score >= 0.70
|
||
and token_score >= 0.35
|
||
and sequence_score >= 0.50
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.10
|
||
reasons.append("shared_identity_anchor_lactacyd_wash")
|
||
if (
|
||
"私密潔膚露" in shared_anchor
|
||
and {"femfresh", "芳芯"} & (left.brand_tokens | right.brand_tokens)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and spec_score >= 0.85
|
||
and token_score >= 0.30
|
||
and sequence_score >= 0.45
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.06
|
||
reasons.append("shared_identity_anchor_femfresh_wash")
|
||
if (
|
||
"私密沐浴露" in shared_anchor
|
||
and {"vigill", "婦潔"} & (left.brand_tokens | right.brand_tokens)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and spec_score >= 0.70
|
||
and token_score >= 0.45
|
||
and sequence_score >= 0.55
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.06
|
||
reasons.append("shared_identity_anchor_vigill_private_wash")
|
||
if (
|
||
"私密潔淨凝露" in shared_anchor
|
||
and {"relove"} <= (left.brand_tokens | right.brand_tokens)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and spec_score >= 0.85
|
||
and token_score >= 0.30
|
||
and sequence_score >= 0.40
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.11
|
||
reasons.append("shared_identity_anchor_relove_cleanser")
|
||
if (
|
||
"柔霧裸唇膏" in shared_anchor
|
||
and {"kate", "凱婷"} & (left.brand_tokens | right.brand_tokens)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.55
|
||
and spec_score >= 0.45
|
||
and token_score >= 0.50
|
||
and sequence_score >= 0.50
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.15
|
||
reasons.append("shared_identity_anchor_kate_bare_lip")
|
||
if (
|
||
"閃亮珍珠眼影棒" in shared_anchor
|
||
and {"karadium"} <= (left.brand_tokens | right.brand_tokens)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.55
|
||
and spec_score >= 0.55
|
||
and token_score >= 0.50
|
||
and sequence_score >= 0.60
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.12
|
||
reasons.append("shared_identity_anchor_karadium_eye_stick")
|
||
if (
|
||
_has_seche_vite_top_coat_alignment(left, right)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and token_score >= 0.70
|
||
and sequence_score >= 0.70
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.04
|
||
reasons.append("shared_identity_anchor_seche_vite_top_coat")
|
||
if (
|
||
_has_xiaomi_s101_shaver_alignment(left, right)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and token_score >= 0.60
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.04
|
||
reasons.append("shared_model_token_xiaomi_s101_shaver")
|
||
if (
|
||
_has_hinoki_roller_oil_alignment(left, right)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and spec_score >= 0.85
|
||
and sequence_score >= 0.50
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.04
|
||
reasons.append("shared_identity_anchor_hinoki_roller_oil")
|
||
if (
|
||
_has_brush_baby_wildones_toothbrush_alignment(left, right)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and token_score >= 0.78
|
||
and sequence_score >= 0.90
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.04
|
||
reasons.append("shared_model_token_brush_baby_wildones")
|
||
if (
|
||
_has_pshine_beauty_foot_file_alignment(left, right)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and token_score >= 0.60
|
||
and sequence_score >= 0.78
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.05
|
||
reasons.append("shared_model_token_pshine_beauty_foot_file")
|
||
if (
|
||
_has_catalog_variant_listing_alignment(left, right)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and spec_score >= 0.85
|
||
and type_score >= 0.95
|
||
and sequence_score >= 0.50
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.06
|
||
reasons.append("catalog_variant_listing_alignment")
|
||
if (
|
||
_has_baan_baby_lip_catalog_alignment(left, right)
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and token_score >= 0.70
|
||
and sequence_score >= 0.45
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.05
|
||
reasons.append("catalog_variant_listing_alignment_baan_lip")
|
||
if (
|
||
shared_anchor
|
||
and len(shared_anchor.replace(" ", "")) >= 5
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.55
|
||
and spec_score >= 0.45
|
||
and token_score >= 0.74
|
||
and sequence_score >= 0.60
|
||
and _shared_variant_descriptors(left, right)
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.05
|
||
reasons.append("shared_variant_descriptor_alignment")
|
||
if (
|
||
shared_anchor
|
||
and len(shared_anchor.replace(" ", "")) >= 8
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and brand_score == 0.55
|
||
and bool(left.brand_tokens) != bool(right.brand_tokens)
|
||
and type_score >= 0.55
|
||
and spec_score >= 0.55
|
||
and token_score >= 0.80
|
||
and sequence_score >= 0.80
|
||
and chinese_name_score >= 0.42
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.09
|
||
reasons.append("brandless_exact_identity")
|
||
if (
|
||
shared_anchor
|
||
and len(shared_anchor.replace(" ", "")) >= 6
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and spec_score >= 0.85
|
||
and token_score >= 0.30
|
||
and sequence_score >= 0.50
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.06
|
||
reasons.append("shared_identity_anchor_reordered_line")
|
||
if (
|
||
shared_anchor
|
||
and len(shared_anchor.replace(" ", "")) >= 4
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and spec_score >= 0.65
|
||
and token_score >= 0.50
|
||
and sequence_score >= 0.50
|
||
and _has_exact_count_alignment(left, right)
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.05
|
||
reasons.append("shared_identity_anchor_bundle_equivalent")
|
||
if (
|
||
shared_anchor
|
||
and len(shared_anchor.replace(" ", "")) >= 6
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and type_score >= 0.55
|
||
and spec_score >= 0.45
|
||
and token_score >= 0.58
|
||
and sequence_score >= 0.50
|
||
and not variant_descriptor_conflict
|
||
):
|
||
score += 0.025
|
||
reasons.append("shared_identity_anchor_variant_safe")
|
||
if (
|
||
brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and spec_score >= 0.99
|
||
and token_score >= 0.44
|
||
and sequence_score >= 0.60
|
||
and type_score >= 0.55
|
||
):
|
||
score += 0.025
|
||
reasons.append("spec_name_alignment")
|
||
shared_models = _shared_model_tokens(left, right)
|
||
if (
|
||
shared_models
|
||
and brand_score >= 0.95
|
||
and not hard_veto
|
||
and price_penalty == 0
|
||
and token_score >= 0.50
|
||
and sequence_score >= 0.62
|
||
):
|
||
score += 0.04
|
||
reasons.append("shared_model_token")
|
||
if variant_descriptor_conflict and spec_score < 0.85:
|
||
score -= 0.05
|
||
reasons.append("variant_descriptor_conflict")
|
||
if (
|
||
brand_score >= 0.95
|
||
and not hard_veto
|
||
and not reasons
|
||
and price_penalty == 0
|
||
and type_score >= 0.95
|
||
and token_score >= 0.82
|
||
and spec_score >= 0.40
|
||
and chinese_name_score >= 0.65
|
||
):
|
||
score += 0.04
|
||
reasons.append("strong_component_line_match")
|
||
if hard_veto:
|
||
score = min(score, 0.74 if comparison_mode == "unit_comparable" else 0.32)
|
||
score = max(0.0, min(1.0, score))
|
||
if _has_safe_multi_component_exact_total_price(
|
||
left,
|
||
right,
|
||
brand_score=brand_score,
|
||
token_score=token_score,
|
||
spec_score=spec_score,
|
||
sequence_score=sequence_score,
|
||
type_score=type_score,
|
||
hard_veto=hard_veto,
|
||
variant_descriptor_conflict=variant_descriptor_conflict,
|
||
reasons=reasons,
|
||
):
|
||
reasons.append("safe_multi_component_exact_total_price")
|
||
reason_tuple = _dedupe_tuple(reasons)
|
||
match_type, price_basis, alert_tier = _classify_match_quality(
|
||
score=score,
|
||
brand_score=brand_score,
|
||
token_score=token_score,
|
||
spec_score=spec_score,
|
||
sequence_score=sequence_score,
|
||
type_score=type_score,
|
||
hard_veto=hard_veto,
|
||
comparison_mode=comparison_mode,
|
||
reasons=reason_tuple,
|
||
shared_anchor=shared_anchor,
|
||
shared_models=shared_models,
|
||
catalog_count_omission=catalog_count_omission,
|
||
multi_component_pair=_has_multi_component(left) and _has_multi_component(right),
|
||
)
|
||
evidence_flags = _build_evidence_flags(
|
||
brand_score=brand_score,
|
||
token_score=token_score,
|
||
spec_score=spec_score,
|
||
sequence_score=sequence_score,
|
||
type_score=type_score,
|
||
shared_anchor=shared_anchor,
|
||
shared_models=shared_models,
|
||
reasons=reason_tuple,
|
||
catalog_count_omission=catalog_count_omission,
|
||
)
|
||
identity_evidence = _identity_evidence_payload(
|
||
left,
|
||
right,
|
||
brand_score=brand_score,
|
||
token_score=token_score,
|
||
spec_score=spec_score,
|
||
sequence_score=sequence_score,
|
||
type_score=type_score,
|
||
hard_veto=hard_veto,
|
||
comparison_mode=comparison_mode,
|
||
match_type=match_type,
|
||
price_basis=price_basis,
|
||
alert_tier=alert_tier,
|
||
shared_anchor=shared_anchor,
|
||
shared_models=shared_models,
|
||
reasons=reason_tuple,
|
||
catalog_count_omission=catalog_count_omission,
|
||
)
|
||
offer_evidence = _offer_evidence_payload(
|
||
momo_price,
|
||
competitor_price,
|
||
price_penalty=price_penalty,
|
||
price_basis=price_basis,
|
||
alert_tier=alert_tier,
|
||
)
|
||
|
||
return MatchDiagnostics(
|
||
score=round(score, 3),
|
||
brand_score=round(brand_score, 3),
|
||
token_score=round(token_score, 3),
|
||
spec_score=round(spec_score, 3),
|
||
sequence_score=round(sequence_score, 3),
|
||
type_score=round(type_score, 3),
|
||
price_penalty=round(price_penalty, 3),
|
||
hard_veto=hard_veto,
|
||
reasons=reason_tuple,
|
||
comparison_mode=comparison_mode,
|
||
match_type=match_type,
|
||
price_basis=price_basis,
|
||
alert_tier=alert_tier,
|
||
evidence_flags=evidence_flags,
|
||
identity_evidence=identity_evidence,
|
||
offer_evidence=offer_evidence,
|
||
)
|
||
|
||
|
||
def _clean_search_phrase(value: str) -> str:
|
||
text = normalize_product_text(value)
|
||
for phrase in sorted(SEARCH_NOISE_PHRASES, key=len, reverse=True):
|
||
text = text.replace(phrase.lower(), " ")
|
||
text = re.sub(r"(?<=\d)\.(?=\d)", "DECIMALPOINT", text)
|
||
text = re.sub(r"[^\w\u4e00-\u9fff]+", " ", text)
|
||
text = text.replace("DECIMALPOINT", ".").replace("decimalpoint", ".")
|
||
text = " ".join(
|
||
token for token in text.split()
|
||
if token not in SEARCH_NOISE_TOKENS and token not in GENERIC_TOKENS
|
||
)
|
||
text = re.sub(r"\s+", " ", text).strip()
|
||
return text
|
||
|
||
|
||
def _search_spec_terms(identity: ProductIdentity) -> list[str]:
|
||
specs: list[str] = []
|
||
if identity.volumes_ml:
|
||
volume = identity.volumes_ml[0]
|
||
specs.append(f"{volume:g}ml")
|
||
if identity.weights_g:
|
||
weight = identity.weights_g[0]
|
||
specs.append(f"{weight:g}g")
|
||
if identity.dosages_mg:
|
||
dosage = identity.dosages_mg[0]
|
||
specs.append(f"{dosage:g}mg")
|
||
if identity.total_piece_count:
|
||
specs.append(f"{identity.total_piece_count}包")
|
||
return specs
|
||
|
||
|
||
def _extract_anchor_phrases(token: str) -> list[str]:
|
||
normalized = normalize_product_text(token)
|
||
cleaned = _clean_search_phrase(token)
|
||
if not cleaned:
|
||
if "經典乳霜" in normalized:
|
||
return ["經典乳霜"]
|
||
return []
|
||
|
||
phrases: list[str] = []
|
||
if "經典旋轉眉筆" in cleaned:
|
||
phrases.append("經典旋轉眉筆")
|
||
if "無印乾爽" in cleaned and "止汗爽身乳液" in cleaned:
|
||
phrases.append("無印乾爽止汗爽身乳液")
|
||
if "智能光感應" in cleaned and "無線自動除臭芳香噴霧機" in cleaned:
|
||
phrases.append("智能光感應無線自動除臭芳香噴霧機")
|
||
if "悠斯晶" in normalized and "經典乳霜" in normalized:
|
||
phrases.append("悠斯晶經典乳霜")
|
||
if "經典乳霜" in normalized:
|
||
phrases.append("經典乳霜")
|
||
if "蜂王玫瑰" in cleaned and any(
|
||
keyword in cleaned for keyword in ("外泌微臻霜", "微泌新生霜", "瑰泌霜")
|
||
):
|
||
phrases.append("蜂王玫瑰瑰泌霜")
|
||
if "瞬效" in cleaned and "b5" in cleaned and "玻尿酸" in cleaned and "精華" in cleaned:
|
||
phrases.append("瞬效b5玻尿酸精華")
|
||
if "慕之幼" in cleaned and "爽身潤膚乳" in cleaned:
|
||
phrases.append("慕之幼爽身潤膚乳")
|
||
for anchor in SEARCH_IDENTITY_ANCHORS:
|
||
anchor_phrase = _clean_search_phrase(anchor)
|
||
if not anchor_phrase or anchor_phrase not in cleaned:
|
||
continue
|
||
if re.search(r"[\u4e00-\u9fff]", anchor_phrase):
|
||
prefix_width = 0 if len(anchor_phrase) >= 5 else (4 if len(anchor_phrase) >= 3 else 6)
|
||
match = re.search(rf"([\u4e00-\u9fff]{{0,{prefix_width}}}{re.escape(anchor_phrase)})", cleaned)
|
||
phrase = match.group(1) if match else anchor_phrase
|
||
else:
|
||
phrase = anchor_phrase
|
||
phrase = _clean_search_phrase(phrase)
|
||
if phrase.startswith("款") and len(phrase) > 2:
|
||
phrase = phrase[1:]
|
||
if any(existing in phrase and existing != phrase for existing in phrases):
|
||
continue
|
||
if len(phrase) >= 2 and phrase not in phrases:
|
||
phrases.append(phrase)
|
||
return phrases
|
||
|
||
|
||
def _shared_identity_anchor(left: ProductIdentity, right: ProductIdentity) -> str:
|
||
left_anchors: set[str] = set()
|
||
right_anchors: set[str] = set()
|
||
for token in left.core_tokens:
|
||
left_anchors.update(_extract_anchor_phrases(token))
|
||
for token in right.core_tokens:
|
||
right_anchors.update(_extract_anchor_phrases(token))
|
||
left_anchors.update(_extract_anchor_phrases(left.normalized_name))
|
||
right_anchors.update(_extract_anchor_phrases(right.normalized_name))
|
||
left_anchors.update(_extract_anchor_phrases(left.searchable_name))
|
||
right_anchors.update(_extract_anchor_phrases(right.searchable_name))
|
||
|
||
partial_shared: set[str] = set()
|
||
for left_anchor in left_anchors:
|
||
left_compact = left_anchor.replace(" ", "")
|
||
for right_anchor in right_anchors:
|
||
right_compact = right_anchor.replace(" ", "")
|
||
if left_compact == right_compact:
|
||
partial_shared.add(left_anchor)
|
||
continue
|
||
if len(left_compact) >= 5 and left_compact in right_compact:
|
||
partial_shared.add(left_anchor)
|
||
elif len(right_compact) >= 5 and right_compact in left_compact:
|
||
partial_shared.add(right_anchor)
|
||
|
||
shared = sorted(
|
||
{
|
||
anchor for anchor in partial_shared
|
||
if len(anchor.replace(" ", "")) >= 5 and anchor not in SEARCH_BROAD_ANCHORS
|
||
},
|
||
key=lambda anchor: (-len(anchor.replace(" ", "")), anchor),
|
||
)
|
||
return shared[0] if shared else ""
|
||
|
||
|
||
def _shared_model_tokens(left: ProductIdentity, right: ProductIdentity) -> set[str]:
|
||
return {
|
||
token
|
||
for token in left.core_tokens & right.core_tokens
|
||
if len(token) >= 4
|
||
and re.search(r"[a-z]", token)
|
||
and re.search(r"\d", token)
|
||
and not _is_spec_like_latin_token(token)
|
||
}
|
||
|
||
|
||
def _variant_descriptors(identity: ProductIdentity) -> set[str]:
|
||
descriptors: set[str] = set()
|
||
brand_compacts = {brand.replace(" ", "") for brand in identity.brand_tokens}
|
||
for token in identity.core_tokens:
|
||
value = token
|
||
for anchor in sorted(_extract_anchor_phrases(token), key=len, reverse=True):
|
||
value = value.replace(anchor, " ")
|
||
value = _clean_search_phrase(value)
|
||
compact = value.replace(" ", "")
|
||
if len(compact) < 2:
|
||
continue
|
||
if compact in brand_compacts:
|
||
continue
|
||
if compact in SEARCH_NOISE_TOKENS or compact in SEARCH_BROAD_ANCHORS:
|
||
continue
|
||
if any(keyword in compact for keyword in VARIANT_DESCRIPTOR_NOISE_KEYWORDS):
|
||
continue
|
||
if re.fullmatch(r"[a-z0-9-]+", compact):
|
||
continue
|
||
descriptors.add(compact.removesuffix("款"))
|
||
return {token for token in descriptors if token}
|
||
|
||
|
||
def _shared_variant_descriptors(left: ProductIdentity, right: ProductIdentity) -> set[str]:
|
||
left_descriptors = _variant_descriptors(left)
|
||
right_descriptors = _variant_descriptors(right)
|
||
shared: set[str] = set()
|
||
for left_descriptor in left_descriptors:
|
||
for right_descriptor in right_descriptors:
|
||
if left_descriptor == right_descriptor:
|
||
shared.add(left_descriptor)
|
||
continue
|
||
if len(left_descriptor) >= 2 and left_descriptor in right_descriptor:
|
||
shared.add(left_descriptor)
|
||
elif len(right_descriptor) >= 2 and right_descriptor in left_descriptor:
|
||
shared.add(right_descriptor)
|
||
return shared
|
||
|
||
|
||
def _has_serum_formulation_conflict(left: ProductIdentity, right: ProductIdentity, shared_anchor: str) -> bool:
|
||
if "精華" not in shared_anchor:
|
||
return False
|
||
formulations = ("精華乳", "精華霜", "精華液")
|
||
left_hit = next((token for token in formulations if token in left.normalized_name), None)
|
||
right_hit = next((token for token in formulations if token in right.normalized_name), None)
|
||
return bool(left_hit and right_hit and left_hit != right_hit)
|
||
|
||
|
||
def _has_saugella_private_wash_variant_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
if not (
|
||
("saugella" in left_text or "賽吉兒" in left_text)
|
||
and ("saugella" in right_text or "賽吉兒" in right_text)
|
||
):
|
||
return False
|
||
variant_tokens = ("日用", "日用型", "加強", "潤澤", "黃金女郎型")
|
||
left_hits = {token for token in variant_tokens if token in left_text}
|
||
right_hits = {token for token in variant_tokens if token in right_text}
|
||
return bool(left_hits and right_hits and left_hits.isdisjoint(right_hits))
|
||
|
||
|
||
def _has_lactacyd_private_wash_variant_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
if not (
|
||
("lactacyd" in left_text or "立朵舒" in left_text)
|
||
and ("lactacyd" in right_text or "立朵舒" in right_text)
|
||
):
|
||
return False
|
||
variant_tokens = (
|
||
"清新舒涼",
|
||
"生理呵護",
|
||
"滋潤緊緻",
|
||
"加倍修護",
|
||
"柔軟滋潤",
|
||
"亮肌柔滑",
|
||
"全日清爽",
|
||
)
|
||
left_hits = {token for token in variant_tokens if token in left_text}
|
||
right_hits = {token for token in variant_tokens if token in right_text}
|
||
return bool(left_hits and right_hits and left_hits.isdisjoint(right_hits))
|
||
|
||
|
||
def _has_makeup_usage_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
cheek_terms = ("頰彩", "腮紅", "blush")
|
||
eye_terms = ("眼彩", "眼影", "eyeshadow")
|
||
left_cheek = any(term in left_text for term in cheek_terms)
|
||
right_cheek = any(term in right_text for term in cheek_terms)
|
||
left_eye = any(term in left_text for term in eye_terms)
|
||
right_eye = any(term in right_text for term in eye_terms)
|
||
return bool((left_cheek and right_eye) or (left_eye and right_cheek))
|
||
|
||
|
||
def _has_makeup_finish_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
if "mac" not in (left.brand_tokens & right.brand_tokens):
|
||
return False
|
||
if not (
|
||
"macximal" in left_text
|
||
and "macximal" in right_text
|
||
and "唇膏" in left_text
|
||
and "唇膏" in right_text
|
||
):
|
||
return False
|
||
matte_terms = ("柔霧", "霧面", "matte")
|
||
satin_terms = ("緞光", "satin")
|
||
left_matte = any(term in left_text for term in matte_terms)
|
||
right_matte = any(term in right_text for term in matte_terms)
|
||
left_satin = any(term in left_text for term in satin_terms)
|
||
right_satin = any(term in right_text for term in satin_terms)
|
||
return bool((left_matte and right_satin) or (left_satin and right_matte))
|
||
|
||
|
||
def _spf_values(identity: ProductIdentity) -> set[int]:
|
||
return {
|
||
int(match.group(1))
|
||
for match in re.finditer(r"spf\s*(\d{1,3})", identity.normalized_name, re.I)
|
||
}
|
||
|
||
|
||
def _has_sun_protection_spf_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(term in pair_text for term in ("防曬", "素顏霜", "spf")):
|
||
return False
|
||
left_spf = _spf_values(left)
|
||
right_spf = _spf_values(right)
|
||
return bool(left_spf and right_spf and left_spf.isdisjoint(right_spf))
|
||
|
||
|
||
def _makeup_spray_line_groups(identity: ProductIdentity) -> set[str]:
|
||
text = identity.searchable_name
|
||
groups: set[str] = set()
|
||
if "fix+" in text or "定妝噴霧" in text or "超持妝" in text:
|
||
groups.add("setting_spray")
|
||
if "活氧水" in text or "激活版" in text:
|
||
groups.add("activating_water")
|
||
if "精華版" in text:
|
||
groups.add("serum_variant")
|
||
if "控油" in text or "黑特霧" in text:
|
||
groups.add("oil_control")
|
||
return groups
|
||
|
||
|
||
def _has_makeup_spray_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(term in pair_text for term in ("定妝噴霧", "活氧水", "fix+", "超光肌", "超持妝")):
|
||
return False
|
||
left_groups = _makeup_spray_line_groups(left)
|
||
right_groups = _makeup_spray_line_groups(right)
|
||
if not left_groups or not right_groups:
|
||
return False
|
||
return bool(
|
||
("setting_spray" in left_groups and "activating_water" in right_groups)
|
||
or ("activating_water" in left_groups and "setting_spray" in right_groups)
|
||
)
|
||
|
||
|
||
def _has_makeup_spray_variant_selection_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_groups = _makeup_spray_line_groups(left)
|
||
right_groups = _makeup_spray_line_groups(right)
|
||
if not left_groups or not right_groups or _has_makeup_spray_line_conflict(left, right):
|
||
return False
|
||
return left_groups != right_groups
|
||
|
||
|
||
def _romand_lip_line_groups(identity: ProductIdentity) -> set[str]:
|
||
text = identity.searchable_name
|
||
groups: set[str] = set()
|
||
if "果汁唇釉" in text or "juicy" in text:
|
||
groups.add("juicy")
|
||
if "零絲絨" in text or "zero velvet" in text or "霧面唇釉" in text:
|
||
groups.add("zero_velvet")
|
||
if "果凍唇釉" in text or "glasting" in text or "唇凍" in text:
|
||
groups.add("glasting")
|
||
if "水感唇釉" in text:
|
||
groups.add("water_gloss")
|
||
return groups
|
||
|
||
|
||
def _has_romand_lip_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not (
|
||
{"rom", "romand"} & (left.brand_tokens | right.brand_tokens)
|
||
or "rom&nd" in pair_text
|
||
or "romand" in pair_text
|
||
):
|
||
return False
|
||
if "唇" not in left.searchable_name or "唇" not in right.searchable_name:
|
||
return False
|
||
left_groups = _romand_lip_line_groups(left)
|
||
right_groups = _romand_lip_line_groups(right)
|
||
return bool(left_groups and right_groups and left_groups.isdisjoint(right_groups))
|
||
|
||
|
||
def _has_nail_tool_function_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
if "erbe" not in (left.brand_tokens & right.brand_tokens):
|
||
return False
|
||
if "指甲" not in left_text or "指甲" not in right_text:
|
||
return False
|
||
cleaning_terms = ("清垢棒", "清潔棒")
|
||
plane_terms = ("指甲緣刨刀", "刨刀")
|
||
left_cleaning = any(term in left_text for term in cleaning_terms)
|
||
right_cleaning = any(term in right_text for term in cleaning_terms)
|
||
left_plane = any(term in left_text for term in plane_terms)
|
||
right_plane = any(term in right_text for term in plane_terms)
|
||
return bool((left_cleaning and right_plane) or (left_plane and right_cleaning))
|
||
|
||
|
||
def _has_yes_nail_tool_exact_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not ({"yes", "德悅氏"} & (left.brand_tokens & right.brand_tokens)):
|
||
return False
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
pair_text = f"{left_text} {right_text}"
|
||
if not any(term in pair_text for term in ("指甲剪", "銼刀", "腳皮銼", "拋光棒")):
|
||
return False
|
||
|
||
exact_lines = (
|
||
("指甲剪附除垢銼刀", ("8cm",), ("亮面", "霧面")),
|
||
("腳皮銼腳板", ("23.5cm",), ()),
|
||
("藍寶石銼刀", ("9cm",), ("可收納",)),
|
||
("指甲拋光棒", ("17.5cm",), ("三面",)),
|
||
)
|
||
for line, sizes, required_options in exact_lines:
|
||
if line not in left_text or line not in right_text:
|
||
continue
|
||
if not any(size in left_text and size in right_text for size in sizes):
|
||
continue
|
||
if required_options and not any(option in left_text and option in right_text for option in required_options):
|
||
continue
|
||
return True
|
||
|
||
if "指甲剪" in left_text and "指甲剪" in right_text:
|
||
excluded = ("附除垢", "腳", "硬皮", "鋒利窄弧型")
|
||
if any(term in left_text or term in right_text for term in excluded):
|
||
return False
|
||
if not any(size in left_text and size in right_text for size in ("6cm", "8cm")):
|
||
return False
|
||
finishes = ("亮面", "霧面", "不掉屑")
|
||
return any(finish in left_text and finish in right_text for finish in finishes)
|
||
|
||
return False
|
||
|
||
|
||
def _has_schick_razor_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
if not ({"schick", "舒適牌"} & (left.brand_tokens & right.brand_tokens)):
|
||
return False
|
||
pair_text = f"{left_text} {right_text}"
|
||
if "除毛刀" not in pair_text:
|
||
return False
|
||
women_razor_terms = ("仕女", "除毛刀")
|
||
if not all(term in pair_text for term in women_razor_terms):
|
||
return False
|
||
left_silk_effects = "舒芙" in left_text
|
||
right_silk_effects = "舒芙" in right_text
|
||
left_intuition = "舒綺" in left_text
|
||
right_intuition = "舒綺" in right_text
|
||
return bool((left_silk_effects and right_intuition) or (left_intuition and right_silk_effects))
|
||
|
||
|
||
def _has_lancome_ultra_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
if not (
|
||
("lancome" in left_text or "蘭蔻" in left_text)
|
||
and ("lancome" in right_text or "蘭蔻" in right_text)
|
||
):
|
||
return False
|
||
glow_terms = ("超極光", "極光水", "晶露", "活粹晶露", "四重酸")
|
||
genifique_terms = ("超極限", "肌因", "小黑瓶", "賦活露", "肌因精華")
|
||
left_glow = any(term in left_text for term in glow_terms)
|
||
right_glow = any(term in right_text for term in glow_terms)
|
||
left_genifique = any(term in left_text for term in genifique_terms)
|
||
right_genifique = any(term in right_text for term in genifique_terms)
|
||
return bool((left_glow and right_genifique) or (left_genifique and right_glow))
|
||
|
||
|
||
def _has_dr_hsieh_labsmart_serum_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
if not (
|
||
("dr" in left_text or "達特醫" in left_text)
|
||
and ("dr" in right_text or "達特醫" in right_text)
|
||
):
|
||
return False
|
||
if left.product_type != "精華" or right.product_type != "精華":
|
||
return False
|
||
|
||
labsmart_terms = ("labsmart", "hi tech", "hi-tech", "classic")
|
||
repair_terms = ("神經醯胺多重修復", "多重修復保濕精華", "多重修復保濕精華液")
|
||
left_labsmart = any(term in left_text for term in labsmart_terms)
|
||
right_labsmart = any(term in right_text for term in labsmart_terms)
|
||
left_repair = any(term in left_text for term in repair_terms)
|
||
right_repair = any(term in right_text for term in repair_terms)
|
||
return bool((left_labsmart and right_repair and not right_labsmart) or (right_labsmart and left_repair and not left_labsmart))
|
||
|
||
|
||
def _has_cotton_swab_variant_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
if "棉棒" not in left_text or "棉棒" not in right_text:
|
||
return False
|
||
variant_tokens = ("細軸", "黑色")
|
||
left_hits = {token for token in variant_tokens if token in left_text}
|
||
right_hits = {token for token in variant_tokens if token in right_text}
|
||
return bool(left_hits and right_hits and left_hits.isdisjoint(right_hits))
|
||
|
||
|
||
def _has_kanebo_milano_powder_perfume_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
if not ({"kanebo", "佳麗寶"} & (left.brand_tokens & right.brand_tokens)):
|
||
return False
|
||
if not (
|
||
("milano" in left_text or "米蘭" in left_text or "collection" in left_text)
|
||
and ("milano" in right_text or "米蘭" in right_text or "collection" in right_text)
|
||
):
|
||
return False
|
||
powder_terms = ("蜜粉", "粉餅")
|
||
fragrance_terms = ("香水", "淡香精", "淡香水", "perfume")
|
||
left_powder = any(term in left_text for term in powder_terms)
|
||
right_powder = any(term in right_text for term in powder_terms)
|
||
left_fragrance = any(term in left_text for term in fragrance_terms)
|
||
right_fragrance = any(term in right_text for term in fragrance_terms)
|
||
return bool((left_powder and right_fragrance) or (right_powder and left_fragrance))
|
||
|
||
|
||
def _has_hoi_candle_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
if "hoi" not in (left.brand_tokens & right.brand_tokens):
|
||
return False
|
||
if "蠟燭" not in left_text or "蠟燭" not in right_text:
|
||
return False
|
||
day_mountain_terms = ("日京山風",)
|
||
lab_terms = ("hoi!lab", "hoilab", "實驗室香氛", "經典篇")
|
||
left_day_mountain = any(term in left_text for term in day_mountain_terms)
|
||
right_day_mountain = any(term in right_text for term in day_mountain_terms)
|
||
left_lab = any(term in left_text for term in lab_terms)
|
||
right_lab = any(term in right_text for term in lab_terms)
|
||
return bool((left_day_mountain and right_lab) or (right_day_mountain and left_lab))
|
||
|
||
|
||
def _has_aroma_scent_variant_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if any(term in pair_text for term in ("護手霜", "融蠟燈", "蠟燭暖燈")):
|
||
return False
|
||
if not any(
|
||
term in pair_text
|
||
for term in (
|
||
"香氛固體凝膠",
|
||
"香氛凝膠",
|
||
"空氣芳香劑",
|
||
"車用香氛",
|
||
"車用擴香",
|
||
"擴香蕊",
|
||
"擴香罐",
|
||
"香薰蠟燭",
|
||
"香氛蠟燭",
|
||
"蠟燭",
|
||
"滾珠精油",
|
||
"香氛精油",
|
||
"植物精油",
|
||
)
|
||
):
|
||
return False
|
||
if _is_multi_variant_catalog_listing(left) or _is_multi_variant_catalog_listing(right):
|
||
return False
|
||
|
||
left_options = _explicit_variant_option_tokens(left)
|
||
right_options = _explicit_variant_option_tokens(right)
|
||
if left_options and right_options:
|
||
return not bool(left_options & right_options)
|
||
|
||
scent_words = {
|
||
"藤蔓果園",
|
||
"清新花園",
|
||
"白麝香",
|
||
"黑麝香",
|
||
"寶貝粉香",
|
||
"青檸羅勒",
|
||
"炭木香",
|
||
"無花果",
|
||
"白茶蘭花",
|
||
"白茶",
|
||
"檸檬草",
|
||
"茶樹",
|
||
"鼠尾草",
|
||
"海鹽",
|
||
"橙花",
|
||
"薄荷",
|
||
"杏仁",
|
||
"薰衣草",
|
||
"茉莉",
|
||
"櫻花",
|
||
"繡球花",
|
||
"玫瑰",
|
||
"雪松",
|
||
"檀香",
|
||
}
|
||
left_scent = {word for word in scent_words if word in left.searchable_name}
|
||
right_scent = {word for word in scent_words if word in right.searchable_name}
|
||
if bool(left_options or left_scent) != bool(right_options or right_scent):
|
||
return True
|
||
if left_scent and right_scent and not (left_scent & right_scent):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _has_unknown_scent_variant_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if any(term in pair_text for term in ("暖燈", "融蠟燈", "融燭燈", "香氛燈")):
|
||
return False
|
||
if any(term in pair_text for term in ("香氛誘霜", "車用", "芳香劑", "香氛機", "擴香儀", "噴香機")):
|
||
return False
|
||
if not (
|
||
("護手霜" in pair_text and any(term in pair_text for term in ("芬香", "香味", "香氣", "精油")))
|
||
or "擴香瓶" in pair_text
|
||
):
|
||
return False
|
||
if _is_multi_variant_catalog_listing(left) or _is_multi_variant_catalog_listing(right):
|
||
return False
|
||
left_descriptors = _variant_descriptors(left)
|
||
right_descriptors = _variant_descriptors(right)
|
||
if not left_descriptors or not right_descriptors:
|
||
return False
|
||
shared = _shared_variant_descriptors(left, right)
|
||
left_unique = left_descriptors - shared
|
||
right_unique = right_descriptors - shared
|
||
descriptor_noise = ("護手霜", "擴香瓶", "精油芬香", "經典擴香")
|
||
left_unique = {
|
||
descriptor for descriptor in left_unique
|
||
if 2 <= len(descriptor) <= 6 and not any(noise in descriptor for noise in descriptor_noise)
|
||
}
|
||
right_unique = {
|
||
descriptor for descriptor in right_unique
|
||
if 2 <= len(descriptor) <= 6 and not any(noise in descriptor for noise in descriptor_noise)
|
||
}
|
||
if not left_unique or not right_unique:
|
||
return False
|
||
for left_descriptor in left_unique:
|
||
for right_descriptor in right_unique:
|
||
if left_descriptor == right_descriptor:
|
||
return False
|
||
if left_descriptor in right_descriptor or right_descriptor in left_descriptor:
|
||
return False
|
||
return True
|
||
|
||
|
||
def _has_nail_polish_color_name_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(term in pair_text for term in ("指甲油", "指彩", "美甲彩繪")):
|
||
return False
|
||
if _shared_model_tokens(left, right):
|
||
return False
|
||
left_model_codes = _nail_polish_model_codes(left)
|
||
right_model_codes = _nail_polish_model_codes(right)
|
||
if left_model_codes and right_model_codes:
|
||
return left_model_codes.isdisjoint(right_model_codes)
|
||
if _is_multi_variant_catalog_listing(left) or _is_multi_variant_catalog_listing(right):
|
||
return False
|
||
left_descriptors = _variant_descriptors(left)
|
||
right_descriptors = _variant_descriptors(right)
|
||
if not left_descriptors or not right_descriptors:
|
||
return False
|
||
shared = _shared_variant_descriptors(left, right)
|
||
left_unique = left_descriptors - shared
|
||
right_unique = right_descriptors - shared
|
||
if not left_unique or not right_unique:
|
||
return False
|
||
for left_descriptor in left_unique:
|
||
for right_descriptor in right_unique:
|
||
if left_descriptor in right_descriptor or right_descriptor in left_descriptor:
|
||
return False
|
||
return True
|
||
|
||
|
||
def _has_aroma_lamp_style_selection_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not ({"les", "nez", "香鼻子"} & (left.brand_tokens & right.brand_tokens)):
|
||
return False
|
||
if not any(term in pair_text for term in ("融蠟燈", "融燭燈", "蠟燭暖燈", "香氛燈")):
|
||
return False
|
||
if not any(term in left.searchable_name for term in ("融蠟燈", "融燭燈", "蠟燭暖燈", "香氛燈")):
|
||
return False
|
||
if not any(term in right.searchable_name for term in ("融蠟燈", "融燭燈", "蠟燭暖燈", "香氛燈")):
|
||
return False
|
||
|
||
style_aliases = {
|
||
"流金歲月": ("流金歲月",),
|
||
"暮光琥珀": ("暮光琥珀",),
|
||
"閃耀琥珀": ("閃耀琥珀",),
|
||
"星夜": ("星夜款", "星夜"),
|
||
"流光玫瑰金": ("流光玫瑰金", "玫瑰金"),
|
||
"土耳其風": ("土耳其風",),
|
||
"手工拼貼玻璃": ("手工拼貼玻璃",),
|
||
"手工玻璃": ("手工玻璃",),
|
||
"北歐": ("北歐",),
|
||
"水晶燈": ("水晶燈",),
|
||
}
|
||
left_styles = {
|
||
style
|
||
for style, aliases in style_aliases.items()
|
||
if any(alias in left.searchable_name for alias in aliases)
|
||
}
|
||
right_styles = {
|
||
style
|
||
for style, aliases in style_aliases.items()
|
||
if any(alias in right.searchable_name for alias in aliases)
|
||
}
|
||
if not left_styles and not right_styles:
|
||
return False
|
||
if left_styles == right_styles:
|
||
return False
|
||
shared_styles = left_styles & right_styles
|
||
left_specific = left_styles - shared_styles
|
||
right_specific = right_styles - shared_styles
|
||
return bool(left_specific or right_specific)
|
||
|
||
|
||
def _has_core_ingredient_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(term in pair_text for term in ("油膏", "護膚油", "身體油", "精油", "基礎油", "按摩油", "甜杏仁油", "酪梨油", "霜", "乳霜")):
|
||
return False
|
||
ingredient_groups = {
|
||
"coconut_oil": ("椰子油", "coconut"),
|
||
"shea_butter": ("乳木果油", "shea"),
|
||
"sweet_almond_oil": ("甜杏仁油", "sweet almond"),
|
||
"apricot_kernel_oil": ("杏桃核仁油", "杏核仁油", "apricot kernel"),
|
||
"avocado_oil": ("酪梨油", "avocado"),
|
||
}
|
||
left_groups = {
|
||
group
|
||
for group, terms in ingredient_groups.items()
|
||
if any(term in left.searchable_name for term in terms)
|
||
}
|
||
right_groups = {
|
||
group
|
||
for group, terms in ingredient_groups.items()
|
||
if any(term in right.searchable_name for term in terms)
|
||
}
|
||
return bool(left_groups and right_groups and not (left_groups & right_groups))
|
||
|
||
|
||
def _has_clarins_body_oil_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not ({"clarins", "克蘭詩"} & (left.brand_tokens & right.brand_tokens)):
|
||
return False
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(term in pair_text for term in ("護理油", "身體油", "美體油", "調和護理油")):
|
||
return False
|
||
line_groups = {
|
||
"contour_lightweight": ("輕盈美體", "美體護理油", "contour"),
|
||
"tonic_body": ("身體調和", "調和護理油", "孕期身體調和", "tonic"),
|
||
}
|
||
left_groups = {
|
||
group
|
||
for group, terms in line_groups.items()
|
||
if any(term in left.searchable_name for term in terms)
|
||
}
|
||
right_groups = {
|
||
group
|
||
for group, terms in line_groups.items()
|
||
if any(term in right.searchable_name for term in terms)
|
||
}
|
||
return bool(left_groups and right_groups and left_groups.isdisjoint(right_groups))
|
||
|
||
|
||
def _has_branded_powder_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not ({"港香蘭"} & (left.brand_tokens & right.brand_tokens)):
|
||
return False
|
||
if "爽身粉" not in left.searchable_name or "爽身粉" not in right.searchable_name:
|
||
return False
|
||
named_lines = ("漢本", "艾魔菈")
|
||
left_lines = {line for line in named_lines if line in left.searchable_name}
|
||
right_lines = {line for line in named_lines if line in right.searchable_name}
|
||
return bool(left_lines and right_lines and not (left_lines & right_lines))
|
||
|
||
|
||
def _has_cleanser_lotion_line_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not (left.brand_tokens & right.brand_tokens):
|
||
return False
|
||
if not _has_overlapping_base_spec(left, right):
|
||
return False
|
||
cleanser_terms = ("潔膚露", "潔膚", "潔淨露", "潔面", "洗面乳", "cleanser")
|
||
lotion_terms = ("修護乳", "乳液", "身體乳", "潤膚乳", "lotion")
|
||
left_cleanser = any(term in left.searchable_name for term in cleanser_terms)
|
||
right_cleanser = any(term in right.searchable_name for term in cleanser_terms)
|
||
left_lotion = any(term in left.searchable_name for term in lotion_terms)
|
||
right_lotion = any(term in right.searchable_name for term in lotion_terms)
|
||
return bool((left_cleanser and right_lotion) or (right_cleanser and left_lotion))
|
||
|
||
|
||
def _selection1990_wax_lamp_design_groups(identity: ProductIdentity) -> set[str]:
|
||
text = identity.searchable_name
|
||
groups: set[str] = set()
|
||
if "現代簡約半圓罩融燭燈" in text or "半圓罩" in text:
|
||
groups.add("half_dome")
|
||
if "歐式可彎融燭燈" in text or "可彎融燭燈" in text:
|
||
groups.add("bendable")
|
||
if "韓風原木底座融燭燈" in text or "原木底座融燭燈" in text:
|
||
groups.add("wood_base")
|
||
if "北歐簡樸融蠟燈" in text or "北歐簡樸" in text:
|
||
groups.add("nordic")
|
||
return groups
|
||
|
||
|
||
def _has_selection1990_wax_lamp_design_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not ({"1990", "選物"} <= (left.brand_tokens & right.brand_tokens)):
|
||
return False
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(term in pair_text for term in ("融燭燈", "蠟燭暖燈", "融蠟燈")):
|
||
return False
|
||
left_groups = _selection1990_wax_lamp_design_groups(left)
|
||
right_groups = _selection1990_wax_lamp_design_groups(right)
|
||
return bool(left_groups and right_groups and left_groups.isdisjoint(right_groups))
|
||
|
||
|
||
def _has_hooome_wax_lamp_design_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if "hooome" not in (left.brand_tokens & right.brand_tokens):
|
||
return False
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(term in pair_text for term in ("香氛蠟燭暖燈", "蠟燭暖燈", "融蠟燈")):
|
||
return False
|
||
concrete_design_terms = ("大理石", "雲石", "原木", "半圓罩", "陶瓷", "玻璃", "水晶", "金屬", "鐵藝")
|
||
left_designs = {term for term in concrete_design_terms if term in left.searchable_name}
|
||
right_designs = {term for term in concrete_design_terms if term in right.searchable_name}
|
||
return bool(left_designs or right_designs) and left_designs != right_designs
|
||
|
||
|
||
def _standalone_size_letter_tokens(identity: ProductIdentity) -> set[str]:
|
||
text = identity.searchable_name
|
||
return {
|
||
match.group(1).lower()
|
||
for match in re.finditer(r"(?<![a-z0-9])([sml])(?![a-z0-9])", text, re.I)
|
||
}
|
||
|
||
|
||
def _has_wax_lamp_size_letter_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(term in pair_text for term in ("香氛蠟燭暖燈", "蠟燭暖燈", "融蠟燈")):
|
||
return False
|
||
left_sizes = _standalone_size_letter_tokens(left)
|
||
right_sizes = _standalone_size_letter_tokens(right)
|
||
return bool(left_sizes and right_sizes and not (left_sizes & right_sizes))
|
||
|
||
|
||
def _has_nitori_diffuser_model_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not ({"nitori", "宜得利家居"} & (left.brand_tokens & right.brand_tokens)):
|
||
return False
|
||
if "香氛噴霧器" not in left.searchable_name or "香氛噴霧器" not in right.searchable_name:
|
||
return False
|
||
|
||
def model_tokens(identity: ProductIdentity) -> set[str]:
|
||
text = identity.searchable_name
|
||
numeric_models = set(re.findall(r"(?<![a-z0-9])\d{3,5}(?![a-z0-9])", text))
|
||
compact_models = {
|
||
match.group(1).lower()
|
||
for match in re.finditer(r"(?<![a-z0-9])([a-z]{1,4}\d{2,}[a-z0-9-]*)(?![a-z0-9])", text, re.I)
|
||
if not _is_spec_like_latin_token(match.group(1).lower())
|
||
}
|
||
return _extract_model_tokens(text) | numeric_models | compact_models
|
||
|
||
left_models = model_tokens(left)
|
||
right_models = model_tokens(right)
|
||
return bool(left_models and right_models and not (left_models & right_models))
|
||
|
||
|
||
def _makeup_shade_tokens(identity: ProductIdentity) -> set[str]:
|
||
text = identity.searchable_name
|
||
tokens = set(_explicit_variant_option_tokens(identity))
|
||
shade_pattern = (
|
||
r"(?<![a-z0-9])(?:#|no\.?|色號|號色)?\s*(\d{1,3})\s+"
|
||
r"(rosy ivory|ivory|beige|sand|fair|light|medium|porcelain|rose)(?![a-z0-9])"
|
||
)
|
||
for match in re.finditer(shade_pattern, text, re.I):
|
||
tokens.add(match.group(1).lower())
|
||
tokens.add(match.group(2).lower().replace(" ", "_"))
|
||
for match in re.finditer(r"(?<![a-z0-9])([a-z]?\d{1,3}[a-z]?)(?=\s*[\u4e00-\u9fff]{2,})", text, re.I):
|
||
value = re.sub(r"[^a-z0-9]", "", match.group(1).lower())
|
||
if re.fullmatch(r"\d+(?:g|m|l|ml|mg)", value):
|
||
continue
|
||
if value:
|
||
tokens.add(value)
|
||
return tokens
|
||
|
||
|
||
def _has_makeup_shade_selection_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(
|
||
term in pair_text
|
||
for term in ("氣墊粉霜", "粉底", "粉霜", "蜜粉", "唇釉", "唇膏", "唇蜜", "染眉膏", "眉筆", "眉膏", "眉彩", "眼線", "遮瑕")
|
||
):
|
||
return False
|
||
left_shades = _makeup_shade_tokens(left)
|
||
right_shades = _makeup_shade_tokens(right)
|
||
return bool(left_shades) != bool(right_shades)
|
||
|
||
|
||
def _commercial_condition_terms(identity: ProductIdentity) -> set[str]:
|
||
text = identity.normalized_name
|
||
terms: set[str] = set()
|
||
if any(term in text for term in ("即期品", "臨期", "短效", "短效期", "效期", "保存期限", "有效期限")):
|
||
terms.add("expiry_sensitive")
|
||
if any(term in text for term in ("盒損", "盒損品", "外盒損", "外盒瑕疵")):
|
||
terms.add("box_damage")
|
||
if any(term in text for term in ("福利品", "瑕疵品")):
|
||
terms.add("clearance_condition")
|
||
return terms
|
||
|
||
|
||
def _has_commercial_condition_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_terms = _commercial_condition_terms(left)
|
||
right_terms = _commercial_condition_terms(right)
|
||
return bool(left_terms or right_terms) and left_terms != right_terms
|
||
|
||
|
||
def _has_relove_private_cleanser_variant_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not _is_relove_cleanser_gel_like(left, right):
|
||
return False
|
||
brightening_terms = ("傳明酸", "淨白", "美白", "亮白", "菸鹼醯胺", "niacinamide")
|
||
left_brightening = any(term in left.searchable_name for term in brightening_terms)
|
||
right_brightening = any(term in right.searchable_name for term in brightening_terms)
|
||
return left_brightening != right_brightening
|
||
|
||
|
||
def _has_makeup_catalog_selection_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
sensitive_terms = (
|
||
"遮瑕蜜",
|
||
"遮瑕",
|
||
"粉底",
|
||
"粉霜",
|
||
"氣墊",
|
||
"蜜粉",
|
||
"腮紅",
|
||
"眼線",
|
||
"眉筆",
|
||
"染眉膏",
|
||
"唇膏",
|
||
"唇釉",
|
||
"唇蜜",
|
||
)
|
||
if not any(term in pair_text for term in sensitive_terms):
|
||
return False
|
||
if not (_is_catalog_or_delimited_variant_listing(left) or _is_catalog_or_delimited_variant_listing(right)):
|
||
return False
|
||
left_shades = _makeup_shade_tokens(left)
|
||
right_shades = _makeup_shade_tokens(right)
|
||
if left_shades and right_shades and _variant_options_overlap(left_shades, right_shades):
|
||
return False
|
||
return True
|
||
|
||
|
||
def _is_candle_scent_catalog_listing(identity: ProductIdentity) -> bool:
|
||
text = identity.searchable_name
|
||
if _is_multi_variant_catalog_listing(identity):
|
||
return True
|
||
return bool(re.search(r"\d+\s*種(?:香味|香氣|味道)", text))
|
||
|
||
|
||
def _has_candle_catalog_selection_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(term in pair_text for term in ("香氛蠟燭", "大豆蠟燭", "蠟燭")):
|
||
return False
|
||
if "融蠟燈" in pair_text or "融燭燈" in pair_text or "蠟燭燈" in pair_text:
|
||
return False
|
||
left_catalog = _is_candle_scent_catalog_listing(left)
|
||
right_catalog = _is_candle_scent_catalog_listing(right)
|
||
return left_catalog != right_catalog
|
||
|
||
|
||
def _has_loreal_serum_variant_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not ({"loreal", "巴黎萊雅"} & (left.brand_tokens | right.brand_tokens)):
|
||
return False
|
||
if "玻尿酸瞬效保濕" not in pair_text:
|
||
return False
|
||
variant_terms = ("啵啵精華水", "液態紫熨斗", "水光精華", "修護晶露", "保濕水光")
|
||
left_terms = {term for term in variant_terms if term in left.searchable_name}
|
||
right_terms = {term for term in variant_terms if term in right.searchable_name}
|
||
if not (left_terms or right_terms):
|
||
return False
|
||
return left_terms != right_terms or _is_catalog_or_delimited_variant_listing(left) != _is_catalog_or_delimited_variant_listing(right)
|
||
|
||
|
||
def _has_sebamed_shampoo_variant_catalog_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not ({"sebamed", "施巴"} & (left.brand_tokens | right.brand_tokens)):
|
||
return False
|
||
if "洗髮乳" not in left.searchable_name or "洗髮乳" not in right.searchable_name:
|
||
return False
|
||
variant_terms = ("溫和", "油性抗屑", "抗屑", "乾性", "敏感")
|
||
left_terms = {term for term in variant_terms if term in left.searchable_name}
|
||
right_terms = {term for term in variant_terms if term in right.searchable_name}
|
||
if _is_catalog_or_delimited_variant_listing(left) != _is_catalog_or_delimited_variant_listing(right):
|
||
return True
|
||
return bool(left_terms or right_terms) and left_terms != right_terms
|
||
|
||
|
||
def _has_schick_2in1_model_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not ({"schick", "舒適牌"} & (left.brand_tokens & right.brand_tokens)):
|
||
return False
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if "舒綺" not in pair_text or "美型刀" not in pair_text:
|
||
return False
|
||
left_2in1 = bool(re.search(r"2\s*(?:-?in-?|合)?\s*1", left.searchable_name, re.I))
|
||
right_2in1 = bool(re.search(r"2\s*(?:-?in-?|合)?\s*1", right.searchable_name, re.I))
|
||
return left_2in1 != right_2in1
|
||
|
||
|
||
def _has_taicend_protection_form_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not ({"taicend", "泰陞"} & (left.brand_tokens & right.brand_tokens)):
|
||
return False
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if "保護膜" not in pair_text and "保護噴霧" not in pair_text and "液態皮膚保護膜" not in pair_text:
|
||
return False
|
||
if "屁屁噴" in left.searchable_name and "屁屁噴" in right.searchable_name:
|
||
return False
|
||
left_terms = {
|
||
term
|
||
for term in ("寶貝液體保護膜", "液態皮膚保護膜", "皮膚保護噴霧", "保護噴霧")
|
||
if term in left.searchable_name
|
||
}
|
||
right_terms = {
|
||
term
|
||
for term in ("寶貝液體保護膜", "液態皮膚保護膜", "皮膚保護噴霧", "保護噴霧")
|
||
if term in right.searchable_name
|
||
}
|
||
return bool(left_terms or right_terms) and left_terms != right_terms
|
||
|
||
|
||
def _has_catalog_specific_variant_selection_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(
|
||
term in pair_text
|
||
for term in (
|
||
"身體去角質",
|
||
"美體主張",
|
||
"私密潔浴露",
|
||
"私密潔浴",
|
||
"私密防護慕絲",
|
||
"私密慕絲",
|
||
"慕絲",
|
||
"嬰兒潤膚乳",
|
||
"定妝噴霧",
|
||
"染眉膏",
|
||
"眼線膠筆",
|
||
"粉餅盒",
|
||
"遮瑕蜜",
|
||
"護手霜",
|
||
"護唇膏",
|
||
"護唇棒",
|
||
"唇釉",
|
||
"唇膏",
|
||
"蜜粉",
|
||
"防曬素顏霜",
|
||
"車用香氛",
|
||
"車用擴香",
|
||
"車用擴香蕊",
|
||
"香氛擴香罐",
|
||
"擴香瓶",
|
||
"擴香罐",
|
||
"擴香蕊",
|
||
"水性指甲油",
|
||
"指甲油",
|
||
"足膜",
|
||
"泡澡入浴劑",
|
||
"入浴劑",
|
||
"融蠟小夜燈",
|
||
"融蠟燈",
|
||
"滋養霜",
|
||
)
|
||
):
|
||
return False
|
||
left_catalog = _is_catalog_or_delimited_variant_listing(left)
|
||
right_catalog = _is_catalog_or_delimited_variant_listing(right)
|
||
return left_catalog != right_catalog
|
||
|
||
|
||
def _has_bath_additive_variant_gap(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if not any(term in pair_text for term in ("入浴劑", "泡澡錠", "泡澡包", "泡澡")):
|
||
return False
|
||
if not (left.brand_tokens & right.brand_tokens):
|
||
return False
|
||
left_terms = {
|
||
term
|
||
for term in ("馨香", "懷舊", "橘盒", "綠盒", "粉盒", "藍盒")
|
||
if term in left.searchable_name
|
||
}
|
||
right_terms = {
|
||
term
|
||
for term in ("馨香", "懷舊", "橘盒", "綠盒", "粉盒", "藍盒")
|
||
if term in right.searchable_name
|
||
}
|
||
return bool(left_terms and right_terms and not (left_terms & right_terms))
|
||
|
||
|
||
def _has_taicend_baby_spray_equivalence(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
brand_tokens = {"taicend", "泰陞"}
|
||
return (
|
||
bool(left.brand_tokens & brand_tokens)
|
||
and bool(right.brand_tokens & brand_tokens)
|
||
and "屁屁噴" in left.searchable_name
|
||
and "屁屁噴" in right.searchable_name
|
||
and _has_overlapping_base_spec(left, right)
|
||
)
|
||
|
||
|
||
def _has_seche_vite_top_coat_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
brand_tokens = {"seche", "vite"}
|
||
return (
|
||
brand_tokens <= (left.brand_tokens | right.brand_tokens)
|
||
and bool(left.brand_tokens & brand_tokens)
|
||
and bool(right.brand_tokens & brand_tokens)
|
||
and "快乾亮油" in left.searchable_name
|
||
and "快乾亮油" in right.searchable_name
|
||
)
|
||
|
||
|
||
def _has_xiaomi_s101_shaver_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
brand_tokens = {"xiaomi", "小米", "小米有品"}
|
||
return (
|
||
bool(left.brand_tokens & brand_tokens)
|
||
and bool(right.brand_tokens & brand_tokens)
|
||
and "s101" in left.searchable_name
|
||
and "s101" in right.searchable_name
|
||
and "電動刮鬍刀" in left.searchable_name
|
||
and "電動刮鬍刀" in right.searchable_name
|
||
)
|
||
|
||
|
||
def _has_hinoki_roller_oil_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
return (
|
||
"檜山坊" in left.brand_tokens
|
||
and "檜山坊" in right.brand_tokens
|
||
and "檜木精油" in left.searchable_name
|
||
and "檜木精油" in right.searchable_name
|
||
and "滾珠瓶" in left.searchable_name
|
||
and "滾珠瓶" in right.searchable_name
|
||
and _has_overlapping_base_spec(left, right)
|
||
)
|
||
|
||
|
||
def _has_brush_baby_wildones_toothbrush_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
brand_tokens = {"brush", "baby", "wildones"}
|
||
return (
|
||
brand_tokens <= left.brand_tokens
|
||
and brand_tokens <= right.brand_tokens
|
||
and "電動牙刷" in left.searchable_name
|
||
and "電動牙刷" in right.searchable_name
|
||
and "0-10y" in left.searchable_name
|
||
and "0-10y" in right.searchable_name
|
||
)
|
||
|
||
|
||
def _has_pshine_beauty_foot_file_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
brand_tokens = {"beauty", "shine", "foot"}
|
||
return (
|
||
brand_tokens <= left.brand_tokens
|
||
and brand_tokens <= right.brand_tokens
|
||
and "雙面" in left.searchable_name
|
||
and "雙面" in right.searchable_name
|
||
and "足" in left.searchable_name
|
||
and "足" in right.searchable_name
|
||
and ("硬皮" in left.searchable_name or "去角質" in left.searchable_name)
|
||
and ("硬皮" in right.searchable_name or "去角質" in right.searchable_name)
|
||
and ("磨砂棒" in left.searchable_name or "足搓棒" in left.searchable_name)
|
||
and ("磨砂棒" in right.searchable_name or "足搓棒" in right.searchable_name)
|
||
)
|
||
|
||
|
||
def _has_baan_baby_lip_catalog_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
brand_tokens = {"baan", "貝恩"}
|
||
left_options = _explicit_variant_option_tokens(left)
|
||
right_options = _explicit_variant_option_tokens(right)
|
||
return (
|
||
bool(left.brand_tokens & brand_tokens)
|
||
and bool(right.brand_tokens & brand_tokens)
|
||
and "嬰兒修護唇膏" in left.searchable_name
|
||
and "嬰兒修護唇膏" in right.searchable_name
|
||
and {"原味", "草莓"} <= left_options
|
||
and {"原味", "草莓"} <= right_options
|
||
)
|
||
|
||
|
||
def _has_recipe_box_child_sunscreen_cushion_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
brand_tokens = left.brand_tokens | right.brand_tokens
|
||
return (
|
||
{"recipe", "box"} <= brand_tokens
|
||
and "兒童防曬氣墊粉餅" in left.searchable_name
|
||
and "兒童防曬氣墊粉餅" in right.searchable_name
|
||
)
|
||
|
||
|
||
def _has_pavaruni_40_scent_oil_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
return (
|
||
"pavaruni" in (left.brand_tokens & right.brand_tokens)
|
||
and "天然植物" in f"{left_text} {right_text}"
|
||
and "精油" in left_text
|
||
and "精油" in right_text
|
||
and _has_shared_volume(left, right, 10)
|
||
and ("40香味" in left_text or "40種香味" in left_text)
|
||
and ("40香味" in right_text or "40種香味" in right_text)
|
||
)
|
||
|
||
|
||
def _has_pavaruni_20_scent_candle_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
return (
|
||
"pavaruni" in (left.brand_tokens & right.brand_tokens)
|
||
and "香氛蠟燭" in left_text
|
||
and "香氛蠟燭" in right_text
|
||
and _has_shared_weight(left, right, 450)
|
||
and ("20香味" in left_text or "20種香味" in left_text)
|
||
and ("20香味" in right_text or "20種香味" in right_text)
|
||
)
|
||
|
||
|
||
def _has_laundrin_tokyo_car_freshener_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
return (
|
||
{"laundrin", "朗德林"} & (left.brand_tokens & right.brand_tokens)
|
||
and "tokyo" in left_text
|
||
and "tokyo" in right_text
|
||
and "車用" in left_text
|
||
and "車用" in right_text
|
||
and "芳香劑" in left_text
|
||
and "芳香劑" in right_text
|
||
and _has_shared_count(left, right, 1, "入")
|
||
)
|
||
|
||
|
||
def _has_shared_count(left: ProductIdentity, right: ProductIdentity, count: int, unit: str) -> bool:
|
||
return (count, unit) in set(left.counts) and (count, unit) in set(right.counts)
|
||
|
||
|
||
def _has_shared_volume(left: ProductIdentity, right: ProductIdentity, volume_ml: float) -> bool:
|
||
return any(_close_number(value, volume_ml) for value in left.volumes_ml) and any(
|
||
_close_number(value, volume_ml) for value in right.volumes_ml
|
||
)
|
||
|
||
|
||
def _has_shared_weight(left: ProductIdentity, right: ProductIdentity, weight_g: float) -> bool:
|
||
return any(_close_number(value, weight_g) for value in left.weights_g) and any(
|
||
_close_number(value, weight_g) for value in right.weights_g
|
||
)
|
||
|
||
|
||
def _has_focused_low_score_exact_identity_line(left: ProductIdentity, right: ProductIdentity) -> str:
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
pair_text = f"{left_text} {right_text}"
|
||
left_raw = left.original_name.lower()
|
||
right_raw = right.original_name.lower()
|
||
brand_tokens = left.brand_tokens | right.brand_tokens
|
||
|
||
if (
|
||
"biodance" in (left.brand_tokens & right.brand_tokens)
|
||
and "深層全效面膜" in left_text
|
||
and "深層全效面膜" in right_text
|
||
and "膠原蛋白" in pair_text
|
||
and _has_shared_count(left, right, 4, "片")
|
||
):
|
||
return "biodance_deep_mask"
|
||
if (
|
||
{"muji", "無印良品"} & brand_tokens
|
||
and "精油芬香護手霜" in left_text
|
||
and "精油芬香護手霜" in right_text
|
||
and _has_shared_weight(left, right, 50)
|
||
and bool(left.brand_tokens) != bool(right.brand_tokens)
|
||
):
|
||
return "muji_aroma_hand_cream_brandless"
|
||
if (
|
||
{"herbacin", "德國小甘菊"} & brand_tokens
|
||
and "小甘菊" in left_text
|
||
and "小甘菊" in right_text
|
||
and "護手霜" in left_text
|
||
and "護手霜" in right_text
|
||
and _has_shared_volume(left, right, 20)
|
||
and bool(left.brand_tokens) != bool(right.brand_tokens)
|
||
):
|
||
return "herbacin_classic_hand_cream_20ml_brandless"
|
||
if (
|
||
{"sab", "初淨肌"} & (left.brand_tokens & right.brand_tokens)
|
||
and "私密防護舒緩噴霧" in left_text
|
||
and "私密防護舒緩噴霧" in right_text
|
||
and _has_shared_volume(left, right, 30)
|
||
):
|
||
return "sab_private_spray"
|
||
if (
|
||
"lush" in (left.brand_tokens & right.brand_tokens)
|
||
and "櫻之花身體噴霧" in left_text
|
||
and "櫻之花身體噴霧" in right_text
|
||
and _has_shared_volume(left, right, 200)
|
||
):
|
||
return "lush_sakura_body_spray"
|
||
if (
|
||
{"kanebo", "佳麗寶"} & (left.brand_tokens & right.brand_tokens)
|
||
and "coffret" in left_text
|
||
and "coffret" in right_text
|
||
and "光透立體眼線筆" in left_text
|
||
and "光透立體眼線筆" in right_text
|
||
):
|
||
return "kanebo_coffret_eyeliner"
|
||
if (
|
||
"artmis" in brand_tokens
|
||
and "葳兒柔" in left_text
|
||
and "葳兒柔" in right_text
|
||
and "賦活凝膠" in left_text
|
||
and "賦活凝膠" in right_text
|
||
and _has_shared_volume(left, right, 35)
|
||
):
|
||
return "artmis_virile_gel"
|
||
if (
|
||
"artmis" in brand_tokens
|
||
and "私密清潔慕斯" in left_text
|
||
and "私密清潔慕斯" in right_text
|
||
and "金縷梅" in left_text
|
||
and "金縷梅" in right_text
|
||
and _has_shared_volume(left, right, 250)
|
||
):
|
||
return "artmis_witch_hazel_private_mousse_250ml"
|
||
if (
|
||
"artmis" in brand_tokens
|
||
and "私密清潔慕斯" in left_text
|
||
and "私密清潔慕斯" in right_text
|
||
and "蔓越莓" in left_text
|
||
and "蔓越莓" in right_text
|
||
and _has_shared_volume(left, right, 250)
|
||
):
|
||
return "artmis_cranberry_private_mousse_250ml"
|
||
if (
|
||
"powerman" in pair_text
|
||
and "男性私密養護液" in left_text
|
||
and "男性私密養護液" in right_text
|
||
and _has_shared_volume(left, right, 30)
|
||
):
|
||
return "playjoy_powerman_male_care_30ml"
|
||
if (
|
||
{"physiogel", "潔美淨"} & (left.brand_tokens & right.brand_tokens)
|
||
and "ai冰鎮精華露" in left_text
|
||
and "ai冰鎮精華露" in right_text
|
||
and _has_shared_volume(left, right, 200)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "physiogel_ai_ice_essence_200ml_2pack"
|
||
if (
|
||
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
|
||
and "緊彈水嫩凝膠" in left_text
|
||
and "緊彈水嫩凝膠" in right_text
|
||
and _has_shared_weight(left, right, 40)
|
||
):
|
||
return "ts6_private_elastic_gel_40g"
|
||
if (
|
||
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
|
||
and "超美" in left_text
|
||
and "超美" in right_text
|
||
and "香氛誘霜" in left_text
|
||
and "香氛誘霜" in right_text
|
||
and (120.0 in set(left.weights_g) or 120.0 in set(left.volumes_ml))
|
||
and (120.0 in set(right.weights_g) or 120.0 in set(right.volumes_ml))
|
||
):
|
||
return "ts6_private_white_fragrance_cream_120"
|
||
if (
|
||
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
|
||
and "淨白植感慕斯" in left_text
|
||
and "淨白植感慕斯" in right_text
|
||
and _has_shared_weight(left, right, 180)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "ts6_white_mousse_180g_3pack"
|
||
if (
|
||
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
|
||
and "沁涼潔淨慕斯" in left_text
|
||
and "沁涼潔淨慕斯" in right_text
|
||
and _has_shared_weight(left, right, 100)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "ts6_cooling_clean_mousse_100g"
|
||
if (
|
||
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
|
||
and "蜜愛潤滑液" in left_text
|
||
and "蜜愛潤滑液" in right_text
|
||
and _has_shared_weight(left, right, 100)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "ts6_lubricant_100g_3pack"
|
||
if (
|
||
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
|
||
and "蜜桃煥白凝膠" in left_text
|
||
and "蜜桃煥白凝膠" in right_text
|
||
and _has_shared_weight(left, right, 45)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "ts6_peach_bright_gel_45g_3pack"
|
||
if (
|
||
{"ts6", "護一生"} & (left.brand_tokens & right.brand_tokens)
|
||
and "極淨白私密潔膚露" in left_text
|
||
and "極淨白私密潔膚露" in right_text
|
||
and "蜜桃煥白凝膠" in left_text
|
||
and "蜜桃煥白凝膠" in right_text
|
||
and _has_shared_weight(left, right, 250)
|
||
and _has_shared_weight(left, right, 45)
|
||
):
|
||
return "ts6_white_wash_peach_gel_kit"
|
||
if (
|
||
{"beauty", "foot"} <= (left.brand_tokens & right.brand_tokens)
|
||
and "足膜" in left_text
|
||
and "足膜" in right_text
|
||
and any(_has_shared_volume(left, right, volume) for volume in (25, 30))
|
||
and _has_exact_count_alignment(left, right)
|
||
and not _is_multi_variant_catalog_listing(left)
|
||
and not _is_multi_variant_catalog_listing(right)
|
||
):
|
||
return "beauty_foot_mask_exact_pack"
|
||
if (
|
||
{"kameria", "凱蜜菈"} & (left.brand_tokens & right.brand_tokens)
|
||
and "足足稱奇" in left_text
|
||
and "足足稱奇" in right_text
|
||
and "積雪草" in left_text
|
||
and "積雪草" in right_text
|
||
and "足膜" in left_text
|
||
and "足膜" in right_text
|
||
and _has_shared_volume(left, right, 17)
|
||
and _has_exact_count_alignment(left, right)
|
||
and not _is_multi_variant_catalog_listing(left)
|
||
and not _is_multi_variant_catalog_listing(right)
|
||
):
|
||
return "kameria_centella_foot_mask_17ml_2pc"
|
||
if (
|
||
{"vaseline", "凡士林"} & (left.brand_tokens & right.brand_tokens)
|
||
and "嬰兒高純修護凝膠" in left_text
|
||
and "嬰兒高純修護凝膠" in right_text
|
||
and _has_shared_weight(left, right, 368)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "vaseline_baby_jelly_368g_3pack"
|
||
if (
|
||
{"derma", "丹麥德瑪"} & (left.brand_tokens & right.brand_tokens)
|
||
and "寶寶" in left_text
|
||
and "寶寶" in right_text
|
||
and "洗髮沐浴露" in left_text
|
||
and "洗髮沐浴露" in right_text
|
||
and _has_shared_volume(left, right, 150)
|
||
):
|
||
return "derma_baby_wash_150ml"
|
||
if (
|
||
{"derma", "丹麥德瑪"} & (left.brand_tokens & right.brand_tokens)
|
||
and "寶寶" in left_text
|
||
and "寶寶" in right_text
|
||
and "洗髮沐浴露" in left_text
|
||
and "洗髮沐浴露" in right_text
|
||
and _has_shared_volume(left, right, 500)
|
||
):
|
||
return "derma_baby_wash_500ml"
|
||
if (
|
||
{"clarins", "克蘭詩"} & (left.brand_tokens & right.brand_tokens)
|
||
and "黃金亮眼萃" in left_text
|
||
and "黃金亮眼萃" in right_text
|
||
and _has_shared_volume(left, right, 20)
|
||
):
|
||
return "clarins_double_serum_eye_20ml"
|
||
if (
|
||
{"cetaphil", "舒特膚"} & (left.brand_tokens & right.brand_tokens)
|
||
and "長效潤膚乳" in left_text
|
||
and "長效潤膚乳" in right_text
|
||
and _has_shared_volume(left, right, 237)
|
||
):
|
||
return "cetaphil_long_lotion_237ml"
|
||
if (
|
||
{"cetaphil", "舒特膚"} & (left.brand_tokens & right.brand_tokens)
|
||
and "長效潤膚乳" in left_text
|
||
and "長效潤膚乳" in right_text
|
||
and _has_shared_volume(left, right, 473)
|
||
):
|
||
return "cetaphil_long_lotion_473ml"
|
||
if (
|
||
{"cetaphil", "舒特膚"} & (left.brand_tokens & right.brand_tokens)
|
||
and "長效潤膚霜" in left_text
|
||
and "長效潤膚霜" in right_text
|
||
and _has_shared_weight(left, right, 250)
|
||
):
|
||
return "cetaphil_long_moisturizing_cream_250g"
|
||
if (
|
||
{"cetaphil", "舒特膚"} & (left.brand_tokens & right.brand_tokens)
|
||
and "益膚康修護舒敏乳霜" in left_text
|
||
and "益膚康修護舒敏乳霜" in right_text
|
||
and _has_shared_weight(left, right, 227)
|
||
):
|
||
return "cetaphil_ad_repair_cream_227g"
|
||
if (
|
||
{"nivea", "妮維雅"} & (left.brand_tokens & right.brand_tokens)
|
||
and "妮維雅霜" in left_text
|
||
and "妮維雅霜" in right_text
|
||
and "隨身版" in left_text
|
||
and "隨身版" in right_text
|
||
and _has_shared_volume(left, right, 100)
|
||
):
|
||
return "nivea_creme_100ml"
|
||
if (
|
||
"nailmatic" in (left.brand_tokens & right.brand_tokens)
|
||
and "小精靈" in left_text
|
||
and "小精靈" in right_text
|
||
and "指甲油" in left_text
|
||
and "指甲油" in right_text
|
||
):
|
||
return "nailmatic_casper_polish"
|
||
if (
|
||
"小浪" in (left.brand_tokens & right.brand_tokens)
|
||
and "智能感應自動噴香機" in left_text
|
||
and "智能感應自動噴香機" in right_text
|
||
and "補充液" in left_text
|
||
and "補充液" in right_text
|
||
and _has_shared_count(left, right, 3, "入")
|
||
):
|
||
return "xiaolang_spray_machine_refill_set"
|
||
if (
|
||
{"yunmi", "j10"} <= (left.brand_tokens & right.brand_tokens)
|
||
and "濕度數顯智能加濕器" in left_text
|
||
and "濕度數顯智能加濕器" in right_text
|
||
):
|
||
return "yunmi_j10_humidifier"
|
||
if (
|
||
"aquiesse" in (left.brand_tokens & right.brand_tokens)
|
||
and "香氛蠟燭" in left_text
|
||
and "香氛蠟燭" in right_text
|
||
and "5oz" in left_text
|
||
and "5oz" in right_text
|
||
and _is_multi_variant_catalog_listing(left)
|
||
and _is_multi_variant_catalog_listing(right)
|
||
):
|
||
return "aquiesse_5oz_candle_catalog"
|
||
if (
|
||
{"rejuran", "麗珠蘭"} & (left.brand_tokens & right.brand_tokens)
|
||
and "麗駐蘭修復舒緩面膜" in left_text
|
||
and "麗駐蘭修復舒緩面膜" in right_text
|
||
and "5p" in left_text
|
||
and "5p" in right_text
|
||
):
|
||
return "rejuran_repair_mask_5p"
|
||
if (
|
||
{"shiseido", "資生堂"} & (left.brand_tokens & right.brand_tokens)
|
||
and "新艷陽" in left_text
|
||
and "新艷陽" in right_text
|
||
and "水離子熱防禦" in left_text
|
||
and "水離子熱防禦" in right_text
|
||
and "隔離露" in left_text
|
||
and "隔離露" in right_text
|
||
):
|
||
return "shiseido_blue_sunscreen"
|
||
if (
|
||
"dhc" in pair_text
|
||
and "純欖護唇膏" in left_text
|
||
and "純欖護唇膏" in right_text
|
||
and _has_shared_weight(left, right, 1.5)
|
||
):
|
||
return "dhc_olive_lip_1_5g"
|
||
if (
|
||
"frudia" in pair_text
|
||
and "蜂蜜藍莓" in left_text
|
||
and "蜂蜜藍莓" in right_text
|
||
and "護唇膏" in left_text
|
||
and "護唇膏" in right_text
|
||
and _has_shared_weight(left, right, 10)
|
||
):
|
||
return "frudia_honey_blueberry_lip_10g"
|
||
if (
|
||
{"sebamed", "施巴"} & brand_tokens
|
||
and "嬰兒護唇膏" in left_text
|
||
and "嬰兒護唇膏" in right_text
|
||
and _has_shared_weight(left, right, 4.8)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "sebamed_baby_lip_4_8g_2pack"
|
||
if (
|
||
"理膚寶水" in pair_text
|
||
and "滋養修護潤唇膏" in left_text
|
||
and "滋養修護潤唇膏" in right_text
|
||
and _has_shared_volume(left, right, 4.7)
|
||
):
|
||
return "laroche_posay_lip_balm_4_7ml"
|
||
if (
|
||
{"baan", "貝恩"} & (left.brand_tokens & right.brand_tokens)
|
||
and "嬰兒修護唇膏" in left_text
|
||
and "嬰兒修護唇膏" in right_text
|
||
and left.product_type == right.product_type == "護唇膏"
|
||
and "原味" in left_text
|
||
and "原味" in right_text
|
||
and "草莓" in left_text
|
||
and "草莓" in right_text
|
||
):
|
||
return "baan_baby_lip_original_strawberry_catalog"
|
||
if (
|
||
{"baan", "貝恩"} & (left.brand_tokens & right.brand_tokens)
|
||
and "嬰兒修護唇膏" in left_text
|
||
and "嬰兒修護唇膏" in right_text
|
||
and left.product_type == right.product_type == "護唇膏"
|
||
):
|
||
return "baan_baby_lip_base_catalog"
|
||
if (
|
||
{"shu uemura", "植村秀"} & (left.brand_tokens & right.brand_tokens)
|
||
and "3d極細防水眼線膠筆" in left_text
|
||
and "3d極細防水眼線膠筆" in right_text
|
||
):
|
||
return "shu_3d_eyeliner"
|
||
if (
|
||
{"ysl", "聖羅蘭"} & (left.brand_tokens & right.brand_tokens)
|
||
and "恆久完美透膚煙染腮紅" in left_text
|
||
and "恆久完美透膚煙染腮紅" in right_text
|
||
):
|
||
return "ysl_blush_catalog"
|
||
if (
|
||
{"hh", "草本新淨界"} & (left.brand_tokens & right.brand_tokens)
|
||
and "私密植萃美白緊緻凝露" in left_text
|
||
and "私密植萃美白緊緻凝露" in right_text
|
||
and _has_shared_volume(left, right, 30)
|
||
):
|
||
return "hh_private_gel"
|
||
if (
|
||
{"lab52", "齒妍堂"} & (left.brand_tokens & right.brand_tokens)
|
||
and "學習刷牙漱口水" in left_text
|
||
and "學習刷牙漱口水" in right_text
|
||
and _has_overlapping_base_spec(left, right)
|
||
):
|
||
return "lab52_mouthwash"
|
||
if (
|
||
{"lab52", "齒妍堂"} & (left.brand_tokens | right.brand_tokens)
|
||
and "牙刷" in left_text
|
||
and "牙刷" in right_text
|
||
and any(term in left_text for term in ("嬰幼兒", "幼兒", "汪汪隊"))
|
||
and any(term in right_text for term in ("嬰幼兒", "幼兒", "汪汪隊"))
|
||
and _has_shared_count(left, right, 2, "入")
|
||
):
|
||
return "lab52_paw_patrol_baby_toothbrush_2pack"
|
||
if (
|
||
"benefit" in (left.brand_tokens & right.brand_tokens)
|
||
and "染唇液" in left_text
|
||
and "染唇液" in right_text
|
||
and "唇頰兩用" in pair_text
|
||
):
|
||
return "benefit_lip_tint"
|
||
if (
|
||
{"schick", "舒適牌"} & (left.brand_tokens & right.brand_tokens)
|
||
and "舒綺" in left_text
|
||
and "舒綺" in right_text
|
||
and "除毛刀片" in left_text
|
||
and "除毛刀片" in right_text
|
||
and "敏感肌" in left_text
|
||
and "敏感肌" in right_text
|
||
and _has_shared_count(left, right, 3, "入")
|
||
):
|
||
return "schick_womens_sensitive_blade_3pack"
|
||
if (
|
||
{"herb24", "草本"} & (left.brand_tokens & right.brand_tokens)
|
||
and "晨霧純精油擴香儀ii" in left_text
|
||
and "晨霧純精油擴香儀ii" in right_text
|
||
and (("霧黑" in left_text and "黑色" in right_text) or ("霧黑" in right_text and "黑色" in left_text))
|
||
):
|
||
return "herb24_mist_diffuser_black"
|
||
if _has_pavaruni_40_scent_oil_alignment(left, right):
|
||
return "pavaruni_40_scent_oil"
|
||
if _has_pavaruni_20_scent_candle_alignment(left, right):
|
||
return "pavaruni_20_scent_candle"
|
||
if _has_laundrin_tokyo_car_freshener_alignment(left, right):
|
||
return "laundrin_tokyo_car_freshener"
|
||
if (
|
||
"好物良品" in (left.brand_tokens & right.brand_tokens)
|
||
and "北歐簡樸融蠟燈桌面氣氛夜燈" in left_text
|
||
and "北歐簡樸融蠟燈桌面氣氛夜燈" in right_text
|
||
):
|
||
return "goodgoods_nordic_wax_lamp"
|
||
if (
|
||
{"derma", "丹麥德瑪"} & (left.brand_tokens & right.brand_tokens)
|
||
and "有機植萃" in left_text
|
||
and "有機植萃" in right_text
|
||
and "護膚油" in left_text
|
||
and "護膚油" in right_text
|
||
and _has_shared_volume(left, right, 150)
|
||
):
|
||
return "derma_eco_skin_oil"
|
||
if (
|
||
{"derma", "丹麥德瑪"} & (left.brand_tokens & right.brand_tokens)
|
||
and "大地" in left_text
|
||
and "大地" in right_text
|
||
and "植萃" in left_text
|
||
and "植萃" in right_text
|
||
and "護膚油" in left_text
|
||
and "護膚油" in right_text
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "derma_eco_skin_oil_2pack_review"
|
||
if (
|
||
{"修護保養"} & (left.brand_tokens & right.brand_tokens)
|
||
and "蝸牛特潤修護面膜" in left_text
|
||
and "蝸牛特潤修護面膜" in right_text
|
||
and _has_shared_count(left, right, 6, "片")
|
||
):
|
||
return "w_repair_snail_mask_6pcs_review"
|
||
if (
|
||
{"yuskin", "悠斯晶"} & (left.brand_tokens & right.brand_tokens)
|
||
and "乳霜" in left_text
|
||
and "乳霜" in right_text
|
||
and _has_shared_weight(left, right, 30)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "yuskin_classic_cream_30g_6pack"
|
||
if (
|
||
{"johnson", "johnsons", "嬌生"} & (left.brand_tokens & right.brand_tokens)
|
||
and "嬰兒" in left_text
|
||
and "嬰兒" in right_text
|
||
and "潤膚乳" in left_text
|
||
and "潤膚乳" in right_text
|
||
and _has_shared_volume(left, right, 500)
|
||
and (not left.counts or not right.counts or _has_exact_count_alignment(left, right))
|
||
and any(option in left_text and option in right_text for option in ("牛奶", "純淨", "甜夢", "溫和", "棉柔"))
|
||
):
|
||
return "johnsons_baby_lotion_variant_catalog"
|
||
if (
|
||
{"im meme", "meme"} & (left.brand_tokens & right.brand_tokens)
|
||
and "我愛超磁妝定妝噴霧" in left_text
|
||
and "我愛超磁妝定妝噴霧" in right_text
|
||
and "涼感" in left_text
|
||
and "涼感" in right_text
|
||
):
|
||
return "im_meme_fixx_cool_setting_spray"
|
||
if (
|
||
{"so", "natural", "fixx"} <= (left.brand_tokens & right.brand_tokens)
|
||
and "全天候超完美定妝噴霧" in left_text
|
||
and "全天候超完美定妝噴霧" in right_text
|
||
and _has_shared_volume(left, right, 120)
|
||
and not any(term in pair_text for term in ("經典款", "光澤款", "霧面款", "夏日款", "涼感", "一般"))
|
||
):
|
||
return "so_natural_fixx_setting_spray_120ml_plain"
|
||
if (
|
||
{"so", "natural", "fixx"} <= (left.brand_tokens & right.brand_tokens)
|
||
and "全天候超完美定妝噴霧" in left_text
|
||
and "全天候超完美定妝噴霧" in right_text
|
||
):
|
||
return "so_natural_fixx_setting_spray_catalog"
|
||
if (
|
||
{"kate", "凱婷"} & (left.brand_tokens & right.brand_tokens)
|
||
and "粉餅盒" in left_text
|
||
and "粉餅盒" in right_text
|
||
):
|
||
return "kate_powder_case_catalog"
|
||
if (
|
||
{"kate", "凱婷"} & (left.brand_tokens & right.brand_tokens)
|
||
and "怪獸級持色唇膏" in left_text
|
||
and "怪獸級持色唇膏" in right_text
|
||
):
|
||
return "kate_monster_lipstick_catalog"
|
||
if (
|
||
"opi" in (left.brand_tokens & right.brand_tokens)
|
||
and _shared_model_tokens(left, right)
|
||
and "類光繚" in left_text
|
||
and "類光繚" in right_text
|
||
and ("指甲油" in left_text or "指彩" in left_text)
|
||
and ("指甲油" in right_text or "指彩" in right_text)
|
||
and ("如膠似漆" in left_text or "如膠似漆" in right_text)
|
||
):
|
||
return "opi_gel_polish_exact_model"
|
||
if (
|
||
"opi" in (left.brand_tokens & right.brand_tokens)
|
||
and "類光繚指甲油" in left_text
|
||
and "類光繚指甲油" in right_text
|
||
and any(series in left_text and series in right_text for series in ("白日夢遊", "驕傲果凍"))
|
||
):
|
||
return "opi_gel_polish_series_catalog"
|
||
if (
|
||
("rom" in (left.brand_tokens & right.brand_tokens) or "romand" in (left.brand_tokens & right.brand_tokens))
|
||
and "果汁唇釉" in left_text
|
||
and "果汁唇釉" in right_text
|
||
and "2.0" in left_text
|
||
and "2.0" in right_text
|
||
):
|
||
return "romand_juicy_lip_tint_2_catalog"
|
||
if (
|
||
"solone" in (left.brand_tokens & right.brand_tokens)
|
||
and "持久眼線筆" in left_text
|
||
and "持久眼線筆" in right_text
|
||
):
|
||
return "solone_longlasting_eyeliner"
|
||
if (
|
||
{"shu uemura", "植村秀"} & (left.brand_tokens & right.brand_tokens)
|
||
and "自動武士刀眉筆" in left_text
|
||
and "自動武士刀眉筆" in right_text
|
||
and "筆蕊" in left_text
|
||
and "筆蕊" in right_text
|
||
):
|
||
return "shu_auto_hard_formula_refill_catalog"
|
||
if (
|
||
{"summer", "eve", "舒摩兒"} & (left.brand_tokens & right.brand_tokens)
|
||
and "浴潔露" in left_text
|
||
and "浴潔露" in right_text
|
||
and "全肌防護" in left_raw
|
||
and "全肌防護" in right_raw
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "summer_eve_full_skin_wash_2pack"
|
||
if (
|
||
"焦糖楓葉香氛擴香花禮盒" in left_text
|
||
and "焦糖楓葉香氛擴香花禮盒" in right_text
|
||
and _has_shared_volume(left, right, 30)
|
||
and bool(left.brand_tokens) != bool(right.brand_tokens)
|
||
):
|
||
return "the_forest_maple_diffuser_flower_brandless"
|
||
if (
|
||
"gatsby" in (left.brand_tokens & right.brand_tokens)
|
||
and "爆水擦澡濕巾" in left_text
|
||
and "爆水擦澡濕巾" in right_text
|
||
and "24張入" in left_text
|
||
and "24張入" in right_text
|
||
):
|
||
return "gatsby_body_wipes_24"
|
||
if (
|
||
{"3w", "clinic"} <= (left.brand_tokens & right.brand_tokens)
|
||
and "膠原蛋白粉底液" in left_text
|
||
and "膠原蛋白粉底液" in right_text
|
||
and _has_shared_volume(left, right, 50)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "3w_clinic_collagen_foundation_50ml_2pack"
|
||
if (
|
||
"花美水" in (left.brand_tokens & right.brand_tokens)
|
||
and "moisture" in (left.brand_tokens & right.brand_tokens)
|
||
and "保濕修護" in left_text
|
||
and "保濕修護" in right_text
|
||
and "精華凝膠" in left_text
|
||
and "精華凝膠" in right_text
|
||
and ("原黃金" in left_text and "原黃金" in right_text)
|
||
and _has_shared_weight(left, right, 1.7)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "hanamisui_moisture_original_gel_1_7g_3pack"
|
||
if (
|
||
"花美水" in (left.brand_tokens & right.brand_tokens)
|
||
and "inclear" in (left.brand_tokens & right.brand_tokens)
|
||
and ("櫻克麗兒" in left_text and "櫻克麗兒" in right_text)
|
||
and ("私密淨化凝膠" in left_text and "私密淨化凝膠" in right_text)
|
||
and _has_shared_weight(left, right, 1.7)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "hanamisui_inclear_private_gel_1_7g_3pack"
|
||
if (
|
||
"花美水" in (left.brand_tokens & right.brand_tokens)
|
||
and "relax" in left_raw
|
||
and "relax" in right_raw
|
||
and "薰衣草" in left_text
|
||
and "薰衣草" in right_text
|
||
and "潤滑凝膠" in left_text
|
||
and "潤滑凝膠" in right_text
|
||
and _has_shared_weight(left, right, 1.7)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "hanamisui_relax_lavender_gel_1_7g_3pack"
|
||
if (
|
||
("聖克萊爾" in left_text and "聖克萊爾" in right_text)
|
||
and "私密呼呼溫和潔淨慕斯" in left_text
|
||
and "私密呼呼溫和潔淨慕斯" in right_text
|
||
and _has_shared_volume(left, right, 150)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "st_clare_private_mousse_150ml_2pack"
|
||
if (
|
||
("聖克萊爾" in left_text and "聖克萊爾" in right_text)
|
||
and "私密呼呼溫和潔淨慕斯" in left_text
|
||
and "私密呼呼溫和潔淨慕斯" in right_text
|
||
and "私密呼呼舒緩護理噴霧" in left_text
|
||
and "私密呼呼舒緩護理噴霧" in right_text
|
||
and _has_shared_volume(left, right, 150)
|
||
and _has_shared_volume(left, right, 50)
|
||
):
|
||
return "st_clare_private_mousse_spray_set"
|
||
if (
|
||
("biopeutic" in (left.brand_tokens & right.brand_tokens) or ("葆療美" in left_text and "葆療美" in right_text))
|
||
and "果酸煥膚水凝乳" in left_text
|
||
and "果酸煥膚水凝乳" in right_text
|
||
and "20%" in left_raw
|
||
and "20%" in right_raw
|
||
and _has_shared_volume(left, right, 150)
|
||
):
|
||
return "biopeutic_plus_aha_lotion_20_150ml"
|
||
if (
|
||
"台塑生醫" in left_text
|
||
and "台塑生醫" in right_text
|
||
and "嬰兒沐浴洗髮" in left_text
|
||
and "嬰兒沐浴洗髮" in right_text
|
||
and "3件組" in left_text
|
||
and "3件組" in right_text
|
||
and "嬰兒沐浴精" in left_text
|
||
and "嬰兒沐浴精" in right_text
|
||
and "嬰幼童洗髮精" in left_text
|
||
and "嬰幼童洗髮精" in right_text
|
||
):
|
||
return "taisu_baby_bath_shampoo_3pc"
|
||
if (
|
||
"雅頓" in left_text
|
||
and "雅頓" in right_text
|
||
and "八小時潤澤護唇膏" in left_text
|
||
and "八小時潤澤護唇膏" in right_text
|
||
and "spf15" in left_raw
|
||
and "spf15" in right_raw
|
||
and _has_shared_weight(left, right, 3.7)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "arden_eight_hour_lip_spf15_3_7g_3pack"
|
||
if (
|
||
"理膚寶水" in left_text
|
||
and "理膚寶水" in right_text
|
||
and "全面修復潤唇膏" in left_text
|
||
and "全面修復潤唇膏" in right_text
|
||
and _has_shared_volume(left, right, 7.5)
|
||
):
|
||
return "laroche_posay_repair_lip_balm_7_5ml"
|
||
if (
|
||
{"flortte", "花洛莉亞"} & (left.brand_tokens & right.brand_tokens)
|
||
and "水果沙拉系列彩色防水眼線液筆" in left_text
|
||
and "水果沙拉系列彩色防水眼線液筆" in right_text
|
||
and "色號" in left_text
|
||
and "色號" in right_text
|
||
and "任選" in left_raw
|
||
and "任選" in right_raw
|
||
and _has_shared_volume(left, right, 0.5)
|
||
):
|
||
return "flortte_fruit_salad_eyeliner_0_5ml_catalog"
|
||
if (
|
||
{"neutrogena", "露得清"} & (left.brand_tokens & right.brand_tokens)
|
||
and "護手霜" in left_text
|
||
and "護手霜" in right_text
|
||
and "無香" in left_text
|
||
and "無香" in right_text
|
||
and "有香" in left_text
|
||
and "有香" in right_text
|
||
and _has_shared_weight(left, right, 56)
|
||
):
|
||
return "neutrogena_hand_cream_56g_scent_catalog"
|
||
if (
|
||
{"kanebo", "佳麗寶"} & (left.brand_tokens & right.brand_tokens)
|
||
and "allie" in left_raw
|
||
and "allie" in right_raw
|
||
and "持采亮化uv防曬水凝乳" in left_text
|
||
and "持采亮化uv防曬水凝乳" in right_text
|
||
and "任選" in left_raw
|
||
and "任選" in right_raw
|
||
and _has_shared_weight(left, right, 60)
|
||
):
|
||
return "kanebo_allie_bright_uv_milk_60g_catalog"
|
||
if (
|
||
"ordinary" in (left.brand_tokens & right.brand_tokens)
|
||
and "咖啡因" in left_text
|
||
and "咖啡因" in right_text
|
||
and "egcg" in left_raw
|
||
and "egcg" in right_raw
|
||
and "兒茶眼部配方" in left_text
|
||
and "兒茶眼部配方" in right_text
|
||
and (
|
||
_has_shared_volume(left, right, 30)
|
||
or (30.0 in left.volumes_ml and not right.volumes_ml)
|
||
or (30.0 in right.volumes_ml and not left.volumes_ml)
|
||
)
|
||
):
|
||
return "the_ordinary_caffeine_egcg_30ml"
|
||
if (
|
||
{"sk-ii", "skii", "sk2"} & (left.brand_tokens & right.brand_tokens)
|
||
and "青春露" in left_text
|
||
and "青春露" in right_text
|
||
and _has_shared_volume(left, right, 330)
|
||
and _has_shared_count(left, right, 2, "入")
|
||
):
|
||
return "sk_ii_essence_330ml_2pack"
|
||
if (
|
||
{"amiino", "安美諾"} & (left.brand_tokens | right.brand_tokens)
|
||
and "美白修護霜" in left_text
|
||
and "美白修護霜" in right_text
|
||
and _has_shared_volume(left, right, 30)
|
||
):
|
||
return "amiino_whitening_repair_cream_30ml"
|
||
if (
|
||
{"natures", "care"} <= (left.brand_tokens & right.brand_tokens)
|
||
and "綿羊油" in left_text
|
||
and "綿羊油" in right_text
|
||
and _has_exact_count_alignment(left, right)
|
||
and (
|
||
_has_shared_volume(left, right, 125)
|
||
or (125.0 in left.volumes_ml and not right.volumes_ml and "125m" in right_text)
|
||
or (125.0 in right.volumes_ml and not left.volumes_ml and "125m" in left_text)
|
||
)
|
||
):
|
||
return "natures_care_sheep_oil_exact_pack"
|
||
if (
|
||
"tomoon" in (left.brand_tokens & right.brand_tokens)
|
||
and "德國奔月" in left_text
|
||
and "德國奔月" in right_text
|
||
and "豪華套裝組" in left_text
|
||
and "豪華套裝組" in right_text
|
||
and ("指甲剪" in left_text or "指甲刀" in left_text)
|
||
and ("指甲剪" in right_text or "指甲刀" in right_text)
|
||
and any(size in left_text and size in right_text for size in ("l號", "s號"))
|
||
):
|
||
return "tomoon_nail_clipper_luxury_size"
|
||
if (
|
||
{"hh", "草本新淨界"} & (left.brand_tokens & right.brand_tokens)
|
||
and "私密植萃抗菌潔淨露" in left_text
|
||
and "私密植萃抗菌潔淨露" in right_text
|
||
and "私密衣物抗菌手洗精" in left_text
|
||
and "私密衣物抗菌手洗精" in right_text
|
||
and _has_shared_volume(left, right, 200)
|
||
):
|
||
return "hh_private_cleanser_laundry_wash_set"
|
||
if (
|
||
{"sebamed", "施巴"} & (left.brand_tokens & right.brand_tokens)
|
||
and "護潔露" in left_text
|
||
and "護潔露" in right_text
|
||
and _has_shared_volume(left, right, 200)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "sebamed_ph38_private_wash_200ml_2pack"
|
||
if (
|
||
{"yes", "德悅氏"} & (left.brand_tokens & right.brand_tokens)
|
||
and "鋒利窄弧型剪刀" in left_text
|
||
and "鋒利窄弧型剪刀" in right_text
|
||
and "9cm" in left_text
|
||
and "9cm" in right_text
|
||
):
|
||
return "yes_curved_scissors_9cm"
|
||
if (
|
||
{"yes", "德悅氏"} & (left.brand_tokens & right.brand_tokens)
|
||
and "腳指甲剪刀" in left_text
|
||
and "腳指甲剪刀" in right_text
|
||
and "10.5cm" in left_text
|
||
and "10.5cm" in right_text
|
||
):
|
||
return "yes_foot_nail_scissors_10_5cm"
|
||
if (
|
||
{"yes", "德悅氏"} & (left.brand_tokens & right.brand_tokens)
|
||
and "極細指甲緣硬皮剪刀" in left_text
|
||
and "極細指甲緣硬皮剪刀" in right_text
|
||
and "9cm" in left_text
|
||
and "9cm" in right_text
|
||
):
|
||
return "yes_cuticle_scissors_9cm"
|
||
if _has_yes_nail_tool_exact_alignment(left, right):
|
||
return "yes_nail_tool_exact_model_size"
|
||
if (
|
||
{"kussen", "葵森"} & (left.brand_tokens & right.brand_tokens)
|
||
and "寶寶益菌屁屁膏" in left_text
|
||
and "寶寶益菌屁屁膏" in right_text
|
||
and _has_shared_volume(left, right, 50)
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "kussen_baby_butt_cream_50ml_3pack"
|
||
if (
|
||
"bone" in (left.brand_tokens & right.brand_tokens)
|
||
and "擴香禮盒三入組" in left_text
|
||
and "擴香禮盒三入組" in right_text
|
||
and all(component in left_text and component in right_text for component in ("原木麋鹿", "搖搖貓頭鷹", "薰衣草精油"))
|
||
and _has_exact_count_alignment(left, right)
|
||
):
|
||
return "bone_diffuser_gift_3pack"
|
||
if (
|
||
{"1990", "選物"} <= (left.brand_tokens & right.brand_tokens)
|
||
and "現代簡約半圓罩融燭燈" in left_text
|
||
and "現代簡約半圓罩融燭燈" in right_text
|
||
and "白色款" in left_text
|
||
and "白色款" in right_text
|
||
):
|
||
return "selection1990_half_dome_wax_lamp_white"
|
||
if (
|
||
{"1990", "選物"} <= (left.brand_tokens & right.brand_tokens)
|
||
and "歐式可彎融燭燈" in left_text
|
||
and "歐式可彎融燭燈" in right_text
|
||
and "白色款" in left_text
|
||
and "白色款" in right_text
|
||
):
|
||
return "selection1990_bendable_wax_lamp_white"
|
||
if (
|
||
"canmake" in (left.brand_tokens & right.brand_tokens)
|
||
and "淚袋專用盤" in left_text
|
||
and "淚袋專用盤" in right_text
|
||
and "淚袋眼影盤" in left_text
|
||
and "淚袋眼影盤" in right_text
|
||
):
|
||
return "canmake_tear_bag_palette"
|
||
if (
|
||
{"recipe", "box"} <= brand_tokens
|
||
and "可撕式水性兒童指甲油" in left_text
|
||
and "可撕式水性兒童指甲油" in right_text
|
||
):
|
||
return "recipe_box_peelable_child_polish_catalog"
|
||
if (
|
||
"gdesign" in (left.brand_tokens & right.brand_tokens)
|
||
and "aroma" in left_text
|
||
and "aroma" in right_text
|
||
and "lava" in left_text
|
||
and "lava" in right_text
|
||
and "解憂放鬆緩緩燈2.0" in left_text
|
||
and "解憂放鬆緩緩燈2.0" in right_text
|
||
and "熔岩燈" in left_text
|
||
and "熔岩燈" in right_text
|
||
and "精油擴香" in left_text
|
||
and "精油擴香" in right_text
|
||
):
|
||
return "gdesign_aroma_lava_lamp_2"
|
||
if (
|
||
"hooome" in (left.brand_tokens & right.brand_tokens)
|
||
and "白色" in left_text
|
||
and "白色" in right_text
|
||
and "香氛蠟燭暖燈" in left_text
|
||
and "香氛蠟燭暖燈" in right_text
|
||
and "兩顆燈泡" in left_text
|
||
and "兩顆燈泡" in right_text
|
||
and "禮盒" in left_text
|
||
and "禮盒" in right_text
|
||
):
|
||
return "hooome_classic_white_wax_lamp_bulbs_giftbox"
|
||
return ""
|
||
|
||
|
||
def _is_relove_private_cleanser_line(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
return (
|
||
"relove" in (left.brand_tokens | right.brand_tokens)
|
||
and "私密" in left.searchable_name
|
||
and "私密" in right.searchable_name
|
||
and "凝露" in left.searchable_name
|
||
and "凝露" in right.searchable_name
|
||
)
|
||
|
||
|
||
def _is_relove_cleanser_gel_like(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if "relove" not in (left.brand_tokens | right.brand_tokens):
|
||
return False
|
||
cleanser_terms = ("私密", "潔淨", "清潔")
|
||
return (
|
||
"凝露" in left.searchable_name
|
||
and "凝露" in right.searchable_name
|
||
and any(term in left.searchable_name for term in cleanser_terms)
|
||
and any(term in right.searchable_name for term in cleanser_terms)
|
||
)
|
||
|
||
|
||
def _is_multi_variant_catalog_listing(identity: ProductIdentity) -> bool:
|
||
text = identity.normalized_name
|
||
return any(phrase in text for phrase in MULTI_VARIANT_LISTING_PHRASES)
|
||
|
||
|
||
def _normalize_variant_option(value: str) -> set[str]:
|
||
compact = re.sub(r"[^a-z0-9]", "", (value or "").lower())
|
||
if not compact:
|
||
return set()
|
||
return {compact}
|
||
|
||
|
||
def _variant_option_compare_key(option: str) -> str:
|
||
if option.isdigit():
|
||
return option.lstrip("0") or "0"
|
||
return option
|
||
|
||
|
||
def _variant_options_overlap(left_options: set[str], right_options: set[str]) -> bool:
|
||
if left_options & right_options:
|
||
return True
|
||
left_keys = {_variant_option_compare_key(option) for option in left_options}
|
||
right_keys = {_variant_option_compare_key(option) for option in right_options}
|
||
return bool(left_keys & right_keys)
|
||
|
||
|
||
def _is_catalog_or_delimited_variant_listing(identity: ProductIdentity) -> bool:
|
||
if _is_multi_variant_catalog_listing(identity):
|
||
return True
|
||
text = identity.searchable_name
|
||
if re.search(r"(?<![a-z0-9])([a-z]?\d{1,3}[a-z]?)\s*(?:~|~|至|-)\s*([a-z]?\d{1,3}[a-z]?)(?![a-z0-9])", text, re.I):
|
||
return True
|
||
options = _explicit_variant_option_tokens(identity)
|
||
if len(options) < 2:
|
||
return bool(
|
||
re.search(r"[//、,,..&&]", text)
|
||
and any(term in text for term in ("粉餅盒", "眼線膠筆", "眉筆", "唇膏", "唇釉", "遮瑕蜜", "車用擴香", "車用香氛"))
|
||
)
|
||
return bool(re.search(r"[//、,,..&&]", text))
|
||
|
||
|
||
def _has_catalog_variant_listing_alignment(left: ProductIdentity, right: ProductIdentity) -> bool:
|
||
if not (_is_multi_variant_catalog_listing(left) and _is_multi_variant_catalog_listing(right)):
|
||
return False
|
||
if left.product_type != right.product_type or left.product_type not in {"精油", "護唇膏"}:
|
||
return False
|
||
if not _has_overlapping_base_spec(left, right):
|
||
return False
|
||
shared_core = left.core_tokens & right.core_tokens
|
||
if shared_core:
|
||
return True
|
||
left_text = left.searchable_name
|
||
right_text = right.searchable_name
|
||
catalog_terms = ("香氛擴香罐", "香氛蠟燭", "蠟燭", "擴香罐", "修護唇膏")
|
||
return any(term in left_text and term in right_text for term in catalog_terms)
|
||
|
||
|
||
def _is_variant_sensitive_identity(
|
||
left: ProductIdentity,
|
||
right: ProductIdentity,
|
||
shared_anchor: str,
|
||
) -> bool:
|
||
corpus = (
|
||
shared_anchor,
|
||
left.product_type or "",
|
||
right.product_type or "",
|
||
left.searchable_name,
|
||
right.searchable_name,
|
||
)
|
||
return any(keyword in text for text in corpus for keyword in VARIANT_SENSITIVE_KEYWORDS if text)
|
||
|
||
|
||
def _has_variant_descriptor_conflict(left: ProductIdentity, right: ProductIdentity, shared_anchor: str) -> bool:
|
||
if _has_serum_formulation_conflict(left, right, shared_anchor):
|
||
return True
|
||
if _has_taicend_baby_spray_equivalence(left, right):
|
||
return False
|
||
if _has_brush_baby_wildones_toothbrush_alignment(left, right):
|
||
return False
|
||
if _has_baan_baby_lip_catalog_alignment(left, right):
|
||
return False
|
||
if _has_recipe_box_child_sunscreen_cushion_alignment(left, right):
|
||
return False
|
||
if _has_pavaruni_40_scent_oil_alignment(left, right):
|
||
return False
|
||
if _has_pavaruni_20_scent_candle_alignment(left, right):
|
||
return False
|
||
if _has_laundrin_tokyo_car_freshener_alignment(left, right):
|
||
return False
|
||
if _is_relove_private_cleanser_line(left, right):
|
||
return False
|
||
if (
|
||
shared_anchor
|
||
and shared_anchor not in SEARCH_BROAD_ANCHORS
|
||
and not _is_variant_sensitive_identity(left, right, shared_anchor)
|
||
):
|
||
return False
|
||
if _shared_model_tokens(left, right):
|
||
return False
|
||
left_descriptors = _variant_descriptors(left)
|
||
right_descriptors = _variant_descriptors(right)
|
||
if not left_descriptors or not right_descriptors:
|
||
return False
|
||
if left_descriptors & right_descriptors:
|
||
return False
|
||
for left_descriptor in left_descriptors:
|
||
for right_descriptor in right_descriptors:
|
||
if left_descriptor in right_descriptor or right_descriptor in left_descriptor:
|
||
return False
|
||
return True
|
||
|
||
|
||
def _explicit_variant_option_tokens(identity: ProductIdentity) -> set[str]:
|
||
text = identity.searchable_name
|
||
options: set[str] = set()
|
||
for match in re.finditer(r"(?<![a-z0-9])([a-z]?\d{1,3}[a-z]?)\s*(?:~|~|至|-)\s*([a-z]?\d{1,3}[a-z]?)(?![a-z0-9])", text, re.I):
|
||
for group in (match.group(1), match.group(2)):
|
||
options.update(_normalize_variant_option(group))
|
||
for match in re.finditer(r"(?:#|no\.?|色號|號色)\s*([a-z]?\d{1,3}[a-z]?)(?![a-z0-9])", text, re.I):
|
||
options.update(_normalize_variant_option(match.group(1)))
|
||
for match in re.finditer(r"(?<![a-z0-9])((?:0?\d){1,2})(?=[\u4e00-\u9fff])", text, re.I):
|
||
if text[match.end(1):match.end(1) + 4] in {"號護唇膏", "號護脣膏"}:
|
||
continue
|
||
options.update(_normalize_variant_option(match.group(1)))
|
||
for color_word in VARIANT_OPTION_COLOR_WORDS:
|
||
if color_word in text:
|
||
options.add(color_word)
|
||
return options
|
||
|
||
|
||
def _has_variant_option_selection_gap(identity: ProductIdentity, options: set[str]) -> bool:
|
||
named_options = {option for option in options if not option.isdigit()}
|
||
if len(named_options) < 2:
|
||
return False
|
||
text = identity.searchable_name
|
||
return _is_multi_variant_catalog_listing(identity) or bool(re.search(r"[//、,,]", text))
|
||
|
||
|
||
def _has_catalog_options_against_generic_count_alignment(
|
||
left: ProductIdentity,
|
||
right: ProductIdentity,
|
||
left_options: set[str],
|
||
right_options: set[str],
|
||
) -> bool:
|
||
if not _has_overlapping_base_spec(left, right):
|
||
return False
|
||
if left.product_type and right.product_type and left.product_type != right.product_type:
|
||
return False
|
||
for catalog_identity, generic_identity, catalog_options, generic_options in (
|
||
(left, right, left_options, right_options),
|
||
(right, left, right_options, left_options),
|
||
):
|
||
named_catalog_options = {option for option in catalog_options if not option.isdigit()}
|
||
named_generic_options = {option for option in generic_options if not option.isdigit()}
|
||
shared_count_options = {
|
||
option for option in catalog_options & generic_options
|
||
if option.isdigit()
|
||
}
|
||
if (
|
||
len(named_catalog_options) >= 2
|
||
and not named_generic_options
|
||
and shared_count_options
|
||
and _is_multi_variant_catalog_listing(catalog_identity)
|
||
and "組" in generic_identity.searchable_name
|
||
):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _has_explicit_variant_option_conflict(
|
||
left: ProductIdentity,
|
||
right: ProductIdentity,
|
||
shared_anchor: str,
|
||
) -> bool:
|
||
if not _is_variant_sensitive_identity(left, right, shared_anchor):
|
||
return False
|
||
left_options = _explicit_variant_option_tokens(left)
|
||
right_options = _explicit_variant_option_tokens(right)
|
||
if not left_options or not right_options:
|
||
return False
|
||
if left_options == right_options:
|
||
return False
|
||
if _variant_options_overlap(left_options, right_options):
|
||
if _has_catalog_options_against_generic_count_alignment(left, right, left_options, right_options):
|
||
return False
|
||
pair_text = f"{left.searchable_name} {right.searchable_name}"
|
||
if any(term in pair_text for term in ("眉筆", "眼線膠筆", "唇膏", "唇釉", "粉餅盒", "遮瑕蜜")) and (
|
||
(
|
||
len(left_options) > len(right_options)
|
||
and _is_catalog_or_delimited_variant_listing(left)
|
||
)
|
||
or (
|
||
len(right_options) > len(left_options)
|
||
and _is_catalog_or_delimited_variant_listing(right)
|
||
)
|
||
):
|
||
return False
|
||
if (
|
||
len(left_options) > len(right_options)
|
||
and _has_variant_option_selection_gap(left, left_options)
|
||
) or (
|
||
len(right_options) > len(left_options)
|
||
and _has_variant_option_selection_gap(right, right_options)
|
||
):
|
||
return True
|
||
return False
|
||
for left_option in left_options:
|
||
for right_option in right_options:
|
||
if left_option in right_option or right_option in left_option:
|
||
return False
|
||
return True
|
||
|
||
|
||
def _has_named_variant_selection_review(
|
||
left: ProductIdentity,
|
||
right: ProductIdentity,
|
||
shared_anchor: str,
|
||
) -> bool:
|
||
if (
|
||
_has_makeup_shade_selection_gap(left, right)
|
||
or _has_makeup_spray_variant_selection_gap(left, right)
|
||
or _has_catalog_specific_variant_selection_gap(left, right)
|
||
):
|
||
return True
|
||
left_options = _explicit_variant_option_tokens(left)
|
||
right_options = _explicit_variant_option_tokens(right)
|
||
if left_options and right_options:
|
||
for catalog_identity, catalog_options, specific_options in (
|
||
(left, left_options, right_options),
|
||
(right, right_options, left_options),
|
||
):
|
||
if (
|
||
_is_catalog_or_delimited_variant_listing(catalog_identity)
|
||
and len(catalog_options) > len(specific_options)
|
||
and _variant_options_overlap(catalog_options, specific_options)
|
||
and _is_variant_sensitive_identity(left, right, shared_anchor)
|
||
):
|
||
return True
|
||
if bool(left_options) != bool(right_options):
|
||
option_identity = left if left_options else right
|
||
catalog_identity = right if left_options else left
|
||
if (
|
||
_is_variant_sensitive_identity(left, right, shared_anchor)
|
||
and _is_catalog_or_delimited_variant_listing(catalog_identity)
|
||
and _explicit_variant_option_tokens(option_identity)
|
||
):
|
||
return True
|
||
if (
|
||
_is_variant_sensitive_identity(left, right, shared_anchor)
|
||
and _has_overlapping_base_spec(left, right)
|
||
and _explicit_variant_option_tokens(option_identity)
|
||
and any(
|
||
term in f"{left.searchable_name} {right.searchable_name}"
|
||
for term in ("粉餅盒", "護手霜", "護唇膏", "護唇棒", "滋養霜", "眼線膠筆", "遮瑕蜜")
|
||
)
|
||
):
|
||
return True
|
||
if bool(left_options) == bool(right_options):
|
||
return False
|
||
|
||
option_identity = left if left_options else right
|
||
named_options = {option for option in (left_options or right_options) if not option.isdigit()}
|
||
if len(named_options) < 2:
|
||
return False
|
||
text = option_identity.searchable_name
|
||
return _is_multi_variant_catalog_listing(option_identity) or bool(re.search(r"[//、,&&]", text))
|
||
|
||
|
||
def _search_core_score(token: str, all_tokens: set[str]) -> tuple[int, int, str]:
|
||
cleaned = _clean_search_phrase(token)
|
||
if not cleaned:
|
||
return (-999, 0, cleaned)
|
||
compact = cleaned.replace(" ", "")
|
||
if compact in SEARCH_NOISE_TOKENS or compact in GENERIC_TOKENS:
|
||
return (-900, 0, cleaned)
|
||
if re.fullmatch(r"\d+(?:\.\d+)?(?:ml|g|mg|kg|l)x\d+", compact, re.I):
|
||
return (-900, 0, cleaned)
|
||
|
||
score = 0
|
||
if re.search(r"[a-z][a-z0-9-]{2,}", cleaned):
|
||
score += 30
|
||
if re.search(r"\d", cleaned):
|
||
score += 12
|
||
|
||
anchors = _extract_anchor_phrases(cleaned)
|
||
if anchors:
|
||
score += 90
|
||
score += min(24, len(anchors[0]) * 3)
|
||
if anchors[0] == compact:
|
||
score += 8
|
||
if compact in SEARCH_BROAD_ANCHORS:
|
||
score -= 28
|
||
else:
|
||
score += max(0, 24 - len(compact))
|
||
|
||
if len(compact) <= 8:
|
||
score += 14
|
||
elif len(compact) >= 12:
|
||
score -= 12
|
||
|
||
has_better_anchor = any(
|
||
other != token and _extract_anchor_phrases(other)
|
||
for other in all_tokens
|
||
)
|
||
if has_better_anchor and any(term in compact for term in SEARCH_AMBIGUOUS_PRODUCT_TERMS):
|
||
score -= 80
|
||
if any(noise in compact for noise in SEARCH_NOISE_TOKENS):
|
||
score -= 18
|
||
|
||
return (score, -len(compact), cleaned)
|
||
|
||
|
||
def _ranked_search_core_phrases(identity: ProductIdentity, limit: int = 4) -> list[str]:
|
||
tokens = {token for token in identity.core_tokens if token not in GENERIC_TOKENS}
|
||
ranked_tokens = sorted(
|
||
tokens,
|
||
key=lambda token: _search_core_score(token, tokens),
|
||
reverse=True,
|
||
)
|
||
|
||
phrases: list[str] = []
|
||
for token in ranked_tokens:
|
||
if _search_core_score(token, tokens)[0] < -100:
|
||
continue
|
||
candidates = _extract_anchor_phrases(token) or [_clean_search_phrase(token)]
|
||
for phrase in candidates:
|
||
compact = phrase.replace(" ", "")
|
||
if len(compact) < 2 or compact in SEARCH_NOISE_TOKENS:
|
||
continue
|
||
if any(term in compact for term in SEARCH_AMBIGUOUS_PRODUCT_TERMS) and len(phrases) > 0:
|
||
continue
|
||
if phrase not in phrases:
|
||
phrases.append(phrase)
|
||
if len(phrases) >= limit:
|
||
return phrases
|
||
return phrases
|
||
|
||
|
||
def _variant_primary_phrase(identity: ProductIdentity) -> str:
|
||
text = identity.searchable_name
|
||
for anchor in ("時尚潮流美甲片", "頂級璀燦美甲片", "薄型經典美甲片", "足部時尚潮流美甲片"):
|
||
pattern = rf"{re.escape(anchor)}[-_ ]*([\u4e00-\u9fff]{{2,8}})"
|
||
match = re.search(pattern, text)
|
||
if not match:
|
||
continue
|
||
phrase = _clean_search_phrase(match.group(1))
|
||
compact = phrase.replace(" ", "")
|
||
if compact and compact not in SEARCH_NOISE_TOKENS:
|
||
return phrase
|
||
variant_descriptors = sorted(_variant_descriptors(identity), key=lambda token: (len(token), token))
|
||
return variant_descriptors[0] if variant_descriptors else ""
|
||
|
||
|
||
def build_search_terms(name: str, max_terms: int = 3) -> list[str]:
|
||
identity = parse_product_identity(name)
|
||
terms: list[str] = []
|
||
is_dashing_diva_nail_line = {"dashing", "diva"} <= identity.brand_tokens and "美甲片" in identity.searchable_name
|
||
|
||
def primary_brand_phrase() -> str:
|
||
if {"dashing", "diva"} <= identity.brand_tokens:
|
||
return "dashing diva"
|
||
if {"rom", "nd"} <= identity.brand_tokens:
|
||
return "romand"
|
||
if {"im", "meme"} <= identity.brand_tokens:
|
||
return "im meme"
|
||
if {"recipe", "box"} <= identity.brand_tokens:
|
||
return "recipe box"
|
||
chinese = sorted(
|
||
(token for token in identity.brand_tokens if re.search(r"[\u4e00-\u9fff]", token)),
|
||
key=lambda token: (-len(token), token),
|
||
)
|
||
if chinese:
|
||
return chinese[0]
|
||
latin = sorted(
|
||
(
|
||
token for token in identity.brand_tokens
|
||
if re.search(r"[a-z]", token) and len(token) >= 3 and token not in GENERIC_TOKENS
|
||
),
|
||
key=lambda token: (" " not in token and "-" not in token, -len(token), token),
|
||
)
|
||
if latin:
|
||
return latin[0]
|
||
short_latin = sorted(
|
||
(
|
||
token for token in identity.brand_tokens
|
||
if re.search(r"[a-z]", token) and len(token) >= 2 and token not in GENERIC_TOKENS
|
||
),
|
||
key=lambda token: (" " not in token and "-" not in token, -len(token), token),
|
||
)
|
||
return short_latin[0] if short_latin else ""
|
||
|
||
brand_part = primary_brand_phrase()
|
||
spec_terms = _search_spec_terms(identity)
|
||
spec_part = " ".join(spec_terms)
|
||
core_phrases = _ranked_search_core_phrases(identity, limit=4)
|
||
full_name_anchor_phrases = _extract_anchor_phrases(name)
|
||
if full_name_anchor_phrases:
|
||
core_phrases = list(dict.fromkeys(full_name_anchor_phrases + core_phrases))
|
||
core_short = " ".join(core_phrases[:2])
|
||
core_primary = core_phrases[0] if core_phrases else ""
|
||
product_type_aliases = set(PRODUCT_TYPES.get(identity.product_type or "", ()))
|
||
chinese_detail_phrases = [
|
||
phrase
|
||
for phrase in core_phrases[1:]
|
||
if re.search(r"[\u4e00-\u9fff]", phrase)
|
||
and phrase != core_primary
|
||
and phrase != (identity.product_type or "")
|
||
and phrase not in SEARCH_BROAD_ANCHORS
|
||
and not any(phrase == alias or phrase in alias or alias in phrase for alias in product_type_aliases)
|
||
]
|
||
modifier_with_primary = " ".join(
|
||
part for part in (chinese_detail_phrases[0] if chinese_detail_phrases else "", core_primary) if part
|
||
)
|
||
variant_primary = _variant_primary_phrase(identity)
|
||
variant_options = sorted(
|
||
(token for token in _explicit_variant_option_tokens(identity) if token != "0"),
|
||
key=lambda token: (len(token), token),
|
||
)
|
||
variant_option_part = " ".join(variant_options[:2])
|
||
model_phrases = [
|
||
phrase
|
||
for phrase in core_phrases[1:]
|
||
if re.fullmatch(r"[a-z]*\d+[a-z0-9-]*", phrase)
|
||
or re.fullmatch(r"[a-z][a-z0-9-]{2,}", phrase)
|
||
]
|
||
if "護甲油" in identity.searchable_name:
|
||
model_phrases = [
|
||
phrase for phrase in model_phrases
|
||
if phrase.lower() not in {"top", "coat"} and not re.fullmatch(r"ist\d+", phrase, re.I)
|
||
]
|
||
primary_with_model = " ".join(
|
||
part for part in (core_primary, model_phrases[0] if model_phrases else "") if part
|
||
)
|
||
variant_sensitive = any(keyword in identity.searchable_name for keyword in VARIANT_SENSITIVE_KEYWORDS)
|
||
model_like_spec = any(
|
||
re.search(r"[a-z]", term)
|
||
and re.search(r"\d", term)
|
||
and not re.fullmatch(r"\d+(?:\.\d+)?(?:ml|g|mg|kg|l)", term, re.I)
|
||
for term in spec_terms
|
||
)
|
||
prefer_variant_search = (
|
||
variant_sensitive
|
||
and bool(variant_primary)
|
||
and not model_phrases
|
||
and not model_like_spec
|
||
and not variant_options
|
||
and "護甲油" not in identity.searchable_name
|
||
and any(
|
||
term in identity.searchable_name
|
||
for term in ("護手霜", "芬香", "香氛", "香味", "擴香", "精油", "指甲油", "指彩")
|
||
)
|
||
)
|
||
for value in (
|
||
" ".join(part for part in (brand_part, core_primary, variant_primary, spec_part) if part)
|
||
if is_dashing_diva_nail_line and variant_sensitive and variant_primary
|
||
else "",
|
||
" ".join(part for part in (brand_part, core_primary, variant_option_part, spec_part) if part)
|
||
if variant_sensitive and variant_option_part and not model_phrases and not model_like_spec
|
||
else "",
|
||
" ".join(part for part in (brand_part, core_primary, variant_primary, spec_part) if part)
|
||
if prefer_variant_search
|
||
else "",
|
||
" ".join(part for part in (brand_part, primary_with_model, spec_part) if part)
|
||
if primary_with_model and model_phrases
|
||
else "",
|
||
" ".join(part for part in (brand_part, modifier_with_primary, spec_part) if part)
|
||
if modifier_with_primary and identity.product_type and identity.product_type in core_primary
|
||
else "",
|
||
" ".join(part for part in (brand_part, core_primary, spec_part) if part)
|
||
if variant_sensitive and core_primary and not variant_options
|
||
else "",
|
||
" ".join(part for part in (brand_part, core_primary, variant_primary, spec_part) if part)
|
||
if variant_sensitive and variant_primary and variant_options and not model_phrases and not model_like_spec
|
||
else "",
|
||
" ".join(part for part in (brand_part, primary_with_model, spec_part) if part),
|
||
" ".join(part for part in (brand_part, core_short, spec_part) if part),
|
||
" ".join(part for part in (brand_part, core_short) if part),
|
||
" ".join(part for part in (core_primary, spec_part) if part),
|
||
identity.searchable_name,
|
||
):
|
||
cleaned = _clean_search_phrase(value)
|
||
if cleaned and cleaned not in terms:
|
||
terms.append(cleaned[:42])
|
||
if len(terms) >= max_terms:
|
||
break
|
||
|
||
return terms
|