強化 PChome 商品身份比對防錯配
All checks were successful
CD Pipeline / deploy (push) Successful in 1m4s

This commit is contained in:
OoO
2026-05-20 11:07:05 +08:00
parent f7772cc465
commit 93f1a558a5
6 changed files with 233 additions and 28 deletions

View File

@@ -4,6 +4,7 @@
================================================================================
【已完成】
- V10.312 強化 PChome 商品身份比對防錯配matcher 開始解析 mg/mcg 劑量、件組套組與多規格集合60ml+150ml vs 60ml+20ml、10mg vs 20mg、10片 vs 10盒、精華 vs 化妝水都會進硬否決或單位價覆核,不再靠單一規格重疊放行;覆核診斷同步新增「劑量差異」標籤,降低核心比價錯配污染 daily/growth/PPT/AI 分析。
- V10.311 統一競品價差語意:`fetch_competitor_comparison_results()`、competitor PPT 與 OpenClaw competitor prompt 全部改用 `MOMO - PChome`,正值代表 MOMO 較貴 / PChome 低價壓力,負值代表 MOMO 價格優勢;避免 daily/growth 顯示價格壓力但 PPT/AI 反向解讀。
- V10.310 強化 MOMO/PChome 核心比價閉環PChome feeder 搜尋候選只有強同款 `0.90` 才提前停止,避免第一個 0.76 次佳候選卡掉後續精準搜尋詞;人工否決的候選會被跳過並改挑下一個候選,不再讓已否決商品長期阻塞同 SKU。人工 `reject_identity`、`unit_price_required`、`needs_research` 會立即讓同候選正式 `competitor_prices` 過期Dashboard 即使尚有舊價也不再顯示正式總價差;手機版比價覆核欄位標籤、覆核按鈕冒泡與候選證據顯示同步修正。
- V10.308 修正商品列表 PChome 比價閉環狀態:`manual_rejected`、`manual_unit_price_required`、`manual_needs_research` 不再掉回籠統「待比對」,改顯示「人工已否決 / 人工標記單位價 / 人工要求補搜尋」與後續 feeder 行為說明,避免人工覆核後 UI 看起來像沒有處理。

View File

@@ -320,7 +320,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
# ==========================================
# 系統版本與路徑
# ==========================================
SYSTEM_VERSION = "V10.311"
SYSTEM_VERSION = "V10.312"
LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
public_url = PUBLIC_URL # 用於模板顯示

View File

@@ -605,3 +605,4 @@ POSTGRES_HOST=momo-db
| 2026-04-17 | 188 容器無 volume mount`docker cp` 臨時解 | 重建 image`COPY . .` bake 進新代碼port 5001 衝突記錄為技術債 |
| 2026-04-17 | 188 .env Telegram token 不正確split-brain| 修正為 `8610496165`188→Telegram message_id=282 確認 |
| 2026-04-17 | NIM Tool Calling E2E | 真實 NVIDIA_API_KEY 驗證dispatched=3, errors=[] |
| 2026-05-20 | PChome 商品身份比對仍可能因單一規格重疊誤放行 | V10.312 起 matcher 解析 mg/mcg 劑量、件組套組、多規格集合與同數字不同單位;劑量/容量/重量/件數/品類衝突會硬否決或導向單位價覆核,避免錯配污染 Dashboard、daily/growth、PPT 與 AI 競價分析 |

View File

@@ -79,6 +79,7 @@ MATCH_DIAGNOSTIC_REASON_LABELS = {
"type_conflict": "品類不符",
"volume_conflict": "容量差異",
"weight_conflict": "重量差異",
"dosage_conflict": "劑量差異",
"count_conflict": "件數差異",
"component_count_conflict": "入數差異",
"multi_component_conflict": "組合差異",

View File

@@ -85,8 +85,10 @@ GENERIC_TOKENS = {
"",
"",
"",
"",
"ml",
"g",
"mg",
"la",
"paris",
}
@@ -107,9 +109,50 @@ PRODUCT_TYPES = {
"保健": ("", "膠囊", "", "", "", "健康食品"),
}
COUNT_UNITS = {"", "", "", "", "", "", "", "", "", "", ""}
COUNT_UNITS = {"", "", "", "", "", "", "", "", "", "", "", ""}
PIECE_UNITS = {"", "", "", "", ""}
CONTAINER_UNITS = {"", "", ""}
BUNDLE_OFFER_PHRASES = (
"囤貨組",
"超值組",
"特惠組",
"優惠組",
"優惠套組",
"禮盒組",
"加大組",
"加量組",
"分享組",
"明星組",
"套組",
"組合",
"組合包",
"雙件組",
"二件組",
"2件組",
"家庭組",
"多入組",
)
NON_BRAND_BRACKET_PHRASES = (
"保濕組",
"熱銷款",
"限定",
"特惠",
"優惠",
"超值",
"囤貨",
"組合",
"套組",
"禮盒",
"分享",
"雙件",
"二件",
"2件",
"家庭",
"多入",
"任選",
"",
"母親節",
)
CHINESE_COUNT = {
"": 1,
"": 2,
@@ -137,6 +180,7 @@ class ProductIdentity:
core_tokens: frozenset[str]
volumes_ml: tuple[float, ...]
weights_g: tuple[float, ...]
dosages_mg: tuple[float, ...]
counts: tuple[tuple[int, str], ...]
total_piece_count: Optional[int]
@@ -261,9 +305,10 @@ def _leading_brand_tokens(original: str, normalized: str) -> set[str]:
bracket_match = re.match(r"\s*[【\[]([^】\]]{2,40})[】\]]", original or "")
if bracket_match:
content = normalize_product_text(bracket_match.group(1))
for token in _tokenize(_strip_noise(content)):
if token not in GENERIC_TOKENS:
tokens.add(token)
if not any(phrase in content for phrase in NON_BRAND_BRACKET_PHRASES):
for token in _tokenize(_strip_noise(content)):
if token not in GENERIC_TOKENS:
tokens.add(token)
leading = normalized[:48]
for token in _tokenize(leading):
@@ -293,28 +338,37 @@ def _convert_volume(value: str, unit: str) -> Optional[tuple[str, float]]:
return ("g", number)
if unit == "kg":
return ("g", number * 1000)
if unit in {"mg", "毫克"}:
return ("mg", number)
if unit in {"mcg", "μg", "ug", "微克"}:
return ("mg", number / 1000)
return None
def _extract_specs(text: str) -> tuple[tuple[float, ...], tuple[float, ...], tuple[tuple[int, str], ...], Optional[int]]:
def _extract_specs(
text: str,
) -> tuple[tuple[float, ...], tuple[float, ...], tuple[float, ...], tuple[tuple[int, str], ...], Optional[int]]:
volumes_ml: list[float] = []
weights_g: list[float] = []
for match in re.finditer(r"(\d+(?:\.\d+)?)\s*(ml|毫升|l|g|公克|kg)", text, re.I):
dosages_mg: list[float] = []
for match in re.finditer(r"(\d+(?:\.\d+)?)\s*(ml|毫升|l|g|公克|kg|mg|毫克|mcg|μg|ug|微克)", text, re.I):
converted = _convert_volume(match.group(1), match.group(2))
if not converted:
continue
unit, number = converted
if unit == "ml":
volumes_ml.append(number)
else:
elif unit == "g":
weights_g.append(number)
else:
dosages_mg.append(number)
counts: list[tuple[int, str]] = []
for match in re.finditer(r"(\d+)\s*([入組瓶支條盒包片顆錠枚])", text):
for match in re.finditer(r"(\d+)\s*([入組瓶支條盒包片顆錠枚])", text):
counts.append((int(match.group(1)), match.group(2)))
for match in re.finditer(r"([一二兩雙三四五六七八九十])\s*([入組瓶支條盒包片顆錠枚])", text):
for match in re.finditer(r"([一二兩雙三四五六七八九十])\s*([入組瓶支條盒包片顆錠枚])", text):
counts.append((CHINESE_COUNT[match.group(1)], match.group(2)))
for match in re.finditer(r"(?:x|乘)\s*(\d+)\s*([入組瓶支條盒包片顆錠枚])?", text, re.I):
for match in re.finditer(r"(?:x|乘)\s*(\d+)\s*([入組瓶支條盒包片顆錠枚])?", text, re.I):
unit = match.group(2) or ""
counts.append((int(match.group(1)), unit))
@@ -334,6 +388,7 @@ def _extract_specs(text: str) -> tuple[tuple[float, ...], tuple[float, ...], tup
return (
tuple(sorted(set(volumes_ml))),
tuple(sorted(set(weights_g))),
tuple(sorted(set(dosages_mg))),
unique_counts,
total_piece_count,
)
@@ -351,11 +406,11 @@ def parse_product_identity(name: str) -> ProductIdentity:
for token in tokens
if token not in GENERIC_TOKENS
and not token.isdigit()
and not re.fullmatch(r"\d+(ml|g|kg|l)?", token)
and not re.fullmatch(r"\d+(ml|g|kg|l|mg|mcg|ug)?", token)
}
core_tokens -= brand_tokens
volumes_ml, weights_g, counts, total_piece_count = _extract_specs(searchable)
volumes_ml, weights_g, dosages_mg, counts, total_piece_count = _extract_specs(searchable)
return ProductIdentity(
original_name=name or "",
normalized_name=normalized,
@@ -366,6 +421,7 @@ def parse_product_identity(name: str) -> ProductIdentity:
core_tokens=frozenset(core_tokens),
volumes_ml=volumes_ml,
weights_g=weights_g,
dosages_mg=dosages_mg,
counts=counts,
total_piece_count=total_piece_count,
)
@@ -408,18 +464,58 @@ def _close_number(left: float, right: float, tolerance: float = 0.04) -> bool:
def _spec_component(left_values: Iterable[float], right_values: Iterable[float]) -> tuple[float, bool]:
left_tuple = tuple(left_values)
right_tuple = tuple(right_values)
left_tuple = tuple(sorted(set(left_values)))
right_tuple = tuple(sorted(set(right_values)))
if not left_tuple and not right_tuple:
return 0.55, False
if not left_tuple or not right_tuple:
return 0.45, False
if len(left_tuple) > 1 or len(right_tuple) > 1:
if len(left_tuple) != len(right_tuple):
return 0.0, True
unmatched = list(right_tuple)
for left_value in left_tuple:
match_index = next(
(
index
for index, right_value in enumerate(unmatched)
if _close_number(left_value, right_value)
),
None,
)
if match_index is None:
return 0.0, True
unmatched.pop(match_index)
return 1.0, False
for left_value in left_tuple:
if any(_close_number(left_value, right_value) for right_value in right_tuple):
return 1.0, False
return 0.0, True
def _has_hard_count_unit_conflict(left: ProductIdentity, right: ProductIdentity) -> bool:
if not left.counts or not right.counts:
return False
left_by_count: dict[int, set[str]] = {}
right_by_count: dict[int, set[str]] = {}
for count, unit in left.counts:
left_by_count.setdefault(count, set()).add(unit)
for count, unit in right.counts:
right_by_count.setdefault(count, set()).add(unit)
for count in set(left_by_count) & set(right_by_count):
left_units = left_by_count[count]
right_units = right_by_count[count]
if left_units & right_units:
continue
if (
(left_units & PIECE_UNITS and right_units & CONTAINER_UNITS)
or (right_units & PIECE_UNITS and left_units & CONTAINER_UNITS)
):
return True
return False
def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool]:
left_counts = [count for count, _unit in left.counts]
right_counts = [count for count, _unit in right.counts]
@@ -431,6 +527,8 @@ def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float,
if left.counts and right.counts:
if set(left.counts) & set(right.counts):
return 0.85, False
if _has_hard_count_unit_conflict(left, right):
return 0.0, True
if left_counts and right_counts:
ratio = max(max(left_counts), max(right_counts)) / max(min(max(left_counts), max(right_counts)), 1)
if ratio >= 1.5:
@@ -444,6 +542,7 @@ def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float,
def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool, tuple[str, ...]]:
volume_score, volume_conflict = _spec_component(left.volumes_ml, right.volumes_ml)
weight_score, weight_conflict = _spec_component(left.weights_g, right.weights_g)
dosage_score, dosage_conflict = _spec_component(left.dosages_mg, right.dosages_mg)
count_score, count_conflict = _count_score(left, right)
available = []
@@ -451,6 +550,8 @@ def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, b
available.append(volume_score)
if left.weights_g or right.weights_g:
available.append(weight_score)
if left.dosages_mg or right.dosages_mg:
available.append(dosage_score)
if left.counts or right.counts:
available.append(count_score)
if not available:
@@ -462,6 +563,8 @@ def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, b
conflicts.append("volume_conflict")
if weight_conflict:
conflicts.append("weight_conflict")
if dosage_conflict:
conflicts.append("dosage_conflict")
if count_conflict:
conflicts.append("count_conflict")
return score, bool(conflicts), tuple(conflicts)
@@ -473,16 +576,7 @@ def _has_bundle_offer(identity: ProductIdentity) -> bool:
re.search(r"\s*\d+\s*送\s*\d+", text)
or re.search(r"\s*[一二兩雙三四五六七八九十]\s*送\s*[一二兩雙三四五六七八九十]", text)
or "買一送一" in text
or "囤貨組" in text
or "超值組" in text
or "特惠組" in text
or "優惠套組" in text
or "禮盒組" in text
or "加大組" in text
or "加量組" in text
or "分享組" in text
or "明星組" in text
or "套組" in text
or any(phrase in text for phrase in BUNDLE_OFFER_PHRASES)
)
@@ -491,7 +585,7 @@ def _has_multi_component(identity: ProductIdentity) -> bool:
return bool(
"+" in text
or "" in text
or re.search(r"\d+\s*(?:ml|g|毫升|公克)\s*x\s*\d+", text, re.I)
or re.search(r"\d+\s*(?:ml|g|mg|毫升|公克|毫克)\s*x\s*\d+", text, re.I)
)
@@ -507,7 +601,13 @@ def _has_refill_pack(identity: ProductIdentity) -> bool:
def _spec_mention_count(identity: ProductIdentity) -> int:
return len(re.findall(r"\d+(?:\.\d+)?\s*(?:ml|毫升|l|g|公克|kg)", identity.normalized_name, re.I))
return len(
re.findall(
r"\d+(?:\.\d+)?\s*(?:ml|毫升|l|g|公克|kg|mg|毫克|mcg|μg|ug|微克)",
identity.normalized_name,
re.I,
)
)
def _count_text_value(value: str) -> Optional[int]:
@@ -526,6 +626,11 @@ def _pack_multiplier(identity: ProductIdentity) -> int:
return left + right
if "買一送一" in text or "買1送1" in text:
return 2
piece_pack = re.search(r"(\d+|[一二兩雙三四五六七八九十])\s*件\s*組", text)
if piece_pack:
count = _count_text_value(piece_pack.group(1)) or 0
if count > 1:
return count
multipliers = [count for count, unit in identity.counts if unit in COUNT_UNITS and count > 1]
if multipliers:
@@ -764,7 +869,7 @@ def score_marketplace_match(
hard_veto = True
if chinese_name_score < 0.16 and token_score < 0.72:
hard_veto = True
if left.product_type and right.product_type and left.product_type != right.product_type and token_score < 0.55:
if left.product_type and right.product_type and left.product_type != right.product_type:
hard_veto = True
comparison_mode = "exact_identity"
@@ -845,6 +950,10 @@ def build_search_terms(name: str, max_terms: int = 3) -> list[str]:
specs.append(f"{int(identity.volumes_ml[0])}ml")
if identity.weights_g:
specs.append(f"{int(identity.weights_g[0])}g")
if identity.dosages_mg:
dosage = identity.dosages_mg[0]
dosage_label = f"{int(dosage)}mg" if dosage.is_integer() else f"{dosage:g}mg"
specs.append(dosage_label)
if identity.total_piece_count:
specs.append(f"{identity.total_piece_count}")

View File

@@ -167,6 +167,99 @@ def test_marketplace_matcher_does_not_promote_wide_price_refill_candidate():
assert "strong_product_line_match" not in diagnostics.reasons
def test_marketplace_matcher_rejects_partial_overlap_in_multi_spec_set():
from services.marketplace_product_matcher import score_marketplace_match
diagnostics = score_marketplace_match(
"【蘭蔻】玫瑰霜60ml+玫瑰精露150ml",
"【蘭蔻】玫瑰霜60ml+玫瑰精露20ml",
momo_price=18765,
competitor_price=7999,
)
assert diagnostics.score < 0.76
assert diagnostics.hard_veto is True
assert "volume_conflict" in diagnostics.reasons
assert diagnostics.comparison_mode == "not_comparable"
def test_marketplace_matcher_rejects_dosage_conflict():
from services.marketplace_product_matcher import score_marketplace_match
diagnostics = score_marketplace_match(
"品牌 葉黃素 10mg 60錠",
"品牌 葉黃素 20mg 60錠",
momo_price=990,
competitor_price=890,
)
assert diagnostics.score < 0.76
assert diagnostics.hard_veto is True
assert "dosage_conflict" in diagnostics.reasons
def test_marketplace_matcher_rejects_product_type_conflict_even_when_line_matches():
from services.marketplace_product_matcher import score_marketplace_match
diagnostics = score_marketplace_match(
"理膚寶水 MelaB3 淡斑精華 30ml",
"理膚寶水 MelaB3 淡斑化妝水 30ml",
momo_price=1280,
competitor_price=1180,
)
assert diagnostics.score < 0.76
assert diagnostics.hard_veto is True
assert "type_conflict" in diagnostics.reasons
assert diagnostics.comparison_mode == "not_comparable"
def test_marketplace_matcher_rejects_same_count_different_unit_family():
from services.marketplace_product_matcher import score_marketplace_match
diagnostics = score_marketplace_match(
"品牌 保濕面膜10片",
"品牌 保濕面膜10盒",
momo_price=399,
competitor_price=1990,
)
assert diagnostics.score < 0.76
assert diagnostics.hard_veto is True
assert "count_conflict" in diagnostics.reasons
def test_marketplace_matcher_marks_generic_bundle_words_as_unit_comparable():
from services.marketplace_product_matcher import score_marketplace_match
diagnostics = score_marketplace_match(
"【NARS】裸光蜜粉餅 雙件組 10g",
"【NARS】裸光蜜粉餅 10g",
momo_price=1999,
competitor_price=1099,
)
assert diagnostics.score < 0.76
assert diagnostics.hard_veto is True
assert "bundle_offer_conflict" in diagnostics.reasons
assert diagnostics.comparison_mode == "unit_comparable"
def test_marketplace_matcher_ignores_non_brand_bracket_copy():
from services.marketplace_product_matcher import score_marketplace_match
diagnostics = score_marketplace_match(
"【保濕組】理膚寶水 B5 修復霜 40ml",
"理膚寶水 B5 修復霜 40ml",
momo_price=699,
competitor_price=679,
)
assert diagnostics.score >= 0.76
assert diagnostics.hard_veto is False
assert "brand_conflict" not in diagnostics.reasons
def test_batch_compare_top_uses_latest_momo_price_not_revenue(monkeypatch):
from services import pchome_crawler