[V10.343] 強化 PChome 商品搜尋召回
All checks were successful
CD Pipeline / deploy (push) Successful in 1m5s

This commit is contained in:
OoO
2026-05-20 16:21:19 +08:00
parent 48e46e35c0
commit 193b6e53c5
8 changed files with 356 additions and 29 deletions

View File

@@ -320,7 +320,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
# ==========================================
# 系統版本與路徑
# ==========================================
SYSTEM_VERSION = "V10.342"
SYSTEM_VERSION = "V10.343"
LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
public_url = PUBLIC_URL # 用於模板顯示

View File

@@ -26,6 +26,7 @@
- 2026-05-20 追記:同步背景 PChome identity / price direction 更新後的 `services/marketplace_product_matcher.py` 行數;此處只更新 inventory不變更商品比對行為。
- 2026-05-20 追記:同步背景 PChome crawler 搜尋韌性擴充後的 `services/pchome_crawler.py` 行數;此處只更新 inventory不變更 PChome crawler 行為。
- 2026-05-20 追記:同步 PChome 近門檻候選重評與 matcher 系列/刀片數防錯配更新後的 `services/marketplace_product_matcher.py``services/competitor_price_feeder.py` 行數;此處只更新 inventory不變更比價行為。
- 2026-05-20 追記:同步 PChome 搜尋詞品質層、候選召回與 hard-veto 狀態分流更新後的 `services/marketplace_product_matcher.py``services/competitor_price_feeder.py` 行數;並補列背景市場情報 deployment readiness 大檔,僅更新 inventory。
## 達到或超過 800 行檔案清單
@@ -52,16 +53,17 @@
| 940 | `services/import_service.py` | P2 import service | validators / import writers / report builders |
| 933 | `services/telegram_templates.py` | P2 Telegram templates | alert template groups / channel-specific formatting / reusable render helpers |
| 867 | `services/token_report_service.py` | P2 token report service | query / aggregation / chart payload / notification formatting |
| 1128 | `services/marketplace_product_matcher.py` | P2 marketplace matcher | identity parsing / unit-comparable scoring / persistence normalization |
| 1356 | `services/marketplace_product_matcher.py` | P2 marketplace matcher | identity parsing / unit-comparable scoring / search term quality / persistence normalization |
| 865 | `routes/daily_sales_routes.py` | P2 Daily Sales Blueprint | route glue / export helpers / daily query and formatting service |
| 844 | `services/ollama_service.py` | P2 Ollama client | host health / request client / fallback policy / response parsing |
| 849 | `services/pchome_crawler.py` | P2 PChome crawler | search fetch / parsing / fallback source handling / rate limit policy |
| 1042 | `services/code_review_pipeline_service.py` | P2 Code review pipeline service | scan orchestration / finding normalization / persistence adapter |
| 953 | `routes/export_routes.py` | P2 Export flow | export command/router glue / file path / download orchestration |
| 816 | `services/ppt_vision_service.py` | P2 PPT vision QA service | runtime state / queue status / model probe / audit execution 分離 |
| 1592 | `services/competitor_price_feeder.py` | P2 competitor price feeder | crawler scheduling / price normalization / cache strategy |
| 1602 | `services/competitor_price_feeder.py` | P2 competitor price feeder | crawler scheduling / price normalization / cache strategy |
| 1120 | `services/competitor_intel_repository.py` | P2 competitor intel repository | review queue query / cache shaping / formatting helpers |
| 805 | `routes/bot_api_routes.py` | P2 Bot API Blueprint | route glue / bot action service |
| 811 | `services/market_intel/deployment_readiness.py` | P2 market intel deployment readiness | preflight gates / readiness payload / route contract helpers |
## 市場情報開發前置禁區

View File

@@ -144,7 +144,13 @@ def _build_search_keywords(momo_name: str) -> list:
cleaned = _clean_search_text(momo_name)
terms = [cleaned[:36], cleaned[:24]]
return _dedupe_terms(terms)
primary_terms = _dedupe_terms(terms[: max(1, MAX_SEARCH_TERMS - 1)])
original_terms = _dedupe_terms([momo_name])
for term in original_terms:
if term.lower() not in {existing.lower() for existing in primary_terms}:
primary_terms.append(term)
break
return _dedupe_terms(primary_terms)
def _format_match_diagnostics(diagnostics) -> str:
@@ -252,8 +258,9 @@ def _search_pchome_candidates(crawler, momo_name: str, keywords: list = None, mo
"""以多組搜尋詞擴大 PChome 候選池,只在強同款時提前停止。"""
candidates = []
seen_ids = set()
search_limit = SEARCH_LIMIT * max(1, SEARCH_MAX_PAGES)
for keyword in keywords or _build_search_keywords(momo_name):
ok, _, products = crawler.search_products(keyword, limit=SEARCH_LIMIT, max_pages=SEARCH_MAX_PAGES)
ok, _, products = crawler.search_products(keyword, limit=search_limit, max_pages=SEARCH_MAX_PAGES)
if not ok or not products:
continue
for product in products:
@@ -1154,6 +1161,7 @@ class CompetitorPriceFeeder:
continue
if score < MIN_MATCH_SCORE and not manual_accept_override:
attempt_status = "identity_veto" if getattr(diagnostics, "hard_veto", False) else "low_score"
logger.debug(
f"[Feeder] {sku} 比對分數過低 ({score:.3f} < {MIN_MATCH_SCORE})"
f"{_format_match_diagnostics(diagnostics)}"
@@ -1165,7 +1173,7 @@ class CompetitorPriceFeeder:
momo_price=momo_price,
search_terms=search_terms,
candidate_count=len(products),
attempt_status="low_score",
attempt_status=attempt_status,
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
@@ -1382,6 +1390,7 @@ class CompetitorPriceFeeder:
continue
if score < MIN_MATCH_SCORE:
attempt_status = "identity_veto" if getattr(diagnostics, "hard_veto", False) else "refresh_low_score"
self._record_match_attempt(
sku,
momo_name,
@@ -1389,7 +1398,7 @@ class CompetitorPriceFeeder:
momo_price=momo_price,
search_terms=search_terms,
candidate_count=1,
attempt_status="refresh_low_score",
attempt_status=attempt_status,
best_product=best_product,
best_score=score,
diagnostics=diagnostics,

View File

@@ -111,6 +111,96 @@ GENERIC_TOKENS = {
"美國",
}
SEARCH_NOISE_PHRASES = (
"新品上市",
"全新上市",
"任選一款",
"任選1款",
"任選一色",
"任選1色",
"多款任選",
"多款可選",
"色號可選",
"香味可選",
"口味可選",
"送精美紙袋",
"精美紙袋",
"交換禮物",
"聖誕禮物",
"母親節",
"父親節",
"情人節",
"外出清潔",
"卸除髒汙",
"卸除防曬",
"卸防曬",
"韓國彩妝",
"水光感",
"官方直營",
"官方",
)
SEARCH_NOISE_TOKENS = {
"一款",
"1款",
"一色",
"1色",
"上市",
"全新",
"新品",
"香味",
"口味",
"味道",
"顏色",
"色號",
"紙袋",
"禮物",
"清潔",
"髒汙",
"防曬",
"彩妝",
"水光感",
}
SEARCH_IDENTITY_ANCHORS = (
"免用水潔淨液",
"身體按摩精油",
"按摩精油",
"擴香補充瓶",
"擴香瓶",
"全面修復霜",
"修復霜",
"護膚膏",
"屁屁噴",
"身體乳",
"緊實乳",
"潔膚露",
"潔淨液",
"護甲油",
"指甲油",
"美甲片",
"唇凍",
"唇釉",
"唇膏",
"粉底棒",
"遮瑕棒",
"化妝水",
"精華液",
"精華",
"面膜",
"乳液",
"乳霜",
"面霜",
"精油",
"水氧機",
"香氛機",
)
SEARCH_AMBIGUOUS_PRODUCT_TERMS = {
"保護膜",
"保護貼",
}
BRAND_ALIAS_OVERRIDES = {
"clarins": ("克蘭詩", "clarins"),
"nars": ("nars",),
@@ -1099,6 +1189,123 @@ def score_marketplace_match(
)
def _clean_search_phrase(value: str) -> str:
text = normalize_product_text(value)
for phrase in sorted(SEARCH_NOISE_PHRASES, key=len, reverse=True):
text = text.replace(phrase.lower(), " ")
text = re.sub(r"[^\w\u4e00-\u9fff]+", " ", text)
text = " ".join(
token for token in text.split()
if token not in SEARCH_NOISE_TOKENS and token not in GENERIC_TOKENS
)
text = re.sub(r"\s+", " ", text).strip()
return text
def _search_spec_terms(identity: ProductIdentity) -> list[str]:
specs: list[str] = []
if identity.volumes_ml:
volume = identity.volumes_ml[0]
specs.append(f"{volume:g}ml")
if identity.weights_g:
weight = identity.weights_g[0]
specs.append(f"{weight:g}g")
if identity.dosages_mg:
dosage = identity.dosages_mg[0]
specs.append(f"{dosage:g}mg")
if identity.total_piece_count:
specs.append(f"{identity.total_piece_count}")
return specs
def _extract_anchor_phrases(token: str) -> list[str]:
cleaned = _clean_search_phrase(token)
if not cleaned:
return []
phrases: list[str] = []
for anchor in SEARCH_IDENTITY_ANCHORS:
if anchor not in cleaned:
continue
if re.search(r"[\u4e00-\u9fff]", anchor):
prefix_width = 0 if len(anchor) >= 5 else (4 if len(anchor) >= 3 else 6)
match = re.search(rf"([\u4e00-\u9fff]{{0,{prefix_width}}}{re.escape(anchor)})", cleaned)
phrase = match.group(1) if match else anchor
else:
phrase = anchor
phrase = _clean_search_phrase(phrase)
if any(existing in phrase and existing != phrase for existing in phrases):
continue
if len(phrase) >= 2 and phrase not in phrases:
phrases.append(phrase)
return phrases
def _search_core_score(token: str, all_tokens: set[str]) -> tuple[int, int, str]:
cleaned = _clean_search_phrase(token)
if not cleaned:
return (-999, 0, cleaned)
compact = cleaned.replace(" ", "")
if compact in SEARCH_NOISE_TOKENS or compact in GENERIC_TOKENS:
return (-900, 0, cleaned)
score = 0
if re.search(r"[a-z][a-z0-9-]{2,}", cleaned):
score += 30
if re.search(r"\d", cleaned):
score += 12
anchors = _extract_anchor_phrases(cleaned)
if anchors:
score += 90
if anchors[0] == compact:
score += 8
else:
score += max(0, 24 - len(compact))
if len(compact) <= 8:
score += 14
elif len(compact) >= 12:
score -= 12
has_better_anchor = any(
other != token and _extract_anchor_phrases(other)
for other in all_tokens
)
if has_better_anchor and any(term in compact for term in SEARCH_AMBIGUOUS_PRODUCT_TERMS):
score -= 80
if any(noise in compact for noise in SEARCH_NOISE_TOKENS):
score -= 18
return (score, -len(compact), cleaned)
def _ranked_search_core_phrases(identity: ProductIdentity, limit: int = 4) -> list[str]:
tokens = {token for token in identity.core_tokens if token not in GENERIC_TOKENS}
ranked_tokens = sorted(
tokens,
key=lambda token: _search_core_score(token, tokens),
reverse=True,
)
phrases: list[str] = []
for token in ranked_tokens:
if _search_core_score(token, tokens)[0] < -100:
continue
candidates = _extract_anchor_phrases(token) or [_clean_search_phrase(token)]
for phrase in candidates:
compact = phrase.replace(" ", "")
if len(compact) < 2 or compact in SEARCH_NOISE_TOKENS:
continue
if any(term in compact for term in SEARCH_AMBIGUOUS_PRODUCT_TERMS) and len(phrases) > 0:
continue
if phrase not in phrases:
phrases.append(phrase)
if len(phrases) >= limit:
return phrases
return phrases
def build_search_terms(name: str, max_terms: int = 3) -> list[str]:
identity = parse_product_identity(name)
terms: list[str] = []
@@ -1120,30 +1327,27 @@ def build_search_terms(name: str, max_terms: int = 3) -> list[str]:
return latin[0] if latin else ""
brand_part = primary_brand_phrase()
core = " ".join(sorted(identity.core_tokens, key=lambda token: (-len(token), token))[:4])
specs = []
if identity.volumes_ml:
specs.append(f"{int(identity.volumes_ml[0])}ml")
if identity.weights_g:
specs.append(f"{int(identity.weights_g[0])}g")
if identity.dosages_mg:
dosage = identity.dosages_mg[0]
dosage_label = f"{int(dosage)}mg" if dosage.is_integer() else f"{dosage:g}mg"
specs.append(dosage_label)
if identity.total_piece_count:
specs.append(f"{identity.total_piece_count}")
spec_part = " ".join(specs)
core_tokens = sorted(identity.core_tokens, key=lambda token: (-len(token), token))
core_short = " ".join(core_tokens[:2])
spec_part = " ".join(_search_spec_terms(identity))
core_phrases = _ranked_search_core_phrases(identity, limit=4)
core_short = " ".join(core_phrases[:2])
core_primary = core_phrases[0] if core_phrases else ""
model_phrases = [
phrase
for phrase in core_phrases[1:]
if re.fullmatch(r"[a-z]*\d+[a-z0-9-]*", phrase)
or re.fullmatch(r"[a-z][a-z0-9-]{2,}", phrase)
]
primary_with_model = " ".join(
part for part in (core_primary, model_phrases[0] if model_phrases else "") if part
)
for value in (
" ".join(part for part in (brand_part, primary_with_model, spec_part) if part),
" ".join(part for part in (brand_part, core_short, spec_part) if part),
" ".join(part for part in (brand_part, core_short) if part),
" ".join(part for part in (core_short, spec_part) if part),
" ".join(part for part in (core_primary, spec_part) if part),
identity.searchable_name,
):
cleaned = re.sub(r"[^\w\u4e00-\u9fff]+", " ", value)
cleaned = re.sub(r"\s+", " ", cleaned).strip()
cleaned = _clean_search_phrase(value)
if cleaned and cleaned not in terms:
terms.append(cleaned[:42])
if len(terms) >= max_terms:

View File

@@ -18,7 +18,8 @@ def test_competitor_feeder_persists_all_match_attempt_outcomes():
assert "INSERT INTO competitor_match_attempts" in source
assert "CAST(:search_terms AS jsonb)" in source
assert 'attempt_status="matched"' in source
assert 'attempt_status="low_score"' in source
assert '"low_score"' in source
assert '"identity_veto"' in source
assert 'attempt_status="no_result"' in source
assert 'attempt_status="no_match"' in source
assert 'attempt_status="error"' in source
@@ -342,6 +343,71 @@ def test_competitor_feeder_skips_rejected_candidate_and_uses_next_best(monkeypat
assert attempts[0]["best_product"].product_id == "DDAB01-ACCEPTABLE"
def test_competitor_feeder_splits_hard_veto_from_low_score(monkeypatch):
from services.competitor_price_feeder import CompetitorPriceFeeder
from services.pchome_crawler import PChomeProduct
product = PChomeProduct(
product_id="DDAB01-WRONG",
name="iPhone 16 Pro 保護膜",
price=399,
original_price=499,
discount=20,
image_url="",
product_url="https://24h.pchome.com.tw/prod/DDAB01-WRONG",
stock=20,
store="24h",
rating=4.7,
review_count=8,
is_on_sale=True,
crawled_at=datetime.now(),
)
class FakeCrawler:
def __init__(self, *_args, **_kwargs):
pass
def search_products(self, *_args, **_kwargs):
return True, "ok", [product]
def fake_score(*_args, **_kwargs):
return SimpleNamespace(
score=0.31,
brand_score=0.0,
token_score=0.1,
spec_score=0.55,
sequence_score=0.1,
type_score=0.55,
price_penalty=0.0,
hard_veto=True,
reasons=("brand_conflict", "product_line_conflict"),
comparison_mode="not_comparable",
tags=["identity_v2", "identity_veto"],
)
monkeypatch.setattr("services.pchome_crawler.PChomeCrawler", FakeCrawler)
monkeypatch.setattr("services.marketplace_product_matcher.score_marketplace_match", fake_score)
feeder = CompetitorPriceFeeder(engine=object())
attempts = []
monkeypatch.setattr(
feeder,
"_record_match_attempt",
lambda *args, **kwargs: attempts.append(kwargs),
)
result = feeder._run_sku_items([{
"sku": "A006",
"name": "【TAICEND 泰陞】寶貝液體保護膜 屁屁噴 100ml",
"product_id": 6,
"momo_price": 399,
}])
assert result.matched == 0
assert result.skipped_low_score == 1
assert attempts[0]["attempt_status"] == "identity_veto"
assert attempts[0]["diagnostics"].hard_veto is True
def test_search_candidates_does_not_stop_on_merely_acceptable_match(monkeypatch):
from services.competitor_price_feeder import _search_pchome_candidates
from services.pchome_crawler import PChomeProduct
@@ -421,6 +487,18 @@ def test_competitor_feeder_logs_keyword_parser_fallback(monkeypatch, caplog):
assert "fallback to cleaned product name" in caplog.text
def test_competitor_feeder_keeps_original_name_as_search_fallback():
from services import competitor_price_feeder
terms = competitor_price_feeder._build_search_keywords(
"【Mustela 慕之恬廊】慕之幼 免用水潔淨液 300ml(外出清潔 卸除髒汙 卸除防曬 卸防曬)"
)
assert len(terms) == competitor_price_feeder.MAX_SEARCH_TERMS
assert terms[-1].startswith("Mustela 慕之恬廊 慕之幼 免用水潔淨液")
assert any("免用水潔淨液 300ml" in term for term in terms[:4])
def test_competitor_feeder_refreshes_expired_identity_by_known_product_id(monkeypatch):
from services.competitor_price_feeder import CompetitorPriceFeeder
from services.pchome_crawler import PChomeProduct

View File

@@ -419,7 +419,8 @@ def test_ai_product_pick_agent_uses_real_competitor_data_and_dashboard_action():
assert "MAX_SEARCH_TERMS" in feeder_source
assert "_build_search_keywords" in feeder_source
assert "_search_pchome_candidates" in feeder_source
assert "crawler.search_products(keyword, limit=SEARCH_LIMIT, max_pages=SEARCH_MAX_PAGES)" in feeder_source
assert "search_limit = SEARCH_LIMIT * max(1, SEARCH_MAX_PAGES)" in feeder_source
assert "crawler.search_products(keyword, limit=search_limit, max_pages=SEARCH_MAX_PAGES)" in feeder_source
assert "_fetch_unmatched_priority_skus" in feeder_source
assert "_fetch_expired_identity_skus" in feeder_source
assert "run_expired_identity_refresh" in feeder_source

View File

@@ -435,6 +435,39 @@ def test_marketplace_search_terms_prefer_readable_brand_core_spec():
assert not any(term.endswith(" l") for term in terms)
def test_marketplace_search_terms_prioritize_identity_phrase_over_ambiguous_copy():
from services.marketplace_product_matcher import build_search_terms
terms = build_search_terms("【TAICEND 泰陞】寶貝液體保護膜 屁屁噴 100ml", max_terms=5)
assert terms[0] == "泰陞 屁屁噴 100ml"
assert "保護膜" not in terms[0]
assert "屁屁噴" in " ".join(terms[:3])
def test_marketplace_search_terms_drop_option_and_marketing_noise():
from services.marketplace_product_matcher import build_search_terms
terms = build_search_terms("【YSL】情挑誘光嫩唇凍6ml(任選一款/新品上市)", max_terms=5)
assert terms[0] == "ysl 情挑誘光嫩唇凍 6ml"
assert not any("一款" in term or "上市" in term for term in terms)
def test_marketplace_search_terms_keep_professional_product_phrase():
from services.marketplace_product_matcher import build_search_terms
abysse_terms = build_search_terms("【Abysse】天然植萃身體按摩精油550ml", max_terms=5)
mustela_terms = build_search_terms(
"【Mustela 慕之恬廊】慕之幼 免用水潔淨液 300ml(外出清潔 卸除髒汙 卸除防曬 卸防曬)",
max_terms=5,
)
assert abysse_terms[0] == "abysse 身體按摩精油 550ml"
assert mustela_terms[0] == "慕之恬廊 免用水潔淨液 300ml"
assert not any("卸除防曬" in term or "外出清潔" in term for term in mustela_terms)
def test_batch_compare_top_uses_latest_momo_price_not_revenue(monkeypatch):
from services import pchome_crawler

View File

@@ -179,5 +179,5 @@ def test_feeder_search_candidate_passes_page_cap(monkeypatch):
)
assert candidates == [product]
assert calls[0][1]["limit"] == 20
assert calls[0][1]["limit"] == 40
assert calls[0][1]["max_pages"] == 2