Files
ewoooc/services/competitor_price_feeder.py
OoO 93065a58e4
All checks were successful
CD Pipeline / deploy (push) Successful in 1m8s
V10.591 拆分比價型錄覆核狀態
2026-06-05 11:06:47 +08:00

3764 lines
171 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
競品價格補給線 Worker (Competitor Price Feeder)
角色:獨立背景 Worker生產者端
架構位置:
[本 Worker — 每 4 小時跑一次] → competitor_prices DB 表(最新快取)
→ competitor_price_history DB 表(歷史快照)
[AI Pipeline] → fetch_candidates() LEFT JOIN competitor_prices消費者端
設計原則:
- 與 AI Pipeline 完全解耦:本 Worker 掛了不影響核心大腦
- 自帶重試機制,不阻塞主排程
- 語意化標籤 (tags) 讓 Hermes 獲得更豐富的情境
爬取邏輯:
MOMO 商品名稱 → PChome 關鍵字搜尋 → 模糊比對最佳匹配 → 寫入 competitor_prices + competitor_price_history
依賴:
services/pchome_crawler.py — 搜尋 + 批量 API
services/price_comparison.py — ProductNameParser + 模糊比對
"""
import json
import logging
import os
import re
import time
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Optional
from urllib.parse import quote_plus
from services.marketplace_product_matcher import FOCUSED_IDENTITY_TOTAL_PRICE_REASONS
logger = logging.getLogger(__name__)
# ── 比對參數 ─────────────────────────────────────────
MIN_MATCH_SCORE = 0.76 # 低於此分數不寫入;核心比價寧可待審也不能錯配
REPLACE_DIFFERENT_PRODUCT_SCORE = 0.84 # 已有不同 PChome 商品時,需超高信心才覆蓋
EARLY_STOP_MATCH_SCORE = 0.90 # 搜尋候選池只有強同款才提前停止,避免次佳候選卡住後續精準搜尋詞
SEARCH_LIMIT = int(os.getenv("PCHOME_FEEDER_SEARCH_LIMIT", "20")) # 每個搜尋詞取 PChome 前 N 筆
MAX_SEARCH_TERMS = int(os.getenv("PCHOME_FEEDER_MAX_SEARCH_TERMS", "6")) # 每個 MOMO 商品最多嘗試幾組搜尋詞
SEARCH_MAX_PAGES = int(os.getenv("PCHOME_FEEDER_SEARCH_MAX_PAGES", "2")) # 每個搜尋詞最多掃描 PChome 搜尋頁數
BATCH_SIZE = 30 # 每批 DB 寫入筆數
RATE_DELAY = float(os.getenv("PCHOME_FEEDER_RATE_DELAY", "1.0")) # 每次 PChome 請求間隔(秒)
TTL_HOURS = int(os.getenv("PCHOME_FEEDER_TTL_HOURS", "48")) # competitor_prices 價格新鮮度有效期
REQUEST_TIMEOUT = float(os.getenv("PCHOME_FEEDER_TIMEOUT", "12")) # 避免外部搜尋 API 長時間卡住排程
BACKFILL_REQUEST_TIMEOUT = float(os.getenv("PCHOME_FEEDER_BACKFILL_TIMEOUT", str(min(REQUEST_TIMEOUT, 6.0))))
BACKFILL_MAX_SEARCH_TERMS = int(os.getenv("PCHOME_FEEDER_BACKFILL_MAX_SEARCH_TERMS", "2"))
BACKFILL_SEARCH_MAX_PAGES = int(os.getenv("PCHOME_FEEDER_BACKFILL_SEARCH_MAX_PAGES", "1"))
BACKFILL_MAX_SECONDS_PER_SKU = float(os.getenv("PCHOME_FEEDER_BACKFILL_MAX_SECONDS_PER_SKU", "18"))
SEARCH_COVERAGE_RESCUE_ENABLED = os.getenv(
"PCHOME_FEEDER_SEARCH_COVERAGE_RESCUE_ENABLED",
"true",
).lower() in {"1", "true", "yes", "on"}
VARIANT_RECALL_SORTS = ("sale/dc", "new/dc")
RECOVERABLE_LOW_SCORE_FLOOR = max(MIN_MATCH_SCORE - 0.03, 0.72)
GENERIC_RECALL_SAFE_PRODUCT_TYPES = {
"止汗噴霧",
"潔膚露",
"刮鬍刀",
"電動牙刷",
"洗手慕斯",
"足膜",
"化妝水",
"乳液",
"面霜",
"洗面乳",
"面膜",
"眼霜",
"卸妝",
}
GENERIC_RECALL_BLOCK_TERMS = (
"任選",
"多款",
"多色",
"色號",
"顏色",
"款式",
"香味",
"香調",
"香氛",
"精油",
"擴香",
"蠟燭",
"融蠟",
"融燭",
"美甲片",
"指甲油",
"指彩",
"唇釉",
"唇彩",
"唇膏",
"潤唇",
"眼影",
"腮紅",
"粉底",
"遮瑕",
"定妝",
"妝前",
"防曬",
"護手霜",
)
COVERAGE_RESCUE_NOISE_TERMS = (
"外出清潔",
"卸除髒汙",
"卸除防曬",
"卸防曬",
"交換禮物",
"送禮",
"節日",
"推薦",
)
GENERIC_RECALL_BLOCK_NAME_PATTERN = "|".join(re.escape(term) for term in GENERIC_RECALL_BLOCK_TERMS)
BROWSE_SH_DIAGNOSTIC_ENABLED = os.getenv("PCHOME_FEEDER_BROWSE_SH_DIAGNOSTIC_ENABLED", "true").lower() in {"1", "true", "yes", "on"}
BROWSE_SH_EXECUTE_ENABLED = os.getenv("PCHOME_FEEDER_BROWSE_SH_EXECUTE_ENABLED", "false").lower() in {"1", "true", "yes", "on"}
BROWSE_SH_TIMEOUT_SECONDS = int(os.getenv("PCHOME_FEEDER_BROWSE_SH_TIMEOUT", "20"))
BROWSE_SH_MAX_EXECUTIONS_PER_RUN = int(os.getenv("PCHOME_FEEDER_BROWSE_SH_MAX_PER_RUN", "3"))
BROWSE_SH_OUTPUT_PREVIEW_CHARS = int(os.getenv("PCHOME_FEEDER_BROWSE_SH_OUTPUT_PREVIEW_CHARS", "1200"))
RECOVERABLE_DIAGNOSTIC_REASONS = {
"strong_product_line_match",
"shared_identity_anchor",
"shared_identity_anchor_no_spec",
"shared_identity_anchor_packaging_variant",
"shared_identity_anchor_marketing_variant",
"shared_identity_anchor_core_line",
"shared_identity_anchor_variant_safe",
"shared_model_token",
"spec_name_alignment",
}
CATALOG_REVIEW_VARIANT_REASONS = {
"variant_selection_review",
"makeup_catalog_selection_gap",
}
CATALOG_REVIEW_UNIT_REASONS = {
"commercial_condition_gap",
"catalog_count_omission",
}
CATALOG_REVIEW_IDENTITY_REASONS = {
"strong_product_line_match",
"strong_exact_spec_match",
"shared_identity_anchor_exact_line",
"shared_identity_anchor_core_line",
"shared_identity_anchor_variant_safe",
"spec_name_alignment",
"shared_model_token",
}
CATALOG_REVIEW_BLOCK_REASONS = {
"brand_conflict",
"type_conflict",
"product_line_conflict",
"core_ingredient_line_conflict",
"variant_option_conflict",
"variant_descriptor_conflict",
"aroma_scent_variant_conflict",
"bath_additive_variant_gap",
"makeup_finish_conflict",
"makeup_usage_conflict",
"romand_lip_line_conflict",
"count_conflict",
"component_count_conflict",
"multi_component_conflict",
"multi_component_count_conflict",
"bundle_offer_conflict",
"refill_pack_conflict",
"accessory_case_conflict",
"named_component_quantity_conflict",
"price_ratio_extreme",
}
RECOVERABLE_SQL_REASON_LIST = ", ".join(f"'{reason}'" for reason in sorted(RECOVERABLE_DIAGNOSTIC_REASONS))
FOCUSED_TOTAL_PRICE_IDENTITY_DIAGNOSTIC_REASONS = {
f"focused_exact_identity_{reason}"
for reason in FOCUSED_IDENTITY_TOTAL_PRICE_REASONS
}
FOCUSED_TOTAL_PRICE_SAFE_DIAGNOSTIC_REASONS = {
"focused_exact_total_price_safe",
*FOCUSED_TOTAL_PRICE_IDENTITY_DIAGNOSTIC_REASONS,
}
FOCUSED_TOTAL_PRICE_IDENTITY_SQL_REASON_LIST = ", ".join(
f"'{reason}'" for reason in sorted(FOCUSED_TOTAL_PRICE_IDENTITY_DIAGNOSTIC_REASONS)
)
FOCUSED_TOTAL_PRICE_SAFE_SQL_REASON_LIST = ", ".join(
f"'{reason}'" for reason in sorted(FOCUSED_TOTAL_PRICE_SAFE_DIAGNOSTIC_REASONS)
)
REVALIDATABLE_REVIEW_GATE_REASONS = {
"strong_exact_spec_match",
"spec_name_alignment",
"shared_model_token",
}
REVALIDATABLE_REVIEW_BLOCK_REASONS = {
"commercial_condition_gap",
"variant_selection_review",
"variant_option_conflict",
"variant_descriptor_conflict",
"aroma_scent_variant_conflict",
"romand_lip_line_conflict",
"makeup_finish_conflict",
"makeup_usage_conflict",
"nail_tool_function_conflict",
"product_line_conflict",
"count_conflict",
"bundle_offer_conflict",
"multi_component_conflict",
"multi_component_count_conflict",
"refill_pack_conflict",
"accessory_case_conflict",
"named_component_quantity_conflict",
}
FOCUSED_REVALIDATABLE_REVIEW_BLOCK_REASONS = REVALIDATABLE_REVIEW_BLOCK_REASONS - {
"variant_selection_review",
}
REVALIDATABLE_REVIEW_SQL_REASON_LIST = ", ".join(
f"'{reason}'" for reason in sorted(REVALIDATABLE_REVIEW_GATE_REASONS)
)
REVALIDATABLE_REVIEW_BLOCK_SQL_REASON_LIST = ", ".join(
f"'{reason}'" for reason in sorted(REVALIDATABLE_REVIEW_BLOCK_REASONS)
)
FOCUSED_REVALIDATABLE_REVIEW_BLOCK_SQL_REASON_LIST = ", ".join(
f"'{reason}'" for reason in sorted(FOCUSED_REVALIDATABLE_REVIEW_BLOCK_REASONS)
)
STALE_IDENTITY_RECOVERY_BLOCK_REASONS = {
"accessory_case_conflict",
"aroma_lamp_style_selection_gap",
"aroma_scent_variant_conflict",
"bundle_offer_conflict",
"candle_catalog_selection_gap",
"catalog_count_omission",
"commercial_condition_gap",
"count_conflict",
"makeup_catalog_selection_gap",
"makeup_finish_conflict",
"makeup_usage_conflict",
"multi_component_conflict",
"multi_component_count_conflict",
"named_component_quantity_conflict",
"nail_tool_function_conflict",
"price_ratio_extreme",
"price_ratio_wide",
"product_line_conflict",
"refill_pack_conflict",
"romand_lip_line_conflict",
"unit_comparable",
"variant_descriptor_conflict",
"variant_option_conflict",
"variant_selection_review",
}
STALE_IDENTITY_RECOVERY_BLOCK_SQL_REASON_LIST = ", ".join(
f"'{reason}'" for reason in sorted(STALE_IDENTITY_RECOVERY_BLOCK_REASONS)
)
STALE_IDENTITY_RECOVERY_BLOCK_NAME_PATTERN = (
r"(任選|多款|色號|顏色|款式|香味|香調|即期|短效|航空版|"
r"融燭燈|融蠟燈|香氛蠟燭|精油蠟燭|蠟燭|限定|組合任選|"
r"\+|[xX]\s*\d|[*]\s*\d|湛藍|麋香|海洋|玫瑰|薰衣草|"
r"生理呵護|日用型|清爽|潤澤)"
)
def _json_list(value) -> list:
if isinstance(value, list):
return value
if isinstance(value, tuple):
return list(value)
if isinstance(value, str):
try:
parsed = json.loads(value)
return parsed if isinstance(parsed, list) else []
except Exception:
return []
return []
def _has_stale_identity_recovery_block(row: dict) -> bool:
reasons = set(str(reason) for reason in _json_list(row.get("diagnostic_reasons")))
if reasons & STALE_IDENTITY_RECOVERY_BLOCK_REASONS:
return True
haystack = f"{row.get('name') or ''} {row.get('competitor_product_name') or ''}"
return bool(re.search(STALE_IDENTITY_RECOVERY_BLOCK_NAME_PATTERN, haystack, flags=re.IGNORECASE))
# ── Feeder 結果 ───────────────────────────────────────
@dataclass
class FeederResult:
total_skus: int
matched: int
skipped_no_result: int
skipped_low_score: int
errors: int
duration_sec: float
history_written: int = 0
attempts_written: int = 0
def _has_recoverable_identity_signal(diagnostics) -> bool:
if not diagnostics:
return False
reasons = set(getattr(diagnostics, "reasons", ()) or ())
if reasons & RECOVERABLE_DIAGNOSTIC_REASONS:
return True
return (
getattr(diagnostics, "brand_score", 0) >= 0.95
and getattr(diagnostics, "token_score", 0) >= 0.56
and getattr(diagnostics, "sequence_score", 0) >= 0.50
and getattr(diagnostics, "comparison_mode", "exact_identity") == "exact_identity"
)
def _is_multi_variant_listing_name(name: str) -> bool:
return bool(
re.search(
r"(多款任選|多款可選|多色任選|多色可選|多種香味|多種香氣|香味任選|香味可選|味道可選)",
name or "",
)
)
def _classify_low_score_attempt(score: float, diagnostics) -> str:
if getattr(diagnostics, "hard_veto", False):
return "identity_veto"
if "variant_selection_review" in set(getattr(diagnostics, "reasons", ()) or ()):
return "true_low_confidence"
if score >= RECOVERABLE_LOW_SCORE_FLOOR and _has_recoverable_identity_signal(diagnostics):
return "recoverable_low_score"
return "true_low_confidence"
def _is_auto_price_write_safe(diagnostics) -> bool:
"""Only exact, total-price identities may update the formal comparison cache."""
if not diagnostics or getattr(diagnostics, "hard_veto", False):
return False
if getattr(diagnostics, "comparison_mode", "") != "exact_identity":
return False
if getattr(diagnostics, "match_type", "") != "exact":
return False
if getattr(diagnostics, "price_basis", "") != "total_price":
return False
if getattr(diagnostics, "alert_tier", "") != "price_alert_exact":
return False
if "variant_selection_review" in set(getattr(diagnostics, "reasons", ()) or ()):
return False
return True
def _classify_catalog_review_attempt(score: float, diagnostics) -> Optional[str]:
"""Split high-confidence manual gates into operational review lanes."""
if getattr(diagnostics, "hard_veto", False):
return None
if score < MIN_MATCH_SCORE:
return None
reasons = set(getattr(diagnostics, "reasons", ()) or ())
if reasons & CATALOG_REVIEW_BLOCK_REASONS:
return None
has_identity_signal = bool(reasons & CATALOG_REVIEW_IDENTITY_REASONS) or score >= 0.95
if not has_identity_signal:
return None
if reasons & CATALOG_REVIEW_VARIANT_REASONS:
return "catalog_variant_review"
if reasons & CATALOG_REVIEW_UNIT_REASONS:
return "catalog_unit_review"
if (
getattr(diagnostics, "comparison_mode", "") == "exact_identity"
and getattr(diagnostics, "match_type", "") in {"exact", "comparable"}
and getattr(diagnostics, "price_basis", "") in {"manual_review", "total_price"}
and getattr(diagnostics, "alert_tier", "") in {"identity_review", "price_alert_exact"}
):
return "catalog_identity_review"
return None
def _classify_auto_write_block_attempt(score: float, diagnostics) -> str:
if getattr(diagnostics, "hard_veto", False):
return "identity_veto"
catalog_status = _classify_catalog_review_attempt(score, diagnostics)
if catalog_status:
return catalog_status
if score >= MIN_MATCH_SCORE:
return "true_low_confidence"
return _classify_low_score_attempt(score, diagnostics)
def _has_variant_selection_gap(
momo_name: str,
ranked_matches: list[tuple],
best_score: float,
) -> bool:
"""True when source lacks explicit variant selection but top candidates require one."""
try:
from services.marketplace_product_matcher import (
_explicit_variant_option_tokens,
parse_product_identity,
)
except Exception:
return False
source_identity = parse_product_identity(momo_name)
source_options = set(_explicit_variant_option_tokens(source_identity))
source_multi_listing = _is_multi_variant_listing_name(momo_name)
if re.search(r"任選\s*[一二兩三四五六七八九十0-9]+\s*款", momo_name):
source_options -= {str(value) for value in range(1, 11)}
source_options -= {f"{value:02d}" for value in range(1, 11)}
if source_options and not source_multi_listing:
return False
threshold = max(best_score - 0.02, RECOVERABLE_LOW_SCORE_FLOOR)
option_buckets: set[str] = set()
for product, score, diagnostics in ranked_matches[:5]:
if getattr(diagnostics, "hard_veto", False) or score < threshold:
continue
candidate_name = getattr(product, "name", "") or ""
if source_multi_listing != _is_multi_variant_listing_name(candidate_name):
return True
candidate_identity = parse_product_identity(candidate_name)
options = _explicit_variant_option_tokens(candidate_identity)
if len(options) >= 2:
return True
option_buckets.update(options)
if len(option_buckets) >= 2:
return True
return False
def _extract_tags(pchome_product) -> list:
"""
從 PChomeProduct 物件提取語意標籤
標籤設計:
- "on_sale" — is_on_sale = True
- "discount_10pct" — 折扣 10~19%
- "discount_20pct" — 折扣 20~29%
- "discount_30pct" — 折扣 ≥ 30%
- "low_stock" — 庫存 < 10
- "high_rating" — 評分 ≥ 4.5
"""
tags = []
if pchome_product.is_on_sale:
tags.append("on_sale")
try:
disc = int(pchome_product.discount or 0)
except (ValueError, TypeError):
disc = 0
if disc >= 30:
tags.append("discount_30pct")
elif disc >= 20:
tags.append("discount_20pct")
elif disc >= 10:
tags.append("discount_10pct")
try:
stock = int(pchome_product.stock) if pchome_product.stock is not None else None
if stock is not None and 0 < stock < 10:
tags.append("low_stock")
except (ValueError, TypeError):
pass
try:
if pchome_product.rating and float(pchome_product.rating) >= 4.5:
tags.append("high_rating")
except (ValueError, TypeError):
pass
return tags
def _extend_match_tags(tags: list, diagnostics, extra: list[str] = None) -> list:
"""Attach matcher evidence tags in one place so all feeder lanes agree."""
tags = list(tags or [])
tags.extend(getattr(diagnostics, "tags", []) or [])
for reason in getattr(diagnostics, "reasons", ()) or ():
tags.append(f"match_{reason}")
if extra:
tags.extend(extra)
return list(dict.fromkeys(tag for tag in tags if tag))
def _clean_search_text(value: str) -> str:
value = re.sub(r'[()]', ' ', value or '')
value = re.sub(r'[【】\[\]]', ' ', value)
value = re.sub(r"(?<=\d)\.(?=\d)", "DECIMALPOINT", value)
value = re.sub(r'[^\w\u4e00-\u9fff]+', ' ', value)
value = value.replace("DECIMALPOINT", ".").replace("decimalpoint", ".")
return re.sub(r'\s+', ' ', value).strip()
def _dedupe_terms(terms: list) -> list:
result = []
seen = set()
for term in terms:
cleaned = _clean_search_text(term)
if len(cleaned) < 2:
continue
key = cleaned.lower()
if key in seen:
continue
seen.add(key)
result.append(cleaned[:36])
if len(result) >= MAX_SEARCH_TERMS:
break
return result
def _identity_spec_terms(identity) -> list[str]:
terms: list[str] = []
for value in getattr(identity, "volumes_ml", ()) or ():
terms.append(f"{float(value):g}ml")
for value in getattr(identity, "weights_g", ()) or ():
terms.append(f"{float(value):g}g")
for value in getattr(identity, "dosages_mg", ()) or ():
terms.append(f"{float(value):g}mg")
total_piece_count = getattr(identity, "total_piece_count", None)
if total_piece_count:
terms.append(f"{int(total_piece_count)}")
return list(dict.fromkeys(terms))
def _identity_core_rescue_phrases(identity, limit: int = 3) -> list[str]:
product_type = getattr(identity, "product_type", "") or ""
phrases: list[str] = []
for token in sorted(
(getattr(identity, "core_tokens", set()) or set()),
key=lambda value: (
product_type not in str(value),
not bool(re.search(r"[\u4e00-\u9fff]", str(value))),
-len(str(value)),
str(value),
),
):
phrase = _clean_search_text(str(token))
compact = phrase.replace(" ", "")
if len(compact) < 2:
continue
if product_type and compact == str(product_type).replace(" ", ""):
continue
if compact.isdigit() or re.fullmatch(r"\d+(?:\.\d+)?(?:ml|g|mg|kg|l|入|片|支|瓶|盒|包)?", compact, re.I):
continue
if (
any(term in compact for term in GENERIC_RECALL_BLOCK_TERMS)
and not _is_generic_recall_safe(identity)
):
continue
if any(term in compact for term in COVERAGE_RESCUE_NOISE_TERMS):
continue
if phrase.lower() in {existing.lower() for existing in phrases}:
continue
phrases.append(phrase)
if len(phrases) >= limit:
break
return phrases
def _build_coverage_rescue_keywords(momo_name: str) -> list[str]:
"""Add narrow identity fallback terms for pending/no-result coverage recovery."""
if not SEARCH_COVERAGE_RESCUE_ENABLED:
return []
try:
from services.marketplace_product_matcher import parse_product_identity
identity = parse_product_identity(momo_name)
except Exception:
return []
brand_phrases = _coverage_brand_phrases(identity)
product_type = getattr(identity, "product_type", "") or ""
spec_part = " ".join(_identity_spec_terms(identity))
core_phrases = _identity_core_rescue_phrases(identity, limit=3)
primary_core = core_phrases[0] if core_phrases else product_type
secondary_core = core_phrases[1] if len(core_phrases) > 1 else ""
terms: list[str] = []
for brand in brand_phrases:
terms.extend([
" ".join(part for part in (brand, primary_core, spec_part) if part),
" ".join(part for part in (brand, primary_core, secondary_core, spec_part) if part),
" ".join(part for part in (brand, product_type, spec_part) if part)
if _is_generic_recall_safe(identity)
else "",
])
terms.append(" ".join(part for part in (primary_core, spec_part) if part))
return _dedupe_terms(terms)
def _append_search_term(target: list[str], term: str, *, max_terms: int) -> None:
if len(target) >= max_terms:
return
cleaned_terms = _dedupe_terms([term])
if not cleaned_terms:
return
cleaned = cleaned_terms[0]
if cleaned.lower() in {existing.lower() for existing in target}:
return
target.append(cleaned)
def _build_search_keywords(momo_name: str) -> list:
"""
用多組商品身份線索搜尋 PChome提高命中率但仍交給身份比對門檻把關。
"""
try:
from services.marketplace_product_matcher import build_search_terms
terms = build_search_terms(momo_name, max_terms=MAX_SEARCH_TERMS)
terms.append(momo_name)
except Exception:
logger.debug(
"[Feeder] marketplace matcher failed while building search keywords; "
"fallback to cleaned product name",
exc_info=True,
)
cleaned = _clean_search_text(momo_name)
terms = [cleaned[:36], cleaned[:24]]
max_terms = max(1, MAX_SEARCH_TERMS)
primary_terms = _dedupe_terms(terms[: max(1, max_terms - 2)])
rescue_terms = _build_coverage_rescue_keywords(momo_name)
original_terms = _dedupe_terms([momo_name])
selected_terms: list[str] = []
for term in primary_terms:
_append_search_term(selected_terms, term, max_terms=max(1, max_terms - 1))
for term in rescue_terms:
_append_search_term(selected_terms, term, max_terms=max(1, max_terms - 1))
for term in original_terms:
if term.lower() not in {existing.lower() for existing in selected_terms}:
if len(selected_terms) >= max_terms:
selected_terms = selected_terms[: max(0, max_terms - 1)]
selected_terms.append(term)
break
return _dedupe_terms(selected_terms)
def _primary_brand_phrase(identity) -> str:
brand_tokens = {token.lower() for token in getattr(identity, "brand_tokens", set())}
if {"3w", "clinic"} <= brand_tokens:
return "3w clinic"
if {"dashing", "diva"} <= brand_tokens:
return "dashing diva"
if {"rom", "nd"} <= brand_tokens:
return "romand"
if {"im", "meme"} <= brand_tokens:
return "im meme"
if {"recipe", "box"} <= brand_tokens:
return "recipe box"
chinese = sorted(
(
token for token in getattr(identity, "brand_tokens", set())
if re.search(r"[\u4e00-\u9fff]", token)
),
key=lambda token: (-len(token), token),
)
if chinese:
return chinese[0]
latin = sorted(
(
token for token in getattr(identity, "brand_tokens", set())
if re.search(r"[a-z]", token, re.I) and len(token) >= 3
),
key=lambda token: (" " not in token and "-" not in token, -len(token), token),
)
if latin:
return latin[0].lower()
short_latin = sorted(
(
token for token in getattr(identity, "brand_tokens", set())
if re.search(r"[a-z]", token, re.I) and len(token) >= 2
),
key=lambda token: (" " not in token and "-" not in token, -len(token), token),
)
return short_latin[0].lower() if short_latin else ""
def _coverage_brand_phrases(identity) -> list[str]:
brand_tokens = {str(token).lower() for token in getattr(identity, "brand_tokens", set())}
chinese = sorted(
(
token for token in getattr(identity, "brand_tokens", set())
if re.search(r"[\u4e00-\u9fff]", str(token))
),
key=lambda token: (-len(str(token)), str(token)),
)
latin = sorted(
(
token for token in getattr(identity, "brand_tokens", set())
if re.search(r"[a-z]", str(token), re.I) and len(str(token)) >= 2
),
key=lambda token: (-len(str(token)), str(token)),
)
phrases: list[str] = []
if latin and chinese:
phrases.append(f"{str(latin[0]).lower()} {chinese[0]}")
primary = _primary_brand_phrase(identity)
if primary:
phrases.append(primary)
for token in (
"cerave",
"embryolisse",
"neogence",
"tunemakers",
"nivea",
"romand",
"lactacyd",
"pavaruni",
"solone",
):
if token in brand_tokens:
phrases.append(token)
return _dedupe_terms(phrases)
def _is_generic_recall_safe(identity) -> bool:
product_type = getattr(identity, "product_type", None)
if product_type not in GENERIC_RECALL_SAFE_PRODUCT_TYPES:
return False
searchable_name = getattr(identity, "searchable_name", "") or ""
return not any(term in searchable_name for term in GENERIC_RECALL_BLOCK_TERMS)
def _build_generic_recall_keywords(identity) -> list[str]:
if not _is_generic_recall_safe(identity):
return []
brand = _primary_brand_phrase(identity)
product_type = getattr(identity, "product_type", "") or ""
if not brand or not product_type:
return []
return _dedupe_terms([f"{brand} {product_type}"])
def _build_variant_recall_search_plan(momo_name: str, keywords: list[str]) -> list[tuple[str, str | None]]:
plan = [(keyword, None) for keyword in (keywords or [])]
try:
from services.marketplace_product_matcher import parse_product_identity
identity = parse_product_identity(momo_name)
except Exception:
return plan
brand_tokens = {token.lower() for token in getattr(identity, "brand_tokens", set())}
if not ({"dashing", "diva"} <= brand_tokens and "美甲片" in getattr(identity, "searchable_name", "")):
seen = {(keyword.lower(), sort) for keyword, sort in plan}
for keyword in _build_generic_recall_keywords(identity):
key = (keyword.lower(), None)
if key in seen:
continue
seen.add(key)
plan.append((keyword, None))
return plan
searchable_name = getattr(identity, "searchable_name", "")
broad_keywords = []
if "足部時尚潮流美甲片" in searchable_name:
broad_keywords.append("dashing diva 足部時尚潮流美甲片")
elif "頂級璀燦美甲片" in searchable_name:
broad_keywords.append("dashing diva 頂級璀燦美甲片")
elif "時尚潮流美甲片" in searchable_name:
broad_keywords.append("dashing diva 時尚潮流美甲片")
elif "薄型經典美甲片" in searchable_name:
broad_keywords.append("dashing diva 薄型經典美甲片")
broad_keywords.extend((
"dashing diva magicpress",
"dashing diva 美甲片",
))
seen = {(keyword.lower(), sort) for keyword, sort in plan}
def append(keyword: str, sort: str | None = None) -> None:
key = (keyword.lower(), sort)
if key in seen:
return
seen.add(key)
plan.append((keyword, sort))
for broad_keyword in broad_keywords:
append(broad_keyword, None)
if broad_keywords:
for sort in VARIANT_RECALL_SORTS:
append(broad_keywords[0], sort)
return plan
def _format_match_diagnostics(diagnostics) -> str:
if not diagnostics:
return ""
reasons = ",".join(getattr(diagnostics, "reasons", ()) or ())
return (
f"score={diagnostics.score}; brand={diagnostics.brand_score}; "
f"token={diagnostics.token_score}; spec={diagnostics.spec_score}; "
f"seq={diagnostics.sequence_score}; type={diagnostics.type_score}; "
f"penalty={diagnostics.price_penalty}; veto={diagnostics.hard_veto}; "
f"mode={getattr(diagnostics, 'comparison_mode', 'exact_identity')}; "
f"match_type={getattr(diagnostics, 'match_type', '')}; "
f"price_basis={getattr(diagnostics, 'price_basis', '')}; "
f"alert_tier={getattr(diagnostics, 'alert_tier', '')}; "
f"reasons={reasons}"
)
def _match_diagnostics_payload(diagnostics) -> dict:
"""Serialize matcher diagnostics for review/report consumers."""
if not diagnostics:
return {}
return {
"score": getattr(diagnostics, "score", None),
"brand_score": getattr(diagnostics, "brand_score", None),
"token_score": getattr(diagnostics, "token_score", None),
"spec_score": getattr(diagnostics, "spec_score", None),
"sequence_score": getattr(diagnostics, "sequence_score", None),
"type_score": getattr(diagnostics, "type_score", None),
"price_penalty": getattr(diagnostics, "price_penalty", None),
"hard_veto": bool(getattr(diagnostics, "hard_veto", False)),
"comparison_mode": getattr(diagnostics, "comparison_mode", "exact_identity"),
"match_type": getattr(diagnostics, "match_type", None),
"price_basis": getattr(diagnostics, "price_basis", None),
"alert_tier": getattr(diagnostics, "alert_tier", None),
"evidence_flags": list(getattr(diagnostics, "evidence_flags", ()) or ()),
"reasons": list(getattr(diagnostics, "reasons", ()) or ()),
"identity_evidence": getattr(diagnostics, "identity_evidence", None) or {},
"offer_evidence": getattr(diagnostics, "offer_evidence", None) or {},
}
def _pchome_search_url(keyword: str) -> str:
return f"https://ecshweb.pchome.com.tw/search/v3.3/?q={quote_plus(keyword or '')}"
def _build_browse_sh_diagnostic_payload(
momo_name: str,
search_terms: list[str] = None,
reason: str = "unknown",
best_product=None,
best_score: float = None,
diagnostics=None,
candidate_count: int = 0,
) -> dict:
"""Build a read-only browse.sh probe plan for low-confidence PChome cases."""
if not BROWSE_SH_DIAGNOSTIC_ENABLED:
return {}
terms = _dedupe_terms(search_terms or _build_search_keywords(momo_name))[:3]
urls = [_pchome_search_url(term) for term in terms]
product_url = getattr(best_product, "product_url", None)
if product_url:
urls.append(product_url)
urls = list(dict.fromkeys(url for url in urls if url))
primary_url = urls[0] if urls else _pchome_search_url(momo_name)
diagnostic_payload = _match_diagnostics_payload(diagnostics)
return {
"tool": "browse.sh",
"mode": "execute_on_demand" if BROWSE_SH_EXECUTE_ENABLED else "plan_only",
"reason": reason,
"execute_enabled": BROWSE_SH_EXECUTE_ENABLED,
"timeout_seconds": BROWSE_SH_TIMEOUT_SECONDS,
"candidate_count": int(candidate_count or 0),
"momo_name": (momo_name or "")[:300],
"search_terms": terms,
"urls": urls,
"suggested_commands": [
{
"purpose": "static_fetch_first_page",
"args": ["get", primary_url],
},
{
"purpose": "manual_browser_probe",
"args": ["open", primary_url],
},
],
"best_candidate": {
"product_id": getattr(best_product, "product_id", None),
"name": (getattr(best_product, "name", None) or "")[:300] or None,
"price": getattr(best_product, "price", None),
"url": product_url,
"score": best_score,
} if best_product else None,
"diagnostic_codes": diagnostic_payload.get("reasons") or [],
"comparison_mode": diagnostic_payload.get("comparison_mode"),
"hard_veto": diagnostic_payload.get("hard_veto"),
"execution": {"status": "disabled"},
}
def _product_snapshot_payload(product) -> dict:
payload = {
"competitor_product_url": None,
"competitor_image_url": None,
"competitor_stock": None,
}
if not product:
return payload
payload.update({
"competitor_product_url": getattr(product, "product_url", None),
"competitor_image_url": getattr(product, "image_url", None),
"competitor_stock": getattr(product, "stock", None),
})
return payload
def _product_id_key(product_id: str) -> str:
"""Normalize PChome IDs for comparing cached IDs with API-returned IDs."""
return re.sub(r"[^A-Z0-9]", "", str(product_id or "").upper())
def _candidate_match_name(product) -> str:
"""Return the identity-rich PChome text used for scoring, falling back to display name."""
return (getattr(product, "match_name", None) or getattr(product, "name", None) or "").strip()
def _find_best_match_detail(
momo_name: str,
pchome_products: list,
momo_price: float = None,
) -> Optional[tuple]:
"""
從 PChome 搜尋結果中找出與 MOMO 商品名稱最接近的一筆
Args:
momo_name: MOMO 商品名稱
pchome_products: PChomeProduct 列表
Returns:
(PChomeProduct, score, diagnostics) or None
"""
ranked = _rank_match_details(momo_name, pchome_products, momo_price=momo_price)
return ranked[0] if ranked else None
def _rank_match_details(
momo_name: str,
pchome_products: list,
momo_price: float = None,
) -> list[tuple]:
"""Score all PChome candidates and return them from strongest to weakest."""
from services.marketplace_product_matcher import score_marketplace_match
ranked = []
for p in pchome_products:
diagnostics = score_marketplace_match(
momo_name,
_candidate_match_name(p),
momo_price=momo_price,
competitor_price=getattr(p, "price", None),
)
ranked.append((p, diagnostics.score, diagnostics))
return sorted(ranked, key=lambda item: item[1], reverse=True)
def _find_best_match(momo_name: str, pchome_products: list) -> Optional[tuple]:
"""Backward-compatible helper for smoke scripts."""
result = _find_best_match_detail(momo_name, pchome_products)
if not result:
return None
best, score, _diagnostics = result
return best, score
def _search_pchome_candidates(
crawler,
momo_name: str,
keywords: list = None,
momo_price: float = None,
max_terms: int | None = None,
max_pages: int | None = None,
max_seconds: float | None = None,
) -> list:
"""以多組搜尋詞擴大 PChome 候選池,只在強同款時提前停止。"""
candidates = []
seen_ids = set()
page_cap = max(1, int(max_pages or SEARCH_MAX_PAGES))
search_limit = SEARCH_LIMIT * page_cap
active_keywords = keywords or _build_search_keywords(momo_name)
search_plan = _build_variant_recall_search_plan(momo_name, active_keywords)
if max_terms is not None:
search_plan = search_plan[:max(1, int(max_terms))]
deadline = (time.monotonic() + max_seconds) if max_seconds and max_seconds > 0 else None
for keyword, sort in search_plan:
if deadline and time.monotonic() >= deadline:
logger.info("[Feeder] PChome search budget exhausted before keyword=%s", keyword)
break
if sort:
ok, _, products = crawler.search_products(
keyword,
limit=search_limit,
max_pages=page_cap,
sort=sort,
)
else:
ok, _, products = crawler.search_products(keyword, limit=search_limit, max_pages=page_cap)
if not ok or not products:
continue
for product in products:
if product.product_id in seen_ids:
continue
seen_ids.add(product.product_id)
candidates.append(product)
best = _find_best_match_detail(momo_name, candidates, momo_price=momo_price)
if best and best[1] >= EARLY_STOP_MATCH_SCORE:
break
return candidates
def _recover_low_score_with_fresh_search(
crawler,
momo_name: str,
momo_price: float = None,
existing_product_id: str = "",
max_terms: int | None = None,
max_pages: int | None = None,
max_seconds: float | None = None,
) -> tuple[Optional[tuple], list[str], int]:
"""
當 legacy / known-id 候選重評仍低分時,再跑一次 fresh keyword search
嘗試把舊錯配洗成新的真同款。
"""
keywords = _build_search_keywords(momo_name)
candidates = _search_pchome_candidates(
crawler,
momo_name,
keywords=keywords,
momo_price=momo_price,
max_terms=max_terms,
max_pages=max_pages,
max_seconds=max_seconds,
)
if existing_product_id:
existing_key = _product_id_key(existing_product_id)
fresh_candidates = [
candidate
for candidate in candidates
if _product_id_key(getattr(candidate, "product_id", "")) != existing_key
]
if fresh_candidates:
candidates = fresh_candidates
best = _find_best_match_detail(momo_name, candidates, momo_price=momo_price)
return best, keywords, len(candidates)
def _structural_similarity(momo_p, pchome_p) -> float:
"""
結構化相似度計算(品牌 + 規格 + 關鍵字)
權重:品牌匹配 0.4 + 規格匹配 0.3 + 關鍵字相似 0.3
"""
from difflib import SequenceMatcher
score = 0.0
# 品牌比對 (0.4)
if momo_p.brand and pchome_p.brand:
if momo_p.brand == pchome_p.brand:
score += 0.4
elif momo_p.brand in pchome_p.brand or pchome_p.brand in momo_p.brand:
score += 0.2
elif not momo_p.brand and not pchome_p.brand:
score += 0.1 # 都沒有品牌,不扣分
# 規格比對 (0.3) — 容量/克重
momo_specs = momo_p.specs or {}
pchome_specs = pchome_p.specs or {}
if momo_specs and pchome_specs:
matching_specs = sum(
1 for k, v in momo_specs.items()
if pchome_specs.get(k) == v
)
total_specs = max(len(momo_specs), len(pchome_specs), 1)
score += 0.3 * (matching_specs / total_specs)
elif not momo_specs and not pchome_specs:
score += 0.15
# 關鍵字相似度 (0.3)
momo_kws = " ".join(momo_p.keywords or [])
pchome_kws = " ".join(pchome_p.keywords or [])
if momo_kws and pchome_kws:
kw_sim = SequenceMatcher(None, momo_kws.lower(), pchome_kws.lower()).ratio()
score += 0.3 * kw_sim
return round(score, 3)
class CompetitorPriceFeeder:
"""
競品價格補給線 Worker
用法:
feeder = CompetitorPriceFeeder(engine=db_engine)
result = feeder.run(source="pchome")
"""
def __init__(self, engine=None):
self.engine = engine
self._history_table_ready = False
self._attempt_table_ready = False
self._price_table_columns_ready = False
self._browse_sh_executions = 0
def _ensure_table_columns(self, conn, table: str, column_specs: list[tuple[str, str]]) -> None:
"""補齊既有表欄位;避免正式端舊表在新 INSERT 時炸掉。"""
from sqlalchemy import inspect, text
inspector = inspect(conn)
if not inspector.has_table(table):
return
existing = {column["name"] for column in inspector.get_columns(table)}
for column_name, column_type in column_specs:
if column_name in existing:
continue
conn.execute(text(f"ALTER TABLE {table} ADD COLUMN {column_name} {column_type}"))
existing.add(column_name)
def _ensure_competitor_prices_columns(self, conn) -> None:
if self._price_table_columns_ready:
return
self._ensure_table_columns(conn, "competitor_prices", [
("competitor_product_url", "TEXT"),
("competitor_image_url", "TEXT"),
("competitor_stock", "INTEGER"),
("match_diagnostic_json", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"),
("comparison_mode", "VARCHAR(40)"),
("hard_veto", "BOOLEAN"),
("diagnostic_codes", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"),
])
self._price_table_columns_ready = True
def _ensure_competitor_price_history_table(self, conn):
"""確保競品價格歷史表存在;排程可自癒補表,不依賴手動 migration。"""
if self._history_table_ready:
return
from sqlalchemy import text
if conn.dialect.name == "postgresql":
conn.execute(text("""
CREATE TABLE IF NOT EXISTS competitor_price_history (
id BIGSERIAL PRIMARY KEY,
sku VARCHAR(50) NOT NULL,
source VARCHAR(30) NOT NULL DEFAULT 'pchome',
momo_product_id INTEGER,
momo_price NUMERIC(10,2),
price NUMERIC(10,2) NOT NULL,
original_price NUMERIC(10,2),
discount_pct INTEGER,
competitor_product_id VARCHAR(100),
competitor_product_name TEXT,
competitor_product_url TEXT,
competitor_image_url TEXT,
competitor_stock INTEGER,
match_score NUMERIC(4,3),
tags JSONB DEFAULT '[]'::jsonb,
match_diagnostic_json JSONB,
comparison_mode VARCHAR(40),
hard_veto BOOLEAN,
diagnostic_codes JSONB,
crawled_at TIMESTAMP NOT NULL DEFAULT NOW()
)
"""))
conn.execute(text("""
CREATE INDEX IF NOT EXISTS idx_comp_price_history_sku_source_time
ON competitor_price_history (sku, source, crawled_at DESC)
"""))
conn.execute(text("""
CREATE INDEX IF NOT EXISTS idx_comp_price_history_competitor_id
ON competitor_price_history (competitor_product_id)
"""))
else:
conn.execute(text("""
CREATE TABLE IF NOT EXISTS competitor_price_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
sku VARCHAR(50) NOT NULL,
source VARCHAR(30) NOT NULL DEFAULT 'pchome',
momo_product_id INTEGER,
momo_price NUMERIC(10,2),
price NUMERIC(10,2) NOT NULL,
original_price NUMERIC(10,2),
discount_pct INTEGER,
competitor_product_id VARCHAR(100),
competitor_product_name TEXT,
competitor_product_url TEXT,
competitor_image_url TEXT,
competitor_stock INTEGER,
match_score NUMERIC(4,3),
tags TEXT DEFAULT '[]',
match_diagnostic_json TEXT,
comparison_mode VARCHAR(40),
hard_veto BOOLEAN,
diagnostic_codes TEXT,
crawled_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
)
"""))
conn.execute(text("""
CREATE INDEX IF NOT EXISTS idx_comp_price_history_sku_source_time
ON competitor_price_history (sku, source, crawled_at DESC)
"""))
conn.execute(text("""
CREATE INDEX IF NOT EXISTS idx_comp_price_history_competitor_id
ON competitor_price_history (competitor_product_id)
"""))
self._ensure_table_columns(conn, "competitor_price_history", [
("competitor_product_url", "TEXT"),
("competitor_image_url", "TEXT"),
("competitor_stock", "INTEGER"),
("match_diagnostic_json", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"),
("comparison_mode", "VARCHAR(40)"),
("hard_veto", "BOOLEAN"),
("diagnostic_codes", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"),
])
self._history_table_ready = True
def _ensure_competitor_match_attempts_table(self, conn):
"""確保 PChome 比對嘗試表存在;成功、低分、無結果與錯誤都要留痕。"""
if self._attempt_table_ready:
return
from sqlalchemy import text
if conn.dialect.name == "postgresql":
conn.execute(text("""
CREATE TABLE IF NOT EXISTS competitor_match_attempts (
id BIGSERIAL PRIMARY KEY,
sku VARCHAR(50) NOT NULL,
source VARCHAR(30) NOT NULL DEFAULT 'pchome',
momo_product_id INTEGER,
momo_product_name TEXT,
momo_price NUMERIC(10,2),
search_terms JSONB DEFAULT '[]'::jsonb,
candidate_count INTEGER DEFAULT 0,
attempt_status VARCHAR(30) NOT NULL,
best_competitor_product_id VARCHAR(100),
best_competitor_product_name TEXT,
competitor_product_url TEXT,
competitor_image_url TEXT,
competitor_stock INTEGER,
best_competitor_price NUMERIC(10,2),
best_match_score NUMERIC(4,3),
match_diagnostic_json JSONB,
comparison_mode VARCHAR(40),
hard_veto BOOLEAN,
diagnostic_codes JSONB,
browse_diagnostic_json JSONB,
error_message TEXT,
attempted_at TIMESTAMP NOT NULL DEFAULT NOW()
)
"""))
conn.execute(text("""
CREATE INDEX IF NOT EXISTS idx_comp_match_attempts_sku_source_time
ON competitor_match_attempts (sku, source, attempted_at DESC)
"""))
conn.execute(text("""
CREATE INDEX IF NOT EXISTS idx_comp_match_attempts_status_time
ON competitor_match_attempts (attempt_status, attempted_at DESC)
"""))
else:
conn.execute(text("""
CREATE TABLE IF NOT EXISTS competitor_match_attempts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
sku VARCHAR(50) NOT NULL,
source VARCHAR(30) NOT NULL DEFAULT 'pchome',
momo_product_id INTEGER,
momo_product_name TEXT,
momo_price NUMERIC(10,2),
search_terms TEXT DEFAULT '[]',
candidate_count INTEGER DEFAULT 0,
attempt_status VARCHAR(30) NOT NULL,
best_competitor_product_id VARCHAR(100),
best_competitor_product_name TEXT,
competitor_product_url TEXT,
competitor_image_url TEXT,
competitor_stock INTEGER,
best_competitor_price NUMERIC(10,2),
best_match_score NUMERIC(4,3),
match_diagnostic_json TEXT,
comparison_mode VARCHAR(40),
hard_veto BOOLEAN,
diagnostic_codes TEXT,
browse_diagnostic_json TEXT,
error_message TEXT,
attempted_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
)
"""))
conn.execute(text("""
CREATE INDEX IF NOT EXISTS idx_comp_match_attempts_sku_source_time
ON competitor_match_attempts (sku, source, attempted_at DESC)
"""))
conn.execute(text("""
CREATE INDEX IF NOT EXISTS idx_comp_match_attempts_status_time
ON competitor_match_attempts (attempt_status, attempted_at DESC)
"""))
self._ensure_table_columns(conn, "competitor_match_attempts", [
("search_terms", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"),
("competitor_product_url", "TEXT"),
("competitor_image_url", "TEXT"),
("competitor_stock", "INTEGER"),
("match_diagnostic_json", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"),
("comparison_mode", "VARCHAR(40)"),
("hard_veto", "BOOLEAN"),
("diagnostic_codes", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"),
("browse_diagnostic_json", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"),
("error_message", "TEXT"),
])
self._attempt_table_ready = True
def _prepare_browse_diagnostic(
self,
momo_name: str,
search_terms: list = None,
reason: str = "unknown",
best_product=None,
best_score: float = None,
diagnostics=None,
candidate_count: int = 0,
) -> dict:
"""Return browse.sh diagnostic evidence; CLI execution remains opt-in and rate-limited."""
payload = _build_browse_sh_diagnostic_payload(
momo_name,
search_terms=search_terms,
reason=reason,
best_product=best_product,
best_score=best_score,
diagnostics=diagnostics,
candidate_count=candidate_count,
)
if not payload or not BROWSE_SH_EXECUTE_ENABLED:
return payload
if self._browse_sh_executions >= BROWSE_SH_MAX_EXECUTIONS_PER_RUN:
payload["execution"] = {"status": "rate_limited"}
return payload
command_args = tuple((payload.get("suggested_commands") or [{}])[0].get("args") or ())
if not command_args:
payload["execution"] = {"status": "missing_command"}
return payload
try:
from services.browse_sh_tool import BrowseShTool
self._browse_sh_executions += 1
result = BrowseShTool(timeout_seconds=BROWSE_SH_TIMEOUT_SECONDS).run(
command_args,
timeout_seconds=BROWSE_SH_TIMEOUT_SECONDS,
)
payload["execution"] = {
"status": "ok" if result.ok else "failed",
"returncode": result.returncode,
"timed_out": result.timed_out,
"unavailable_reason": result.unavailable_reason,
"stdout_preview": (result.stdout or "")[:BROWSE_SH_OUTPUT_PREVIEW_CHARS],
"stderr_preview": (result.stderr or "")[:BROWSE_SH_OUTPUT_PREVIEW_CHARS],
}
except Exception as exc:
payload["execution"] = {
"status": "error",
"error": str(exc)[:500],
}
return payload
def _record_match_attempt(
self,
sku: str,
momo_name: str,
momo_product_id: int = None,
momo_price: float = None,
search_terms: list = None,
candidate_count: int = 0,
attempt_status: str = "unknown",
best_product=None,
best_score: float = None,
diagnostics=None,
browse_diagnostic: dict = None,
error_message: str = None,
source: str = "pchome",
) -> None:
"""追加一筆 PChome 比對嘗試紀錄,讓待比對/低信心也能回溯。"""
from sqlalchemy import text
with self.engine.begin() as conn:
self._ensure_competitor_match_attempts_table(conn)
search_terms_expr = "CAST(:search_terms AS jsonb)" if conn.dialect.name == "postgresql" else ":search_terms"
json_cast = "CAST(:match_diagnostic_json AS jsonb)" if conn.dialect.name == "postgresql" else ":match_diagnostic_json"
codes_cast = "CAST(:diagnostic_codes AS jsonb)" if conn.dialect.name == "postgresql" else ":diagnostic_codes"
browse_cast = "CAST(:browse_diagnostic_json AS jsonb)" if conn.dialect.name == "postgresql" else ":browse_diagnostic_json"
diagnostic_payload = _match_diagnostics_payload(diagnostics)
diagnostic_codes = diagnostic_payload.get("reasons") or []
product_payload = _product_snapshot_payload(best_product)
browse_diagnostic_json = (
json.dumps(browse_diagnostic, ensure_ascii=False)
if browse_diagnostic
else None
)
conn.execute(text(f"""
INSERT INTO competitor_match_attempts
(sku, source, momo_product_id, momo_product_name, momo_price,
search_terms, candidate_count, attempt_status,
best_competitor_product_id, best_competitor_product_name,
competitor_product_url, competitor_image_url, competitor_stock,
best_competitor_price, best_match_score,
match_diagnostic_json, comparison_mode, hard_veto, diagnostic_codes,
browse_diagnostic_json,
error_message,
attempted_at)
VALUES
(:sku, :source, :momo_product_id, :momo_product_name, :momo_price,
{search_terms_expr}, :candidate_count, :attempt_status,
:best_id, :best_name,
:competitor_product_url, :competitor_image_url, :competitor_stock,
:best_price, :best_score,
{json_cast}, :comparison_mode, :hard_veto, {codes_cast},
{browse_cast},
:error_message,
CURRENT_TIMESTAMP)
"""), {
"sku": sku,
"source": source,
"momo_product_id": momo_product_id,
"momo_product_name": momo_name,
"momo_price": momo_price,
"search_terms": json.dumps(search_terms or [], ensure_ascii=False),
"candidate_count": candidate_count,
"attempt_status": attempt_status,
"best_id": getattr(best_product, "product_id", None),
"best_name": (getattr(best_product, "name", None) or "")[:300] or None,
**product_payload,
"best_price": getattr(best_product, "price", None),
"best_score": best_score,
"match_diagnostic_json": json.dumps(diagnostic_payload, ensure_ascii=False) if diagnostic_payload else None,
"comparison_mode": diagnostic_payload.get("comparison_mode"),
"hard_veto": diagnostic_payload.get("hard_veto"),
"diagnostic_codes": json.dumps(diagnostic_codes, ensure_ascii=False) if diagnostic_codes else None,
"browse_diagnostic_json": browse_diagnostic_json,
"error_message": (error_message or "")[:1000] or None,
})
def _fetch_active_skus(self) -> list:
"""
從 products 表取得待監控的 ACTIVE 商品清單
Returns:
list of {"sku": str, "name": str, "category": str}
"""
if self.engine is None:
raise RuntimeError("需要注入 SQLAlchemy engine")
from sqlalchemy import text
sql = text("""
SELECT
p.id AS product_id,
p.i_code AS sku,
p.name,
p.category,
(
SELECT pr.price
FROM price_records pr
WHERE pr.product_id = p.id
ORDER BY pr.timestamp DESC
LIMIT 1
) AS momo_price
FROM products p
WHERE p.status = 'ACTIVE'
AND EXISTS (
SELECT 1
FROM price_records pr
WHERE pr.product_id = p.id
)
ORDER BY p.i_code
""")
with self.engine.connect() as conn:
rows = conn.execute(sql).fetchall()
return [dict(r._mapping) for r in rows]
def _fetch_unmatched_priority_skus(self, limit: int = 80) -> list:
"""
取得目前沒有有效 PChome 配對的高價 ACTIVE 商品,供補強流程優先處理。
"""
if self.engine is None:
raise RuntimeError("需要注入 SQLAlchemy engine")
from sqlalchemy import text
sql = text("""
WITH latest_attempt AS (
SELECT DISTINCT ON (cma.sku)
cma.sku,
cma.attempt_status,
cma.attempted_at
FROM competitor_match_attempts cma
WHERE cma.source = 'pchome'
ORDER BY cma.sku, cma.attempted_at DESC, cma.id DESC
)
SELECT
p.id AS product_id,
p.i_code AS sku,
p.name,
p.category,
latest_price.price AS momo_price,
COALESCE(la.attempt_status, 'never_attempted') AS latest_attempt_status
FROM products p
JOIN LATERAL (
SELECT pr.price
FROM price_records pr
WHERE pr.product_id = p.id
ORDER BY pr.timestamp DESC, pr.id DESC
LIMIT 1
) latest_price ON TRUE
LEFT JOIN latest_attempt la
ON la.sku = p.i_code
LEFT JOIN competitor_prices cp
ON cp.sku = p.i_code
AND cp.source = 'pchome'
AND cp.expires_at > CURRENT_TIMESTAMP
AND COALESCE(cp.match_score, 0) >= :match_score_floor
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
WHERE p.status = 'ACTIVE'
AND cp.sku IS NULL
ORDER BY
CASE
WHEN la.attempt_status IN ('no_result', 'refresh_no_result')
AND p.name !~* :generic_recall_block_pattern THEN 0
WHEN la.sku IS NULL THEN 1
WHEN la.attempt_status IN ('no_result', 'refresh_no_result') THEN 2
WHEN la.attempt_status IN ('low_score', 'refresh_low_score', 'recoverable_low_score') THEN 3
ELSE 4
END,
latest_price.price DESC NULLS LAST,
p.i_code
LIMIT :limit
""")
with self.engine.connect() as conn:
rows = conn.execute(
sql,
{
"limit": max(1, min(int(limit), 300)),
"match_score_floor": MIN_MATCH_SCORE,
"generic_recall_block_pattern": GENERIC_RECALL_BLOCK_NAME_PATTERN,
},
).fetchall()
return [dict(r._mapping) for r in rows]
def _fetch_retryable_candidate_skus(self, limit: int = 80, min_score: float = 0.70) -> list:
"""
取得近門檻候選,供 matcher 升級後重新評分。
這條路徑優先用前次留下的 PChome product_id 批次查詢最新商品資料;
若 product_id 過期或重評仍低分,才走受控 fresh search recovery。
自動隊列只收「近門檻、無 hard veto、仍在 exact_identity 軌道、已有同品線證據」
的候選identity_veto、unit_comparable 不在自動回刷主戰場。
true_low_confidence 只允許目前已補 focused exact 規則的品線進入窄門重評,
且必須沒有 commercial / variant / count / bundle 類阻擋理由。
最後仍由現行 matcher 重新判斷,不因舊 attempt_status 自動寫入正式比價。
"""
if self.engine is None:
raise RuntimeError("需要注入 SQLAlchemy engine")
from sqlalchemy import text
sql = text(f"""
WITH latest_attempt AS (
SELECT DISTINCT ON (cma.sku)
cma.sku,
cma.best_competitor_product_id,
cma.best_competitor_product_name,
cma.best_match_score,
cma.attempt_status,
cma.hard_veto,
cma.match_diagnostic_json,
cma.attempted_at
FROM competitor_match_attempts cma
WHERE cma.source = 'pchome'
ORDER BY cma.sku, cma.attempted_at DESC, cma.id DESC
),
legacy_unmasked_attempt AS (
SELECT DISTINCT ON (cma.sku, cma.best_competitor_product_id)
cma.sku,
cma.best_competitor_product_id,
cma.best_competitor_product_name,
cma.best_match_score,
cma.attempt_status,
cma.hard_veto,
cma.match_diagnostic_json,
cma.attempted_at
FROM competitor_match_attempts cma
JOIN latest_attempt current_la
ON current_la.sku = cma.sku
WHERE cma.source = 'pchome'
AND current_la.attempt_status IN (
'refresh_no_result',
'no_result',
'expired_match'
)
AND cma.attempt_status IN (
'low_score',
'refresh_low_score',
'recoverable_low_score',
'true_low_confidence',
'rescore_accepted_current'
)
AND cma.attempted_at < current_la.attempted_at
ORDER BY cma.sku, cma.best_competitor_product_id, cma.attempted_at DESC, cma.id DESC
),
candidate_attempt AS (
SELECT DISTINCT ON (la.sku, la.best_competitor_product_id)
la.*
FROM (
SELECT * FROM latest_attempt
UNION ALL
SELECT * FROM legacy_unmasked_attempt
) la
WHERE la.best_competitor_product_id IS NOT NULL
AND la.best_competitor_product_id <> ''
AND COALESCE(la.best_match_score, 0) >= :min_score
AND COALESCE(la.hard_veto, false) = false
AND COALESCE(la.match_diagnostic_json->>'comparison_mode', 'exact_identity') = 'exact_identity'
ORDER BY la.sku, la.best_competitor_product_id, la.attempted_at DESC
)
SELECT
p.id AS product_id,
p.i_code AS sku,
p.name,
p.category,
latest_price.price AS momo_price,
la.best_competitor_product_id AS competitor_product_id,
la.best_competitor_product_name AS competitor_product_name,
la.best_match_score,
la.attempt_status
FROM candidate_attempt la
JOIN products p
ON p.i_code = la.sku
AND p.status = 'ACTIVE'
JOIN LATERAL (
SELECT pr.price
FROM price_records pr
WHERE pr.product_id = p.id
ORDER BY pr.timestamp DESC, pr.id DESC
LIMIT 1
) latest_price ON TRUE
LEFT JOIN competitor_prices cp
ON cp.sku = p.i_code
AND cp.source = 'pchome'
AND cp.expires_at > CURRENT_TIMESTAMP
AND COALESCE(cp.match_score, 0) >= :match_score_floor
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
WHERE cp.sku IS NULL
AND (
(
la.attempt_status IN (
'low_score',
'refresh_low_score',
'recoverable_low_score'
)
AND (
la.attempt_status = 'recoverable_low_score'
OR COALESCE(la.match_diagnostic_json->'reasons', '[]'::jsonb) ?| array[{RECOVERABLE_SQL_REASON_LIST}]
)
)
OR (
la.attempt_status IN (
'low_score',
'refresh_low_score',
'true_low_confidence',
'rescore_accepted_current'
)
AND COALESCE(la.best_match_score, 0) >= :min_score
AND COALESCE(la.hard_veto, false) = false
AND COALESCE(la.match_diagnostic_json->>'comparison_mode', 'exact_identity') = 'exact_identity'
AND (
(
COALESCE(la.match_diagnostic_json->'reasons', '[]'::jsonb) ? 'focused_exact_total_price_safe'
AND COALESCE(la.match_diagnostic_json->'reasons', '[]'::jsonb) ?| array[{FOCUSED_TOTAL_PRICE_IDENTITY_SQL_REASON_LIST}]
)
OR (
COALESCE(la.best_match_score, 0) >= :match_score_floor
AND COALESCE(la.match_diagnostic_json->'reasons', '[]'::jsonb) ?| array[{FOCUSED_TOTAL_PRICE_IDENTITY_SQL_REASON_LIST}]
)
)
AND NOT (
COALESCE(la.match_diagnostic_json->'reasons', '[]'::jsonb)
?| array[{REVALIDATABLE_REVIEW_BLOCK_SQL_REASON_LIST}]
)
)
OR (
la.attempt_status = 'true_low_confidence'
AND COALESCE(la.best_match_score, 0) >= 0.95
AND COALESCE(la.match_diagnostic_json->>'match_type', '') = 'exact'
AND COALESCE(la.match_diagnostic_json->>'price_basis', '') = 'manual_review'
AND COALESCE(la.match_diagnostic_json->'reasons', '[]'::jsonb) ? 'strong_product_line_match'
AND NOT (
COALESCE(la.match_diagnostic_json->'reasons', '[]'::jsonb)
?| array[{REVALIDATABLE_REVIEW_BLOCK_SQL_REASON_LIST}]
)
AND (
(
lower(COALESCE(p.name, '')) LIKE '%bioneo%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%bioneo%'
AND COALESCE(p.name, '') LIKE '%微煥膚藻晶去角質凝膠%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%微煥膚藻晶去角質凝膠%'
AND lower(COALESCE(p.name, '')) ~* '150\\s*ml\\s*[x*]\\s*2'
AND lower(COALESCE(la.best_competitor_product_name, '')) ~* '150\\s*ml\\s*[x*]\\s*2'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%cetaphil%'
OR COALESCE(p.name, '') LIKE '%舒特膚%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%cetaphil%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%舒特膚%'
)
AND COALESCE(p.name, '') LIKE '%舒新雪潤安撫水%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%舒新雪潤安撫水%'
AND lower(COALESCE(p.name, '')) ~* '150\\s*ml\\s*[x*]\\s*2'
AND lower(COALESCE(la.best_competitor_product_name, '')) ~* '150\\s*ml\\s*[x*]\\s*2'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%avene%'
OR COALESCE(p.name, '') LIKE '%雅漾%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%avene%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%雅漾%'
)
AND COALESCE(p.name, '') LIKE '%舒護活泉水%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%舒護活泉水%'
AND lower(COALESCE(p.name, '')) ~* '300\\s*ml\\s*[x*]\\s*4'
AND lower(COALESCE(la.best_competitor_product_name, '')) ~* '300\\s*ml\\s*[x*]\\s*4'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%schick%'
OR COALESCE(p.name, '') LIKE '%舒適牌%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%schick%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%舒適牌%'
)
AND (
(
COALESCE(p.name, '') LIKE '%超捍輕便刀%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%超捍輕便刀%'
)
OR (
COALESCE(p.name, '') LIKE '%高級防滑輕便刀%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%高級防滑輕便刀%'
AND COALESCE(p.name, '') LIKE '%滋潤型%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%滋潤型%'
)
)
AND lower(COALESCE(p.name, '')) ~* '2\\s*\\+\\s*1\\s*入'
AND lower(COALESCE(la.best_competitor_product_name, '')) ~* '2\\s*\\+\\s*1\\s*入'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%kose%'
OR COALESCE(p.name, '') LIKE '%高絲%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%kose%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%高絲%'
)
AND COALESCE(p.name, '') LIKE '%雪肌精限量豪華禮盒%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%雪肌精限量豪華禮盒%'
AND lower(COALESCE(p.name, '')) ~* '500\\s*ml\\s*[x*]\\s*2'
AND lower(COALESCE(la.best_competitor_product_name, '')) ~* '500\\s*ml\\s*[x*]\\s*2'
)
)
)
OR (
la.attempt_status = 'true_low_confidence'
AND COALESCE(la.best_match_score, 0) >= 0.95
AND COALESCE(la.match_diagnostic_json->'reasons', '[]'::jsonb) ? 'strong_exact_spec_match'
AND COALESCE(la.match_diagnostic_json->'reasons', '[]'::jsonb) ?| array[{REVALIDATABLE_REVIEW_SQL_REASON_LIST}]
AND NOT (
COALESCE(la.match_diagnostic_json->'reasons', '[]'::jsonb)
?| array[{REVALIDATABLE_REVIEW_BLOCK_SQL_REASON_LIST}]
)
AND (
(
lower(COALESCE(p.name, '')) LIKE '%beauty foot%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%beauty foot%'
AND COALESCE(p.name, '') LIKE '%足膜%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%足膜%'
)
OR (
COALESCE(p.name, '') LIKE '%足足稱奇%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%足足稱奇%'
AND COALESCE(p.name, '') LIKE '%足膜%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%足膜%'
)
OR (
lower(COALESCE(p.name, '')) LIKE '%ts6%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%ts6%'
AND (
COALESCE(p.name, '') LIKE '%蜜愛潤滑液%'
OR COALESCE(p.name, '') LIKE '%蜜桃煥白凝膠%'
OR COALESCE(p.name, '') LIKE '%極淨白私密潔膚露%'
)
AND (
COALESCE(la.best_competitor_product_name, '') LIKE '%蜜愛潤滑液%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%蜜桃煥白凝膠%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%極淨白私密潔膚露%'
)
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%vaseline%'
OR COALESCE(p.name, '') LIKE '%凡士林%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%vaseline%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%凡士林%'
)
AND COALESCE(p.name, '') LIKE '%嬰兒高純修護凝膠%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%嬰兒高純修護凝膠%'
)
)
)
OR (
la.attempt_status IN (
'low_score',
'refresh_low_score',
'true_low_confidence'
)
AND COALESCE(la.best_match_score, 0) >= :min_score
AND COALESCE(la.hard_veto, false) = false
AND COALESCE(la.match_diagnostic_json->>'comparison_mode', 'exact_identity') = 'exact_identity'
AND NOT (
COALESCE(la.match_diagnostic_json->'reasons', '[]'::jsonb)
?| array[{REVALIDATABLE_REVIEW_BLOCK_SQL_REASON_LIST}]
)
AND (
(
(
lower(COALESCE(p.name, '')) LIKE '%herb24%'
OR COALESCE(p.name, '') LIKE '%草本24%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%herb24%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%草本24%'
)
AND COALESCE(p.name, '') LIKE '%晨霧純精油擴香儀%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%晨霧純精油擴香儀%'
AND COALESCE(p.name, '') LIKE '%黑%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%黑%'
)
OR (
lower(COALESCE(p.name, '')) LIKE '%pavaruni%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%pavaruni%'
AND COALESCE(p.name, '') LIKE '%天然植物%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%天然植物%'
AND COALESCE(p.name, '') LIKE '%精油%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%精油%'
AND COALESCE(p.name, '') LIKE '%40%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%40%'
AND lower(COALESCE(p.name, '')) ~* '10\\s*ml'
AND lower(COALESCE(la.best_competitor_product_name, '')) ~* '10\\s*ml'
)
OR (
lower(COALESCE(p.name, '')) LIKE '%pavaruni%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%pavaruni%'
AND COALESCE(p.name, '') LIKE '%香氛蠟燭%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%香氛蠟燭%'
AND COALESCE(p.name, '') LIKE '%20%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%20%'
AND lower(COALESCE(p.name, '')) ~* '450\\s*g'
AND lower(COALESCE(la.best_competitor_product_name, '')) ~* '450\\s*g'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%derma%'
OR COALESCE(p.name, '') LIKE '%德瑪%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%derma%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%德瑪%'
)
AND COALESCE(p.name, '') LIKE '%大地%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%大地%'
AND COALESCE(p.name, '') LIKE '%有機植萃%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%有機植萃%'
AND COALESCE(p.name, '') LIKE '%護膚油%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%護膚油%'
AND lower(COALESCE(p.name, '')) ~* '150\\s*ml'
AND lower(COALESCE(la.best_competitor_product_name, '')) ~* '150\\s*ml'
AND NOT (
COALESCE(p.name, '') ~* '(2\\s*入|2\\s*瓶|x\\s*2|\\*\\s*2|\\s*2)'
OR COALESCE(la.best_competitor_product_name, '') ~* '(2\\s*入|2\\s*瓶|x\\s*2|\\*\\s*2|\\s*2)'
)
)
)
)
OR (
(
(
la.attempt_status = 'true_low_confidence'
AND COALESCE(la.best_match_score, 0) >= 0.95
)
OR (
la.attempt_status = 'rescore_accepted_current'
AND COALESCE(la.best_match_score, 0) >= 0.76
)
)
AND COALESCE(la.hard_veto, false) = false
AND COALESCE(la.match_diagnostic_json->>'comparison_mode', 'exact_identity') = 'exact_identity'
AND NOT (
COALESCE(la.match_diagnostic_json->'reasons', '[]'::jsonb)
?| array[{FOCUSED_REVALIDATABLE_REVIEW_BLOCK_SQL_REASON_LIST}]
)
AND (
(
COALESCE(p.name, '') LIKE '%花美水%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%花美水%'
AND lower(COALESCE(p.name, '')) LIKE '%relax%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%relax%'
AND COALESCE(p.name, '') LIKE '%薰衣草%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%薰衣草%'
AND COALESCE(p.name, '') LIKE '%潤滑凝膠%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%潤滑凝膠%'
)
OR (
COALESCE(p.name, '') LIKE '%聖克萊爾%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%聖克萊爾%'
AND COALESCE(p.name, '') LIKE '%私密呼呼溫和潔淨慕斯%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%私密呼呼溫和潔淨慕斯%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%biopeutic%'
OR COALESCE(p.name, '') LIKE '%葆療美%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%biopeutic%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%葆療美%'
)
AND COALESCE(p.name, '') LIKE '%果酸煥膚水凝乳%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%果酸煥膚水凝乳%'
)
OR (
COALESCE(p.name, '') LIKE '%台塑生醫%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%台塑生醫%'
AND COALESCE(p.name, '') LIKE '%嬰兒沐浴洗髮%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%嬰兒沐浴洗髮%'
AND COALESCE(p.name, '') LIKE '%嬰兒沐浴精%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%嬰兒沐浴精%'
AND COALESCE(p.name, '') LIKE '%嬰幼童洗髮精%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%嬰幼童洗髮精%'
)
OR (
COALESCE(p.name, '') LIKE '%雅頓%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%雅頓%'
AND COALESCE(p.name, '') LIKE '%八小時潤澤護唇膏%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%八小時潤澤護唇膏%'
AND lower(COALESCE(p.name, '')) LIKE '%spf15%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%spf15%'
)
OR (
COALESCE(p.name, '') LIKE '%理膚寶水%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%理膚寶水%'
AND COALESCE(p.name, '') LIKE '%全面修復潤唇膏%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%全面修復潤唇膏%'
)
OR (
lower(COALESCE(p.name, '')) LIKE '%3w%'
AND lower(COALESCE(p.name, '')) LIKE '%clinic%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%3w%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%clinic%'
AND COALESCE(p.name, '') LIKE '%膠原蛋白粉底液%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%膠原蛋白粉底液%'
AND COALESCE(p.name, '') ~* '50\\s*ml'
AND COALESCE(la.best_competitor_product_name, '') ~* '50\\s*ml'
)
OR (
COALESCE(p.name, '') LIKE '%花美水%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%花美水%'
AND lower(COALESCE(p.name, '')) LIKE '%moisture%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%moisture%'
AND COALESCE(p.name, '') LIKE '%保濕修護%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%保濕修護%'
AND COALESCE(p.name, '') LIKE '%精華凝膠%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%精華凝膠%'
AND COALESCE(p.name, '') LIKE '%原黃金%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%原黃金%'
)
OR (
COALESCE(p.name, '') LIKE '%花美水%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%花美水%'
AND lower(COALESCE(p.name, '')) LIKE '%inclear%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%inclear%'
AND COALESCE(p.name, '') LIKE '%櫻克麗兒%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%櫻克麗兒%'
AND COALESCE(p.name, '') LIKE '%私密淨化凝膠%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%私密淨化凝膠%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%kussen%'
OR COALESCE(p.name, '') LIKE '%葵森%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%kussen%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%葵森%'
)
AND COALESCE(p.name, '') LIKE '%寶寶益菌屁屁膏%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%寶寶益菌屁屁膏%'
AND COALESCE(p.name, '') LIKE '%50%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%50%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%lab52%'
OR COALESCE(p.name, '') LIKE '%齒妍堂%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%lab52%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%齒妍堂%'
)
AND COALESCE(p.name, '') LIKE '%牙刷%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%牙刷%'
AND (
COALESCE(p.name, '') LIKE '%嬰幼兒%'
OR COALESCE(p.name, '') LIKE '%幼兒%'
OR COALESCE(p.name, '') LIKE '%汪汪隊%'
)
AND (
COALESCE(la.best_competitor_product_name, '') LIKE '%嬰幼兒%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%幼兒%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%汪汪隊%'
)
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%flortte%'
OR COALESCE(p.name, '') LIKE '%花洛莉亞%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%flortte%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%花洛莉亞%'
)
AND COALESCE(p.name, '') LIKE '%水果沙拉系列彩色防水眼線液筆%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%水果沙拉系列彩色防水眼線液筆%'
AND COALESCE(p.name, '') LIKE '%色號任選%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%色號任選%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%neutrogena%'
OR COALESCE(p.name, '') LIKE '%露得清%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%neutrogena%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%露得清%'
)
AND COALESCE(p.name, '') LIKE '%護手霜%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%護手霜%'
AND COALESCE(p.name, '') LIKE '%無香%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%無香%'
AND COALESCE(p.name, '') LIKE '%有香%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%有香%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%kanebo%'
OR COALESCE(p.name, '') LIKE '%佳麗寶%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%kanebo%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%佳麗寶%'
)
AND lower(COALESCE(p.name, '')) LIKE '%allie%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%allie%'
AND COALESCE(p.name, '') LIKE '%持采亮化UV防曬水凝乳%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%持采亮化UV防曬水凝乳%'
AND COALESCE(p.name, '') LIKE '%任選%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%任選%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%o.p.i%'
OR lower(COALESCE(p.name, '')) LIKE '%opi%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%o.p.i%'
OR lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%opi%'
)
AND COALESCE(p.name, '') LIKE '%類光繚%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%類光繚%'
AND (
COALESCE(p.name, '') LIKE '%指甲油%'
OR COALESCE(p.name, '') LIKE '%指彩%'
)
AND (
COALESCE(la.best_competitor_product_name, '') LIKE '%指甲油%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%指彩%'
)
AND COALESCE(p.name, '') ~* 'isl[a-z0-9]*[0-9]{2,3}'
AND COALESCE(la.best_competitor_product_name, '') ~* 'isl[a-z0-9]*[0-9]{2,3}'
)
OR (
lower(COALESCE(p.name, '')) LIKE '%ordinary%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%ordinary%'
AND COALESCE(p.name, '') LIKE '%咖啡因%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%咖啡因%'
AND lower(COALESCE(p.name, '')) LIKE '%egcg%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%egcg%'
AND COALESCE(p.name, '') LIKE '%兒茶眼部配方%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%兒茶眼部配方%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%sk-ii%'
OR lower(COALESCE(p.name, '')) LIKE '%skii%'
OR lower(COALESCE(p.name, '')) LIKE '%sk2%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%sk-ii%'
OR lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%skii%'
OR lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%sk2%'
)
AND COALESCE(p.name, '') LIKE '%青春露%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%青春露%'
AND COALESCE(p.name, '') LIKE '%330%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%330%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%amiino%'
OR COALESCE(p.name, '') LIKE '%安美諾%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%amiino%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%安美諾%'
)
AND COALESCE(p.name, '') LIKE '%美白修護霜%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%美白修護霜%'
AND COALESCE(p.name, '') LIKE '%30%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%30%'
)
OR (
lower(COALESCE(p.name, '')) LIKE '%natures%'
AND lower(COALESCE(p.name, '')) LIKE '%care%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%natures%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%care%'
AND COALESCE(p.name, '') LIKE '%綿羊油%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%綿羊油%'
)
OR (
lower(COALESCE(p.name, '')) LIKE '%tomoon%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%tomoon%'
AND COALESCE(p.name, '') LIKE '%德國奔月%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%德國奔月%'
AND COALESCE(p.name, '') LIKE '%豪華套裝組%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%豪華套裝組%'
AND COALESCE(p.name, '') LIKE '%指甲%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%指甲%'
)
OR (
lower(COALESCE(p.name, '')) LIKE '%hh%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%hh%'
AND COALESCE(p.name, '') LIKE '%私密植萃抗菌潔淨露%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%私密植萃抗菌潔淨露%'
AND COALESCE(p.name, '') LIKE '%私密衣物抗菌手洗精%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%私密衣物抗菌手洗精%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%sebamed%'
OR COALESCE(p.name, '') LIKE '%施巴%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%sebamed%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%施巴%'
)
AND COALESCE(p.name, '') LIKE '%護潔露%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%護潔露%'
AND COALESCE(p.name, '') LIKE '%200%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%200%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%yes%'
OR COALESCE(p.name, '') LIKE '%德悅氏%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%yes%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%德悅氏%'
)
AND COALESCE(p.name, '') LIKE '%鋒利窄弧型剪刀%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%鋒利窄弧型剪刀%'
AND lower(COALESCE(p.name, '')) LIKE '%9cm%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%9cm%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%yes%'
OR COALESCE(p.name, '') LIKE '%德悅氏%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%yes%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%德悅氏%'
)
AND COALESCE(p.name, '') LIKE '%腳指甲剪刀%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%腳指甲剪刀%'
AND lower(COALESCE(p.name, '')) LIKE '%10.5cm%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%10.5cm%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%yes%'
OR COALESCE(p.name, '') LIKE '%德悅氏%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%yes%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%德悅氏%'
)
AND COALESCE(p.name, '') LIKE '%極細指甲緣硬皮剪刀%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%極細指甲緣硬皮剪刀%'
AND lower(COALESCE(p.name, '')) LIKE '%9cm%'
AND lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%9cm%'
)
OR (
(
lower(COALESCE(p.name, '')) LIKE '%yes%'
OR COALESCE(p.name, '') LIKE '%德悅氏%'
)
AND (
lower(COALESCE(la.best_competitor_product_name, '')) LIKE '%yes%'
OR COALESCE(la.best_competitor_product_name, '') LIKE '%德悅氏%'
)
AND (
(
COALESCE(p.name, '') LIKE '%指甲剪附除垢銼刀%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%指甲剪附除垢銼刀%'
)
OR (
COALESCE(p.name, '') LIKE '%腳皮銼腳板%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%腳皮銼腳板%'
)
OR (
COALESCE(p.name, '') LIKE '%藍寶石銼刀%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%藍寶石銼刀%'
)
OR (
COALESCE(p.name, '') LIKE '%指甲拋光棒%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%指甲拋光棒%'
)
OR (
COALESCE(p.name, '') LIKE '%指甲剪%'
AND COALESCE(la.best_competitor_product_name, '') LIKE '%指甲剪%'
AND COALESCE(p.name, '') NOT LIKE '%附除垢%'
AND COALESCE(la.best_competitor_product_name, '') NOT LIKE '%附除垢%'
AND COALESCE(p.name, '') NOT LIKE '%腳指甲%'
AND COALESCE(la.best_competitor_product_name, '') NOT LIKE '%腳指甲%'
)
)
)
)
)
)
ORDER BY la.best_match_score DESC NULLS LAST, latest_price.price DESC NULLS LAST, p.i_code
LIMIT :limit
""")
with self.engine.connect() as conn:
rows = conn.execute(
sql,
{
"limit": max(1, min(int(limit), 300)),
"min_score": float(min_score),
"match_score_floor": MIN_MATCH_SCORE,
},
).fetchall()
return [dict(r._mapping) for r in rows]
def preview_retryable_candidate_revalidation(self, limit: int = 80, min_score: float = 0.70) -> dict:
"""
只讀預覽下一批可重評候選。
這個方法刻意復用正式重評的候選 SQL但只回傳受 limit 限制的摘要;
不啟動 PChome 搜尋、不呼叫 LLM、不寫 competitor_match_attempts / competitor_prices。
"""
preview_limit = max(1, min(int(limit), 300))
rows = self._fetch_retryable_candidate_skus(limit=preview_limit, min_score=min_score)
status_counts: dict[str, int] = {}
examples: list[dict] = []
review_gated_count = 0
for row in rows:
status = str(row.get("attempt_status") or "unknown")
status_counts[status] = status_counts.get(status, 0) + 1
if status == "true_low_confidence":
review_gated_count += 1
if len(examples) < 5:
score = row.get("best_match_score")
examples.append({
"sku": row.get("sku"),
"momo_name": row.get("name"),
"competitor_product_id": row.get("competitor_product_id"),
"competitor_product_name": row.get("competitor_product_name"),
"best_match_score": float(score) if score is not None else None,
"attempt_status": status,
})
return {
"available": True,
"preview_limit": preview_limit,
"candidate_count": len(rows),
"has_more": len(rows) >= preview_limit,
"review_gated_count": review_gated_count,
"status_counts": status_counts,
"examples": examples,
"min_score": float(min_score),
"boundary": "read_only_no_crawl_no_llm_no_db_write",
}
def preview_expired_identity_recovery(self, limit: int = 40) -> dict:
"""
只讀預覽過期 identity_v2 搜尋救援候選。
這裡只查本地 DB 中已建立 identity_v2、但價格已過期的 SKU
不查 PChome、不重新比對、不寫 attempts / prices。
"""
preview_limit = max(1, min(int(limit), 120))
rows = self._fetch_expired_identity_recovery_skus(limit=preview_limit)
examples: list[dict] = []
for row in rows[:5]:
examples.append({
"sku": row.get("sku"),
"momo_name": row.get("name"),
"competitor_product_id": row.get("competitor_product_id"),
"competitor_product_name": row.get("competitor_product_name"),
"match_score": float(row.get("match_score")) if row.get("match_score") is not None else None,
"expires_at": str(row.get("expires_at")) if row.get("expires_at") is not None else None,
})
return {
"available": True,
"preview_limit": preview_limit,
"candidate_count": len(rows),
"has_more": len(rows) >= preview_limit,
"examples": examples,
"boundary": "read_only_no_crawl_no_llm_no_db_write",
}
def _fetch_expired_identity_recovery_skus(self, limit: int = 40) -> list:
"""
取得適合 fresh-search recovery 的過期 identity_v2 商品。
這比一般 expired refresh 更窄:只收過去已是 exact / total_price /
price_alert_exact 的正式配對,且排除款式、香味、型態、入數、商業狀態等
高風險診斷或名稱訊號。避免把本來應該人工覆核的 stale pair 送進慢速搜尋。
"""
if self.engine is None:
raise RuntimeError("需要注入 SQLAlchemy engine")
from sqlalchemy import text
candidate_limit = max(1, min(int(limit) * 6, 400))
target_limit = max(1, min(int(limit), 120))
sql = text(f"""
WITH expired_competitor AS (
SELECT
cp.sku,
cp.competitor_product_id,
cp.competitor_product_name,
cp.match_score,
cp.expires_at,
COALESCE(cp.match_diagnostic_json->'reasons', '[]'::jsonb) AS diagnostic_reasons
FROM competitor_prices cp
WHERE cp.source = 'pchome'
AND cp.competitor_product_id IS NOT NULL
AND cp.competitor_product_id <> ''
AND (cp.expires_at IS NULL OR cp.expires_at <= CURRENT_TIMESTAMP)
AND COALESCE(cp.match_score, 0) >= :match_score_floor
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
AND (
COALESCE(cp.tags, '[]'::jsonb) ? 'price_basis_total_price'
OR cp.match_diagnostic_json->>'price_basis' = 'total_price'
)
AND (
COALESCE(cp.tags, '[]'::jsonb) ? 'alert_tier_price_alert_exact'
OR cp.match_diagnostic_json->>'alert_tier' = 'price_alert_exact'
)
AND COALESCE(cp.match_diagnostic_json->>'comparison_mode', 'exact_identity') = 'exact_identity'
AND COALESCE(cp.hard_veto, false) = false
ORDER BY
CASE WHEN cp.expires_at IS NULL THEN 0 ELSE 1 END,
cp.expires_at ASC NULLS FIRST,
cp.sku
LIMIT :candidate_limit
)
SELECT
p.id AS product_id,
p.i_code AS sku,
p.name,
p.category,
latest_price.price AS momo_price,
ec.competitor_product_id,
ec.competitor_product_name,
ec.match_score,
ec.expires_at,
ec.diagnostic_reasons
FROM expired_competitor ec
JOIN products p
ON p.i_code = ec.sku
AND p.status = 'ACTIVE'
JOIN LATERAL (
SELECT pr.price
FROM price_records pr
WHERE pr.product_id = p.id
ORDER BY pr.timestamp DESC, pr.id DESC
LIMIT 1
) latest_price ON TRUE
ORDER BY
CASE WHEN ec.expires_at IS NULL THEN 0 ELSE 1 END,
ec.expires_at ASC NULLS FIRST,
latest_price.price DESC NULLS LAST,
p.i_code
""")
with self.engine.connect() as conn:
rows = conn.execute(
sql,
{
"candidate_limit": candidate_limit,
"match_score_floor": MIN_MATCH_SCORE,
},
).fetchall()
filtered = []
for row in rows:
payload = dict(row._mapping)
if _has_stale_identity_recovery_block(payload):
continue
payload.pop("diagnostic_reasons", None)
filtered.append(payload)
if len(filtered) >= target_limit:
break
return filtered
def _fetch_expired_identity_skus(self, limit: int = 120) -> list:
"""
取得 identity_v2 已確認、但 PChome 價格快取過期的商品。
這些商品不需重新 keyword search先用既有 PChome product_id 批次刷新價格。
"""
if self.engine is None:
raise RuntimeError("需要注入 SQLAlchemy engine")
from sqlalchemy import text
sql = text("""
SELECT
p.id AS product_id,
p.i_code AS sku,
p.name,
p.category,
latest_price.price AS momo_price,
cp.competitor_product_id,
cp.competitor_product_name,
cp.match_score,
cp.expires_at
FROM competitor_prices cp
JOIN products p
ON p.i_code = cp.sku
AND p.status = 'ACTIVE'
JOIN LATERAL (
SELECT pr.price
FROM price_records pr
WHERE pr.product_id = p.id
ORDER BY pr.timestamp DESC, pr.id DESC
LIMIT 1
) latest_price ON TRUE
WHERE cp.source = 'pchome'
AND cp.competitor_product_id IS NOT NULL
AND cp.competitor_product_id <> ''
AND (cp.expires_at IS NULL OR cp.expires_at <= CURRENT_TIMESTAMP)
AND COALESCE(cp.match_score, 0) >= :match_score_floor
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
ORDER BY
CASE
WHEN cp.expires_at IS NULL THEN -1
WHEN (
COALESCE(cp.tags, '[]'::jsonb) ? 'price_basis_total_price'
OR cp.match_diagnostic_json->>'price_basis' = 'total_price'
)
AND (
COALESCE(cp.tags, '[]'::jsonb) ? 'alert_tier_price_alert_exact'
OR cp.match_diagnostic_json->>'alert_tier' = 'price_alert_exact'
)
THEN 0
ELSE 1
END,
cp.expires_at ASC NULLS FIRST,
latest_price.price DESC NULLS LAST,
p.i_code
LIMIT :limit
""")
with self.engine.connect() as conn:
rows = conn.execute(
sql,
{"limit": max(1, min(int(limit), 500)), "match_score_floor": MIN_MATCH_SCORE},
).fetchall()
return [dict(r._mapping) for r in rows]
def _upsert_competitor_price(
self,
sku: str,
product, # PChomeProduct
match_score: float,
tags: list,
momo_product_id: int = None,
momo_price: float = None,
diagnostics=None,
source: str = "pchome",
):
"""單筆寫入/更新最新快取,並追加一筆歷史快照。"""
from sqlalchemy import text
_taipei = timezone(timedelta(hours=8))
expires_at = (datetime.now(_taipei) + timedelta(hours=TTL_HOURS)).strftime("%Y-%m-%d %H:%M:%S")
tags_json = json.dumps(tags, ensure_ascii=False)
diagnostic_payload = _match_diagnostics_payload(diagnostics)
diagnostic_codes = diagnostic_payload.get("reasons") or []
product_payload = _product_snapshot_payload(product)
with self.engine.begin() as conn:
self._ensure_competitor_price_history_table(conn)
self._ensure_competitor_prices_columns(conn)
json_expr = "CAST(:match_diagnostic_json AS jsonb)" if conn.dialect.name == "postgresql" else ":match_diagnostic_json"
codes_expr = "CAST(:diagnostic_codes AS jsonb)" if conn.dialect.name == "postgresql" else ":diagnostic_codes"
conn.execute(text("""
INSERT INTO competitor_prices
(sku, source, price, original_price, discount_pct,
competitor_product_id, competitor_product_name,
competitor_product_url, competitor_image_url, competitor_stock,
match_score, tags, match_diagnostic_json,
comparison_mode, hard_veto, diagnostic_codes,
crawled_at, expires_at)
VALUES
(:sku, :source, :price, :original_price, :discount_pct,
:comp_id, :comp_name,
:competitor_product_url, :competitor_image_url, :competitor_stock,
:match_score, :tags, {json_expr},
:comparison_mode, :hard_veto, {codes_expr},
CURRENT_TIMESTAMP, :expires_at)
ON CONFLICT (sku, source) DO UPDATE
SET price = EXCLUDED.price,
original_price = EXCLUDED.original_price,
discount_pct = EXCLUDED.discount_pct,
competitor_product_id = EXCLUDED.competitor_product_id,
competitor_product_name = EXCLUDED.competitor_product_name,
competitor_product_url = EXCLUDED.competitor_product_url,
competitor_image_url = EXCLUDED.competitor_image_url,
competitor_stock = EXCLUDED.competitor_stock,
match_score = EXCLUDED.match_score,
tags = EXCLUDED.tags,
match_diagnostic_json = EXCLUDED.match_diagnostic_json,
comparison_mode = EXCLUDED.comparison_mode,
hard_veto = EXCLUDED.hard_veto,
diagnostic_codes = EXCLUDED.diagnostic_codes,
crawled_at = CURRENT_TIMESTAMP,
expires_at = :expires_at
""".format(json_expr=json_expr, codes_expr=codes_expr)), {
"sku": sku,
"source": source,
"price": product.price,
"original_price":product.original_price,
"discount_pct": product.discount,
"comp_id": product.product_id,
"comp_name": product.name[:200],
**product_payload,
"match_score": match_score,
"tags": tags_json,
"match_diagnostic_json": json.dumps(diagnostic_payload, ensure_ascii=False) if diagnostic_payload else None,
"comparison_mode": diagnostic_payload.get("comparison_mode"),
"hard_veto": diagnostic_payload.get("hard_veto"),
"diagnostic_codes": json.dumps(diagnostic_codes, ensure_ascii=False) if diagnostic_codes else None,
"expires_at": expires_at,
})
conn.execute(text("""
INSERT INTO competitor_price_history
(sku, source, momo_product_id, momo_price,
price, original_price, discount_pct,
competitor_product_id, competitor_product_name,
competitor_product_url, competitor_image_url, competitor_stock,
match_score, tags, match_diagnostic_json,
comparison_mode, hard_veto, diagnostic_codes,
crawled_at)
VALUES
(:sku, :source, :momo_product_id, :momo_price,
:price, :original_price, :discount_pct,
:comp_id, :comp_name,
:competitor_product_url, :competitor_image_url, :competitor_stock,
:match_score, :tags, {json_expr},
:comparison_mode, :hard_veto, {codes_expr},
CURRENT_TIMESTAMP)
""".format(json_expr=json_expr, codes_expr=codes_expr)), {
"sku": sku,
"source": source,
"momo_product_id": momo_product_id,
"momo_price": momo_price,
"price": product.price,
"original_price": product.original_price,
"discount_pct": product.discount,
"comp_id": product.product_id,
"comp_name": product.name[:200],
**product_payload,
"match_score": match_score,
"tags": tags_json,
"match_diagnostic_json": json.dumps(diagnostic_payload, ensure_ascii=False) if diagnostic_payload else None,
"comparison_mode": diagnostic_payload.get("comparison_mode"),
"hard_veto": diagnostic_payload.get("hard_veto"),
"diagnostic_codes": json.dumps(diagnostic_codes, ensure_ascii=False) if diagnostic_codes else None,
})
def _should_upsert_competitor_price(
self,
sku: str,
product,
match_score: float,
momo_name: str | None = None,
source: str = "pchome",
) -> tuple[bool, str]:
"""
保護正式 competitor_prices若既有配對是不同 PChome 商品,
只有超高信心才允許覆蓋,避免新 matcher 一次污染核心比價資料。
"""
from sqlalchemy import text
with self.engine.connect() as conn:
row = conn.execute(text("""
SELECT competitor_product_id, competitor_product_name, match_score, tags
FROM competitor_prices
WHERE sku = :sku
AND source = :source
LIMIT 1
"""), {"sku": sku, "source": source}).mappings().first()
if not row:
return True, "new_match"
existing_id = str(row.get("competitor_product_id") or "")
existing_name = str(row.get("competitor_product_name") or "")
incoming_id = str(getattr(product, "product_id", "") or "")
incoming_name = str(getattr(product, "name", "") or "")
try:
existing_score = float(row.get("match_score") or 0)
except (TypeError, ValueError):
existing_score = 0.0
existing_tags = row.get("tags") or []
if isinstance(existing_tags, str):
try:
existing_tags = json.loads(existing_tags)
except Exception:
existing_tags = []
if "identity_v2" not in existing_tags:
return True, "replace_legacy_unverified"
if not existing_id or existing_id == incoming_id:
return True, "same_or_empty_existing"
if existing_score < MIN_MATCH_SCORE:
return True, f"replace_low_existing_score={existing_score:.3f}"
if momo_name and existing_name:
try:
from services.marketplace_product_matcher import score_marketplace_match
existing_current = score_marketplace_match(momo_name, existing_name)
except Exception:
existing_current = None
if (
existing_current
and (
existing_current.hard_veto
or existing_current.score < MIN_MATCH_SCORE
)
and match_score >= MIN_MATCH_SCORE
):
return (
True,
"replace_stale_existing_identity="
f"{existing_score:.3f}->{existing_current.score:.3f}->{match_score:.3f}",
)
if existing_name and incoming_name:
try:
from services.marketplace_product_matcher import score_marketplace_match
candidate_equivalence = score_marketplace_match(existing_name, incoming_name)
except Exception:
candidate_equivalence = None
if (
candidate_equivalence
and candidate_equivalence.score >= 0.88
and not candidate_equivalence.hard_veto
and candidate_equivalence.comparison_mode == "exact_identity"
and match_score >= max(MIN_MATCH_SCORE, existing_score + 0.015)
):
return (
True,
"replace_same_identity_better_score="
f"{existing_score:.3f}->{match_score:.3f}",
)
if existing_score >= match_score:
return (
False,
f"existing_match_conflict;stronger_existing;existing_id={existing_id};"
f"incoming_id={incoming_id};existing_score={existing_score:.3f};"
f"incoming_score={match_score:.3f}",
)
if match_score >= REPLACE_DIFFERENT_PRODUCT_SCORE:
return True, f"replace_high_confidence_score={match_score:.3f}"
return (
False,
f"existing_match_conflict;existing_id={existing_id};"
f"incoming_id={incoming_id};existing_score={existing_score:.3f};"
f"incoming_score={match_score:.3f}",
)
def _fetch_latest_manual_review_for_candidate(
self,
sku: str,
competitor_product_id: str,
source: str = "pchome",
) -> Optional[dict]:
"""Read the latest human review for this exact candidate, if the table exists."""
if not competitor_product_id:
return None
from sqlalchemy import text
try:
with self.engine.connect() as conn:
row = conn.execute(text("""
SELECT review_action, review_reason, reviewer_identity, reviewed_at
FROM competitor_match_reviews
WHERE sku = :sku
AND source = :source
AND candidate_product_id = :candidate_id
ORDER BY reviewed_at DESC, id DESC
LIMIT 1
"""), {
"sku": sku,
"source": source,
"candidate_id": competitor_product_id,
}).mappings().first()
except Exception:
return None
return dict(row) if row else None
def _run_sku_items(
self,
skus: list,
source: str = "pchome",
label: str = "PChome 競品價格",
bounded_search: bool = False,
) -> FeederResult:
start = time.time()
if source != "pchome":
logger.warning(f"[Feeder] 尚未支援 source={source},跳過")
return FeederResult(0, 0, 0, 0, 0, 0.0)
from services.pchome_crawler import PChomeCrawler
crawler_timeout = BACKFILL_REQUEST_TIMEOUT if bounded_search else REQUEST_TIMEOUT
crawler = PChomeCrawler(timeout=crawler_timeout, delay=RATE_DELAY)
logger.info(f"[Feeder] 開始抓取 {len(skus)} 支商品的 {label}")
matched = 0
skipped_no = 0
skipped_low = 0
errors = 0
history_written = 0
attempts_written = 0
for item in skus:
sku = item["sku"]
momo_name = item["name"]
momo_product_id = item.get("product_id")
momo_price = item.get("momo_price")
search_terms = _build_search_keywords(momo_name)
try:
if bounded_search:
products = _search_pchome_candidates(
crawler,
momo_name,
search_terms,
momo_price=momo_price,
max_terms=BACKFILL_MAX_SEARCH_TERMS,
max_pages=BACKFILL_SEARCH_MAX_PAGES,
max_seconds=BACKFILL_MAX_SECONDS_PER_SKU,
)
else:
products = _search_pchome_candidates(crawler, momo_name, search_terms, momo_price=momo_price)
if not products:
logger.debug(f"[Feeder] {sku} 無搜尋結果,跳過")
browse_diagnostic = self._prepare_browse_diagnostic(
momo_name,
search_terms=search_terms,
reason="no_result",
candidate_count=0,
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=0,
attempt_status="no_result",
browse_diagnostic=browse_diagnostic,
source=source,
)
attempts_written += 1
skipped_no += 1
continue
ranked_matches = _rank_match_details(momo_name, products, momo_price=momo_price)
if not ranked_matches:
browse_diagnostic = self._prepare_browse_diagnostic(
momo_name,
search_terms=search_terms,
reason="no_match",
candidate_count=len(products),
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=len(products),
attempt_status="no_match",
browse_diagnostic=browse_diagnostic,
source=source,
)
attempts_written += 1
skipped_no += 1
continue
selected_match = None
manually_rejected_ids: list[str] = []
for candidate_product, candidate_score, candidate_diagnostics in ranked_matches:
candidate_review = self._fetch_latest_manual_review_for_candidate(
sku,
getattr(candidate_product, "product_id", None),
source=source,
)
if (candidate_review or {}).get("review_action") == "reject_identity":
manually_rejected_ids.append(str(getattr(candidate_product, "product_id", "") or ""))
continue
selected_match = (
candidate_product,
candidate_score,
candidate_diagnostics,
candidate_review,
)
break
if not selected_match:
best_product, score, diagnostics = ranked_matches[0]
rejected_note = ",".join(product_id for product_id in manually_rejected_ids if product_id)
logger.info(
f"[Feeder] {sku} 所有可信候選都已被人工否決,跳過正式寫入 | "
f"rejected_candidates={rejected_note}"
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=len(products),
attempt_status="manual_rejected",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
error_message=(
f"manual_review_rejected; rejected_candidates={rejected_note}; "
f"{_format_match_diagnostics(diagnostics)}"
),
source=source,
)
attempts_written += 1
skipped_low += 1
continue
best_product, score, diagnostics, manual_review = selected_match
manual_action = (manual_review or {}).get("review_action")
if manual_action == "unit_price_required":
logger.info(
f"[Feeder] {sku} 候選已被人工標記為單位價比較,不寫正式總價差 | "
f"candidate={getattr(best_product, 'product_id', None)}"
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=len(products),
attempt_status="manual_unit_price_required",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
error_message=f"manual_review_unit_price_required; {_format_match_diagnostics(diagnostics)}",
source=source,
)
attempts_written += 1
skipped_low += 1
continue
manual_accept_override = manual_action == "accept_identity"
if getattr(diagnostics, "comparison_mode", "") == "unit_comparable" and not manual_accept_override:
logger.info(
f"[Feeder] {sku} 候選屬單位價可比但非同販售組合,"
f"不寫入正式價差 | {_format_match_diagnostics(diagnostics)}"
)
browse_diagnostic = self._prepare_browse_diagnostic(
momo_name,
search_terms=search_terms,
reason="unit_comparable",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
candidate_count=len(products),
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=len(products),
attempt_status="unit_comparable",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
browse_diagnostic=browse_diagnostic,
error_message=_format_match_diagnostics(diagnostics),
source=source,
)
attempts_written += 1
skipped_low += 1
continue
if score < MIN_MATCH_SCORE and not manual_accept_override:
attempt_status = _classify_low_score_attempt(score, diagnostics)
if (
attempt_status == "recoverable_low_score"
and _has_variant_selection_gap(momo_name, ranked_matches, score)
):
attempt_status = "true_low_confidence"
logger.debug(
f"[Feeder] {sku} 比對分數過低 ({score:.3f} < {MIN_MATCH_SCORE})"
f"{_format_match_diagnostics(diagnostics)}"
)
browse_diagnostic = self._prepare_browse_diagnostic(
momo_name,
search_terms=search_terms,
reason=attempt_status,
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
candidate_count=len(products),
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=len(products),
attempt_status=attempt_status,
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
browse_diagnostic=browse_diagnostic,
error_message=_format_match_diagnostics(diagnostics),
source=source,
)
attempts_written += 1
skipped_low += 1
continue
if manual_accept_override:
score = max(score, MIN_MATCH_SCORE)
if not manual_accept_override and not _is_auto_price_write_safe(diagnostics):
attempt_status = _classify_auto_write_block_attempt(score, diagnostics)
browse_diagnostic = self._prepare_browse_diagnostic(
momo_name,
search_terms=search_terms,
reason=attempt_status,
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
candidate_count=len(products),
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=len(products),
attempt_status=attempt_status,
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
browse_diagnostic=browse_diagnostic,
error_message=f"auto_price_write_blocked; {_format_match_diagnostics(diagnostics)}",
source=source,
)
attempts_written += 1
skipped_low += 1
continue
tags = _extend_match_tags(_extract_tags(best_product), diagnostics)
if manual_accept_override:
tags.extend(["manual_review", "manual_accept"])
tags = [tag for tag in tags if tag != "identity_veto"]
tags = list(dict.fromkeys(tags))
should_write, write_reason = self._should_upsert_competitor_price(
sku,
best_product,
score,
momo_name=momo_name,
source=source,
)
if manual_accept_override and not should_write:
should_write = True
write_reason = "manual_accept_override"
if not should_write:
logger.info(f"[Feeder] {sku} 進入人工覆核,不覆蓋既有配對 | {write_reason}")
browse_diagnostic = self._prepare_browse_diagnostic(
momo_name,
search_terms=search_terms,
reason="protected_existing_match",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
candidate_count=len(products),
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=len(products),
attempt_status="protected_existing_match",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
browse_diagnostic=browse_diagnostic,
error_message=f"{write_reason}; {_format_match_diagnostics(diagnostics)}",
source=source,
)
attempts_written += 1
skipped_low += 1
continue
tags.append(write_reason)
self._upsert_competitor_price(
sku,
best_product,
score,
tags,
momo_product_id=momo_product_id,
momo_price=momo_price,
diagnostics=diagnostics,
source=source,
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=len(products),
attempt_status="matched",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
source=source,
)
matched += 1
history_written += 1
attempts_written += 1
logger.debug(
f"[Feeder] {sku} → PChome ${best_product.price} "
f"score={score:.3f} tags={tags}"
)
except Exception as e:
logger.error(f"[Feeder] {sku} 處理失敗: {e}")
try:
browse_diagnostic = self._prepare_browse_diagnostic(
momo_name,
search_terms=search_terms,
reason="crawler_error",
candidate_count=0,
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
attempt_status="error",
browse_diagnostic=browse_diagnostic,
error_message=str(e),
source=source,
)
attempts_written += 1
except Exception as attempt_error:
logger.warning(f"[Feeder] {sku} 比對嘗試紀錄寫入失敗: {attempt_error}")
errors += 1
duration = round(time.time() - start, 2)
logger.info(
f"[Feeder] 完成 matched={matched} skipped_no={skipped_no} "
f"skipped_low={skipped_low} errors={errors} "
f"history_written={history_written} attempts_written={attempts_written} 耗時={duration}s"
)
return FeederResult(
total_skus=len(skus),
matched=matched,
skipped_no_result=skipped_no,
skipped_low_score=skipped_low,
errors=errors,
duration_sec=duration,
history_written=history_written,
attempts_written=attempts_written,
)
def _run_known_identity_refresh_items(
self,
skus: list,
source: str = "pchome",
label: str = "已確認身份價格刷新",
allow_missing_recovery: bool = True,
bounded_recovery: bool = False,
) -> FeederResult:
start = time.time()
if source != "pchome":
logger.warning(f"[Feeder] 尚未支援 source={source},跳過")
return FeederResult(0, 0, 0, 0, 0, 0.0)
if not skus:
return FeederResult(0, 0, 0, 0, 0, 0.0)
from services.pchome_crawler import PChomeCrawler
crawler_timeout = BACKFILL_REQUEST_TIMEOUT if bounded_recovery else REQUEST_TIMEOUT
crawler = PChomeCrawler(timeout=crawler_timeout, delay=RATE_DELAY)
recovery_kwargs = (
{
"max_terms": BACKFILL_MAX_SEARCH_TERMS,
"max_pages": BACKFILL_SEARCH_MAX_PAGES,
"max_seconds": BACKFILL_MAX_SECONDS_PER_SKU,
}
if bounded_recovery
else {}
)
requested_ids = [
str(item.get("competitor_product_id") or "").strip()
for item in skus
if str(item.get("competitor_product_id") or "").strip()
]
ok, message, products = crawler.fetch_product_details(requested_ids, batch_size=20)
product_map = {_product_id_key(product.product_id): product for product in products} if ok else {}
logger.info(
f"[Feeder] {label} product_id 批次查詢 | requested={len(requested_ids)} "
f"returned={len(product_map)} msg={message}"
)
matched = 0
skipped_no = 0
skipped_low = 0
errors = 0
history_written = 0
attempts_written = 0
for item in skus:
sku = item["sku"]
momo_name = item["name"]
momo_product_id = item.get("product_id")
momo_price = item.get("momo_price")
competitor_product_id = str(item.get("competitor_product_id") or "").strip()
search_terms = [f"known_product_id:{competitor_product_id}"] if competitor_product_id else []
try:
product = product_map.get(_product_id_key(competitor_product_id))
if not product:
if not allow_missing_recovery:
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=0,
attempt_status="refresh_no_result",
error_message=(
f"PChome product_id not returned: {competitor_product_id}; "
"fresh_search_recovery_deferred"
),
source=source,
)
skipped_no += 1
attempts_written += 1
continue
recovered, recovery_terms, recovery_candidate_count = _recover_low_score_with_fresh_search(
crawler,
momo_name,
momo_price=momo_price,
existing_product_id=competitor_product_id,
**recovery_kwargs,
)
if recovered:
best_product, score, diagnostics = recovered
if getattr(diagnostics, "comparison_mode", "") == "unit_comparable":
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms + [term for term in recovery_terms if term not in search_terms],
candidate_count=max(1, recovery_candidate_count),
attempt_status="refresh_unit_comparable",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
error_message=_format_match_diagnostics(diagnostics),
source=source,
)
skipped_low += 1
attempts_written += 1
continue
if score >= MIN_MATCH_SCORE and not getattr(diagnostics, "hard_veto", False):
if not _is_auto_price_write_safe(diagnostics):
attempt_terms = search_terms + [term for term in recovery_terms if term not in search_terms]
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=attempt_terms,
candidate_count=max(1, recovery_candidate_count),
attempt_status=_classify_auto_write_block_attempt(score, diagnostics),
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
error_message=f"auto_price_write_blocked; {_format_match_diagnostics(diagnostics)}",
source=source,
)
skipped_low += 1
attempts_written += 1
continue
tags = _extend_match_tags(
_extract_tags(best_product),
diagnostics,
["refresh_known_identity", "fresh_search_recovery", "missing_known_product_id"],
)
should_write, write_reason = self._should_upsert_competitor_price(
sku,
best_product,
score,
momo_name=momo_name,
source=source,
)
attempt_terms = search_terms + [term for term in recovery_terms if term not in search_terms]
if not should_write:
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=attempt_terms,
candidate_count=max(1, recovery_candidate_count),
attempt_status="protected_existing_match",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
error_message=f"{write_reason}; {_format_match_diagnostics(diagnostics)}",
source=source,
)
skipped_low += 1
attempts_written += 1
continue
tags.append(write_reason)
self._upsert_competitor_price(
sku,
best_product,
score,
tags,
momo_product_id=momo_product_id,
momo_price=momo_price,
diagnostics=diagnostics,
source=source,
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=attempt_terms,
candidate_count=max(1, recovery_candidate_count),
attempt_status="matched",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
source=source,
)
matched += 1
history_written += 1
attempts_written += 1
continue
attempt_status = _classify_low_score_attempt(score, diagnostics)
if (
attempt_status == "recoverable_low_score"
and _has_variant_selection_gap(momo_name, [(best_product, score, diagnostics)], score)
):
attempt_status = "true_low_confidence"
attempt_terms = search_terms + [term for term in recovery_terms if term not in search_terms]
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=attempt_terms,
candidate_count=max(1, recovery_candidate_count),
attempt_status=attempt_status,
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
error_message=(
"missing_known_product_id_fresh_search_low_confidence; "
f"PChome product_id not returned: {competitor_product_id}; "
f"{_format_match_diagnostics(diagnostics)}"
),
source=source,
)
skipped_low += 1
attempts_written += 1
continue
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms + [term for term in recovery_terms if term not in search_terms],
candidate_count=max(0, recovery_candidate_count),
attempt_status="refresh_no_result",
error_message=f"PChome product_id not returned: {competitor_product_id}",
source=source,
)
skipped_no += 1
attempts_written += 1
continue
result = _find_best_match_detail(momo_name, [product], momo_price=momo_price)
if not result:
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=1,
attempt_status="refresh_no_match",
source=source,
)
skipped_no += 1
attempts_written += 1
continue
best_product, score, diagnostics = result
if getattr(diagnostics, "comparison_mode", "") == "unit_comparable":
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=1,
attempt_status="refresh_unit_comparable",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
error_message=_format_match_diagnostics(diagnostics),
source=source,
)
skipped_low += 1
attempts_written += 1
continue
if score < MIN_MATCH_SCORE:
recovery_terms: list[str] = []
recovery_candidate_count = 0
if allow_missing_recovery:
recovered, recovery_terms, recovery_candidate_count = _recover_low_score_with_fresh_search(
crawler,
momo_name,
momo_price=momo_price,
existing_product_id=competitor_product_id,
**recovery_kwargs,
)
if recovered:
recovered_product, recovered_score, recovered_diagnostics = recovered
if (
recovered_score > score
or getattr(diagnostics, "hard_veto", False)
and not getattr(recovered_diagnostics, "hard_veto", False)
):
best_product, score, diagnostics = recovered_product, recovered_score, recovered_diagnostics
if score >= MIN_MATCH_SCORE:
extras = ["refresh_known_identity"]
if recovery_terms:
extras.append("fresh_search_recovery")
if not _is_auto_price_write_safe(diagnostics):
candidate_count = max(1, recovery_candidate_count or 1)
attempt_terms = search_terms + [term for term in recovery_terms if term not in search_terms]
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=attempt_terms,
candidate_count=candidate_count,
attempt_status=_classify_auto_write_block_attempt(score, diagnostics),
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
error_message=f"auto_price_write_blocked; {_format_match_diagnostics(diagnostics)}",
source=source,
)
skipped_low += 1
attempts_written += 1
continue
tags = _extend_match_tags(_extract_tags(best_product), diagnostics, extras)
should_write, write_reason = self._should_upsert_competitor_price(
sku,
best_product,
score,
momo_name=momo_name,
source=source,
)
candidate_count = max(1, recovery_candidate_count or 1)
attempt_terms = search_terms + [term for term in recovery_terms if term not in search_terms]
if not should_write:
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=attempt_terms,
candidate_count=candidate_count,
attempt_status="protected_existing_match",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
error_message=f"{write_reason}; {_format_match_diagnostics(diagnostics)}",
source=source,
)
skipped_low += 1
attempts_written += 1
continue
tags.append(write_reason)
self._upsert_competitor_price(
sku,
best_product,
score,
tags,
momo_product_id=momo_product_id,
momo_price=momo_price,
diagnostics=diagnostics,
source=source,
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=attempt_terms,
candidate_count=candidate_count,
attempt_status="matched",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
source=source,
)
matched += 1
history_written += 1
attempts_written += 1
continue
attempt_status = _classify_low_score_attempt(score, diagnostics)
if (
attempt_status == "recoverable_low_score"
and _has_variant_selection_gap(momo_name, [(best_product, score, diagnostics)], score)
):
attempt_status = "true_low_confidence"
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms + [term for term in recovery_terms if term not in search_terms],
candidate_count=max(1, recovery_candidate_count or 1),
attempt_status=attempt_status,
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
error_message=(
_format_match_diagnostics(diagnostics)
if allow_missing_recovery
else f"fresh_search_recovery_deferred; {_format_match_diagnostics(diagnostics)}"
),
source=source,
)
skipped_low += 1
attempts_written += 1
continue
if not _is_auto_price_write_safe(diagnostics):
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=1,
attempt_status=_classify_auto_write_block_attempt(score, diagnostics),
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
error_message=f"auto_price_write_blocked; {_format_match_diagnostics(diagnostics)}",
source=source,
)
skipped_low += 1
attempts_written += 1
continue
tags = _extend_match_tags(_extract_tags(best_product), diagnostics, ["refresh_known_identity"])
should_write, write_reason = self._should_upsert_competitor_price(
sku,
best_product,
score,
momo_name=momo_name,
source=source,
)
if not should_write:
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=1,
attempt_status="protected_existing_match",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
error_message=f"{write_reason}; {_format_match_diagnostics(diagnostics)}",
source=source,
)
skipped_low += 1
attempts_written += 1
continue
tags.append(write_reason)
self._upsert_competitor_price(
sku,
best_product,
score,
tags,
momo_product_id=momo_product_id,
momo_price=momo_price,
diagnostics=diagnostics,
source=source,
)
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
candidate_count=1,
attempt_status="matched",
best_product=best_product,
best_score=score,
diagnostics=diagnostics,
source=source,
)
matched += 1
history_written += 1
attempts_written += 1
except Exception as e:
logger.error(f"[Feeder] {sku} 已確認身份刷新失敗: {e}")
try:
self._record_match_attempt(
sku,
momo_name,
momo_product_id=momo_product_id,
momo_price=momo_price,
search_terms=search_terms,
attempt_status="refresh_error",
error_message=str(e),
source=source,
)
attempts_written += 1
except Exception as attempt_error:
logger.warning(f"[Feeder] {sku} 刷新嘗試紀錄寫入失敗: {attempt_error}")
errors += 1
duration = round(time.time() - start, 2)
logger.info(
f"[Feeder] {label} 完成 matched={matched}/{len(skus)} "
f"skip_no={skipped_no} skip_low={skipped_low} errors={errors} "
f"history_written={history_written} attempts_written={attempts_written} 耗時={duration}s"
)
return FeederResult(
total_skus=len(skus),
matched=matched,
skipped_no_result=skipped_no,
skipped_low_score=skipped_low,
errors=errors,
duration_sec=duration,
history_written=history_written,
attempts_written=attempts_written,
)
def run(self, source: str = "pchome") -> FeederResult:
"""
執行一輪競品價格抓取與寫入
Args:
source: 競品來源代碼(目前支援 'pchome'
Returns:
FeederResult
"""
try:
skus = self._fetch_active_skus()
except Exception as e:
logger.error(f"[Feeder] 讀取商品清單失敗: {e}")
return FeederResult(0, 0, 0, 0, 1, 0.0)
return self._run_sku_items(skus, source=source, label="PChome 競品價格")
def run_expired_identity_refresh(self, limit: int = 120, source: str = "pchome") -> FeederResult:
"""刷新已通過 identity_v2、但 PChome 價格快取過期的商品。"""
try:
skus = self._fetch_expired_identity_skus(limit=limit)
except Exception as e:
logger.error(f"[Feeder] 讀取過期 identity_v2 商品失敗: {e}")
return FeederResult(0, 0, 0, 0, 1, 0.0)
return self._run_known_identity_refresh_items(
skus,
source=source,
label="identity_v2 過期價格刷新",
allow_missing_recovery=False,
)
def run_expired_identity_search_recovery(self, limit: int = 40, source: str = "pchome") -> FeederResult:
"""
搜尋救援已建立 identity_v2、但價格過期或舊 PChome ID 失效的商品。
與一般 refresh-stale 不同,這條路徑允許在 known product_id 缺失或重評低分時,
走受控 fresh_search_recovery最後仍會經過 hard veto、auto price write
safety、overwrite protection 才能寫入正式 competitor_prices。
"""
try:
skus = self._fetch_expired_identity_recovery_skus(limit=max(1, min(int(limit), 120)))
except Exception as e:
logger.error(f"[Feeder] 讀取過期 identity_v2 搜尋救援商品失敗: {e}")
return FeederResult(0, 0, 0, 0, 1, 0.0)
return self._run_known_identity_refresh_items(
skus,
source=source,
label="identity_v2 過期價格搜尋救援",
allow_missing_recovery=True,
bounded_recovery=True,
)
def run_retryable_candidate_revalidation(
self,
limit: int = 80,
min_score: float = 0.70,
source: str = "pchome",
) -> FeederResult:
"""重新評分近門檻候選,避免 matcher 升級後仍卡在舊的 low_score。"""
try:
skus = self._fetch_retryable_candidate_skus(limit=limit, min_score=min_score)
except Exception as e:
logger.error(f"[Feeder] 讀取近門檻候選失敗: {e}")
return FeederResult(0, 0, 0, 0, 1, 0.0)
return self._run_known_identity_refresh_items(
skus,
source=source,
label="近門檻候選重新評分",
)
def run_unmatched_priority(self, limit: int = 80, source: str = "pchome") -> FeederResult:
"""優先補抓尚未有有效 PChome 配對的高價商品。"""
try:
skus = self._fetch_unmatched_priority_skus(limit=limit)
except Exception as e:
logger.error(f"[Feeder] 讀取待比對優先商品失敗: {e}")
return FeederResult(0, 0, 0, 0, 1, 0.0)
return self._run_sku_items(
skus,
source=source,
label="待比對優先補抓",
bounded_search=True,
)
# ─────────────────────────────────────────────
# CLI 測試(不依賴 DB直接測試爬蟲 + 比對邏輯)
# python3 services/competitor_price_feeder.py
# ─────────────────────────────────────────────
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
from services.pchome_crawler import PChomeCrawler
test_items = [
{"sku": "A003", "name": "舒特膚AD乳液200ml"},
{"sku": "A001", "name": "玻尿酸面膜10片裝"},
{"sku": "A009", "name": "美白化妝水150ml"},
]
crawler = PChomeCrawler(delay=0.8)
print("=== Competitor Price Feeder CLI 測試 ===\n")
for item in test_items:
keyword = item["name"][:20]
ok, msg, products = crawler.search_products(keyword, limit=10)
if not ok or not products:
print(f"[{item['sku']}] 無結果: {msg}")
continue
result = _find_best_match(item["name"], products)
if not result:
print(f"[{item['sku']}] 無法比對")
continue
best, score = result
tags = _extract_tags(best)
symbol = "" if score >= MIN_MATCH_SCORE else "⚠️ 低分"
print(
f"{symbol} [{item['sku']}] {item['name'][:25]}\n"
f" → PChome: {best.name[:40]}\n"
f" → 售價 ${best.price} | 分數 {score:.3f} | 標籤 {tags}\n"
)