Files
ewoooc/tests/test_pchome_crawler_search.py
OoO bb8c29e56d
All checks were successful
CD Pipeline / deploy (push) Successful in 1m7s
V10.590 修正 PChome 副標去重與比價覆核入口
2026-06-04 22:00:32 +08:00

290 lines
9.2 KiB
Python

from datetime import datetime
import requests
class _FakeResponse:
def __init__(self, payload=None, status_code=200):
self._payload = payload or {}
self.status_code = status_code
self.text = ""
def json(self):
return self._payload
def raise_for_status(self):
if self.status_code >= 400:
raise requests.HTTPError(f"HTTP {self.status_code}", response=self)
def test_pchome_search_scans_multiple_pages_until_limit(monkeypatch):
from services.pchome_crawler import PChomeCrawler, PChomeProduct
crawler = PChomeCrawler(timeout=1, delay=0, max_retries=0)
calls = []
fetched_ids = []
class FakeSession:
headers = {}
def get(self, url, params=None, timeout=None):
calls.append((url, dict(params or {}), timeout))
page = int((params or {}).get("page") or 1)
if page == 1:
return _FakeResponse({"Prods": [{"Id": "A001"}, {"Id": "A002"}]})
if page == 2:
return _FakeResponse({"Prods": [{"Id": "A002"}, {"Id": "A003"}]})
return _FakeResponse({"Prods": []})
def fake_fetch_product_details(product_ids, batch_size=20):
fetched_ids.extend(product_ids)
return True, "details ok", [
PChomeProduct(
product_id=product_id,
name=f"商品 {product_id}",
price=100,
original_price=120,
discount=17,
image_url="",
product_url=f"https://24h.pchome.com.tw/prod/{product_id}",
stock=10,
store="24h",
rating=None,
review_count=0,
is_on_sale=True,
crawled_at=datetime.now(),
)
for product_id in product_ids
]
crawler.session = FakeSession()
monkeypatch.setattr(crawler, "fetch_product_details", fake_fetch_product_details)
success, message, products = crawler.search_products("理膚寶水", limit=3, max_pages=3)
assert success is True
assert "搜尋頁數 2" in message
assert fetched_ids == ["A001", "A002", "A003"]
assert [call[1]["page"] for call in calls] == [1, 2]
assert [product.product_id for product in products] == ["A001", "A002", "A003"]
def test_pchome_get_retries_transient_timeout():
from services.pchome_crawler import PChomeCrawler
crawler = PChomeCrawler(timeout=1, delay=0, max_retries=1, retry_backoff=0)
calls = []
class FakeSession:
headers = {}
def get(self, url, **kwargs):
calls.append((url, kwargs))
if len(calls) == 1:
raise requests.Timeout("temporary timeout")
return _FakeResponse({"ok": True})
crawler.session = FakeSession()
response = crawler._get_with_retry("https://example.test/api", timeout=1)
assert response.json() == {"ok": True}
assert len(calls) == 2
def test_pchome_fetch_product_details_accepts_list_payload():
from services.pchome_crawler import PChomeCrawler
crawler = PChomeCrawler(timeout=1, delay=0, max_retries=0)
calls = []
class FakeSession:
headers = {}
def get(self, url, params=None, timeout=None):
calls.append((url, params, timeout))
return _FakeResponse([
{
"Id": "DDABCD-12345678",
"Name": "測試商品 50ml",
"Nick": "測試商品 50ml x2 限量組",
"Price": {"P": 799, "M": 999},
"Pic": {"B": "/items/DDABCD12345678.jpg"},
"Qty": 8,
"Store": "24h",
"isOnSale": True,
}
])
crawler.session = FakeSession()
success, message, products = crawler.fetch_product_details(["DDABCD-12345678"])
assert success is True
assert message == "成功取得 1 個商品資料"
assert len(calls) == 1
assert [product.product_id for product in products] == ["DDABCD-12345678"]
assert products[0].price == 799
assert products[0].subtitle == "測試商品 50ml x2 限量組"
assert products[0].match_name == "測試商品 50ml x2 限量組"
def test_pchome_match_name_combines_non_duplicate_nick():
from services.pchome_crawler import _build_match_name
assert _build_match_name("水楊酸身體乳雙入組", "2% 水楊酸身體乳 210ml x2") == (
"水楊酸身體乳雙入組 2% 水楊酸身體乳 210ml x2"
)
def test_pchome_match_name_deduplicates_normalized_nick_prefix():
from services.pchome_crawler import _build_match_name
match_name = _build_match_name(
"【Laura Mercier 蘿拉蜜思】 煥顏透明蜜粉 29g",
"【Laura Mercier 蘿拉蜜思】煥顏透明蜜粉 29g 專櫃公司貨",
)
assert match_name == "【Laura Mercier 蘿拉蜜思】煥顏透明蜜粉 29g 專櫃公司貨"
assert match_name.count("29g") == 1
def test_pchome_match_name_deduplicates_marketing_prefix_before_title():
from services.pchome_crawler import _build_match_name
ad_cream = _build_match_name(
"Cetaphil舒特膚 AD益膚康修護舒敏乳霜227g",
"《即期特賣》Cetaphil舒特膚 AD益膚康修護舒敏乳霜227g",
)
moisturizing_cream = _build_match_name(
"【Cetaphil 舒特膚】長效潤膚霜250g",
"48小時長效保濕升級版 【Cetaphil 舒特膚】長效潤膚霜250g",
)
assert ad_cream == "Cetaphil舒特膚 AD益膚康修護舒敏乳霜227g 《即期特賣》"
assert ad_cream.count("227g") == 1
assert moisturizing_cream == "【Cetaphil 舒特膚】長效潤膚霜250g 48小時長效保濕升級版"
assert moisturizing_cream.count("250g") == 1
def test_pchome_match_name_strips_html_marketing_noise():
from services.pchome_crawler import _build_match_name
match_name = _build_match_name(
"TS6護一生沁涼潔淨慕斯100g",
'<font color="#FF0066">★降溫限定。92%滿意★</font><br>TS6護一生 沁涼潔淨慕斯100g',
)
assert "<font" not in match_name
assert "降溫限定" not in match_name
assert match_name == "TS6護一生 沁涼潔淨慕斯100g"
assert match_name.count("100g") == 1
def test_feeder_search_cleanup_preserves_bracket_brand_and_specs():
from services.competitor_price_feeder import _clean_search_text
cleaned = _clean_search_text("【蘭蔻】絕對完美玫瑰霜(60ml)+玫瑰精露150ml")
assert "蘭蔻" in cleaned
assert "60ml" in cleaned
assert "150ml" in cleaned
def test_feeder_search_candidate_passes_page_cap(monkeypatch):
from services.competitor_price_feeder import _search_pchome_candidates
from services.pchome_crawler import PChomeProduct
product = PChomeProduct(
product_id="DDAB01-PAGE2",
name="理膚寶水 B5 修復霜 40ml",
price=679,
original_price=799,
discount=15,
image_url="",
product_url="https://24h.pchome.com.tw/prod/DDAB01-PAGE2",
stock=20,
store="24h",
rating=4.7,
review_count=8,
is_on_sale=True,
crawled_at=datetime.now(),
)
calls = []
class FakeCrawler:
def search_products(self, keyword, **kwargs):
calls.append((keyword, kwargs))
return True, "ok", [product]
monkeypatch.setattr(
"services.marketplace_product_matcher.score_marketplace_match",
lambda *_args, **_kwargs: type(
"Diagnostics",
(),
{"score": 0.95},
)(),
)
candidates = _search_pchome_candidates(
FakeCrawler(),
"理膚寶水 B5 修復霜 40ml",
keywords=["理膚寶水 B5 40ml"],
momo_price=699,
)
assert candidates == [product]
assert calls[0][1]["limit"] == 40
assert calls[0][1]["max_pages"] == 2
def test_feeder_search_candidate_respects_bounded_budget(monkeypatch):
from services.competitor_price_feeder import _search_pchome_candidates
from services.pchome_crawler import PChomeProduct
product = PChomeProduct(
product_id="DDAB01-FAST",
name="理膚寶水 B5 修復霜 40ml",
price=679,
original_price=799,
discount=15,
image_url="",
product_url="https://24h.pchome.com.tw/prod/DDAB01-FAST",
stock=20,
store="24h",
rating=4.7,
review_count=8,
is_on_sale=True,
crawled_at=datetime.now(),
)
calls = []
class FakeCrawler:
def search_products(self, keyword, **kwargs):
calls.append((keyword, kwargs))
return True, "ok", [product]
monkeypatch.setattr(
"services.marketplace_product_matcher.score_marketplace_match",
lambda *_args, **_kwargs: type(
"Diagnostics",
(),
{"score": 0.80},
)(),
)
candidates = _search_pchome_candidates(
FakeCrawler(),
"理膚寶水 B5 修復霜 40ml",
keywords=["理膚寶水 B5", "理膚寶水 修復霜", "b5 cream"],
momo_price=699,
max_terms=1,
max_pages=1,
max_seconds=30,
)
assert candidates == [product]
assert [call[0] for call in calls] == ["理膚寶水 B5"]
assert calls[0][1]["limit"] == 20
assert calls[0][1]["max_pages"] == 1