Files
ewoooc/services/momo_crawler.py
ogt c268b5cc02
All checks were successful
CD Pipeline / deploy (push) Successful in 1m5s
feat: show product identity in ai recommendations
2026-06-26 18:33:11 +08:00

814 lines
32 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
MOMO 購物網爬蟲服務
爬取 MOMO 購物網商品資料,支援:
- 關鍵字搜尋
- 熱銷商品排行
API 參考:
- 搜尋 API: https://m.momoshop.com.tw/search.momo
- 商品 API: https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code=XXX
"""
from __future__ import annotations
import re
import json
import time
import logging
import os
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime
import requests
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
MOMO_TARGETED_SEARCH_MIN_SCORE = float(os.getenv("MOMO_TARGETED_SEARCH_MIN_SCORE", "0.45"))
MOMO_TARGETED_SEARCH_MAX_PRODUCTS = int(os.getenv("MOMO_TARGETED_SEARCH_MAX_PRODUCTS", "30"))
MOMO_TARGETED_SEARCH_MAX_TERMS = int(os.getenv("MOMO_TARGETED_SEARCH_MAX_TERMS", "4"))
MOMO_TARGETED_SEARCH_LIMIT_PER_TERM = int(os.getenv("MOMO_TARGETED_SEARCH_LIMIT_PER_TERM", "8"))
@dataclass
class MomoProduct:
"""MOMO 商品資料結構"""
product_id: str # 商品 ID
name: str # 商品名稱
price: int # 售價
original_price: int # 原價
discount: Optional[int] # 折扣 (%)
image_url: str # 圖片 URL
product_url: str # 商品頁面 URL
brand: str # 品牌
crawled_at: datetime # 爬取時間
def to_dict(self) -> dict:
"""轉換為字典"""
data = asdict(self)
data['crawled_at'] = self.crawled_at.isoformat()
return data
class MomoCrawler:
"""MOMO 購物網爬蟲"""
# 基礎 URL
BASE_URL = 'https://www.momoshop.com.tw'
MOBILE_URL = 'https://m.momoshop.com.tw'
SEARCH_API = 'https://www.momoshop.com.tw/search/searchShop.jsp'
# 使用行動版搜尋 API更容易爬取
MOBILE_SEARCH_API = 'https://m.momoshop.com.tw/search.momo'
# 預設 Headers模擬行動裝置
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://m.momoshop.com.tw/',
}
def __init__(self, timeout: int = 30, delay: float = 1.0):
"""
初始化爬蟲
Args:
timeout: 請求超時時間 (秒)
delay: 請求間隔延遲 (秒)
"""
self.timeout = timeout
self.delay = delay
self.session = requests.Session()
self.session.headers.update(self.DEFAULT_HEADERS)
self._last_request_time = 0
def _rate_limit(self):
"""速率限制"""
elapsed = time.time() - self._last_request_time
if elapsed < self.delay:
time.sleep(self.delay - elapsed)
self._last_request_time = time.time()
def search_products(self, keyword: str, limit: int = 10, sort_by: str = 'sSaleQty/dc') -> Tuple[bool, str, List[MomoProduct]]:
"""
搜尋商品
Args:
keyword: 搜尋關鍵字
limit: 最多回傳數量
sort_by: 排序方式
- 'sSaleQty/dc': 銷量高到低(熱銷)
- 'sPrice/ac': 價格低到高
- 'sPrice/dc': 價格高到低
- 'sSaleDate/dc': 上架時間新到舊
Returns:
(成功與否, 訊息, 商品列表)
"""
try:
self._rate_limit()
# 先嘗試行動版 API
products = self._search_mobile(keyword, limit, sort_by)
if products:
return True, f"成功取得 {len(products)} 個商品", products
# 如果行動版失敗,嘗試桌面版
products = self._search_desktop(keyword, limit, sort_by)
if products:
return True, f"成功取得 {len(products)} 個商品", products
return False, "無法解析商品資料", []
except requests.Timeout:
logger.error("[MOMO] 請求超時")
return False, "請求超時", []
except Exception as e:
logger.error(f"[MOMO] 搜尋失敗: {e}")
return False, str(e), []
def _search_mobile(self, keyword: str, limit: int, sort_by: str) -> List[MomoProduct]:
"""使用行動版 API 搜尋"""
try:
# 行動版搜尋參數
params = {
'searchKeyword': keyword,
'sortType': '4' if 'sSaleQty' in sort_by else '1', # 4=銷量, 1=相關
'maxPage': '1',
'curPage': '1',
}
logger.info(f"[MOMO] 行動版搜尋: {keyword}")
response = self.session.get(
self.MOBILE_SEARCH_API,
params=params,
timeout=self.timeout
)
if response.status_code != 200:
logger.warning(f"[MOMO] 行動版搜尋失敗: HTTP {response.status_code}")
return []
return self._parse_mobile_results(response.text, limit)
except Exception as e:
logger.warning(f"[MOMO] 行動版搜尋異常: {e}")
return []
def _search_desktop(self, keyword: str, limit: int, sort_by: str) -> List[MomoProduct]:
"""使用桌面版 API 搜尋"""
try:
params = {
'keyword': keyword,
'searchType': '1',
'cateLevel': '-1',
'curPage': '1',
'maxPage': '1',
'minPage': '1',
'areaCode': 'all',
'isFuzzy': '0',
'sortType': sort_by
}
logger.info(f"[MOMO] 桌面版搜尋: {keyword}")
# 更換為桌面版 User-Agent
headers = self.DEFAULT_HEADERS.copy()
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
response = self.session.get(
self.SEARCH_API,
params=params,
headers=headers,
timeout=self.timeout
)
if response.status_code != 200:
logger.warning(f"[MOMO] 桌面版搜尋失敗: HTTP {response.status_code}")
return []
return self._parse_search_results(response.text, limit)
except Exception as e:
logger.warning(f"[MOMO] 桌面版搜尋異常: {e}")
return []
def _parse_mobile_results(self, html: str, limit: int) -> List[MomoProduct]:
"""解析行動版搜尋結果"""
products = []
try:
soup = BeautifulSoup(html, 'html.parser')
# 方法 1: 找到所有包含 i_code 的連結
product_links = soup.find_all('a', href=re.compile(r'i_code=\d+'))
seen_ids = set()
for link in product_links:
if len(products) >= limit:
break
try:
href = link.get('href', '')
match = re.search(r'i_code=(\d+)', href)
if not match:
continue
product_id = match.group(1)
if product_id in seen_ids:
continue
seen_ids.add(product_id)
# 尋找商品名稱 - 從多個可能位置
name = ''
# 從 title 屬性
if link.get('title'):
name = link.get('title')
# 從 img alt
if not name:
img = link.find('img')
if img and img.get('alt'):
name = img.get('alt')
# 從 text 內容
if not name:
name_elem = link.find(class_=re.compile(r'name|title|goods', re.I))
if name_elem:
name = name_elem.get_text(strip=True)
# 從連結本身的文字
if not name:
name = link.get_text(strip=True)
if not name or len(name) < 3:
continue
# 尋找價格 - 從父元素或兄弟元素
price = 0
parent = link.find_parent(['li', 'div', 'article'])
if parent:
price_elem = parent.find(class_=re.compile(r'price', re.I))
if price_elem:
price_text = price_elem.get_text(strip=True)
price_match = re.search(r'[\d,]+', price_text)
if price_match:
price = int(price_match.group().replace(',', ''))
# 尋找圖片
image_url = ''
img = link.find('img')
if img:
image_url = img.get('src') or img.get('data-src') or img.get('data-original', '')
if image_url and image_url.startswith('//'):
image_url = 'https:' + image_url
product = MomoProduct(
product_id=product_id,
name=name.strip()[:100], # 限制名稱長度
price=price,
original_price=price,
discount=None,
image_url=image_url,
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
brand='',
crawled_at=datetime.now()
)
products.append(product)
except Exception as e:
logger.debug(f"[MOMO] 解析商品連結失敗: {e}")
continue
# 方法 2: 新版 Next.js app router 會把 goodsInfoList 放在 script payload 字串中
if not products:
products = self._parse_next_search_payload_results(html, limit)
# 方法 3: 如果上面沒找到,嘗試從 __NEXT_DATA__ 或 JSON
if not products:
# 嘗試找 Next.js 資料
script = soup.find('script', {'id': '__NEXT_DATA__'})
if script and script.string:
try:
next_data = json.loads(script.string)
props = next_data.get('props', {}).get('pageProps', {})
items = props.get('products', []) or props.get('items', [])
for item in items[:limit]:
product = MomoProduct(
product_id=str(item.get('goodsCode', '')),
name=item.get('goodsName', ''),
price=int(item.get('price', 0)),
original_price=int(item.get('suggestPrice', item.get('price', 0))),
discount=None,
image_url=item.get('imgUrl', ''),
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={item.get("goodsCode", "")}',
brand='',
crawled_at=datetime.now()
)
if product.product_id and product.name:
products.append(product)
except json.JSONDecodeError:
pass
# 方法 4: 從 HTML 中找嵌入的 JSON
if not products:
json_pattern = re.compile(r'"goodsCode"\s*:\s*"?(\d+)"?.*?"goodsName"\s*:\s*"([^"]+)".*?"price"\s*:\s*(\d+)', re.DOTALL)
matches = json_pattern.findall(html)
for match in matches[:limit]:
product_id, name, price = match
product = MomoProduct(
product_id=product_id,
name=name,
price=int(price),
original_price=int(price),
discount=None,
image_url='',
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
brand='',
crawled_at=datetime.now()
)
products.append(product)
logger.info(f"[MOMO] 行動版解析到 {len(products)} 個商品")
return products
except Exception as e:
logger.error(f"[MOMO] 解析行動版結果失敗: {e}")
return []
def _parse_next_search_payload_results(self, html: str, limit: int) -> List[MomoProduct]:
"""解析 MOMO 新版搜尋頁嵌入的 Next.js goodsInfoList payload。"""
products: List[MomoProduct] = []
seen_ids: set[str] = set()
product_pattern = re.compile(
r'\\"goodsCode\\"\s*:\s*\\"(?P<code>\d+)\\"'
r'.{0,800}?'
r'\\"goodsName\\"\s*:\s*\\"(?P<name>.*?)\\"'
r'.{0,1600}?'
r'\\"goodsPrice\\"\s*:\s*\\"(?P<price>[^\\"]+)\\"'
r'.{0,2400}?'
r'\\"imgUrl\\"\s*:\s*\\"(?P<img>[^\\"]*)\\"',
re.DOTALL,
)
for match in product_pattern.finditer(html):
if len(products) >= limit:
break
product_id = match.group("code")
if product_id in seen_ids:
continue
seen_ids.add(product_id)
name = self._decode_payload_text(match.group("name"))
price = self._parse_momo_price(match.group("price"))
if not name or price <= 0:
continue
image_url = self._decode_payload_text(match.group("img"))
original_price = self._parse_original_price_nearby(html, match.start(), match.end()) or price
discount = round((1 - price / original_price) * 100) if original_price > price else None
products.append(MomoProduct(
product_id=product_id,
name=name.strip()[:160],
price=price,
original_price=original_price,
discount=discount,
image_url=image_url,
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
brand='',
crawled_at=datetime.now(),
))
return products
@staticmethod
def _decode_payload_text(value: str) -> str:
try:
return json.loads(f'"{value}"')
except Exception:
return (value or "").replace("\\u0026", "&").replace("\\/", "/")
@staticmethod
def _parse_momo_price(value: str) -> int:
match = re.search(r"[\d,]+", value or "")
return int(match.group(0).replace(",", "")) if match else 0
def _parse_original_price_nearby(self, html: str, start: int, end: int) -> int:
snippet = html[start:min(len(html), end + 1800)]
match = re.search(r'\\"goodsPriceOri\\"\s*:\s*\\"(?P<price>[^\\"]+)\\"', snippet)
return self._parse_momo_price(match.group("price")) if match else 0
def _parse_search_results(self, html: str, limit: int) -> List[MomoProduct]:
"""
解析搜尋結果 HTML
Args:
html: HTML 內容
limit: 最多回傳數量
Returns:
商品列表
"""
products = []
try:
# 使用正則表達式解析商品資訊
# MOMO 商品卡片通常包含 goodsUrl、goodsName、price 等資訊
# 嘗試從 goodsCode 取得商品 ID
goods_pattern = re.compile(
r'<a[^>]*href=["\']([^"\']*i_code=(\d+)[^"\']*)["\'][^>]*>.*?'
r'<img[^>]*(?:src|data-original)=["\']([^"\']+)["\'][^>]*>.*?'
r'<p[^>]*class=["\'][^"\']*prdName[^"\']*["\'][^>]*>([^<]+)</p>.*?'
r'<b[^>]*class=["\'][^"\']*price[^"\']*["\'][^>]*>\$?([\d,]+)</b>',
re.DOTALL | re.IGNORECASE
)
# 備用:使用更簡單的模式
simple_pattern = re.compile(
r'i_code=(\d+).*?'
r'title=["\']([^"\']+)["\'].*?'
r'(?:src|data-original)=["\']([^"\']*(?:jpg|png|webp)[^"\']*)["\'].*?'
r'\$?([\d,]+)',
re.DOTALL | re.IGNORECASE
)
# 嘗試用 li.goodsItemLi 模式
item_pattern = re.compile(
r'<li[^>]*class=["\'][^"\']*goodsItemLi[^"\']*["\'][^>]*>(.*?)</li>',
re.DOTALL | re.IGNORECASE
)
items = item_pattern.findall(html)
for item_html in items[:limit]:
try:
# 從每個商品項目中提取資料
code_match = re.search(r'i_code=(\d+)', item_html)
name_match = re.search(r'<p[^>]*class=["\'][^"\']*prdName[^"\']*["\'][^>]*>([^<]+)</p>', item_html)
if not name_match:
name_match = re.search(r'title=["\']([^"\']+)["\']', item_html)
price_match = re.search(r'<b[^>]*>\$?([\d,]+)</b>', item_html)
if not price_match:
price_match = re.search(r'\$?([\d,]+)', item_html)
img_match = re.search(r'(?:src|data-original)=["\']([^"\']+\.(?:jpg|png|webp)[^"\']*)["\']', item_html, re.IGNORECASE)
original_price_match = re.search(r'<del[^>]*>\$?([\d,]+)</del>', item_html)
if code_match and name_match and price_match:
product_id = code_match.group(1)
name = name_match.group(1).strip()
price = int(price_match.group(1).replace(',', ''))
original_price = int(original_price_match.group(1).replace(',', '')) if original_price_match else price
image_url = img_match.group(1) if img_match else ''
# 計算折扣
discount = None
if original_price > price:
discount = round((1 - price / original_price) * 100)
product = MomoProduct(
product_id=product_id,
name=name,
price=price,
original_price=original_price,
discount=discount,
image_url=image_url if image_url.startswith('http') else f'https:{image_url}' if image_url.startswith('//') else image_url,
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
brand='',
crawled_at=datetime.now()
)
products.append(product)
except Exception as e:
logger.debug(f"[MOMO] 解析商品項目失敗: {e}")
continue
# 如果上面的方法都失敗,嘗試用 JSON-like 結構
if not products:
# 有時候 MOMO 會在 HTML 中嵌入 JSON 資料
json_pattern = re.compile(r'"goodsCode"\s*:\s*"(\d+)".*?"goodsName"\s*:\s*"([^"]+)".*?"price"\s*:\s*(\d+)', re.DOTALL)
matches = json_pattern.findall(html)
for match in matches[:limit]:
product_id, name, price = match
product = MomoProduct(
product_id=product_id,
name=name,
price=int(price),
original_price=int(price),
discount=None,
image_url='',
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
brand='',
crawled_at=datetime.now()
)
products.append(product)
logger.info(f"[MOMO] 解析到 {len(products)} 個商品")
return products
except Exception as e:
logger.error(f"[MOMO] 解析搜尋結果失敗: {e}")
return []
# 全域爬蟲實例
_crawler_instance: Optional[MomoCrawler] = None
def get_crawler() -> MomoCrawler:
"""取得爬蟲實例(單例模式)"""
global _crawler_instance
if _crawler_instance is None:
_crawler_instance = MomoCrawler()
return _crawler_instance
def search_momo_products(keyword: str, limit: int = 10) -> Tuple[bool, str, List[dict]]:
"""
搜尋 MOMO 商品(便捷函數)
Args:
keyword: 搜尋關鍵字
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品資料列表)
"""
crawler = get_crawler()
success, message, products = crawler.search_products(keyword, limit)
return success, message, [p.to_dict() for p in products]
def _to_float(value, default: float = 0.0) -> float:
try:
if value is None:
return default
return float(str(value).replace(",", "").replace("$", "").strip())
except (TypeError, ValueError):
return default
def _product_name_from_payload(payload: dict) -> str:
return str(
payload.get("name")
or payload.get("product_name")
or payload.get("title")
or payload.get("商品名稱")
or ""
).strip()
def _product_price_from_payload(payload: dict) -> float:
return _to_float(
payload.get("price")
or payload.get("pchome_price")
or payload.get("sale_price")
or payload.get("售價")
)
def _dedupe_terms(terms: list[str], max_terms: int) -> list[str]:
result: list[str] = []
seen: set[str] = set()
for term in terms:
normalized = re.sub(r"\s+", " ", str(term or "").strip())
if len(normalized) < 2:
continue
key = normalized.lower()
if key in seen:
continue
seen.add(key)
result.append(normalized)
if len(result) >= max_terms:
break
return result
def build_targeted_momo_search_terms(pchome_name: str, max_terms: int = MOMO_TARGETED_SEARCH_MAX_TERMS) -> list[str]:
"""用 PChome 商品名稱產生 MOMO 精準搜尋詞,保留品名、容量與組合線索。"""
if not pchome_name:
return []
try:
from services.marketplace_product_matcher import build_search_terms
terms = build_search_terms(pchome_name, max_terms=max_terms)
except Exception:
logger.warning("[MOMO] 產生精準搜尋詞失敗,改用原商品名", exc_info=True)
terms = []
terms.append(pchome_name)
return _dedupe_terms(terms, max_terms=max_terms)
def search_momo_products_for_pchome_products(
pchome_products: list[dict],
*,
limit_per_product: int = MOMO_TARGETED_SEARCH_LIMIT_PER_TERM,
max_products: int = MOMO_TARGETED_SEARCH_MAX_PRODUCTS,
max_terms_per_product: int = MOMO_TARGETED_SEARCH_MAX_TERMS,
min_score: float = MOMO_TARGETED_SEARCH_MIN_SCORE,
crawler: MomoCrawler | None = None,
) -> Tuple[bool, str, List[dict]]:
"""以 PChome 商品逐筆反查 MOMO 候選,補足單品與組合的精準比價來源。"""
if not pchome_products:
return False, "沒有 PChome 商品可用來搜尋 MOMO", []
try:
from services.marketplace_product_matcher import (
build_unit_price_comparison,
score_marketplace_match,
)
except Exception as exc:
logger.error("[MOMO] 無法載入商品比對工具: %s", exc, exc_info=True)
return False, "商品比對工具暫時不可用", []
crawler = crawler or get_crawler()
candidates_by_id: dict[str, dict] = {}
searched_products = 0
searched_terms: list[str] = []
for target in pchome_products[:max_products]:
pchome_name = _product_name_from_payload(target)
if not pchome_name:
continue
searched_products += 1
pchome_price = _product_price_from_payload(target)
pchome_id = str(target.get("product_id") or target.get("id") or target.get("sku") or "").strip()
terms = build_targeted_momo_search_terms(pchome_name, max_terms=max_terms_per_product)
for term in terms:
searched_terms.append(term)
success, _, products = crawler.search_products(term, limit=limit_per_product)
if not success or not products:
continue
for product in products:
row = product.to_dict() if hasattr(product, "to_dict") else dict(product)
momo_name = _product_name_from_payload(row)
if not momo_name:
continue
momo_price = _to_float(row.get("price"))
diagnostics = score_marketplace_match(
momo_name,
pchome_name,
momo_price=momo_price,
competitor_price=pchome_price,
)
score = float(getattr(diagnostics, "score", 0.0) or 0.0)
if score < min_score:
continue
hard_veto = bool(getattr(diagnostics, "hard_veto", False))
comparison_mode = getattr(diagnostics, "comparison_mode", "exact_identity")
diagnostic_price_basis = str(getattr(diagnostics, "price_basis", "") or "")
diagnostic_alert_tier = str(getattr(diagnostics, "alert_tier", "") or "")
diagnostic_match_type = str(getattr(diagnostics, "match_type", "") or "")
unit_price_comparison = {}
auto_compare_type = "manual_review"
price_basis = "none"
review_status = "需人工確認"
if (
not hard_veto
and comparison_mode == "exact_identity"
and diagnostic_price_basis == "total_price"
and diagnostic_alert_tier == "price_alert_exact"
):
can_auto_compare = True
auto_compare_type = "total_price"
price_basis = "total_price"
review_status = "可直接比價"
elif comparison_mode == "unit_comparable" or diagnostic_price_basis == "unit_price":
unit_price_comparison = build_unit_price_comparison(
momo_name,
pchome_name,
momo_price=momo_price,
competitor_price=pchome_price,
)
can_auto_compare = bool(unit_price_comparison.get("comparable"))
if can_auto_compare:
auto_compare_type = "unit_price"
price_basis = "unit_price"
review_status = "自動單位價比較"
else:
price_basis = "unit_price_review"
else:
can_auto_compare = False
if comparison_mode != "unit_comparable":
unit_price_comparison = {}
gap_pct = None
if unit_price_comparison:
gap_pct = unit_price_comparison.get("unit_gap_pct")
elif pchome_price:
try:
gap_pct = (float(momo_price or 0) - float(pchome_price)) / float(pchome_price) * 100
except (TypeError, ValueError, ZeroDivisionError):
gap_pct = None
product_id = str(row.get("product_id") or row.get("goodsCode") or row.get("id") or "").strip()
if not product_id:
product_id = f"momo_candidate_{len(candidates_by_id)}"
existing = candidates_by_id.get(product_id)
if existing and float(existing.get("target_match_score") or 0.0) >= score:
continue
row.update({
"product_id": product_id,
"target_pchome_product_id": pchome_id,
"target_pchome_name": pchome_name,
"target_pchome_price": pchome_price,
"target_match_score": round(score, 3),
"target_search_term": term,
"target_match_reasons": list(getattr(diagnostics, "reasons", ()) or ()),
"target_comparison_mode": comparison_mode,
"target_match_type": diagnostic_match_type,
"target_alert_tier": diagnostic_alert_tier,
"target_hard_veto": hard_veto,
"can_auto_compare": can_auto_compare,
"auto_compare_type": auto_compare_type,
"target_price_basis": price_basis,
"target_gap_pct": round(float(gap_pct), 2) if gap_pct is not None else None,
"target_unit_price_comparison": unit_price_comparison,
"target_review_status": review_status,
"source_strategy": "pchome_targeted_momo_search",
})
candidates_by_id[product_id] = row
candidates = sorted(
candidates_by_id.values(),
key=lambda item: float(item.get("target_match_score") or 0.0),
reverse=True,
)
if not candidates:
return False, f"已用 {searched_products} 筆 PChome 商品搜尋 MOMO但沒有找到可用候選", []
exact_count = sum(1 for item in candidates if item.get("auto_compare_type") == "total_price")
unit_count = sum(1 for item in candidates if item.get("auto_compare_type") == "unit_price")
review_count = len(candidates) - exact_count - unit_count
return (
True,
(
f"已用 {searched_products} 筆 PChome 商品搜尋 MOMO找到 {len(candidates)} 筆候選"
f"(可直接比價 {exact_count} 筆、自動單位價比較 {unit_count} 筆、需人工確認 {review_count} 筆)"
),
candidates,
)
def get_momo_bestsellers(category: str, limit: int = 5) -> Tuple[bool, str, List[dict]]:
"""
取得 MOMO 分類熱銷商品
Args:
category: 分類關鍵字 (如 '面膜', '乳液', '精華液')
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品資料列表)
"""
crawler = get_crawler()
# 使用搜尋 API按銷量排序
success, message, products = crawler.search_products(category, limit=limit, sort_by='sSaleQty/dc')
if success and products:
# 轉換為精簡格式
result = []
for p in products[:limit]:
result.append({
'id': p.product_id,
'product_id': p.product_id,
'platform': 'momo',
'name': p.name,
'price': p.price,
'original_price': p.original_price,
'discount': p.discount,
'url': p.product_url,
'product_url': p.product_url,
'image_url': p.image_url,
'image': p.image_url
})
return True, f"成功取得 {len(result)} 個熱銷商品", result
return success, message, []
if __name__ == '__main__':
# 測試
logging.basicConfig(level=logging.INFO)
print("=== MOMO 爬蟲測試 ===\n")
# 測試搜尋
print("[1] 測試搜尋 (關鍵字: 面膜)")
success, msg, products = search_momo_products('面膜', limit=5)
print(f"結果: {msg}")
if products:
print(f"搜尋結果:")
for p in products[:3]:
print(f" - {p['name'][:30]}... ${p['price']}")
print("\n[2] 測試熱銷商品 (分類: 精華液)")
success, msg, products = get_momo_bestsellers('精華液', limit=5)
print(f"結果: {msg}")
if products:
print(f"熱銷商品:")
for i, p in enumerate(products, 1):
print(f" {i}. {p['name'][:30]}... ${p['price']}")