809 lines
32 KiB
Python
809 lines
32 KiB
Python
"""
|
||
MOMO 購物網爬蟲服務
|
||
|
||
爬取 MOMO 購物網商品資料,支援:
|
||
- 關鍵字搜尋
|
||
- 熱銷商品排行
|
||
|
||
API 參考:
|
||
- 搜尋 API: https://m.momoshop.com.tw/search.momo
|
||
- 商品 API: https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code=XXX
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
import json
|
||
import time
|
||
import logging
|
||
import os
|
||
from typing import List, Dict, Optional, Tuple
|
||
from dataclasses import dataclass, asdict
|
||
from datetime import datetime
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
MOMO_TARGETED_SEARCH_MIN_SCORE = float(os.getenv("MOMO_TARGETED_SEARCH_MIN_SCORE", "0.45"))
|
||
MOMO_TARGETED_SEARCH_MAX_PRODUCTS = int(os.getenv("MOMO_TARGETED_SEARCH_MAX_PRODUCTS", "30"))
|
||
MOMO_TARGETED_SEARCH_MAX_TERMS = int(os.getenv("MOMO_TARGETED_SEARCH_MAX_TERMS", "4"))
|
||
MOMO_TARGETED_SEARCH_LIMIT_PER_TERM = int(os.getenv("MOMO_TARGETED_SEARCH_LIMIT_PER_TERM", "8"))
|
||
|
||
|
||
@dataclass
|
||
class MomoProduct:
|
||
"""MOMO 商品資料結構"""
|
||
product_id: str # 商品 ID
|
||
name: str # 商品名稱
|
||
price: int # 售價
|
||
original_price: int # 原價
|
||
discount: Optional[int] # 折扣 (%)
|
||
image_url: str # 圖片 URL
|
||
product_url: str # 商品頁面 URL
|
||
brand: str # 品牌
|
||
crawled_at: datetime # 爬取時間
|
||
|
||
def to_dict(self) -> dict:
|
||
"""轉換為字典"""
|
||
data = asdict(self)
|
||
data['crawled_at'] = self.crawled_at.isoformat()
|
||
return data
|
||
|
||
|
||
class MomoCrawler:
|
||
"""MOMO 購物網爬蟲"""
|
||
|
||
# 基礎 URL
|
||
BASE_URL = 'https://www.momoshop.com.tw'
|
||
MOBILE_URL = 'https://m.momoshop.com.tw'
|
||
SEARCH_API = 'https://www.momoshop.com.tw/search/searchShop.jsp'
|
||
# 使用行動版搜尋 API(更容易爬取)
|
||
MOBILE_SEARCH_API = 'https://m.momoshop.com.tw/search.momo'
|
||
|
||
# 預設 Headers(模擬行動裝置)
|
||
DEFAULT_HEADERS = {
|
||
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
|
||
'Accept-Encoding': 'gzip, deflate, br',
|
||
'Referer': 'https://m.momoshop.com.tw/',
|
||
}
|
||
|
||
def __init__(self, timeout: int = 30, delay: float = 1.0):
|
||
"""
|
||
初始化爬蟲
|
||
|
||
Args:
|
||
timeout: 請求超時時間 (秒)
|
||
delay: 請求間隔延遲 (秒)
|
||
"""
|
||
self.timeout = timeout
|
||
self.delay = delay
|
||
self.session = requests.Session()
|
||
self.session.headers.update(self.DEFAULT_HEADERS)
|
||
self._last_request_time = 0
|
||
|
||
def _rate_limit(self):
|
||
"""速率限制"""
|
||
elapsed = time.time() - self._last_request_time
|
||
if elapsed < self.delay:
|
||
time.sleep(self.delay - elapsed)
|
||
self._last_request_time = time.time()
|
||
|
||
def search_products(self, keyword: str, limit: int = 10, sort_by: str = 'sSaleQty/dc') -> Tuple[bool, str, List[MomoProduct]]:
|
||
"""
|
||
搜尋商品
|
||
|
||
Args:
|
||
keyword: 搜尋關鍵字
|
||
limit: 最多回傳數量
|
||
sort_by: 排序方式
|
||
- 'sSaleQty/dc': 銷量高到低(熱銷)
|
||
- 'sPrice/ac': 價格低到高
|
||
- 'sPrice/dc': 價格高到低
|
||
- 'sSaleDate/dc': 上架時間新到舊
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品列表)
|
||
"""
|
||
try:
|
||
self._rate_limit()
|
||
|
||
# 先嘗試行動版 API
|
||
products = self._search_mobile(keyword, limit, sort_by)
|
||
if products:
|
||
return True, f"成功取得 {len(products)} 個商品", products
|
||
|
||
# 如果行動版失敗,嘗試桌面版
|
||
products = self._search_desktop(keyword, limit, sort_by)
|
||
if products:
|
||
return True, f"成功取得 {len(products)} 個商品", products
|
||
|
||
return False, "無法解析商品資料", []
|
||
|
||
except requests.Timeout:
|
||
logger.error("[MOMO] 請求超時")
|
||
return False, "請求超時", []
|
||
except Exception as e:
|
||
logger.error(f"[MOMO] 搜尋失敗: {e}")
|
||
return False, str(e), []
|
||
|
||
def _search_mobile(self, keyword: str, limit: int, sort_by: str) -> List[MomoProduct]:
|
||
"""使用行動版 API 搜尋"""
|
||
try:
|
||
# 行動版搜尋參數
|
||
params = {
|
||
'searchKeyword': keyword,
|
||
'sortType': '4' if 'sSaleQty' in sort_by else '1', # 4=銷量, 1=相關
|
||
'maxPage': '1',
|
||
'curPage': '1',
|
||
}
|
||
|
||
logger.info(f"[MOMO] 行動版搜尋: {keyword}")
|
||
|
||
response = self.session.get(
|
||
self.MOBILE_SEARCH_API,
|
||
params=params,
|
||
timeout=self.timeout
|
||
)
|
||
|
||
if response.status_code != 200:
|
||
logger.warning(f"[MOMO] 行動版搜尋失敗: HTTP {response.status_code}")
|
||
return []
|
||
|
||
return self._parse_mobile_results(response.text, limit)
|
||
|
||
except Exception as e:
|
||
logger.warning(f"[MOMO] 行動版搜尋異常: {e}")
|
||
return []
|
||
|
||
def _search_desktop(self, keyword: str, limit: int, sort_by: str) -> List[MomoProduct]:
|
||
"""使用桌面版 API 搜尋"""
|
||
try:
|
||
params = {
|
||
'keyword': keyword,
|
||
'searchType': '1',
|
||
'cateLevel': '-1',
|
||
'curPage': '1',
|
||
'maxPage': '1',
|
||
'minPage': '1',
|
||
'areaCode': 'all',
|
||
'isFuzzy': '0',
|
||
'sortType': sort_by
|
||
}
|
||
|
||
logger.info(f"[MOMO] 桌面版搜尋: {keyword}")
|
||
|
||
# 更換為桌面版 User-Agent
|
||
headers = self.DEFAULT_HEADERS.copy()
|
||
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||
|
||
response = self.session.get(
|
||
self.SEARCH_API,
|
||
params=params,
|
||
headers=headers,
|
||
timeout=self.timeout
|
||
)
|
||
|
||
if response.status_code != 200:
|
||
logger.warning(f"[MOMO] 桌面版搜尋失敗: HTTP {response.status_code}")
|
||
return []
|
||
|
||
return self._parse_search_results(response.text, limit)
|
||
|
||
except Exception as e:
|
||
logger.warning(f"[MOMO] 桌面版搜尋異常: {e}")
|
||
return []
|
||
|
||
def _parse_mobile_results(self, html: str, limit: int) -> List[MomoProduct]:
|
||
"""解析行動版搜尋結果"""
|
||
products = []
|
||
|
||
try:
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
|
||
# 方法 1: 找到所有包含 i_code 的連結
|
||
product_links = soup.find_all('a', href=re.compile(r'i_code=\d+'))
|
||
|
||
seen_ids = set()
|
||
for link in product_links:
|
||
if len(products) >= limit:
|
||
break
|
||
|
||
try:
|
||
href = link.get('href', '')
|
||
match = re.search(r'i_code=(\d+)', href)
|
||
if not match:
|
||
continue
|
||
|
||
product_id = match.group(1)
|
||
if product_id in seen_ids:
|
||
continue
|
||
seen_ids.add(product_id)
|
||
|
||
# 尋找商品名稱 - 從多個可能位置
|
||
name = ''
|
||
# 從 title 屬性
|
||
if link.get('title'):
|
||
name = link.get('title')
|
||
# 從 img alt
|
||
if not name:
|
||
img = link.find('img')
|
||
if img and img.get('alt'):
|
||
name = img.get('alt')
|
||
# 從 text 內容
|
||
if not name:
|
||
name_elem = link.find(class_=re.compile(r'name|title|goods', re.I))
|
||
if name_elem:
|
||
name = name_elem.get_text(strip=True)
|
||
# 從連結本身的文字
|
||
if not name:
|
||
name = link.get_text(strip=True)
|
||
|
||
if not name or len(name) < 3:
|
||
continue
|
||
|
||
# 尋找價格 - 從父元素或兄弟元素
|
||
price = 0
|
||
parent = link.find_parent(['li', 'div', 'article'])
|
||
if parent:
|
||
price_elem = parent.find(class_=re.compile(r'price', re.I))
|
||
if price_elem:
|
||
price_text = price_elem.get_text(strip=True)
|
||
price_match = re.search(r'[\d,]+', price_text)
|
||
if price_match:
|
||
price = int(price_match.group().replace(',', ''))
|
||
|
||
# 尋找圖片
|
||
image_url = ''
|
||
img = link.find('img')
|
||
if img:
|
||
image_url = img.get('src') or img.get('data-src') or img.get('data-original', '')
|
||
if image_url and image_url.startswith('//'):
|
||
image_url = 'https:' + image_url
|
||
|
||
product = MomoProduct(
|
||
product_id=product_id,
|
||
name=name.strip()[:100], # 限制名稱長度
|
||
price=price,
|
||
original_price=price,
|
||
discount=None,
|
||
image_url=image_url,
|
||
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
|
||
brand='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
products.append(product)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[MOMO] 解析商品連結失敗: {e}")
|
||
continue
|
||
|
||
# 方法 2: 新版 Next.js app router 會把 goodsInfoList 放在 script payload 字串中
|
||
if not products:
|
||
products = self._parse_next_search_payload_results(html, limit)
|
||
|
||
# 方法 3: 如果上面沒找到,嘗試從 __NEXT_DATA__ 或 JSON
|
||
if not products:
|
||
# 嘗試找 Next.js 資料
|
||
script = soup.find('script', {'id': '__NEXT_DATA__'})
|
||
if script and script.string:
|
||
try:
|
||
next_data = json.loads(script.string)
|
||
props = next_data.get('props', {}).get('pageProps', {})
|
||
items = props.get('products', []) or props.get('items', [])
|
||
for item in items[:limit]:
|
||
product = MomoProduct(
|
||
product_id=str(item.get('goodsCode', '')),
|
||
name=item.get('goodsName', ''),
|
||
price=int(item.get('price', 0)),
|
||
original_price=int(item.get('suggestPrice', item.get('price', 0))),
|
||
discount=None,
|
||
image_url=item.get('imgUrl', ''),
|
||
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={item.get("goodsCode", "")}',
|
||
brand='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
if product.product_id and product.name:
|
||
products.append(product)
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
# 方法 4: 從 HTML 中找嵌入的 JSON
|
||
if not products:
|
||
json_pattern = re.compile(r'"goodsCode"\s*:\s*"?(\d+)"?.*?"goodsName"\s*:\s*"([^"]+)".*?"price"\s*:\s*(\d+)', re.DOTALL)
|
||
matches = json_pattern.findall(html)
|
||
for match in matches[:limit]:
|
||
product_id, name, price = match
|
||
product = MomoProduct(
|
||
product_id=product_id,
|
||
name=name,
|
||
price=int(price),
|
||
original_price=int(price),
|
||
discount=None,
|
||
image_url='',
|
||
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
|
||
brand='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
products.append(product)
|
||
|
||
logger.info(f"[MOMO] 行動版解析到 {len(products)} 個商品")
|
||
return products
|
||
|
||
except Exception as e:
|
||
logger.error(f"[MOMO] 解析行動版結果失敗: {e}")
|
||
return []
|
||
|
||
def _parse_next_search_payload_results(self, html: str, limit: int) -> List[MomoProduct]:
|
||
"""解析 MOMO 新版搜尋頁嵌入的 Next.js goodsInfoList payload。"""
|
||
products: List[MomoProduct] = []
|
||
seen_ids: set[str] = set()
|
||
|
||
product_pattern = re.compile(
|
||
r'\\"goodsCode\\"\s*:\s*\\"(?P<code>\d+)\\"'
|
||
r'.{0,800}?'
|
||
r'\\"goodsName\\"\s*:\s*\\"(?P<name>.*?)\\"'
|
||
r'.{0,1600}?'
|
||
r'\\"goodsPrice\\"\s*:\s*\\"(?P<price>[^\\"]+)\\"'
|
||
r'.{0,2400}?'
|
||
r'\\"imgUrl\\"\s*:\s*\\"(?P<img>[^\\"]*)\\"',
|
||
re.DOTALL,
|
||
)
|
||
for match in product_pattern.finditer(html):
|
||
if len(products) >= limit:
|
||
break
|
||
product_id = match.group("code")
|
||
if product_id in seen_ids:
|
||
continue
|
||
seen_ids.add(product_id)
|
||
|
||
name = self._decode_payload_text(match.group("name"))
|
||
price = self._parse_momo_price(match.group("price"))
|
||
if not name or price <= 0:
|
||
continue
|
||
image_url = self._decode_payload_text(match.group("img"))
|
||
original_price = self._parse_original_price_nearby(html, match.start(), match.end()) or price
|
||
discount = round((1 - price / original_price) * 100) if original_price > price else None
|
||
|
||
products.append(MomoProduct(
|
||
product_id=product_id,
|
||
name=name.strip()[:160],
|
||
price=price,
|
||
original_price=original_price,
|
||
discount=discount,
|
||
image_url=image_url,
|
||
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
|
||
brand='',
|
||
crawled_at=datetime.now(),
|
||
))
|
||
return products
|
||
|
||
@staticmethod
|
||
def _decode_payload_text(value: str) -> str:
|
||
try:
|
||
return json.loads(f'"{value}"')
|
||
except Exception:
|
||
return (value or "").replace("\\u0026", "&").replace("\\/", "/")
|
||
|
||
@staticmethod
|
||
def _parse_momo_price(value: str) -> int:
|
||
match = re.search(r"[\d,]+", value or "")
|
||
return int(match.group(0).replace(",", "")) if match else 0
|
||
|
||
def _parse_original_price_nearby(self, html: str, start: int, end: int) -> int:
|
||
snippet = html[start:min(len(html), end + 1800)]
|
||
match = re.search(r'\\"goodsPriceOri\\"\s*:\s*\\"(?P<price>[^\\"]+)\\"', snippet)
|
||
return self._parse_momo_price(match.group("price")) if match else 0
|
||
|
||
def _parse_search_results(self, html: str, limit: int) -> List[MomoProduct]:
|
||
"""
|
||
解析搜尋結果 HTML
|
||
|
||
Args:
|
||
html: HTML 內容
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
商品列表
|
||
"""
|
||
products = []
|
||
|
||
try:
|
||
# 使用正則表達式解析商品資訊
|
||
# MOMO 商品卡片通常包含 goodsUrl、goodsName、price 等資訊
|
||
|
||
# 嘗試從 goodsCode 取得商品 ID
|
||
goods_pattern = re.compile(
|
||
r'<a[^>]*href=["\']([^"\']*i_code=(\d+)[^"\']*)["\'][^>]*>.*?'
|
||
r'<img[^>]*(?:src|data-original)=["\']([^"\']+)["\'][^>]*>.*?'
|
||
r'<p[^>]*class=["\'][^"\']*prdName[^"\']*["\'][^>]*>([^<]+)</p>.*?'
|
||
r'<b[^>]*class=["\'][^"\']*price[^"\']*["\'][^>]*>\$?([\d,]+)</b>',
|
||
re.DOTALL | re.IGNORECASE
|
||
)
|
||
|
||
# 備用:使用更簡單的模式
|
||
simple_pattern = re.compile(
|
||
r'i_code=(\d+).*?'
|
||
r'title=["\']([^"\']+)["\'].*?'
|
||
r'(?:src|data-original)=["\']([^"\']*(?:jpg|png|webp)[^"\']*)["\'].*?'
|
||
r'\$?([\d,]+)',
|
||
re.DOTALL | re.IGNORECASE
|
||
)
|
||
|
||
# 嘗試用 li.goodsItemLi 模式
|
||
item_pattern = re.compile(
|
||
r'<li[^>]*class=["\'][^"\']*goodsItemLi[^"\']*["\'][^>]*>(.*?)</li>',
|
||
re.DOTALL | re.IGNORECASE
|
||
)
|
||
|
||
items = item_pattern.findall(html)
|
||
|
||
for item_html in items[:limit]:
|
||
try:
|
||
# 從每個商品項目中提取資料
|
||
code_match = re.search(r'i_code=(\d+)', item_html)
|
||
name_match = re.search(r'<p[^>]*class=["\'][^"\']*prdName[^"\']*["\'][^>]*>([^<]+)</p>', item_html)
|
||
if not name_match:
|
||
name_match = re.search(r'title=["\']([^"\']+)["\']', item_html)
|
||
price_match = re.search(r'<b[^>]*>\$?([\d,]+)</b>', item_html)
|
||
if not price_match:
|
||
price_match = re.search(r'\$?([\d,]+)', item_html)
|
||
img_match = re.search(r'(?:src|data-original)=["\']([^"\']+\.(?:jpg|png|webp)[^"\']*)["\']', item_html, re.IGNORECASE)
|
||
original_price_match = re.search(r'<del[^>]*>\$?([\d,]+)</del>', item_html)
|
||
|
||
if code_match and name_match and price_match:
|
||
product_id = code_match.group(1)
|
||
name = name_match.group(1).strip()
|
||
price = int(price_match.group(1).replace(',', ''))
|
||
original_price = int(original_price_match.group(1).replace(',', '')) if original_price_match else price
|
||
image_url = img_match.group(1) if img_match else ''
|
||
|
||
# 計算折扣
|
||
discount = None
|
||
if original_price > price:
|
||
discount = round((1 - price / original_price) * 100)
|
||
|
||
product = MomoProduct(
|
||
product_id=product_id,
|
||
name=name,
|
||
price=price,
|
||
original_price=original_price,
|
||
discount=discount,
|
||
image_url=image_url if image_url.startswith('http') else f'https:{image_url}' if image_url.startswith('//') else image_url,
|
||
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
|
||
brand='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
products.append(product)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[MOMO] 解析商品項目失敗: {e}")
|
||
continue
|
||
|
||
# 如果上面的方法都失敗,嘗試用 JSON-like 結構
|
||
if not products:
|
||
# 有時候 MOMO 會在 HTML 中嵌入 JSON 資料
|
||
json_pattern = re.compile(r'"goodsCode"\s*:\s*"(\d+)".*?"goodsName"\s*:\s*"([^"]+)".*?"price"\s*:\s*(\d+)', re.DOTALL)
|
||
matches = json_pattern.findall(html)
|
||
|
||
for match in matches[:limit]:
|
||
product_id, name, price = match
|
||
product = MomoProduct(
|
||
product_id=product_id,
|
||
name=name,
|
||
price=int(price),
|
||
original_price=int(price),
|
||
discount=None,
|
||
image_url='',
|
||
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
|
||
brand='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
products.append(product)
|
||
|
||
logger.info(f"[MOMO] 解析到 {len(products)} 個商品")
|
||
return products
|
||
|
||
except Exception as e:
|
||
logger.error(f"[MOMO] 解析搜尋結果失敗: {e}")
|
||
return []
|
||
|
||
|
||
# 全域爬蟲實例
|
||
_crawler_instance: Optional[MomoCrawler] = None
|
||
|
||
|
||
def get_crawler() -> MomoCrawler:
|
||
"""取得爬蟲實例(單例模式)"""
|
||
global _crawler_instance
|
||
if _crawler_instance is None:
|
||
_crawler_instance = MomoCrawler()
|
||
return _crawler_instance
|
||
|
||
|
||
def search_momo_products(keyword: str, limit: int = 10) -> Tuple[bool, str, List[dict]]:
|
||
"""
|
||
搜尋 MOMO 商品(便捷函數)
|
||
|
||
Args:
|
||
keyword: 搜尋關鍵字
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品資料列表)
|
||
"""
|
||
crawler = get_crawler()
|
||
success, message, products = crawler.search_products(keyword, limit)
|
||
return success, message, [p.to_dict() for p in products]
|
||
|
||
|
||
def _to_float(value, default: float = 0.0) -> float:
|
||
try:
|
||
if value is None:
|
||
return default
|
||
return float(str(value).replace(",", "").replace("$", "").strip())
|
||
except (TypeError, ValueError):
|
||
return default
|
||
|
||
|
||
def _product_name_from_payload(payload: dict) -> str:
|
||
return str(
|
||
payload.get("name")
|
||
or payload.get("product_name")
|
||
or payload.get("title")
|
||
or payload.get("商品名稱")
|
||
or ""
|
||
).strip()
|
||
|
||
|
||
def _product_price_from_payload(payload: dict) -> float:
|
||
return _to_float(
|
||
payload.get("price")
|
||
or payload.get("pchome_price")
|
||
or payload.get("sale_price")
|
||
or payload.get("售價")
|
||
)
|
||
|
||
|
||
def _dedupe_terms(terms: list[str], max_terms: int) -> list[str]:
|
||
result: list[str] = []
|
||
seen: set[str] = set()
|
||
for term in terms:
|
||
normalized = re.sub(r"\s+", " ", str(term or "").strip())
|
||
if len(normalized) < 2:
|
||
continue
|
||
key = normalized.lower()
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
result.append(normalized)
|
||
if len(result) >= max_terms:
|
||
break
|
||
return result
|
||
|
||
|
||
def build_targeted_momo_search_terms(pchome_name: str, max_terms: int = MOMO_TARGETED_SEARCH_MAX_TERMS) -> list[str]:
|
||
"""用 PChome 商品名稱產生 MOMO 精準搜尋詞,保留品名、容量與組合線索。"""
|
||
if not pchome_name:
|
||
return []
|
||
try:
|
||
from services.marketplace_product_matcher import build_search_terms
|
||
|
||
terms = build_search_terms(pchome_name, max_terms=max_terms)
|
||
except Exception:
|
||
logger.warning("[MOMO] 產生精準搜尋詞失敗,改用原商品名", exc_info=True)
|
||
terms = []
|
||
terms.append(pchome_name)
|
||
return _dedupe_terms(terms, max_terms=max_terms)
|
||
|
||
|
||
def search_momo_products_for_pchome_products(
|
||
pchome_products: list[dict],
|
||
*,
|
||
limit_per_product: int = MOMO_TARGETED_SEARCH_LIMIT_PER_TERM,
|
||
max_products: int = MOMO_TARGETED_SEARCH_MAX_PRODUCTS,
|
||
max_terms_per_product: int = MOMO_TARGETED_SEARCH_MAX_TERMS,
|
||
min_score: float = MOMO_TARGETED_SEARCH_MIN_SCORE,
|
||
crawler: MomoCrawler | None = None,
|
||
) -> Tuple[bool, str, List[dict]]:
|
||
"""以 PChome 商品逐筆反查 MOMO 候選,補足單品與組合的精準比價來源。"""
|
||
if not pchome_products:
|
||
return False, "沒有 PChome 商品可用來搜尋 MOMO", []
|
||
|
||
try:
|
||
from services.marketplace_product_matcher import (
|
||
build_unit_price_comparison,
|
||
score_marketplace_match,
|
||
)
|
||
except Exception as exc:
|
||
logger.error("[MOMO] 無法載入商品比對工具: %s", exc, exc_info=True)
|
||
return False, "商品比對工具暫時不可用", []
|
||
|
||
crawler = crawler or get_crawler()
|
||
candidates_by_id: dict[str, dict] = {}
|
||
searched_products = 0
|
||
searched_terms: list[str] = []
|
||
|
||
for target in pchome_products[:max_products]:
|
||
pchome_name = _product_name_from_payload(target)
|
||
if not pchome_name:
|
||
continue
|
||
searched_products += 1
|
||
pchome_price = _product_price_from_payload(target)
|
||
pchome_id = str(target.get("product_id") or target.get("id") or target.get("sku") or "").strip()
|
||
terms = build_targeted_momo_search_terms(pchome_name, max_terms=max_terms_per_product)
|
||
|
||
for term in terms:
|
||
searched_terms.append(term)
|
||
success, _, products = crawler.search_products(term, limit=limit_per_product)
|
||
if not success or not products:
|
||
continue
|
||
|
||
for product in products:
|
||
row = product.to_dict() if hasattr(product, "to_dict") else dict(product)
|
||
momo_name = _product_name_from_payload(row)
|
||
if not momo_name:
|
||
continue
|
||
momo_price = _to_float(row.get("price"))
|
||
diagnostics = score_marketplace_match(
|
||
momo_name,
|
||
pchome_name,
|
||
momo_price=momo_price,
|
||
competitor_price=pchome_price,
|
||
)
|
||
score = float(getattr(diagnostics, "score", 0.0) or 0.0)
|
||
if score < min_score:
|
||
continue
|
||
hard_veto = bool(getattr(diagnostics, "hard_veto", False))
|
||
comparison_mode = getattr(diagnostics, "comparison_mode", "exact_identity")
|
||
diagnostic_price_basis = str(getattr(diagnostics, "price_basis", "") or "")
|
||
diagnostic_alert_tier = str(getattr(diagnostics, "alert_tier", "") or "")
|
||
diagnostic_match_type = str(getattr(diagnostics, "match_type", "") or "")
|
||
unit_price_comparison = {}
|
||
auto_compare_type = "manual_review"
|
||
price_basis = "none"
|
||
review_status = "需人工確認"
|
||
if (
|
||
not hard_veto
|
||
and comparison_mode == "exact_identity"
|
||
and diagnostic_price_basis == "total_price"
|
||
and diagnostic_alert_tier == "price_alert_exact"
|
||
):
|
||
can_auto_compare = True
|
||
auto_compare_type = "total_price"
|
||
price_basis = "total_price"
|
||
review_status = "可直接比價"
|
||
elif comparison_mode == "unit_comparable" or diagnostic_price_basis == "unit_price":
|
||
unit_price_comparison = build_unit_price_comparison(
|
||
momo_name,
|
||
pchome_name,
|
||
momo_price=momo_price,
|
||
competitor_price=pchome_price,
|
||
)
|
||
can_auto_compare = bool(unit_price_comparison.get("comparable"))
|
||
if can_auto_compare:
|
||
auto_compare_type = "unit_price"
|
||
price_basis = "unit_price"
|
||
review_status = "自動單位價比較"
|
||
else:
|
||
price_basis = "unit_price_review"
|
||
else:
|
||
can_auto_compare = False
|
||
|
||
if comparison_mode != "unit_comparable":
|
||
unit_price_comparison = {}
|
||
|
||
gap_pct = None
|
||
if unit_price_comparison:
|
||
gap_pct = unit_price_comparison.get("unit_gap_pct")
|
||
elif pchome_price:
|
||
try:
|
||
gap_pct = (float(momo_price or 0) - float(pchome_price)) / float(pchome_price) * 100
|
||
except (TypeError, ValueError, ZeroDivisionError):
|
||
gap_pct = None
|
||
|
||
product_id = str(row.get("product_id") or row.get("goodsCode") or row.get("id") or "").strip()
|
||
if not product_id:
|
||
product_id = f"momo_candidate_{len(candidates_by_id)}"
|
||
existing = candidates_by_id.get(product_id)
|
||
if existing and float(existing.get("target_match_score") or 0.0) >= score:
|
||
continue
|
||
|
||
row.update({
|
||
"product_id": product_id,
|
||
"target_pchome_product_id": pchome_id,
|
||
"target_pchome_name": pchome_name,
|
||
"target_pchome_price": pchome_price,
|
||
"target_match_score": round(score, 3),
|
||
"target_search_term": term,
|
||
"target_match_reasons": list(getattr(diagnostics, "reasons", ()) or ()),
|
||
"target_comparison_mode": comparison_mode,
|
||
"target_match_type": diagnostic_match_type,
|
||
"target_alert_tier": diagnostic_alert_tier,
|
||
"target_hard_veto": hard_veto,
|
||
"can_auto_compare": can_auto_compare,
|
||
"auto_compare_type": auto_compare_type,
|
||
"target_price_basis": price_basis,
|
||
"target_gap_pct": round(float(gap_pct), 2) if gap_pct is not None else None,
|
||
"target_unit_price_comparison": unit_price_comparison,
|
||
"target_review_status": review_status,
|
||
"source_strategy": "pchome_targeted_momo_search",
|
||
})
|
||
candidates_by_id[product_id] = row
|
||
|
||
candidates = sorted(
|
||
candidates_by_id.values(),
|
||
key=lambda item: float(item.get("target_match_score") or 0.0),
|
||
reverse=True,
|
||
)
|
||
if not candidates:
|
||
return False, f"已用 {searched_products} 筆 PChome 商品搜尋 MOMO,但沒有找到可用候選", []
|
||
exact_count = sum(1 for item in candidates if item.get("auto_compare_type") == "total_price")
|
||
unit_count = sum(1 for item in candidates if item.get("auto_compare_type") == "unit_price")
|
||
review_count = len(candidates) - exact_count - unit_count
|
||
return (
|
||
True,
|
||
(
|
||
f"已用 {searched_products} 筆 PChome 商品搜尋 MOMO,找到 {len(candidates)} 筆候選"
|
||
f"(可直接比價 {exact_count} 筆、自動單位價比較 {unit_count} 筆、需人工確認 {review_count} 筆)"
|
||
),
|
||
candidates,
|
||
)
|
||
|
||
|
||
def get_momo_bestsellers(category: str, limit: int = 5) -> Tuple[bool, str, List[dict]]:
|
||
"""
|
||
取得 MOMO 分類熱銷商品
|
||
|
||
Args:
|
||
category: 分類關鍵字 (如 '面膜', '乳液', '精華液')
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品資料列表)
|
||
"""
|
||
crawler = get_crawler()
|
||
# 使用搜尋 API,按銷量排序
|
||
success, message, products = crawler.search_products(category, limit=limit, sort_by='sSaleQty/dc')
|
||
if success and products:
|
||
# 轉換為精簡格式
|
||
result = []
|
||
for p in products[:limit]:
|
||
result.append({
|
||
'name': p.name,
|
||
'price': p.price,
|
||
'original_price': p.original_price,
|
||
'discount': p.discount,
|
||
'url': p.product_url,
|
||
'image': p.image_url
|
||
})
|
||
return True, f"成功取得 {len(result)} 個熱銷商品", result
|
||
return success, message, []
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 測試
|
||
logging.basicConfig(level=logging.INFO)
|
||
|
||
print("=== MOMO 爬蟲測試 ===\n")
|
||
|
||
# 測試搜尋
|
||
print("[1] 測試搜尋 (關鍵字: 面膜)")
|
||
success, msg, products = search_momo_products('面膜', limit=5)
|
||
print(f"結果: {msg}")
|
||
if products:
|
||
print(f"搜尋結果:")
|
||
for p in products[:3]:
|
||
print(f" - {p['name'][:30]}... ${p['price']}")
|
||
|
||
print("\n[2] 測試熱銷商品 (分類: 精華液)")
|
||
success, msg, products = get_momo_bestsellers('精華液', limit=5)
|
||
print(f"結果: {msg}")
|
||
if products:
|
||
print(f"熱銷商品:")
|
||
for i, p in enumerate(products, 1):
|
||
print(f" {i}. {p['name'][:30]}... ${p['price']}")
|