Files
ewoooc/services/momo_crawler.py
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

522 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
MOMO 購物網爬蟲服務
爬取 MOMO 購物網商品資料,支援:
- 關鍵字搜尋
- 熱銷商品排行
API 參考:
- 搜尋 API: https://m.momoshop.com.tw/search.momo
- 商品 API: https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code=XXX
"""
import re
import json
import time
import logging
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime
import requests
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
@dataclass
class MomoProduct:
"""MOMO 商品資料結構"""
product_id: str # 商品 ID
name: str # 商品名稱
price: int # 售價
original_price: int # 原價
discount: Optional[int] # 折扣 (%)
image_url: str # 圖片 URL
product_url: str # 商品頁面 URL
brand: str # 品牌
crawled_at: datetime # 爬取時間
def to_dict(self) -> dict:
"""轉換為字典"""
data = asdict(self)
data['crawled_at'] = self.crawled_at.isoformat()
return data
class MomoCrawler:
"""MOMO 購物網爬蟲"""
# 基礎 URL
BASE_URL = 'https://www.momoshop.com.tw'
MOBILE_URL = 'https://m.momoshop.com.tw'
SEARCH_API = 'https://www.momoshop.com.tw/search/searchShop.jsp'
# 使用行動版搜尋 API更容易爬取
MOBILE_SEARCH_API = 'https://m.momoshop.com.tw/search.momo'
# 預設 Headers模擬行動裝置
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://m.momoshop.com.tw/',
}
def __init__(self, timeout: int = 30, delay: float = 1.0):
"""
初始化爬蟲
Args:
timeout: 請求超時時間 (秒)
delay: 請求間隔延遲 (秒)
"""
self.timeout = timeout
self.delay = delay
self.session = requests.Session()
self.session.headers.update(self.DEFAULT_HEADERS)
self._last_request_time = 0
def _rate_limit(self):
"""速率限制"""
elapsed = time.time() - self._last_request_time
if elapsed < self.delay:
time.sleep(self.delay - elapsed)
self._last_request_time = time.time()
def search_products(self, keyword: str, limit: int = 10, sort_by: str = 'sSaleQty/dc') -> Tuple[bool, str, List[MomoProduct]]:
"""
搜尋商品
Args:
keyword: 搜尋關鍵字
limit: 最多回傳數量
sort_by: 排序方式
- 'sSaleQty/dc': 銷量高到低(熱銷)
- 'sPrice/ac': 價格低到高
- 'sPrice/dc': 價格高到低
- 'sSaleDate/dc': 上架時間新到舊
Returns:
(成功與否, 訊息, 商品列表)
"""
try:
self._rate_limit()
# 先嘗試行動版 API
products = self._search_mobile(keyword, limit, sort_by)
if products:
return True, f"成功取得 {len(products)} 個商品", products
# 如果行動版失敗,嘗試桌面版
products = self._search_desktop(keyword, limit, sort_by)
if products:
return True, f"成功取得 {len(products)} 個商品", products
return False, "無法解析商品資料", []
except requests.Timeout:
logger.error("[MOMO] 請求超時")
return False, "請求超時", []
except Exception as e:
logger.error(f"[MOMO] 搜尋失敗: {e}")
return False, str(e), []
def _search_mobile(self, keyword: str, limit: int, sort_by: str) -> List[MomoProduct]:
"""使用行動版 API 搜尋"""
try:
# 行動版搜尋參數
params = {
'searchKeyword': keyword,
'sortType': '4' if 'sSaleQty' in sort_by else '1', # 4=銷量, 1=相關
'maxPage': '1',
'curPage': '1',
}
logger.info(f"[MOMO] 行動版搜尋: {keyword}")
response = self.session.get(
self.MOBILE_SEARCH_API,
params=params,
timeout=self.timeout
)
if response.status_code != 200:
logger.warning(f"[MOMO] 行動版搜尋失敗: HTTP {response.status_code}")
return []
return self._parse_mobile_results(response.text, limit)
except Exception as e:
logger.warning(f"[MOMO] 行動版搜尋異常: {e}")
return []
def _search_desktop(self, keyword: str, limit: int, sort_by: str) -> List[MomoProduct]:
"""使用桌面版 API 搜尋"""
try:
params = {
'keyword': keyword,
'searchType': '1',
'cateLevel': '-1',
'curPage': '1',
'maxPage': '1',
'minPage': '1',
'areaCode': 'all',
'isFuzzy': '0',
'sortType': sort_by
}
logger.info(f"[MOMO] 桌面版搜尋: {keyword}")
# 更換為桌面版 User-Agent
headers = self.DEFAULT_HEADERS.copy()
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
response = self.session.get(
self.SEARCH_API,
params=params,
headers=headers,
timeout=self.timeout
)
if response.status_code != 200:
logger.warning(f"[MOMO] 桌面版搜尋失敗: HTTP {response.status_code}")
return []
return self._parse_search_results(response.text, limit)
except Exception as e:
logger.warning(f"[MOMO] 桌面版搜尋異常: {e}")
return []
def _parse_mobile_results(self, html: str, limit: int) -> List[MomoProduct]:
"""解析行動版搜尋結果"""
products = []
try:
soup = BeautifulSoup(html, 'html.parser')
# 方法 1: 找到所有包含 i_code 的連結
product_links = soup.find_all('a', href=re.compile(r'i_code=\d+'))
seen_ids = set()
for link in product_links:
if len(products) >= limit:
break
try:
href = link.get('href', '')
match = re.search(r'i_code=(\d+)', href)
if not match:
continue
product_id = match.group(1)
if product_id in seen_ids:
continue
seen_ids.add(product_id)
# 尋找商品名稱 - 從多個可能位置
name = ''
# 從 title 屬性
if link.get('title'):
name = link.get('title')
# 從 img alt
if not name:
img = link.find('img')
if img and img.get('alt'):
name = img.get('alt')
# 從 text 內容
if not name:
name_elem = link.find(class_=re.compile(r'name|title|goods', re.I))
if name_elem:
name = name_elem.get_text(strip=True)
# 從連結本身的文字
if not name:
name = link.get_text(strip=True)
if not name or len(name) < 3:
continue
# 尋找價格 - 從父元素或兄弟元素
price = 0
parent = link.find_parent(['li', 'div', 'article'])
if parent:
price_elem = parent.find(class_=re.compile(r'price', re.I))
if price_elem:
price_text = price_elem.get_text(strip=True)
price_match = re.search(r'[\d,]+', price_text)
if price_match:
price = int(price_match.group().replace(',', ''))
# 尋找圖片
image_url = ''
img = link.find('img')
if img:
image_url = img.get('src') or img.get('data-src') or img.get('data-original', '')
if image_url and image_url.startswith('//'):
image_url = 'https:' + image_url
product = MomoProduct(
product_id=product_id,
name=name.strip()[:100], # 限制名稱長度
price=price,
original_price=price,
discount=None,
image_url=image_url,
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
brand='',
crawled_at=datetime.now()
)
products.append(product)
except Exception as e:
logger.debug(f"[MOMO] 解析商品連結失敗: {e}")
continue
# 方法 2: 如果上面沒找到,嘗試從 __NEXT_DATA__ 或 JSON
if not products:
# 嘗試找 Next.js 資料
script = soup.find('script', {'id': '__NEXT_DATA__'})
if script and script.string:
try:
next_data = json.loads(script.string)
props = next_data.get('props', {}).get('pageProps', {})
items = props.get('products', []) or props.get('items', [])
for item in items[:limit]:
product = MomoProduct(
product_id=str(item.get('goodsCode', '')),
name=item.get('goodsName', ''),
price=int(item.get('price', 0)),
original_price=int(item.get('suggestPrice', item.get('price', 0))),
discount=None,
image_url=item.get('imgUrl', ''),
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={item.get("goodsCode", "")}',
brand='',
crawled_at=datetime.now()
)
if product.product_id and product.name:
products.append(product)
except json.JSONDecodeError:
pass
# 方法 3: 從 HTML 中找嵌入的 JSON
if not products:
json_pattern = re.compile(r'"goodsCode"\s*:\s*"?(\d+)"?.*?"goodsName"\s*:\s*"([^"]+)".*?"price"\s*:\s*(\d+)', re.DOTALL)
matches = json_pattern.findall(html)
for match in matches[:limit]:
product_id, name, price = match
product = MomoProduct(
product_id=product_id,
name=name,
price=int(price),
original_price=int(price),
discount=None,
image_url='',
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
brand='',
crawled_at=datetime.now()
)
products.append(product)
logger.info(f"[MOMO] 行動版解析到 {len(products)} 個商品")
return products
except Exception as e:
logger.error(f"[MOMO] 解析行動版結果失敗: {e}")
return []
def _parse_search_results(self, html: str, limit: int) -> List[MomoProduct]:
"""
解析搜尋結果 HTML
Args:
html: HTML 內容
limit: 最多回傳數量
Returns:
商品列表
"""
products = []
try:
# 使用正則表達式解析商品資訊
# MOMO 商品卡片通常包含 goodsUrl、goodsName、price 等資訊
# 嘗試從 goodsCode 取得商品 ID
goods_pattern = re.compile(
r'<a[^>]*href=["\']([^"\']*i_code=(\d+)[^"\']*)["\'][^>]*>.*?'
r'<img[^>]*(?:src|data-original)=["\']([^"\']+)["\'][^>]*>.*?'
r'<p[^>]*class=["\'][^"\']*prdName[^"\']*["\'][^>]*>([^<]+)</p>.*?'
r'<b[^>]*class=["\'][^"\']*price[^"\']*["\'][^>]*>\$?([\d,]+)</b>',
re.DOTALL | re.IGNORECASE
)
# 備用:使用更簡單的模式
simple_pattern = re.compile(
r'i_code=(\d+).*?'
r'title=["\']([^"\']+)["\'].*?'
r'(?:src|data-original)=["\']([^"\']*(?:jpg|png|webp)[^"\']*)["\'].*?'
r'\$?([\d,]+)',
re.DOTALL | re.IGNORECASE
)
# 嘗試用 li.goodsItemLi 模式
item_pattern = re.compile(
r'<li[^>]*class=["\'][^"\']*goodsItemLi[^"\']*["\'][^>]*>(.*?)</li>',
re.DOTALL | re.IGNORECASE
)
items = item_pattern.findall(html)
for item_html in items[:limit]:
try:
# 從每個商品項目中提取資料
code_match = re.search(r'i_code=(\d+)', item_html)
name_match = re.search(r'<p[^>]*class=["\'][^"\']*prdName[^"\']*["\'][^>]*>([^<]+)</p>', item_html)
if not name_match:
name_match = re.search(r'title=["\']([^"\']+)["\']', item_html)
price_match = re.search(r'<b[^>]*>\$?([\d,]+)</b>', item_html)
if not price_match:
price_match = re.search(r'\$?([\d,]+)', item_html)
img_match = re.search(r'(?:src|data-original)=["\']([^"\']+\.(?:jpg|png|webp)[^"\']*)["\']', item_html, re.IGNORECASE)
original_price_match = re.search(r'<del[^>]*>\$?([\d,]+)</del>', item_html)
if code_match and name_match and price_match:
product_id = code_match.group(1)
name = name_match.group(1).strip()
price = int(price_match.group(1).replace(',', ''))
original_price = int(original_price_match.group(1).replace(',', '')) if original_price_match else price
image_url = img_match.group(1) if img_match else ''
# 計算折扣
discount = None
if original_price > price:
discount = round((1 - price / original_price) * 100)
product = MomoProduct(
product_id=product_id,
name=name,
price=price,
original_price=original_price,
discount=discount,
image_url=image_url if image_url.startswith('http') else f'https:{image_url}' if image_url.startswith('//') else image_url,
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
brand='',
crawled_at=datetime.now()
)
products.append(product)
except Exception as e:
logger.debug(f"[MOMO] 解析商品項目失敗: {e}")
continue
# 如果上面的方法都失敗,嘗試用 JSON-like 結構
if not products:
# 有時候 MOMO 會在 HTML 中嵌入 JSON 資料
json_pattern = re.compile(r'"goodsCode"\s*:\s*"(\d+)".*?"goodsName"\s*:\s*"([^"]+)".*?"price"\s*:\s*(\d+)', re.DOTALL)
matches = json_pattern.findall(html)
for match in matches[:limit]:
product_id, name, price = match
product = MomoProduct(
product_id=product_id,
name=name,
price=int(price),
original_price=int(price),
discount=None,
image_url='',
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
brand='',
crawled_at=datetime.now()
)
products.append(product)
logger.info(f"[MOMO] 解析到 {len(products)} 個商品")
return products
except Exception as e:
logger.error(f"[MOMO] 解析搜尋結果失敗: {e}")
return []
# 全域爬蟲實例
_crawler_instance: Optional[MomoCrawler] = None
def get_crawler() -> MomoCrawler:
"""取得爬蟲實例(單例模式)"""
global _crawler_instance
if _crawler_instance is None:
_crawler_instance = MomoCrawler()
return _crawler_instance
def search_momo_products(keyword: str, limit: int = 10) -> Tuple[bool, str, List[dict]]:
"""
搜尋 MOMO 商品(便捷函數)
Args:
keyword: 搜尋關鍵字
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品資料列表)
"""
crawler = get_crawler()
success, message, products = crawler.search_products(keyword, limit)
return success, message, [p.to_dict() for p in products]
def get_momo_bestsellers(category: str, limit: int = 5) -> Tuple[bool, str, List[dict]]:
"""
取得 MOMO 分類熱銷商品
Args:
category: 分類關鍵字 (如 '面膜', '乳液', '精華液')
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品資料列表)
"""
crawler = get_crawler()
# 使用搜尋 API按銷量排序
success, message, products = crawler.search_products(category, limit=limit, sort_by='sSaleQty/dc')
if success and products:
# 轉換為精簡格式
result = []
for p in products[:limit]:
result.append({
'name': p.name,
'price': p.price,
'original_price': p.original_price,
'discount': p.discount,
'url': p.product_url,
'image': p.image_url
})
return True, f"成功取得 {len(result)} 個熱銷商品", result
return success, message, []
if __name__ == '__main__':
# 測試
logging.basicConfig(level=logging.INFO)
print("=== MOMO 爬蟲測試 ===\n")
# 測試搜尋
print("[1] 測試搜尋 (關鍵字: 面膜)")
success, msg, products = search_momo_products('面膜', limit=5)
print(f"結果: {msg}")
if products:
print(f"搜尋結果:")
for p in products[:3]:
print(f" - {p['name'][:30]}... ${p['price']}")
print("\n[2] 測試熱銷商品 (分類: 精華液)")
success, msg, products = get_momo_bestsellers('精華液', limit=5)
print(f"結果: {msg}")
if products:
print(f"熱銷商品:")
for i, p in enumerate(products, 1):
print(f" {i}. {p['name'][:30]}... ${p['price']}")