Some checks failed
CD Pipeline / deploy (push) Failing after 59s
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml) - 部署模式: rsync Python 檔案至 188 → docker restart (volume mount) - Dockerfile/requirements 變動時自動重建 Docker image - 部署通知: Telegram (開始/成功/失敗) - 健康檢查: https://mo.wooo.work/health (最多 5 次重試) - 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
522 lines
20 KiB
Python
522 lines
20 KiB
Python
"""
|
||
MOMO 購物網爬蟲服務
|
||
|
||
爬取 MOMO 購物網商品資料,支援:
|
||
- 關鍵字搜尋
|
||
- 熱銷商品排行
|
||
|
||
API 參考:
|
||
- 搜尋 API: https://m.momoshop.com.tw/search.momo
|
||
- 商品 API: https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code=XXX
|
||
"""
|
||
|
||
import re
|
||
import json
|
||
import time
|
||
import logging
|
||
from typing import List, Dict, Optional, Tuple
|
||
from dataclasses import dataclass, asdict
|
||
from datetime import datetime
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
@dataclass
|
||
class MomoProduct:
|
||
"""MOMO 商品資料結構"""
|
||
product_id: str # 商品 ID
|
||
name: str # 商品名稱
|
||
price: int # 售價
|
||
original_price: int # 原價
|
||
discount: Optional[int] # 折扣 (%)
|
||
image_url: str # 圖片 URL
|
||
product_url: str # 商品頁面 URL
|
||
brand: str # 品牌
|
||
crawled_at: datetime # 爬取時間
|
||
|
||
def to_dict(self) -> dict:
|
||
"""轉換為字典"""
|
||
data = asdict(self)
|
||
data['crawled_at'] = self.crawled_at.isoformat()
|
||
return data
|
||
|
||
|
||
class MomoCrawler:
|
||
"""MOMO 購物網爬蟲"""
|
||
|
||
# 基礎 URL
|
||
BASE_URL = 'https://www.momoshop.com.tw'
|
||
MOBILE_URL = 'https://m.momoshop.com.tw'
|
||
SEARCH_API = 'https://www.momoshop.com.tw/search/searchShop.jsp'
|
||
# 使用行動版搜尋 API(更容易爬取)
|
||
MOBILE_SEARCH_API = 'https://m.momoshop.com.tw/search.momo'
|
||
|
||
# 預設 Headers(模擬行動裝置)
|
||
DEFAULT_HEADERS = {
|
||
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
|
||
'Accept-Encoding': 'gzip, deflate, br',
|
||
'Referer': 'https://m.momoshop.com.tw/',
|
||
}
|
||
|
||
def __init__(self, timeout: int = 30, delay: float = 1.0):
|
||
"""
|
||
初始化爬蟲
|
||
|
||
Args:
|
||
timeout: 請求超時時間 (秒)
|
||
delay: 請求間隔延遲 (秒)
|
||
"""
|
||
self.timeout = timeout
|
||
self.delay = delay
|
||
self.session = requests.Session()
|
||
self.session.headers.update(self.DEFAULT_HEADERS)
|
||
self._last_request_time = 0
|
||
|
||
def _rate_limit(self):
|
||
"""速率限制"""
|
||
elapsed = time.time() - self._last_request_time
|
||
if elapsed < self.delay:
|
||
time.sleep(self.delay - elapsed)
|
||
self._last_request_time = time.time()
|
||
|
||
def search_products(self, keyword: str, limit: int = 10, sort_by: str = 'sSaleQty/dc') -> Tuple[bool, str, List[MomoProduct]]:
|
||
"""
|
||
搜尋商品
|
||
|
||
Args:
|
||
keyword: 搜尋關鍵字
|
||
limit: 最多回傳數量
|
||
sort_by: 排序方式
|
||
- 'sSaleQty/dc': 銷量高到低(熱銷)
|
||
- 'sPrice/ac': 價格低到高
|
||
- 'sPrice/dc': 價格高到低
|
||
- 'sSaleDate/dc': 上架時間新到舊
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品列表)
|
||
"""
|
||
try:
|
||
self._rate_limit()
|
||
|
||
# 先嘗試行動版 API
|
||
products = self._search_mobile(keyword, limit, sort_by)
|
||
if products:
|
||
return True, f"成功取得 {len(products)} 個商品", products
|
||
|
||
# 如果行動版失敗,嘗試桌面版
|
||
products = self._search_desktop(keyword, limit, sort_by)
|
||
if products:
|
||
return True, f"成功取得 {len(products)} 個商品", products
|
||
|
||
return False, "無法解析商品資料", []
|
||
|
||
except requests.Timeout:
|
||
logger.error("[MOMO] 請求超時")
|
||
return False, "請求超時", []
|
||
except Exception as e:
|
||
logger.error(f"[MOMO] 搜尋失敗: {e}")
|
||
return False, str(e), []
|
||
|
||
def _search_mobile(self, keyword: str, limit: int, sort_by: str) -> List[MomoProduct]:
|
||
"""使用行動版 API 搜尋"""
|
||
try:
|
||
# 行動版搜尋參數
|
||
params = {
|
||
'searchKeyword': keyword,
|
||
'sortType': '4' if 'sSaleQty' in sort_by else '1', # 4=銷量, 1=相關
|
||
'maxPage': '1',
|
||
'curPage': '1',
|
||
}
|
||
|
||
logger.info(f"[MOMO] 行動版搜尋: {keyword}")
|
||
|
||
response = self.session.get(
|
||
self.MOBILE_SEARCH_API,
|
||
params=params,
|
||
timeout=self.timeout
|
||
)
|
||
|
||
if response.status_code != 200:
|
||
logger.warning(f"[MOMO] 行動版搜尋失敗: HTTP {response.status_code}")
|
||
return []
|
||
|
||
return self._parse_mobile_results(response.text, limit)
|
||
|
||
except Exception as e:
|
||
logger.warning(f"[MOMO] 行動版搜尋異常: {e}")
|
||
return []
|
||
|
||
def _search_desktop(self, keyword: str, limit: int, sort_by: str) -> List[MomoProduct]:
|
||
"""使用桌面版 API 搜尋"""
|
||
try:
|
||
params = {
|
||
'keyword': keyword,
|
||
'searchType': '1',
|
||
'cateLevel': '-1',
|
||
'curPage': '1',
|
||
'maxPage': '1',
|
||
'minPage': '1',
|
||
'areaCode': 'all',
|
||
'isFuzzy': '0',
|
||
'sortType': sort_by
|
||
}
|
||
|
||
logger.info(f"[MOMO] 桌面版搜尋: {keyword}")
|
||
|
||
# 更換為桌面版 User-Agent
|
||
headers = self.DEFAULT_HEADERS.copy()
|
||
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||
|
||
response = self.session.get(
|
||
self.SEARCH_API,
|
||
params=params,
|
||
headers=headers,
|
||
timeout=self.timeout
|
||
)
|
||
|
||
if response.status_code != 200:
|
||
logger.warning(f"[MOMO] 桌面版搜尋失敗: HTTP {response.status_code}")
|
||
return []
|
||
|
||
return self._parse_search_results(response.text, limit)
|
||
|
||
except Exception as e:
|
||
logger.warning(f"[MOMO] 桌面版搜尋異常: {e}")
|
||
return []
|
||
|
||
def _parse_mobile_results(self, html: str, limit: int) -> List[MomoProduct]:
|
||
"""解析行動版搜尋結果"""
|
||
products = []
|
||
|
||
try:
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
|
||
# 方法 1: 找到所有包含 i_code 的連結
|
||
product_links = soup.find_all('a', href=re.compile(r'i_code=\d+'))
|
||
|
||
seen_ids = set()
|
||
for link in product_links:
|
||
if len(products) >= limit:
|
||
break
|
||
|
||
try:
|
||
href = link.get('href', '')
|
||
match = re.search(r'i_code=(\d+)', href)
|
||
if not match:
|
||
continue
|
||
|
||
product_id = match.group(1)
|
||
if product_id in seen_ids:
|
||
continue
|
||
seen_ids.add(product_id)
|
||
|
||
# 尋找商品名稱 - 從多個可能位置
|
||
name = ''
|
||
# 從 title 屬性
|
||
if link.get('title'):
|
||
name = link.get('title')
|
||
# 從 img alt
|
||
if not name:
|
||
img = link.find('img')
|
||
if img and img.get('alt'):
|
||
name = img.get('alt')
|
||
# 從 text 內容
|
||
if not name:
|
||
name_elem = link.find(class_=re.compile(r'name|title|goods', re.I))
|
||
if name_elem:
|
||
name = name_elem.get_text(strip=True)
|
||
# 從連結本身的文字
|
||
if not name:
|
||
name = link.get_text(strip=True)
|
||
|
||
if not name or len(name) < 3:
|
||
continue
|
||
|
||
# 尋找價格 - 從父元素或兄弟元素
|
||
price = 0
|
||
parent = link.find_parent(['li', 'div', 'article'])
|
||
if parent:
|
||
price_elem = parent.find(class_=re.compile(r'price', re.I))
|
||
if price_elem:
|
||
price_text = price_elem.get_text(strip=True)
|
||
price_match = re.search(r'[\d,]+', price_text)
|
||
if price_match:
|
||
price = int(price_match.group().replace(',', ''))
|
||
|
||
# 尋找圖片
|
||
image_url = ''
|
||
img = link.find('img')
|
||
if img:
|
||
image_url = img.get('src') or img.get('data-src') or img.get('data-original', '')
|
||
if image_url and image_url.startswith('//'):
|
||
image_url = 'https:' + image_url
|
||
|
||
product = MomoProduct(
|
||
product_id=product_id,
|
||
name=name.strip()[:100], # 限制名稱長度
|
||
price=price,
|
||
original_price=price,
|
||
discount=None,
|
||
image_url=image_url,
|
||
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
|
||
brand='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
products.append(product)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[MOMO] 解析商品連結失敗: {e}")
|
||
continue
|
||
|
||
# 方法 2: 如果上面沒找到,嘗試從 __NEXT_DATA__ 或 JSON
|
||
if not products:
|
||
# 嘗試找 Next.js 資料
|
||
script = soup.find('script', {'id': '__NEXT_DATA__'})
|
||
if script and script.string:
|
||
try:
|
||
next_data = json.loads(script.string)
|
||
props = next_data.get('props', {}).get('pageProps', {})
|
||
items = props.get('products', []) or props.get('items', [])
|
||
for item in items[:limit]:
|
||
product = MomoProduct(
|
||
product_id=str(item.get('goodsCode', '')),
|
||
name=item.get('goodsName', ''),
|
||
price=int(item.get('price', 0)),
|
||
original_price=int(item.get('suggestPrice', item.get('price', 0))),
|
||
discount=None,
|
||
image_url=item.get('imgUrl', ''),
|
||
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={item.get("goodsCode", "")}',
|
||
brand='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
if product.product_id and product.name:
|
||
products.append(product)
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
# 方法 3: 從 HTML 中找嵌入的 JSON
|
||
if not products:
|
||
json_pattern = re.compile(r'"goodsCode"\s*:\s*"?(\d+)"?.*?"goodsName"\s*:\s*"([^"]+)".*?"price"\s*:\s*(\d+)', re.DOTALL)
|
||
matches = json_pattern.findall(html)
|
||
for match in matches[:limit]:
|
||
product_id, name, price = match
|
||
product = MomoProduct(
|
||
product_id=product_id,
|
||
name=name,
|
||
price=int(price),
|
||
original_price=int(price),
|
||
discount=None,
|
||
image_url='',
|
||
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
|
||
brand='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
products.append(product)
|
||
|
||
logger.info(f"[MOMO] 行動版解析到 {len(products)} 個商品")
|
||
return products
|
||
|
||
except Exception as e:
|
||
logger.error(f"[MOMO] 解析行動版結果失敗: {e}")
|
||
return []
|
||
|
||
def _parse_search_results(self, html: str, limit: int) -> List[MomoProduct]:
|
||
"""
|
||
解析搜尋結果 HTML
|
||
|
||
Args:
|
||
html: HTML 內容
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
商品列表
|
||
"""
|
||
products = []
|
||
|
||
try:
|
||
# 使用正則表達式解析商品資訊
|
||
# MOMO 商品卡片通常包含 goodsUrl、goodsName、price 等資訊
|
||
|
||
# 嘗試從 goodsCode 取得商品 ID
|
||
goods_pattern = re.compile(
|
||
r'<a[^>]*href=["\']([^"\']*i_code=(\d+)[^"\']*)["\'][^>]*>.*?'
|
||
r'<img[^>]*(?:src|data-original)=["\']([^"\']+)["\'][^>]*>.*?'
|
||
r'<p[^>]*class=["\'][^"\']*prdName[^"\']*["\'][^>]*>([^<]+)</p>.*?'
|
||
r'<b[^>]*class=["\'][^"\']*price[^"\']*["\'][^>]*>\$?([\d,]+)</b>',
|
||
re.DOTALL | re.IGNORECASE
|
||
)
|
||
|
||
# 備用:使用更簡單的模式
|
||
simple_pattern = re.compile(
|
||
r'i_code=(\d+).*?'
|
||
r'title=["\']([^"\']+)["\'].*?'
|
||
r'(?:src|data-original)=["\']([^"\']*(?:jpg|png|webp)[^"\']*)["\'].*?'
|
||
r'\$?([\d,]+)',
|
||
re.DOTALL | re.IGNORECASE
|
||
)
|
||
|
||
# 嘗試用 li.goodsItemLi 模式
|
||
item_pattern = re.compile(
|
||
r'<li[^>]*class=["\'][^"\']*goodsItemLi[^"\']*["\'][^>]*>(.*?)</li>',
|
||
re.DOTALL | re.IGNORECASE
|
||
)
|
||
|
||
items = item_pattern.findall(html)
|
||
|
||
for item_html in items[:limit]:
|
||
try:
|
||
# 從每個商品項目中提取資料
|
||
code_match = re.search(r'i_code=(\d+)', item_html)
|
||
name_match = re.search(r'<p[^>]*class=["\'][^"\']*prdName[^"\']*["\'][^>]*>([^<]+)</p>', item_html)
|
||
if not name_match:
|
||
name_match = re.search(r'title=["\']([^"\']+)["\']', item_html)
|
||
price_match = re.search(r'<b[^>]*>\$?([\d,]+)</b>', item_html)
|
||
if not price_match:
|
||
price_match = re.search(r'\$?([\d,]+)', item_html)
|
||
img_match = re.search(r'(?:src|data-original)=["\']([^"\']+\.(?:jpg|png|webp)[^"\']*)["\']', item_html, re.IGNORECASE)
|
||
original_price_match = re.search(r'<del[^>]*>\$?([\d,]+)</del>', item_html)
|
||
|
||
if code_match and name_match and price_match:
|
||
product_id = code_match.group(1)
|
||
name = name_match.group(1).strip()
|
||
price = int(price_match.group(1).replace(',', ''))
|
||
original_price = int(original_price_match.group(1).replace(',', '')) if original_price_match else price
|
||
image_url = img_match.group(1) if img_match else ''
|
||
|
||
# 計算折扣
|
||
discount = None
|
||
if original_price > price:
|
||
discount = round((1 - price / original_price) * 100)
|
||
|
||
product = MomoProduct(
|
||
product_id=product_id,
|
||
name=name,
|
||
price=price,
|
||
original_price=original_price,
|
||
discount=discount,
|
||
image_url=image_url if image_url.startswith('http') else f'https:{image_url}' if image_url.startswith('//') else image_url,
|
||
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
|
||
brand='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
products.append(product)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[MOMO] 解析商品項目失敗: {e}")
|
||
continue
|
||
|
||
# 如果上面的方法都失敗,嘗試用 JSON-like 結構
|
||
if not products:
|
||
# 有時候 MOMO 會在 HTML 中嵌入 JSON 資料
|
||
json_pattern = re.compile(r'"goodsCode"\s*:\s*"(\d+)".*?"goodsName"\s*:\s*"([^"]+)".*?"price"\s*:\s*(\d+)', re.DOTALL)
|
||
matches = json_pattern.findall(html)
|
||
|
||
for match in matches[:limit]:
|
||
product_id, name, price = match
|
||
product = MomoProduct(
|
||
product_id=product_id,
|
||
name=name,
|
||
price=int(price),
|
||
original_price=int(price),
|
||
discount=None,
|
||
image_url='',
|
||
product_url=f'{self.BASE_URL}/goods/GoodsDetail.jsp?i_code={product_id}',
|
||
brand='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
products.append(product)
|
||
|
||
logger.info(f"[MOMO] 解析到 {len(products)} 個商品")
|
||
return products
|
||
|
||
except Exception as e:
|
||
logger.error(f"[MOMO] 解析搜尋結果失敗: {e}")
|
||
return []
|
||
|
||
|
||
# 全域爬蟲實例
|
||
_crawler_instance: Optional[MomoCrawler] = None
|
||
|
||
|
||
def get_crawler() -> MomoCrawler:
|
||
"""取得爬蟲實例(單例模式)"""
|
||
global _crawler_instance
|
||
if _crawler_instance is None:
|
||
_crawler_instance = MomoCrawler()
|
||
return _crawler_instance
|
||
|
||
|
||
def search_momo_products(keyword: str, limit: int = 10) -> Tuple[bool, str, List[dict]]:
|
||
"""
|
||
搜尋 MOMO 商品(便捷函數)
|
||
|
||
Args:
|
||
keyword: 搜尋關鍵字
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品資料列表)
|
||
"""
|
||
crawler = get_crawler()
|
||
success, message, products = crawler.search_products(keyword, limit)
|
||
return success, message, [p.to_dict() for p in products]
|
||
|
||
|
||
def get_momo_bestsellers(category: str, limit: int = 5) -> Tuple[bool, str, List[dict]]:
|
||
"""
|
||
取得 MOMO 分類熱銷商品
|
||
|
||
Args:
|
||
category: 分類關鍵字 (如 '面膜', '乳液', '精華液')
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品資料列表)
|
||
"""
|
||
crawler = get_crawler()
|
||
# 使用搜尋 API,按銷量排序
|
||
success, message, products = crawler.search_products(category, limit=limit, sort_by='sSaleQty/dc')
|
||
if success and products:
|
||
# 轉換為精簡格式
|
||
result = []
|
||
for p in products[:limit]:
|
||
result.append({
|
||
'name': p.name,
|
||
'price': p.price,
|
||
'original_price': p.original_price,
|
||
'discount': p.discount,
|
||
'url': p.product_url,
|
||
'image': p.image_url
|
||
})
|
||
return True, f"成功取得 {len(result)} 個熱銷商品", result
|
||
return success, message, []
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 測試
|
||
logging.basicConfig(level=logging.INFO)
|
||
|
||
print("=== MOMO 爬蟲測試 ===\n")
|
||
|
||
# 測試搜尋
|
||
print("[1] 測試搜尋 (關鍵字: 面膜)")
|
||
success, msg, products = search_momo_products('面膜', limit=5)
|
||
print(f"結果: {msg}")
|
||
if products:
|
||
print(f"搜尋結果:")
|
||
for p in products[:3]:
|
||
print(f" - {p['name'][:30]}... ${p['price']}")
|
||
|
||
print("\n[2] 測試熱銷商品 (分類: 精華液)")
|
||
success, msg, products = get_momo_bestsellers('精華液', limit=5)
|
||
print(f"結果: {msg}")
|
||
if products:
|
||
print(f"熱銷商品:")
|
||
for i, p in enumerate(products, 1):
|
||
print(f" {i}. {p['name'][:30]}... ${p['price']}")
|