Files
ewoooc/services/mybest_crawler.py
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

774 lines
27 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
mybest 台灣爬蟲服務
爬取 mybest 台灣的推薦文章和商品排行
網站: https://tw.my-best.com/
支援分類:
- 美妝保養
- 健康保健
- 母嬰用品
- 生活用品
"""
import re
import json
import time
import logging
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime
import requests
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
@dataclass
class MybestArticle:
"""mybest 文章資料結構"""
article_id: str # 文章 ID
title: str # 文章標題
category: str # 分類
product_count: int # 推薦商品數量
image_url: str # 縮圖 URL
article_url: str # 文章頁面 URL
description: str # 簡介
crawled_at: datetime # 爬取時間
def to_dict(self) -> dict:
"""轉換為字典"""
data = asdict(self)
data['crawled_at'] = self.crawled_at.isoformat()
return data
@dataclass
class MybestProduct:
"""mybest 推薦商品資料結構"""
product_id: str # 商品 ID
name: str # 商品名稱
brand: str # 品牌
rank: int # 排名
price: Optional[int] # 價格
image_url: str # 圖片 URL
product_url: str # 商品連結
article_title: str # 所屬文章標題
crawled_at: datetime # 爬取時間
def to_dict(self) -> dict:
"""轉換為字典"""
data = asdict(self)
data['crawled_at'] = self.crawled_at.isoformat()
return data
class MybestCrawler:
"""mybest 台灣爬蟲"""
BASE_URL = 'https://tw.my-best.com'
# 分類搜尋關鍵字 - 使用 search_contents API
CATEGORIES = {
# 美妝保養
'skincare': '基礎保養 精華液 乳液',
'makeup': '彩妝 口紅 眼影',
'hair_care': '洗髮精 護髮 頭髮保養',
'body_care': '身體乳 身體保養',
'sunscreen': '防曬 防曬乳',
'mask': '面膜 保濕面膜',
# 健康保健
'health': '健康食品 保健',
'supplement': '營養補充 維他命',
'diet': '減重 瘦身',
# 母嬰用品
'baby': '嬰兒用品 奶瓶 尿布',
'maternity': '孕婦用品 孕婦',
'kids': '兒童用品 兒童',
# 居家生活
'home': '居家用品 收納',
'kitchen': '廚房用品 鍋具',
# 3C 家電
'electronics': '3C 電子 藍牙耳機',
'appliances': '家電 吸塵器',
}
# 中文分類名稱
CATEGORY_NAMES = {
'skincare': '基礎保養',
'makeup': '彩妝',
'hair_care': '頭髮保養',
'body_care': '身體保養',
'sunscreen': '防曬',
'mask': '面膜',
'health': '健康食品',
'supplement': '營養補充',
'diet': '減重瘦身',
'baby': '嬰兒用品',
'maternity': '孕婦用品',
'kids': '兒童用品',
'home': '居家用品',
'kitchen': '廚房用品',
'electronics': '3C 電子',
'appliances': '家電',
}
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
'Referer': 'https://tw.my-best.com/',
}
def __init__(self, timeout: int = 30, delay: float = 1.0):
self.timeout = timeout
self.delay = delay
self.session = requests.Session()
self.session.headers.update(self.DEFAULT_HEADERS)
self._last_request_time = 0
def _rate_limit(self):
"""速率限制"""
elapsed = time.time() - self._last_request_time
if elapsed < self.delay:
time.sleep(self.delay - elapsed)
self._last_request_time = time.time()
def _extract_next_data(self, html: str) -> Optional[dict]:
"""從 HTML 中提取 Next.js 資料"""
try:
soup = BeautifulSoup(html, 'html.parser')
script = soup.find('script', {'id': '__NEXT_DATA__'})
if script and script.string:
return json.loads(script.string)
except Exception as e:
logger.debug(f"[mybest] 提取 __NEXT_DATA__ 失敗: {e}")
return None
def get_articles(self, category: str = 'skincare', limit: int = 10) -> Tuple[bool, str, List[MybestArticle]]:
"""
取得分類文章列表 - 使用搜尋 API
Args:
category: 分類代碼(參考 CATEGORIES
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 文章列表)
"""
try:
self._rate_limit()
# 取得分類對應的搜尋關鍵字
search_keyword = self.CATEGORIES.get(category, self.CATEGORIES['skincare'])
# 只取第一個關鍵字
keyword = search_keyword.split()[0]
url = f"{self.BASE_URL}/search_contents"
params = {'q': keyword}
logger.info(f"[mybest] 搜尋文章: {category} -> {keyword}")
response = self.session.get(url, params=params, timeout=self.timeout)
if response.status_code != 200:
return False, f"HTTP {response.status_code}", []
articles = self._parse_search_page(response.text, category, limit)
if articles:
return True, f"成功取得 {len(articles)} 篇文章", articles
else:
# 備用:嘗試從首頁取得
return self.get_latest_presses(limit)
except requests.Timeout:
logger.error("[mybest] 請求超時")
return False, "請求超時", []
except Exception as e:
logger.error(f"[mybest] 取得文章列表失敗: {e}")
return False, str(e), []
def _parse_search_page(self, html: str, category: str, limit: int) -> List[MybestArticle]:
"""解析搜尋結果頁面 - 使用 HTML 解析"""
articles = []
try:
soup = BeautifulSoup(html, 'html.parser')
# 找到所有文章連結 - 通常是 /數字 格式的連結
article_links = soup.find_all('a', href=re.compile(r'^/\d+$'))
seen_ids = set()
for link in article_links:
if len(articles) >= limit:
break
try:
href = link.get('href', '')
article_id = href.strip('/')
if article_id in seen_ids:
continue
seen_ids.add(article_id)
# 找標題 - 從連結內的文字或圖片
title = ''
title_elem = link.find(['h2', 'h3', 'span', 'p'])
if title_elem:
title = title_elem.get_text(strip=True)
if not title:
title = link.get_text(strip=True)
# 過濾太短的標題
if not title or len(title) < 5:
continue
# 找圖片
image_url = ''
img = link.find('img')
if img:
image_url = img.get('src') or img.get('data-src', '')
article = MybestArticle(
article_id=article_id,
title=title[:100],
category=self.CATEGORY_NAMES.get(category, category),
product_count=0,
image_url=image_url,
article_url=f"{self.BASE_URL}/{article_id}",
description='',
crawled_at=datetime.now()
)
articles.append(article)
except Exception as e:
logger.debug(f"[mybest] 解析文章連結失敗: {e}")
continue
logger.info(f"[mybest] 從搜尋頁解析到 {len(articles)} 篇文章")
return articles
except Exception as e:
logger.error(f"[mybest] 解析搜尋頁面失敗: {e}")
return []
def _parse_articles(self, html: str, category: str, limit: int) -> List[MybestArticle]:
"""解析文章列表頁面"""
articles = []
try:
# 優先嘗試 Next.js 資料
next_data = self._extract_next_data(html)
if next_data:
props = next_data.get('props', {}).get('pageProps', {})
items = props.get('presses', []) or props.get('articles', [])
for item in items[:limit]:
try:
article_id = str(item.get('id', ''))
title = item.get('title', '') or item.get('name', '')
slug = item.get('slug', '') or item.get('url', '')
# 構建 URL
article_url = f"{self.BASE_URL}/{slug}" if slug else ''
# 圖片 URL
image_url = (
item.get('rectangleThumbnailUrl') or
item.get('thumbnailUrl') or
item.get('imageUrl', '')
)
# 商品數量
product_count = item.get('productCountInt', 0) or item.get('productCount', 0)
article = MybestArticle(
article_id=article_id,
title=title,
category=self.CATEGORY_NAMES.get(category, category),
product_count=product_count,
image_url=image_url,
article_url=article_url,
description=item.get('description', ''),
crawled_at=datetime.now()
)
articles.append(article)
except Exception as e:
logger.debug(f"[mybest] 解析文章項目失敗: {e}")
continue
# 備用: 直接解析 HTML
if not articles:
soup = BeautifulSoup(html, 'html.parser')
# 文章卡片
article_cards = soup.select('a[href*="/"]')
for card in article_cards[:limit * 3]: # 多取一些,因為可能有非文章連結
try:
href = card.get('href', '')
if not href or '/categories/' in href or href == '/':
continue
# 標題
title_elem = card.select_one('[class*="title"], h2, h3')
if not title_elem:
continue
title = title_elem.get_text(strip=True)
if not title or len(title) < 5:
continue
# URL
article_url = href if href.startswith('http') else f"{self.BASE_URL}{href}"
# 圖片
img_elem = card.select_one('img')
image_url = ''
if img_elem:
image_url = img_elem.get('src') or img_elem.get('data-src', '')
# ID 從 URL 提取
article_id = href.split('/')[-1] if href else ''
article = MybestArticle(
article_id=article_id,
title=title,
category=self.CATEGORY_NAMES.get(category, category),
product_count=0,
image_url=image_url,
article_url=article_url,
description='',
crawled_at=datetime.now()
)
articles.append(article)
if len(articles) >= limit:
break
except Exception as e:
logger.debug(f"[mybest] 解析 HTML 文章失敗: {e}")
continue
logger.info(f"[mybest] 解析到 {len(articles)} 篇文章")
return articles
except Exception as e:
logger.error(f"[mybest] 解析文章列表失敗: {e}")
return []
def get_article_products(self, article_url: str, limit: int = 10) -> Tuple[bool, str, List[MybestProduct]]:
"""
取得文章中的推薦商品
Args:
article_url: 文章 URL
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品列表)
"""
try:
self._rate_limit()
logger.info(f"[mybest] 取得文章商品: {article_url}")
response = self.session.get(article_url, timeout=self.timeout)
if response.status_code != 200:
return False, f"HTTP {response.status_code}", []
products = self._parse_article_products(response.text, article_url, limit)
if products:
return True, f"成功取得 {len(products)} 個商品", products
else:
return False, "無法解析商品資料", []
except requests.Timeout:
logger.error("[mybest] 請求超時")
return False, "請求超時", []
except Exception as e:
logger.error(f"[mybest] 取得文章商品失敗: {e}")
return False, str(e), []
def _parse_article_products(self, html: str, article_url: str, limit: int) -> List[MybestProduct]:
"""解析文章內的推薦商品"""
products = []
try:
# 取得文章標題
article_title = ''
soup = BeautifulSoup(html, 'html.parser')
title_elem = soup.select_one('h1')
if title_elem:
article_title = title_elem.get_text(strip=True)
# 優先嘗試 Next.js 資料
next_data = self._extract_next_data(html)
if next_data:
props = next_data.get('props', {}).get('pageProps', {})
press = props.get('press', {}) or props.get('article', {})
items = press.get('products', []) or press.get('items', [])
if not article_title:
article_title = press.get('title', '')
for idx, item in enumerate(items[:limit], 1):
try:
product_id = str(item.get('id', ''))
name = item.get('name', '') or item.get('title', '')
brand = item.get('brand', '') or item.get('maker', '')
# 價格
price = None
price_val = item.get('price') or item.get('lowestPrice')
if price_val:
try:
price = int(re.sub(r'[^\d]', '', str(price_val)))
except (ValueError, TypeError):
pass
# 圖片
image_url = item.get('imageUrl') or item.get('thumbnailUrl', '')
# 商品連結
product_url = item.get('url') or item.get('affiliateUrl', '')
product = MybestProduct(
product_id=product_id,
name=name,
brand=brand,
rank=idx,
price=price,
image_url=image_url,
product_url=product_url,
article_title=article_title,
crawled_at=datetime.now()
)
products.append(product)
except Exception as e:
logger.debug(f"[mybest] 解析商品項目失敗: {e}")
continue
# 備用: 直接解析 HTML
if not products:
# 找到商品區塊 (通常有排名標示)
product_sections = soup.select('[class*="ranking"], [class*="product"]')
for idx, section in enumerate(product_sections[:limit], 1):
try:
# 商品名稱
name_elem = section.select_one('h2, h3, [class*="name"], [class*="title"]')
if not name_elem:
continue
name = name_elem.get_text(strip=True)
if not name or len(name) < 3:
continue
# 品牌
brand_elem = section.select_one('[class*="brand"], [class*="maker"]')
brand = brand_elem.get_text(strip=True) if brand_elem else ''
# 圖片
img_elem = section.select_one('img')
image_url = ''
if img_elem:
image_url = img_elem.get('src') or img_elem.get('data-src', '')
# 連結
link_elem = section.select_one('a[href*="http"]')
product_url = link_elem.get('href', '') if link_elem else ''
product = MybestProduct(
product_id=str(idx),
name=name,
brand=brand,
rank=idx,
price=None,
image_url=image_url,
product_url=product_url,
article_title=article_title,
crawled_at=datetime.now()
)
products.append(product)
except Exception as e:
logger.debug(f"[mybest] 解析 HTML 商品失敗: {e}")
continue
logger.info(f"[mybest] 解析到 {len(products)} 個商品")
return products
except Exception as e:
logger.error(f"[mybest] 解析文章商品失敗: {e}")
return []
def get_latest_presses(self, limit: int = 20) -> Tuple[bool, str, List[MybestArticle]]:
"""
取得最新推薦文章 - 從首頁取得
Args:
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 文章列表)
"""
try:
self._rate_limit()
url = f"{self.BASE_URL}/"
logger.info(f"[mybest] 取得首頁文章: {url}")
response = self.session.get(url, timeout=self.timeout)
if response.status_code != 200:
return False, f"HTTP {response.status_code}", []
articles = self._parse_homepage(response.text, limit)
if articles:
return True, f"成功取得 {len(articles)} 篇文章", articles
else:
return False, "無法解析文章資料", []
except requests.Timeout:
logger.error("[mybest] 請求超時")
return False, "請求超時", []
except Exception as e:
logger.error(f"[mybest] 取得最新文章失敗: {e}")
return False, str(e), []
def _parse_homepage(self, html: str, limit: int) -> List[MybestArticle]:
"""解析首頁,取得推薦文章"""
articles = []
try:
# 從 __NEXT_DATA__ 取得
next_data = self._extract_next_data(html)
if next_data:
props = next_data.get('props', {}).get('pageProps', {})
top_data = props.get('data', {}).get('top', {})
# 從 displayableContents 取得文章
contents = top_data.get('displayableContents', [])
for item in contents[:limit]:
try:
press_id = str(item.get('pressId', ''))
article_url = item.get('url', '')
if not article_url:
article_url = f"{self.BASE_URL}/{press_id}"
# 取得商品數量
product_count = item.get('productCountInt', 0)
# 取得縮圖
image_url = item.get('rectangleThumbnailUrl', '') or item.get('thumbnailProductImageUrl', '')
# 取得商品名稱作為標題
merchandise = item.get('merchandise', '')
if isinstance(merchandise, str) and merchandise:
title = f"{merchandise} 推薦排行榜"
elif isinstance(merchandise, dict):
title = merchandise.get('name', '') or f"精選推薦 #{press_id}"
else:
title = f"精選推薦 #{press_id}"
article = MybestArticle(
article_id=press_id,
title=title,
category='精選推薦',
product_count=product_count,
image_url=image_url,
article_url=article_url,
description='',
crawled_at=datetime.now()
)
articles.append(article)
except Exception as e:
logger.debug(f"[mybest] 解析首頁文章失敗: {e}")
continue
# 從 displayableItemLists 取得精選清單
item_lists = top_data.get('displayableItemLists', [])
for item in item_lists[:limit - len(articles)]:
try:
list_id = str(item.get('id', ''))
title = item.get('title', '')
if not title:
continue
image_url = item.get('thumbnailCardSquareUrl', '')
article = MybestArticle(
article_id=f"list_{list_id}",
title=title,
category='達人推薦',
product_count=len(item.get('itemParts', [])),
image_url=image_url,
article_url=f"{self.BASE_URL}/item_lists/{list_id}",
description=item.get('introduction', '')[:100] if item.get('introduction') else '',
crawled_at=datetime.now()
)
articles.append(article)
except Exception as e:
logger.debug(f"[mybest] 解析精選清單失敗: {e}")
continue
logger.info(f"[mybest] 從首頁解析到 {len(articles)} 篇文章")
return articles[:limit]
except Exception as e:
logger.error(f"[mybest] 解析首頁失敗: {e}")
return []
def search_articles(self, keyword: str, limit: int = 10) -> Tuple[bool, str, List[MybestArticle]]:
"""
搜尋文章
Args:
keyword: 搜尋關鍵字
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 文章列表)
"""
try:
self._rate_limit()
url = f"{self.BASE_URL}/search"
params = {'q': keyword}
logger.info(f"[mybest] 搜尋文章: {keyword}")
response = self.session.get(url, params=params, timeout=self.timeout)
if response.status_code != 200:
return False, f"HTTP {response.status_code}", []
articles = self._parse_articles(response.text, 'search', limit)
if articles:
return True, f"成功取得 {len(articles)} 篇文章", articles
else:
return False, "無搜尋結果", []
except requests.Timeout:
logger.error("[mybest] 搜尋超時")
return False, "請求超時", []
except Exception as e:
logger.error(f"[mybest] 搜尋失敗: {e}")
return False, str(e), []
# 全域爬蟲實例
_crawler_instance: Optional[MybestCrawler] = None
def get_crawler() -> MybestCrawler:
"""取得爬蟲實例(單例模式)"""
global _crawler_instance
if _crawler_instance is None:
_crawler_instance = MybestCrawler()
return _crawler_instance
def get_mybest_articles(category: str = 'skincare', limit: int = 10) -> Tuple[bool, str, List[dict]]:
"""
取得 mybest 文章(便捷函數)
Args:
category: 分類代碼
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 文章資料列表)
"""
crawler = get_crawler()
success, message, articles = crawler.get_articles(category, limit)
return success, message, [a.to_dict() for a in articles]
def get_mybest_latest(limit: int = 20) -> Tuple[bool, str, List[dict]]:
"""
取得 mybest 最新文章(便捷函數)
Args:
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 文章資料列表)
"""
crawler = get_crawler()
success, message, articles = crawler.get_latest_presses(limit)
return success, message, [a.to_dict() for a in articles]
def search_mybest_articles(keyword: str, limit: int = 10) -> Tuple[bool, str, List[dict]]:
"""
搜尋 mybest 文章(便捷函數)
Args:
keyword: 搜尋關鍵字
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 文章資料列表)
"""
crawler = get_crawler()
success, message, articles = crawler.search_articles(keyword, limit)
return success, message, [a.to_dict() for a in articles]
def get_mybest_categories() -> Dict[str, str]:
"""取得所有分類"""
return MybestCrawler.CATEGORY_NAMES.copy()
if __name__ == '__main__':
# 測試
logging.basicConfig(level=logging.INFO)
print("=== mybest 爬蟲測試 ===\n")
# 測試最新文章
print("[1] 測試最新文章")
success, msg, articles = get_mybest_latest(limit=5)
print(f"結果: {msg}")
if articles:
print("最新文章:")
for a in articles:
print(f" - {a['title'][:40]}... ({a['product_count']}款商品)")
# 測試分類文章
print("\n[2] 測試分類文章 (分類: 基礎保養)")
success, msg, articles = get_mybest_articles('skincare', limit=5)
print(f"結果: {msg}")
if articles:
print("分類文章:")
for a in articles[:3]:
print(f" - {a['title'][:40]}...")
# 測試搜尋
print("\n[3] 測試搜尋 (關鍵字: 面膜)")
success, msg, articles = search_mybest_articles('面膜', limit=5)
print(f"結果: {msg}")
if articles:
print("搜尋結果:")
for a in articles[:3]:
print(f" - {a['title'][:40]}...")