Some checks failed
CD Pipeline / deploy (push) Failing after 59s
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml) - 部署模式: rsync Python 檔案至 188 → docker restart (volume mount) - Dockerfile/requirements 變動時自動重建 Docker image - 部署通知: Telegram (開始/成功/失敗) - 健康檢查: https://mo.wooo.work/health (最多 5 次重試) - 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
774 lines
27 KiB
Python
774 lines
27 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
mybest 台灣爬蟲服務
|
||
|
||
爬取 mybest 台灣的推薦文章和商品排行
|
||
網站: https://tw.my-best.com/
|
||
|
||
支援分類:
|
||
- 美妝保養
|
||
- 健康保健
|
||
- 母嬰用品
|
||
- 生活用品
|
||
"""
|
||
|
||
import re
|
||
import json
|
||
import time
|
||
import logging
|
||
from typing import List, Dict, Optional, Tuple
|
||
from dataclasses import dataclass, asdict
|
||
from datetime import datetime
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
@dataclass
|
||
class MybestArticle:
|
||
"""mybest 文章資料結構"""
|
||
article_id: str # 文章 ID
|
||
title: str # 文章標題
|
||
category: str # 分類
|
||
product_count: int # 推薦商品數量
|
||
image_url: str # 縮圖 URL
|
||
article_url: str # 文章頁面 URL
|
||
description: str # 簡介
|
||
crawled_at: datetime # 爬取時間
|
||
|
||
def to_dict(self) -> dict:
|
||
"""轉換為字典"""
|
||
data = asdict(self)
|
||
data['crawled_at'] = self.crawled_at.isoformat()
|
||
return data
|
||
|
||
|
||
@dataclass
|
||
class MybestProduct:
|
||
"""mybest 推薦商品資料結構"""
|
||
product_id: str # 商品 ID
|
||
name: str # 商品名稱
|
||
brand: str # 品牌
|
||
rank: int # 排名
|
||
price: Optional[int] # 價格
|
||
image_url: str # 圖片 URL
|
||
product_url: str # 商品連結
|
||
article_title: str # 所屬文章標題
|
||
crawled_at: datetime # 爬取時間
|
||
|
||
def to_dict(self) -> dict:
|
||
"""轉換為字典"""
|
||
data = asdict(self)
|
||
data['crawled_at'] = self.crawled_at.isoformat()
|
||
return data
|
||
|
||
|
||
class MybestCrawler:
|
||
"""mybest 台灣爬蟲"""
|
||
|
||
BASE_URL = 'https://tw.my-best.com'
|
||
|
||
# 分類搜尋關鍵字 - 使用 search_contents API
|
||
CATEGORIES = {
|
||
# 美妝保養
|
||
'skincare': '基礎保養 精華液 乳液',
|
||
'makeup': '彩妝 口紅 眼影',
|
||
'hair_care': '洗髮精 護髮 頭髮保養',
|
||
'body_care': '身體乳 身體保養',
|
||
'sunscreen': '防曬 防曬乳',
|
||
'mask': '面膜 保濕面膜',
|
||
|
||
# 健康保健
|
||
'health': '健康食品 保健',
|
||
'supplement': '營養補充 維他命',
|
||
'diet': '減重 瘦身',
|
||
|
||
# 母嬰用品
|
||
'baby': '嬰兒用品 奶瓶 尿布',
|
||
'maternity': '孕婦用品 孕婦',
|
||
'kids': '兒童用品 兒童',
|
||
|
||
# 居家生活
|
||
'home': '居家用品 收納',
|
||
'kitchen': '廚房用品 鍋具',
|
||
|
||
# 3C 家電
|
||
'electronics': '3C 電子 藍牙耳機',
|
||
'appliances': '家電 吸塵器',
|
||
}
|
||
|
||
# 中文分類名稱
|
||
CATEGORY_NAMES = {
|
||
'skincare': '基礎保養',
|
||
'makeup': '彩妝',
|
||
'hair_care': '頭髮保養',
|
||
'body_care': '身體保養',
|
||
'sunscreen': '防曬',
|
||
'mask': '面膜',
|
||
'health': '健康食品',
|
||
'supplement': '營養補充',
|
||
'diet': '減重瘦身',
|
||
'baby': '嬰兒用品',
|
||
'maternity': '孕婦用品',
|
||
'kids': '兒童用品',
|
||
'home': '居家用品',
|
||
'kitchen': '廚房用品',
|
||
'electronics': '3C 電子',
|
||
'appliances': '家電',
|
||
}
|
||
|
||
DEFAULT_HEADERS = {
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
|
||
'Referer': 'https://tw.my-best.com/',
|
||
}
|
||
|
||
def __init__(self, timeout: int = 30, delay: float = 1.0):
|
||
self.timeout = timeout
|
||
self.delay = delay
|
||
self.session = requests.Session()
|
||
self.session.headers.update(self.DEFAULT_HEADERS)
|
||
self._last_request_time = 0
|
||
|
||
def _rate_limit(self):
|
||
"""速率限制"""
|
||
elapsed = time.time() - self._last_request_time
|
||
if elapsed < self.delay:
|
||
time.sleep(self.delay - elapsed)
|
||
self._last_request_time = time.time()
|
||
|
||
def _extract_next_data(self, html: str) -> Optional[dict]:
|
||
"""從 HTML 中提取 Next.js 資料"""
|
||
try:
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
script = soup.find('script', {'id': '__NEXT_DATA__'})
|
||
if script and script.string:
|
||
return json.loads(script.string)
|
||
except Exception as e:
|
||
logger.debug(f"[mybest] 提取 __NEXT_DATA__ 失敗: {e}")
|
||
return None
|
||
|
||
def get_articles(self, category: str = 'skincare', limit: int = 10) -> Tuple[bool, str, List[MybestArticle]]:
|
||
"""
|
||
取得分類文章列表 - 使用搜尋 API
|
||
|
||
Args:
|
||
category: 分類代碼(參考 CATEGORIES)
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 文章列表)
|
||
"""
|
||
try:
|
||
self._rate_limit()
|
||
|
||
# 取得分類對應的搜尋關鍵字
|
||
search_keyword = self.CATEGORIES.get(category, self.CATEGORIES['skincare'])
|
||
# 只取第一個關鍵字
|
||
keyword = search_keyword.split()[0]
|
||
|
||
url = f"{self.BASE_URL}/search_contents"
|
||
params = {'q': keyword}
|
||
|
||
logger.info(f"[mybest] 搜尋文章: {category} -> {keyword}")
|
||
|
||
response = self.session.get(url, params=params, timeout=self.timeout)
|
||
|
||
if response.status_code != 200:
|
||
return False, f"HTTP {response.status_code}", []
|
||
|
||
articles = self._parse_search_page(response.text, category, limit)
|
||
|
||
if articles:
|
||
return True, f"成功取得 {len(articles)} 篇文章", articles
|
||
else:
|
||
# 備用:嘗試從首頁取得
|
||
return self.get_latest_presses(limit)
|
||
|
||
except requests.Timeout:
|
||
logger.error("[mybest] 請求超時")
|
||
return False, "請求超時", []
|
||
except Exception as e:
|
||
logger.error(f"[mybest] 取得文章列表失敗: {e}")
|
||
return False, str(e), []
|
||
|
||
def _parse_search_page(self, html: str, category: str, limit: int) -> List[MybestArticle]:
|
||
"""解析搜尋結果頁面 - 使用 HTML 解析"""
|
||
articles = []
|
||
|
||
try:
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
|
||
# 找到所有文章連結 - 通常是 /數字 格式的連結
|
||
article_links = soup.find_all('a', href=re.compile(r'^/\d+$'))
|
||
|
||
seen_ids = set()
|
||
for link in article_links:
|
||
if len(articles) >= limit:
|
||
break
|
||
|
||
try:
|
||
href = link.get('href', '')
|
||
article_id = href.strip('/')
|
||
|
||
if article_id in seen_ids:
|
||
continue
|
||
seen_ids.add(article_id)
|
||
|
||
# 找標題 - 從連結內的文字或圖片
|
||
title = ''
|
||
title_elem = link.find(['h2', 'h3', 'span', 'p'])
|
||
if title_elem:
|
||
title = title_elem.get_text(strip=True)
|
||
if not title:
|
||
title = link.get_text(strip=True)
|
||
|
||
# 過濾太短的標題
|
||
if not title or len(title) < 5:
|
||
continue
|
||
|
||
# 找圖片
|
||
image_url = ''
|
||
img = link.find('img')
|
||
if img:
|
||
image_url = img.get('src') or img.get('data-src', '')
|
||
|
||
article = MybestArticle(
|
||
article_id=article_id,
|
||
title=title[:100],
|
||
category=self.CATEGORY_NAMES.get(category, category),
|
||
product_count=0,
|
||
image_url=image_url,
|
||
article_url=f"{self.BASE_URL}/{article_id}",
|
||
description='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
articles.append(article)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[mybest] 解析文章連結失敗: {e}")
|
||
continue
|
||
|
||
logger.info(f"[mybest] 從搜尋頁解析到 {len(articles)} 篇文章")
|
||
return articles
|
||
|
||
except Exception as e:
|
||
logger.error(f"[mybest] 解析搜尋頁面失敗: {e}")
|
||
return []
|
||
|
||
def _parse_articles(self, html: str, category: str, limit: int) -> List[MybestArticle]:
|
||
"""解析文章列表頁面"""
|
||
articles = []
|
||
|
||
try:
|
||
# 優先嘗試 Next.js 資料
|
||
next_data = self._extract_next_data(html)
|
||
if next_data:
|
||
props = next_data.get('props', {}).get('pageProps', {})
|
||
items = props.get('presses', []) or props.get('articles', [])
|
||
|
||
for item in items[:limit]:
|
||
try:
|
||
article_id = str(item.get('id', ''))
|
||
title = item.get('title', '') or item.get('name', '')
|
||
slug = item.get('slug', '') or item.get('url', '')
|
||
|
||
# 構建 URL
|
||
article_url = f"{self.BASE_URL}/{slug}" if slug else ''
|
||
|
||
# 圖片 URL
|
||
image_url = (
|
||
item.get('rectangleThumbnailUrl') or
|
||
item.get('thumbnailUrl') or
|
||
item.get('imageUrl', '')
|
||
)
|
||
|
||
# 商品數量
|
||
product_count = item.get('productCountInt', 0) or item.get('productCount', 0)
|
||
|
||
article = MybestArticle(
|
||
article_id=article_id,
|
||
title=title,
|
||
category=self.CATEGORY_NAMES.get(category, category),
|
||
product_count=product_count,
|
||
image_url=image_url,
|
||
article_url=article_url,
|
||
description=item.get('description', ''),
|
||
crawled_at=datetime.now()
|
||
)
|
||
articles.append(article)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[mybest] 解析文章項目失敗: {e}")
|
||
continue
|
||
|
||
# 備用: 直接解析 HTML
|
||
if not articles:
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
|
||
# 文章卡片
|
||
article_cards = soup.select('a[href*="/"]')
|
||
|
||
for card in article_cards[:limit * 3]: # 多取一些,因為可能有非文章連結
|
||
try:
|
||
href = card.get('href', '')
|
||
if not href or '/categories/' in href or href == '/':
|
||
continue
|
||
|
||
# 標題
|
||
title_elem = card.select_one('[class*="title"], h2, h3')
|
||
if not title_elem:
|
||
continue
|
||
|
||
title = title_elem.get_text(strip=True)
|
||
if not title or len(title) < 5:
|
||
continue
|
||
|
||
# URL
|
||
article_url = href if href.startswith('http') else f"{self.BASE_URL}{href}"
|
||
|
||
# 圖片
|
||
img_elem = card.select_one('img')
|
||
image_url = ''
|
||
if img_elem:
|
||
image_url = img_elem.get('src') or img_elem.get('data-src', '')
|
||
|
||
# ID 從 URL 提取
|
||
article_id = href.split('/')[-1] if href else ''
|
||
|
||
article = MybestArticle(
|
||
article_id=article_id,
|
||
title=title,
|
||
category=self.CATEGORY_NAMES.get(category, category),
|
||
product_count=0,
|
||
image_url=image_url,
|
||
article_url=article_url,
|
||
description='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
articles.append(article)
|
||
|
||
if len(articles) >= limit:
|
||
break
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[mybest] 解析 HTML 文章失敗: {e}")
|
||
continue
|
||
|
||
logger.info(f"[mybest] 解析到 {len(articles)} 篇文章")
|
||
return articles
|
||
|
||
except Exception as e:
|
||
logger.error(f"[mybest] 解析文章列表失敗: {e}")
|
||
return []
|
||
|
||
def get_article_products(self, article_url: str, limit: int = 10) -> Tuple[bool, str, List[MybestProduct]]:
|
||
"""
|
||
取得文章中的推薦商品
|
||
|
||
Args:
|
||
article_url: 文章 URL
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品列表)
|
||
"""
|
||
try:
|
||
self._rate_limit()
|
||
|
||
logger.info(f"[mybest] 取得文章商品: {article_url}")
|
||
|
||
response = self.session.get(article_url, timeout=self.timeout)
|
||
|
||
if response.status_code != 200:
|
||
return False, f"HTTP {response.status_code}", []
|
||
|
||
products = self._parse_article_products(response.text, article_url, limit)
|
||
|
||
if products:
|
||
return True, f"成功取得 {len(products)} 個商品", products
|
||
else:
|
||
return False, "無法解析商品資料", []
|
||
|
||
except requests.Timeout:
|
||
logger.error("[mybest] 請求超時")
|
||
return False, "請求超時", []
|
||
except Exception as e:
|
||
logger.error(f"[mybest] 取得文章商品失敗: {e}")
|
||
return False, str(e), []
|
||
|
||
def _parse_article_products(self, html: str, article_url: str, limit: int) -> List[MybestProduct]:
|
||
"""解析文章內的推薦商品"""
|
||
products = []
|
||
|
||
try:
|
||
# 取得文章標題
|
||
article_title = ''
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
title_elem = soup.select_one('h1')
|
||
if title_elem:
|
||
article_title = title_elem.get_text(strip=True)
|
||
|
||
# 優先嘗試 Next.js 資料
|
||
next_data = self._extract_next_data(html)
|
||
if next_data:
|
||
props = next_data.get('props', {}).get('pageProps', {})
|
||
press = props.get('press', {}) or props.get('article', {})
|
||
items = press.get('products', []) or press.get('items', [])
|
||
|
||
if not article_title:
|
||
article_title = press.get('title', '')
|
||
|
||
for idx, item in enumerate(items[:limit], 1):
|
||
try:
|
||
product_id = str(item.get('id', ''))
|
||
name = item.get('name', '') or item.get('title', '')
|
||
brand = item.get('brand', '') or item.get('maker', '')
|
||
|
||
# 價格
|
||
price = None
|
||
price_val = item.get('price') or item.get('lowestPrice')
|
||
if price_val:
|
||
try:
|
||
price = int(re.sub(r'[^\d]', '', str(price_val)))
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
# 圖片
|
||
image_url = item.get('imageUrl') or item.get('thumbnailUrl', '')
|
||
|
||
# 商品連結
|
||
product_url = item.get('url') or item.get('affiliateUrl', '')
|
||
|
||
product = MybestProduct(
|
||
product_id=product_id,
|
||
name=name,
|
||
brand=brand,
|
||
rank=idx,
|
||
price=price,
|
||
image_url=image_url,
|
||
product_url=product_url,
|
||
article_title=article_title,
|
||
crawled_at=datetime.now()
|
||
)
|
||
products.append(product)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[mybest] 解析商品項目失敗: {e}")
|
||
continue
|
||
|
||
# 備用: 直接解析 HTML
|
||
if not products:
|
||
# 找到商品區塊 (通常有排名標示)
|
||
product_sections = soup.select('[class*="ranking"], [class*="product"]')
|
||
|
||
for idx, section in enumerate(product_sections[:limit], 1):
|
||
try:
|
||
# 商品名稱
|
||
name_elem = section.select_one('h2, h3, [class*="name"], [class*="title"]')
|
||
if not name_elem:
|
||
continue
|
||
|
||
name = name_elem.get_text(strip=True)
|
||
if not name or len(name) < 3:
|
||
continue
|
||
|
||
# 品牌
|
||
brand_elem = section.select_one('[class*="brand"], [class*="maker"]')
|
||
brand = brand_elem.get_text(strip=True) if brand_elem else ''
|
||
|
||
# 圖片
|
||
img_elem = section.select_one('img')
|
||
image_url = ''
|
||
if img_elem:
|
||
image_url = img_elem.get('src') or img_elem.get('data-src', '')
|
||
|
||
# 連結
|
||
link_elem = section.select_one('a[href*="http"]')
|
||
product_url = link_elem.get('href', '') if link_elem else ''
|
||
|
||
product = MybestProduct(
|
||
product_id=str(idx),
|
||
name=name,
|
||
brand=brand,
|
||
rank=idx,
|
||
price=None,
|
||
image_url=image_url,
|
||
product_url=product_url,
|
||
article_title=article_title,
|
||
crawled_at=datetime.now()
|
||
)
|
||
products.append(product)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[mybest] 解析 HTML 商品失敗: {e}")
|
||
continue
|
||
|
||
logger.info(f"[mybest] 解析到 {len(products)} 個商品")
|
||
return products
|
||
|
||
except Exception as e:
|
||
logger.error(f"[mybest] 解析文章商品失敗: {e}")
|
||
return []
|
||
|
||
def get_latest_presses(self, limit: int = 20) -> Tuple[bool, str, List[MybestArticle]]:
|
||
"""
|
||
取得最新推薦文章 - 從首頁取得
|
||
|
||
Args:
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 文章列表)
|
||
"""
|
||
try:
|
||
self._rate_limit()
|
||
|
||
url = f"{self.BASE_URL}/"
|
||
logger.info(f"[mybest] 取得首頁文章: {url}")
|
||
|
||
response = self.session.get(url, timeout=self.timeout)
|
||
|
||
if response.status_code != 200:
|
||
return False, f"HTTP {response.status_code}", []
|
||
|
||
articles = self._parse_homepage(response.text, limit)
|
||
|
||
if articles:
|
||
return True, f"成功取得 {len(articles)} 篇文章", articles
|
||
else:
|
||
return False, "無法解析文章資料", []
|
||
|
||
except requests.Timeout:
|
||
logger.error("[mybest] 請求超時")
|
||
return False, "請求超時", []
|
||
except Exception as e:
|
||
logger.error(f"[mybest] 取得最新文章失敗: {e}")
|
||
return False, str(e), []
|
||
|
||
def _parse_homepage(self, html: str, limit: int) -> List[MybestArticle]:
|
||
"""解析首頁,取得推薦文章"""
|
||
articles = []
|
||
|
||
try:
|
||
# 從 __NEXT_DATA__ 取得
|
||
next_data = self._extract_next_data(html)
|
||
if next_data:
|
||
props = next_data.get('props', {}).get('pageProps', {})
|
||
top_data = props.get('data', {}).get('top', {})
|
||
|
||
# 從 displayableContents 取得文章
|
||
contents = top_data.get('displayableContents', [])
|
||
for item in contents[:limit]:
|
||
try:
|
||
press_id = str(item.get('pressId', ''))
|
||
article_url = item.get('url', '')
|
||
if not article_url:
|
||
article_url = f"{self.BASE_URL}/{press_id}"
|
||
|
||
# 取得商品數量
|
||
product_count = item.get('productCountInt', 0)
|
||
|
||
# 取得縮圖
|
||
image_url = item.get('rectangleThumbnailUrl', '') or item.get('thumbnailProductImageUrl', '')
|
||
|
||
# 取得商品名稱作為標題
|
||
merchandise = item.get('merchandise', '')
|
||
if isinstance(merchandise, str) and merchandise:
|
||
title = f"{merchandise} 推薦排行榜"
|
||
elif isinstance(merchandise, dict):
|
||
title = merchandise.get('name', '') or f"精選推薦 #{press_id}"
|
||
else:
|
||
title = f"精選推薦 #{press_id}"
|
||
|
||
article = MybestArticle(
|
||
article_id=press_id,
|
||
title=title,
|
||
category='精選推薦',
|
||
product_count=product_count,
|
||
image_url=image_url,
|
||
article_url=article_url,
|
||
description='',
|
||
crawled_at=datetime.now()
|
||
)
|
||
articles.append(article)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[mybest] 解析首頁文章失敗: {e}")
|
||
continue
|
||
|
||
# 從 displayableItemLists 取得精選清單
|
||
item_lists = top_data.get('displayableItemLists', [])
|
||
for item in item_lists[:limit - len(articles)]:
|
||
try:
|
||
list_id = str(item.get('id', ''))
|
||
title = item.get('title', '')
|
||
|
||
if not title:
|
||
continue
|
||
|
||
image_url = item.get('thumbnailCardSquareUrl', '')
|
||
|
||
article = MybestArticle(
|
||
article_id=f"list_{list_id}",
|
||
title=title,
|
||
category='達人推薦',
|
||
product_count=len(item.get('itemParts', [])),
|
||
image_url=image_url,
|
||
article_url=f"{self.BASE_URL}/item_lists/{list_id}",
|
||
description=item.get('introduction', '')[:100] if item.get('introduction') else '',
|
||
crawled_at=datetime.now()
|
||
)
|
||
articles.append(article)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[mybest] 解析精選清單失敗: {e}")
|
||
continue
|
||
|
||
logger.info(f"[mybest] 從首頁解析到 {len(articles)} 篇文章")
|
||
return articles[:limit]
|
||
|
||
except Exception as e:
|
||
logger.error(f"[mybest] 解析首頁失敗: {e}")
|
||
return []
|
||
|
||
def search_articles(self, keyword: str, limit: int = 10) -> Tuple[bool, str, List[MybestArticle]]:
|
||
"""
|
||
搜尋文章
|
||
|
||
Args:
|
||
keyword: 搜尋關鍵字
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 文章列表)
|
||
"""
|
||
try:
|
||
self._rate_limit()
|
||
|
||
url = f"{self.BASE_URL}/search"
|
||
params = {'q': keyword}
|
||
|
||
logger.info(f"[mybest] 搜尋文章: {keyword}")
|
||
|
||
response = self.session.get(url, params=params, timeout=self.timeout)
|
||
|
||
if response.status_code != 200:
|
||
return False, f"HTTP {response.status_code}", []
|
||
|
||
articles = self._parse_articles(response.text, 'search', limit)
|
||
|
||
if articles:
|
||
return True, f"成功取得 {len(articles)} 篇文章", articles
|
||
else:
|
||
return False, "無搜尋結果", []
|
||
|
||
except requests.Timeout:
|
||
logger.error("[mybest] 搜尋超時")
|
||
return False, "請求超時", []
|
||
except Exception as e:
|
||
logger.error(f"[mybest] 搜尋失敗: {e}")
|
||
return False, str(e), []
|
||
|
||
|
||
# 全域爬蟲實例
|
||
_crawler_instance: Optional[MybestCrawler] = None
|
||
|
||
|
||
def get_crawler() -> MybestCrawler:
|
||
"""取得爬蟲實例(單例模式)"""
|
||
global _crawler_instance
|
||
if _crawler_instance is None:
|
||
_crawler_instance = MybestCrawler()
|
||
return _crawler_instance
|
||
|
||
|
||
def get_mybest_articles(category: str = 'skincare', limit: int = 10) -> Tuple[bool, str, List[dict]]:
|
||
"""
|
||
取得 mybest 文章(便捷函數)
|
||
|
||
Args:
|
||
category: 分類代碼
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 文章資料列表)
|
||
"""
|
||
crawler = get_crawler()
|
||
success, message, articles = crawler.get_articles(category, limit)
|
||
return success, message, [a.to_dict() for a in articles]
|
||
|
||
|
||
def get_mybest_latest(limit: int = 20) -> Tuple[bool, str, List[dict]]:
|
||
"""
|
||
取得 mybest 最新文章(便捷函數)
|
||
|
||
Args:
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 文章資料列表)
|
||
"""
|
||
crawler = get_crawler()
|
||
success, message, articles = crawler.get_latest_presses(limit)
|
||
return success, message, [a.to_dict() for a in articles]
|
||
|
||
|
||
def search_mybest_articles(keyword: str, limit: int = 10) -> Tuple[bool, str, List[dict]]:
|
||
"""
|
||
搜尋 mybest 文章(便捷函數)
|
||
|
||
Args:
|
||
keyword: 搜尋關鍵字
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 文章資料列表)
|
||
"""
|
||
crawler = get_crawler()
|
||
success, message, articles = crawler.search_articles(keyword, limit)
|
||
return success, message, [a.to_dict() for a in articles]
|
||
|
||
|
||
def get_mybest_categories() -> Dict[str, str]:
|
||
"""取得所有分類"""
|
||
return MybestCrawler.CATEGORY_NAMES.copy()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 測試
|
||
logging.basicConfig(level=logging.INFO)
|
||
|
||
print("=== mybest 爬蟲測試 ===\n")
|
||
|
||
# 測試最新文章
|
||
print("[1] 測試最新文章")
|
||
success, msg, articles = get_mybest_latest(limit=5)
|
||
print(f"結果: {msg}")
|
||
if articles:
|
||
print("最新文章:")
|
||
for a in articles:
|
||
print(f" - {a['title'][:40]}... ({a['product_count']}款商品)")
|
||
|
||
# 測試分類文章
|
||
print("\n[2] 測試分類文章 (分類: 基礎保養)")
|
||
success, msg, articles = get_mybest_articles('skincare', limit=5)
|
||
print(f"結果: {msg}")
|
||
if articles:
|
||
print("分類文章:")
|
||
for a in articles[:3]:
|
||
print(f" - {a['title'][:40]}...")
|
||
|
||
# 測試搜尋
|
||
print("\n[3] 測試搜尋 (關鍵字: 面膜)")
|
||
success, msg, articles = search_mybest_articles('面膜', limit=5)
|
||
print(f"結果: {msg}")
|
||
if articles:
|
||
print("搜尋結果:")
|
||
for a in articles[:3]:
|
||
print(f" - {a['title'][:40]}...")
|