#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ mybest 台灣爬蟲服務 爬取 mybest 台灣的推薦文章和商品排行 網站: https://tw.my-best.com/ 支援分類: - 美妝保養 - 健康保健 - 母嬰用品 - 生活用品 """ import re import json import time import logging from typing import List, Dict, Optional, Tuple from dataclasses import dataclass, asdict from datetime import datetime import requests from bs4 import BeautifulSoup logger = logging.getLogger(__name__) @dataclass class MybestArticle: """mybest 文章資料結構""" article_id: str # 文章 ID title: str # 文章標題 category: str # 分類 product_count: int # 推薦商品數量 image_url: str # 縮圖 URL article_url: str # 文章頁面 URL description: str # 簡介 crawled_at: datetime # 爬取時間 def to_dict(self) -> dict: """轉換為字典""" data = asdict(self) data['crawled_at'] = self.crawled_at.isoformat() return data @dataclass class MybestProduct: """mybest 推薦商品資料結構""" product_id: str # 商品 ID name: str # 商品名稱 brand: str # 品牌 rank: int # 排名 price: Optional[int] # 價格 image_url: str # 圖片 URL product_url: str # 商品連結 article_title: str # 所屬文章標題 crawled_at: datetime # 爬取時間 def to_dict(self) -> dict: """轉換為字典""" data = asdict(self) data['crawled_at'] = self.crawled_at.isoformat() return data class MybestCrawler: """mybest 台灣爬蟲""" BASE_URL = 'https://tw.my-best.com' # 分類搜尋關鍵字 - 使用 search_contents API CATEGORIES = { # 美妝保養 'skincare': '基礎保養 精華液 乳液', 'makeup': '彩妝 口紅 眼影', 'hair_care': '洗髮精 護髮 頭髮保養', 'body_care': '身體乳 身體保養', 'sunscreen': '防曬 防曬乳', 'mask': '面膜 保濕面膜', # 健康保健 'health': '健康食品 保健', 'supplement': '營養補充 維他命', 'diet': '減重 瘦身', # 母嬰用品 'baby': '嬰兒用品 奶瓶 尿布', 'maternity': '孕婦用品 孕婦', 'kids': '兒童用品 兒童', # 居家生活 'home': '居家用品 收納', 'kitchen': '廚房用品 鍋具', # 3C 家電 'electronics': '3C 電子 藍牙耳機', 'appliances': '家電 吸塵器', } # 中文分類名稱 CATEGORY_NAMES = { 'skincare': '基礎保養', 'makeup': '彩妝', 'hair_care': '頭髮保養', 'body_care': '身體保養', 'sunscreen': '防曬', 'mask': '面膜', 'health': '健康食品', 'supplement': '營養補充', 'diet': '減重瘦身', 'baby': '嬰兒用品', 'maternity': '孕婦用品', 'kids': '兒童用品', 'home': '居家用品', 'kitchen': '廚房用品', 'electronics': '3C 電子', 'appliances': '家電', } DEFAULT_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8', 'Referer': 'https://tw.my-best.com/', } def __init__(self, timeout: int = 30, delay: float = 1.0): self.timeout = timeout self.delay = delay self.session = requests.Session() self.session.headers.update(self.DEFAULT_HEADERS) self._last_request_time = 0 def _rate_limit(self): """速率限制""" elapsed = time.time() - self._last_request_time if elapsed < self.delay: time.sleep(self.delay - elapsed) self._last_request_time = time.time() def _extract_next_data(self, html: str) -> Optional[dict]: """從 HTML 中提取 Next.js 資料""" try: soup = BeautifulSoup(html, 'html.parser') script = soup.find('script', {'id': '__NEXT_DATA__'}) if script and script.string: return json.loads(script.string) except Exception as e: logger.debug(f"[mybest] 提取 __NEXT_DATA__ 失敗: {e}") return None def get_articles(self, category: str = 'skincare', limit: int = 10) -> Tuple[bool, str, List[MybestArticle]]: """ 取得分類文章列表 - 使用搜尋 API Args: category: 分類代碼(參考 CATEGORIES) limit: 最多回傳數量 Returns: (成功與否, 訊息, 文章列表) """ try: self._rate_limit() # 取得分類對應的搜尋關鍵字 search_keyword = self.CATEGORIES.get(category, self.CATEGORIES['skincare']) # 只取第一個關鍵字 keyword = search_keyword.split()[0] url = f"{self.BASE_URL}/search_contents" params = {'q': keyword} logger.info(f"[mybest] 搜尋文章: {category} -> {keyword}") response = self.session.get(url, params=params, timeout=self.timeout) if response.status_code != 200: return False, f"HTTP {response.status_code}", [] articles = self._parse_search_page(response.text, category, limit) if articles: return True, f"成功取得 {len(articles)} 篇文章", articles else: # 備用:嘗試從首頁取得 return self.get_latest_presses(limit) except requests.Timeout: logger.error("[mybest] 請求超時") return False, "請求超時", [] except Exception as e: logger.error(f"[mybest] 取得文章列表失敗: {e}") return False, str(e), [] def _parse_search_page(self, html: str, category: str, limit: int) -> List[MybestArticle]: """解析搜尋結果頁面 - 使用 HTML 解析""" articles = [] try: soup = BeautifulSoup(html, 'html.parser') # 找到所有文章連結 - 通常是 /數字 格式的連結 article_links = soup.find_all('a', href=re.compile(r'^/\d+$')) seen_ids = set() for link in article_links: if len(articles) >= limit: break try: href = link.get('href', '') article_id = href.strip('/') if article_id in seen_ids: continue seen_ids.add(article_id) # 找標題 - 從連結內的文字或圖片 title = '' title_elem = link.find(['h2', 'h3', 'span', 'p']) if title_elem: title = title_elem.get_text(strip=True) if not title: title = link.get_text(strip=True) # 過濾太短的標題 if not title or len(title) < 5: continue # 找圖片 image_url = '' img = link.find('img') if img: image_url = img.get('src') or img.get('data-src', '') article = MybestArticle( article_id=article_id, title=title[:100], category=self.CATEGORY_NAMES.get(category, category), product_count=0, image_url=image_url, article_url=f"{self.BASE_URL}/{article_id}", description='', crawled_at=datetime.now() ) articles.append(article) except Exception as e: logger.debug(f"[mybest] 解析文章連結失敗: {e}") continue logger.info(f"[mybest] 從搜尋頁解析到 {len(articles)} 篇文章") return articles except Exception as e: logger.error(f"[mybest] 解析搜尋頁面失敗: {e}") return [] def _parse_articles(self, html: str, category: str, limit: int) -> List[MybestArticle]: """解析文章列表頁面""" articles = [] try: # 優先嘗試 Next.js 資料 next_data = self._extract_next_data(html) if next_data: props = next_data.get('props', {}).get('pageProps', {}) items = props.get('presses', []) or props.get('articles', []) for item in items[:limit]: try: article_id = str(item.get('id', '')) title = item.get('title', '') or item.get('name', '') slug = item.get('slug', '') or item.get('url', '') # 構建 URL article_url = f"{self.BASE_URL}/{slug}" if slug else '' # 圖片 URL image_url = ( item.get('rectangleThumbnailUrl') or item.get('thumbnailUrl') or item.get('imageUrl', '') ) # 商品數量 product_count = item.get('productCountInt', 0) or item.get('productCount', 0) article = MybestArticle( article_id=article_id, title=title, category=self.CATEGORY_NAMES.get(category, category), product_count=product_count, image_url=image_url, article_url=article_url, description=item.get('description', ''), crawled_at=datetime.now() ) articles.append(article) except Exception as e: logger.debug(f"[mybest] 解析文章項目失敗: {e}") continue # 備用: 直接解析 HTML if not articles: soup = BeautifulSoup(html, 'html.parser') # 文章卡片 article_cards = soup.select('a[href*="/"]') for card in article_cards[:limit * 3]: # 多取一些,因為可能有非文章連結 try: href = card.get('href', '') if not href or '/categories/' in href or href == '/': continue # 標題 title_elem = card.select_one('[class*="title"], h2, h3') if not title_elem: continue title = title_elem.get_text(strip=True) if not title or len(title) < 5: continue # URL article_url = href if href.startswith('http') else f"{self.BASE_URL}{href}" # 圖片 img_elem = card.select_one('img') image_url = '' if img_elem: image_url = img_elem.get('src') or img_elem.get('data-src', '') # ID 從 URL 提取 article_id = href.split('/')[-1] if href else '' article = MybestArticle( article_id=article_id, title=title, category=self.CATEGORY_NAMES.get(category, category), product_count=0, image_url=image_url, article_url=article_url, description='', crawled_at=datetime.now() ) articles.append(article) if len(articles) >= limit: break except Exception as e: logger.debug(f"[mybest] 解析 HTML 文章失敗: {e}") continue logger.info(f"[mybest] 解析到 {len(articles)} 篇文章") return articles except Exception as e: logger.error(f"[mybest] 解析文章列表失敗: {e}") return [] def get_article_products(self, article_url: str, limit: int = 10) -> Tuple[bool, str, List[MybestProduct]]: """ 取得文章中的推薦商品 Args: article_url: 文章 URL limit: 最多回傳數量 Returns: (成功與否, 訊息, 商品列表) """ try: self._rate_limit() logger.info(f"[mybest] 取得文章商品: {article_url}") response = self.session.get(article_url, timeout=self.timeout) if response.status_code != 200: return False, f"HTTP {response.status_code}", [] products = self._parse_article_products(response.text, article_url, limit) if products: return True, f"成功取得 {len(products)} 個商品", products else: return False, "無法解析商品資料", [] except requests.Timeout: logger.error("[mybest] 請求超時") return False, "請求超時", [] except Exception as e: logger.error(f"[mybest] 取得文章商品失敗: {e}") return False, str(e), [] def _parse_article_products(self, html: str, article_url: str, limit: int) -> List[MybestProduct]: """解析文章內的推薦商品""" products = [] try: # 取得文章標題 article_title = '' soup = BeautifulSoup(html, 'html.parser') title_elem = soup.select_one('h1') if title_elem: article_title = title_elem.get_text(strip=True) # 優先嘗試 Next.js 資料 next_data = self._extract_next_data(html) if next_data: props = next_data.get('props', {}).get('pageProps', {}) press = props.get('press', {}) or props.get('article', {}) items = press.get('products', []) or press.get('items', []) if not article_title: article_title = press.get('title', '') for idx, item in enumerate(items[:limit], 1): try: product_id = str(item.get('id', '')) name = item.get('name', '') or item.get('title', '') brand = item.get('brand', '') or item.get('maker', '') # 價格 price = None price_val = item.get('price') or item.get('lowestPrice') if price_val: try: price = int(re.sub(r'[^\d]', '', str(price_val))) except (ValueError, TypeError): pass # 圖片 image_url = item.get('imageUrl') or item.get('thumbnailUrl', '') # 商品連結 product_url = item.get('url') or item.get('affiliateUrl', '') product = MybestProduct( product_id=product_id, name=name, brand=brand, rank=idx, price=price, image_url=image_url, product_url=product_url, article_title=article_title, crawled_at=datetime.now() ) products.append(product) except Exception as e: logger.debug(f"[mybest] 解析商品項目失敗: {e}") continue # 備用: 直接解析 HTML if not products: # 找到商品區塊 (通常有排名標示) product_sections = soup.select('[class*="ranking"], [class*="product"]') for idx, section in enumerate(product_sections[:limit], 1): try: # 商品名稱 name_elem = section.select_one('h2, h3, [class*="name"], [class*="title"]') if not name_elem: continue name = name_elem.get_text(strip=True) if not name or len(name) < 3: continue # 品牌 brand_elem = section.select_one('[class*="brand"], [class*="maker"]') brand = brand_elem.get_text(strip=True) if brand_elem else '' # 圖片 img_elem = section.select_one('img') image_url = '' if img_elem: image_url = img_elem.get('src') or img_elem.get('data-src', '') # 連結 link_elem = section.select_one('a[href*="http"]') product_url = link_elem.get('href', '') if link_elem else '' product = MybestProduct( product_id=str(idx), name=name, brand=brand, rank=idx, price=None, image_url=image_url, product_url=product_url, article_title=article_title, crawled_at=datetime.now() ) products.append(product) except Exception as e: logger.debug(f"[mybest] 解析 HTML 商品失敗: {e}") continue logger.info(f"[mybest] 解析到 {len(products)} 個商品") return products except Exception as e: logger.error(f"[mybest] 解析文章商品失敗: {e}") return [] def get_latest_presses(self, limit: int = 20) -> Tuple[bool, str, List[MybestArticle]]: """ 取得最新推薦文章 - 從首頁取得 Args: limit: 最多回傳數量 Returns: (成功與否, 訊息, 文章列表) """ try: self._rate_limit() url = f"{self.BASE_URL}/" logger.info(f"[mybest] 取得首頁文章: {url}") response = self.session.get(url, timeout=self.timeout) if response.status_code != 200: return False, f"HTTP {response.status_code}", [] articles = self._parse_homepage(response.text, limit) if articles: return True, f"成功取得 {len(articles)} 篇文章", articles else: return False, "無法解析文章資料", [] except requests.Timeout: logger.error("[mybest] 請求超時") return False, "請求超時", [] except Exception as e: logger.error(f"[mybest] 取得最新文章失敗: {e}") return False, str(e), [] def _parse_homepage(self, html: str, limit: int) -> List[MybestArticle]: """解析首頁,取得推薦文章""" articles = [] try: # 從 __NEXT_DATA__ 取得 next_data = self._extract_next_data(html) if next_data: props = next_data.get('props', {}).get('pageProps', {}) top_data = props.get('data', {}).get('top', {}) # 從 displayableContents 取得文章 contents = top_data.get('displayableContents', []) for item in contents[:limit]: try: press_id = str(item.get('pressId', '')) article_url = item.get('url', '') if not article_url: article_url = f"{self.BASE_URL}/{press_id}" # 取得商品數量 product_count = item.get('productCountInt', 0) # 取得縮圖 image_url = item.get('rectangleThumbnailUrl', '') or item.get('thumbnailProductImageUrl', '') # 取得商品名稱作為標題 merchandise = item.get('merchandise', '') if isinstance(merchandise, str) and merchandise: title = f"{merchandise} 推薦排行榜" elif isinstance(merchandise, dict): title = merchandise.get('name', '') or f"精選推薦 #{press_id}" else: title = f"精選推薦 #{press_id}" article = MybestArticle( article_id=press_id, title=title, category='精選推薦', product_count=product_count, image_url=image_url, article_url=article_url, description='', crawled_at=datetime.now() ) articles.append(article) except Exception as e: logger.debug(f"[mybest] 解析首頁文章失敗: {e}") continue # 從 displayableItemLists 取得精選清單 item_lists = top_data.get('displayableItemLists', []) for item in item_lists[:limit - len(articles)]: try: list_id = str(item.get('id', '')) title = item.get('title', '') if not title: continue image_url = item.get('thumbnailCardSquareUrl', '') article = MybestArticle( article_id=f"list_{list_id}", title=title, category='達人推薦', product_count=len(item.get('itemParts', [])), image_url=image_url, article_url=f"{self.BASE_URL}/item_lists/{list_id}", description=item.get('introduction', '')[:100] if item.get('introduction') else '', crawled_at=datetime.now() ) articles.append(article) except Exception as e: logger.debug(f"[mybest] 解析精選清單失敗: {e}") continue logger.info(f"[mybest] 從首頁解析到 {len(articles)} 篇文章") return articles[:limit] except Exception as e: logger.error(f"[mybest] 解析首頁失敗: {e}") return [] def search_articles(self, keyword: str, limit: int = 10) -> Tuple[bool, str, List[MybestArticle]]: """ 搜尋文章 Args: keyword: 搜尋關鍵字 limit: 最多回傳數量 Returns: (成功與否, 訊息, 文章列表) """ try: self._rate_limit() url = f"{self.BASE_URL}/search" params = {'q': keyword} logger.info(f"[mybest] 搜尋文章: {keyword}") response = self.session.get(url, params=params, timeout=self.timeout) if response.status_code != 200: return False, f"HTTP {response.status_code}", [] articles = self._parse_articles(response.text, 'search', limit) if articles: return True, f"成功取得 {len(articles)} 篇文章", articles else: return False, "無搜尋結果", [] except requests.Timeout: logger.error("[mybest] 搜尋超時") return False, "請求超時", [] except Exception as e: logger.error(f"[mybest] 搜尋失敗: {e}") return False, str(e), [] # 全域爬蟲實例 _crawler_instance: Optional[MybestCrawler] = None def get_crawler() -> MybestCrawler: """取得爬蟲實例(單例模式)""" global _crawler_instance if _crawler_instance is None: _crawler_instance = MybestCrawler() return _crawler_instance def get_mybest_articles(category: str = 'skincare', limit: int = 10) -> Tuple[bool, str, List[dict]]: """ 取得 mybest 文章(便捷函數) Args: category: 分類代碼 limit: 最多回傳數量 Returns: (成功與否, 訊息, 文章資料列表) """ crawler = get_crawler() success, message, articles = crawler.get_articles(category, limit) return success, message, [a.to_dict() for a in articles] def get_mybest_latest(limit: int = 20) -> Tuple[bool, str, List[dict]]: """ 取得 mybest 最新文章(便捷函數) Args: limit: 最多回傳數量 Returns: (成功與否, 訊息, 文章資料列表) """ crawler = get_crawler() success, message, articles = crawler.get_latest_presses(limit) return success, message, [a.to_dict() for a in articles] def search_mybest_articles(keyword: str, limit: int = 10) -> Tuple[bool, str, List[dict]]: """ 搜尋 mybest 文章(便捷函數) Args: keyword: 搜尋關鍵字 limit: 最多回傳數量 Returns: (成功與否, 訊息, 文章資料列表) """ crawler = get_crawler() success, message, articles = crawler.search_articles(keyword, limit) return success, message, [a.to_dict() for a in articles] def get_mybest_categories() -> Dict[str, str]: """取得所有分類""" return MybestCrawler.CATEGORY_NAMES.copy() if __name__ == '__main__': # 測試 logging.basicConfig(level=logging.INFO) print("=== mybest 爬蟲測試 ===\n") # 測試最新文章 print("[1] 測試最新文章") success, msg, articles = get_mybest_latest(limit=5) print(f"結果: {msg}") if articles: print("最新文章:") for a in articles: print(f" - {a['title'][:40]}... ({a['product_count']}款商品)") # 測試分類文章 print("\n[2] 測試分類文章 (分類: 基礎保養)") success, msg, articles = get_mybest_articles('skincare', limit=5) print(f"結果: {msg}") if articles: print("分類文章:") for a in articles[:3]: print(f" - {a['title'][:40]}...") # 測試搜尋 print("\n[3] 測試搜尋 (關鍵字: 面膜)") success, msg, articles = search_mybest_articles('面膜', limit=5) print(f"結果: {msg}") if articles: print("搜尋結果:") for a in articles[:3]: print(f" - {a['title'][:40]}...")