#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ COSME 台灣 (@cosme) 爬蟲服務 爬取 COSME 台灣的美妝保養排行榜和評測資料 網站: https://www.cosme.net.tw/ 支援分類: - 美妝保養排行榜 - 品牌資訊 - 商品評價 """ import re import time import logging from typing import List, Dict, Optional, Tuple from dataclasses import dataclass, asdict from datetime import datetime import requests from bs4 import BeautifulSoup logger = logging.getLogger(__name__) @dataclass class CosmeProduct: """COSME 商品資料結構""" product_id: str # 商品 ID name: str # 商品名稱 brand: str # 品牌 category: str # 分類 rating: float # 評分 (0-7) review_count: int # 評價數量 price: Optional[int] # 價格(可能無) image_url: str # 圖片 URL product_url: str # 商品頁面 URL rank: int # 排名 crawled_at: datetime # 爬取時間 def to_dict(self) -> dict: """轉換為字典""" data = asdict(self) data['crawled_at'] = self.crawled_at.isoformat() return data class CosmeCrawler: """COSME 台灣爬蟲""" BASE_URL = 'https://www.cosme.net.tw' # 分類對應表 - 使用 tags/{id}/ranking 格式 CATEGORIES = { # 臉部保養 'face_care': '/tags/1/ranking', # 臉部保養 'cleanser': '/tags/12/ranking', # 洗面乳 'toner': '/tags/13/ranking', # 化妝水 'serum': '/tags/78/ranking', # 精華液 'lotion': '/tags/14/ranking', # 乳液 'cream': '/tags/15/ranking', # 面霜/乳霜 'mask': '/tags/16/ranking', # 面膜/凝霜 'eye_care': '/tags/87/ranking', # 眼霜 'sunscreen': '/tags/10/ranking', # 防曬 # 彩妝 'makeup': '/tags/2/ranking', # 彩妝 'foundation': '/tags/105/ranking', # 粉底液 'lipstick': '/tags/101/ranking', # 唇膏 'eye_makeup': '/tags/96/ranking', # 眼影 'blush': '/tags/109/ranking', # 腮紅 # 身體保養 'body_care': '/tags/3/ranking', # 身體保養 'body_lotion': '/tags/27/ranking', # 身體乳 'hand_care': '/tags/26/ranking', # 護手霜 # 頭髮 'hair_care': '/tags/4/ranking', # 頭髮保養 'shampoo': '/tags/29/ranking', # 洗髮精 'conditioner': '/tags/30/ranking', # 護髮 } # 中文分類名稱 CATEGORY_NAMES = { 'face_care': '臉部保養', 'cleanser': '洗面乳', 'toner': '化妝水', 'serum': '精華液', 'lotion': '乳液', 'cream': '面霜', 'mask': '面膜', 'eye_care': '眼部保養', 'sunscreen': '防曬', 'makeup': '彩妝', 'foundation': '底妝', 'lipstick': '唇彩', 'eye_makeup': '眼妝', 'blush': '腮紅', 'body_care': '身體保養', 'body_lotion': '身體乳液', 'hand_care': '護手霜', 'hair_care': '頭髮保養', 'shampoo': '洗髮精', 'conditioner': '護髮', } DEFAULT_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8', 'Referer': 'https://www.cosme.net.tw/', } def __init__(self, timeout: int = 30, delay: float = 1.0): self.timeout = timeout self.delay = delay self.session = requests.Session() self.session.headers.update(self.DEFAULT_HEADERS) self._last_request_time = 0 def _rate_limit(self): """速率限制""" elapsed = time.time() - self._last_request_time if elapsed < self.delay: time.sleep(self.delay - elapsed) self._last_request_time = time.time() def get_rankings(self, category: str = 'mask', limit: int = 10) -> Tuple[bool, str, List[CosmeProduct]]: """ 取得排行榜商品 Args: category: 分類代碼(參考 CATEGORIES) limit: 最多回傳數量 Returns: (成功與否, 訊息, 商品列表) """ try: self._rate_limit() # 取得分類 URL category_path = self.CATEGORIES.get(category, self.CATEGORIES['mask']) url = f"{self.BASE_URL}{category_path}" logger.info(f"[COSME] 取得排行榜: {category} -> {url}") response = self.session.get(url, timeout=self.timeout) if response.status_code != 200: return False, f"HTTP {response.status_code}", [] products = self._parse_rankings(response.text, category, limit) if products: return True, f"成功取得 {len(products)} 個商品", products else: return False, "無法解析商品資料", [] except requests.Timeout: logger.error("[COSME] 請求超時") return False, "請求超時", [] except Exception as e: logger.error(f"[COSME] 取得排行榜失敗: {e}") return False, str(e), [] def _parse_rankings(self, html: str, category: str, limit: int) -> List[CosmeProduct]: """解析排行榜頁面""" products = [] try: soup = BeautifulSoup(html, 'html.parser') # 找到排行榜商品列表 - 使用新的選擇器 product_items = soup.select('.uc-tag-ranking-item') for idx, item in enumerate(product_items[:limit], 1): try: # 商品連結和名稱 name_elem = item.select_one('.product-name a, .product-name h3') product_link = item.select_one('a[href*="/products/"]') if not product_link: continue # 取得商品名稱 name = '' if name_elem: name = name_elem.get_text(strip=True) else: # 備用:從連結的 title 或圖片 alt 取得 img = item.select_one('.product-image img') if img: name = img.get('title', '') or img.get('alt', '') # 清理名稱中的排名資訊 name = re.sub(r'^.*第\d+名\s*-\s*', '', name) product_url = product_link.get('href', '') if product_url and not product_url.startswith('http'): product_url = f"{self.BASE_URL}{product_url}" # 商品 ID product_id = '' if product_url: match = re.search(r'/products/(\d+)', product_url) if match: product_id = match.group(1) # 品牌 brand_elem = item.select_one('.brand-name a, .brand-name') brand = brand_elem.get_text(strip=True) if brand_elem else '' # 評分 - 找 .product-score-text.score 或 .score rating = 0.0 rating_elem = item.select_one('.product-score-text.score, .score') if rating_elem: rating_text = rating_elem.get_text(strip=True) try: rating = float(re.sub(r'[^\d.]', '', rating_text)) except ValueError: pass # 評價數量 review_count = 0 review_elem = item.select_one('.product-review-count') if review_elem: review_text = review_elem.get_text(strip=True) try: review_count = int(re.sub(r'[^\d]', '', review_text)) except ValueError: pass # 價格 price = None price_elem = item.select_one('.product-market-date') if price_elem: price_text = price_elem.get_text(strip=True) price_match = re.search(r'價格[::]\s*(\d+)', price_text) if price_match: try: price = int(price_match.group(1)) except ValueError: pass # 圖片 - 找 product-image 區塊的 img image_url = '' img_elem = item.select_one('.product-image img') if img_elem: image_url = img_elem.get('src') or img_elem.get('data-src', '') if image_url and not image_url.startswith('http'): image_url = f"https:{image_url}" if image_url.startswith('//') else f"{self.BASE_URL}{image_url}" product = CosmeProduct( product_id=product_id, name=name, brand=brand, category=self.CATEGORY_NAMES.get(category, category), rating=rating, review_count=review_count, price=price, image_url=image_url, product_url=product_url, rank=idx, crawled_at=datetime.now() ) products.append(product) except Exception as e: logger.debug(f"[COSME] 解析商品項目失敗: {e}") continue logger.info(f"[COSME] 解析到 {len(products)} 個商品") return products except Exception as e: logger.error(f"[COSME] 解析排行榜失敗: {e}") return [] def search_products(self, keyword: str, limit: int = 10) -> Tuple[bool, str, List[CosmeProduct]]: """ 搜尋商品 Args: keyword: 搜尋關鍵字 limit: 最多回傳數量 Returns: (成功與否, 訊息, 商品列表) """ try: self._rate_limit() url = f"{self.BASE_URL}/search/products" params = {'q': keyword} logger.info(f"[COSME] 搜尋商品: {keyword}") response = self.session.get(url, params=params, timeout=self.timeout) if response.status_code != 200: return False, f"HTTP {response.status_code}", [] products = self._parse_search_results(response.text, limit) if products: return True, f"成功取得 {len(products)} 個商品", products else: return False, "無搜尋結果", [] except requests.Timeout: logger.error("[COSME] 搜尋超時") return False, "請求超時", [] except Exception as e: logger.error(f"[COSME] 搜尋失敗: {e}") return False, str(e), [] def _parse_search_results(self, html: str, limit: int) -> List[CosmeProduct]: """解析搜尋結果""" products = [] try: soup = BeautifulSoup(html, 'html.parser') # 搜尋結果商品 product_items = soup.select('.search-product-item, .product-item, .uc-product-card') for idx, item in enumerate(product_items[:limit], 1): try: name_elem = item.select_one('.product-name a, a[href*="/products/"]') if not name_elem: continue name = name_elem.get_text(strip=True) product_url = name_elem.get('href', '') if product_url and not product_url.startswith('http'): product_url = f"{self.BASE_URL}{product_url}" product_id = '' if product_url: match = re.search(r'/products/(\d+)', product_url) if match: product_id = match.group(1) brand_elem = item.select_one('.brand-name, .product-brand') brand = brand_elem.get_text(strip=True) if brand_elem else '' rating = 0.0 rating_elem = item.select_one('.rating-score, .score') if rating_elem: try: rating = float(re.sub(r'[^\d.]', '', rating_elem.get_text(strip=True))) except ValueError: pass img_elem = item.select_one('img') image_url = '' if img_elem: image_url = img_elem.get('src') or img_elem.get('data-src', '') product = CosmeProduct( product_id=product_id, name=name, brand=brand, category='搜尋結果', rating=rating, review_count=0, price=None, image_url=image_url, product_url=product_url, rank=idx, crawled_at=datetime.now() ) products.append(product) except Exception as e: logger.debug(f"[COSME] 解析搜尋項目失敗: {e}") continue return products except Exception as e: logger.error(f"[COSME] 解析搜尋結果失敗: {e}") return [] # 全域爬蟲實例 _crawler_instance: Optional[CosmeCrawler] = None def get_crawler() -> CosmeCrawler: """取得爬蟲實例(單例模式)""" global _crawler_instance if _crawler_instance is None: _crawler_instance = CosmeCrawler() return _crawler_instance def get_cosme_rankings(category: str = 'mask', limit: int = 10) -> Tuple[bool, str, List[dict]]: """ 取得 COSME 排行榜(便捷函數) Args: category: 分類代碼 limit: 最多回傳數量 Returns: (成功與否, 訊息, 商品資料列表) """ crawler = get_crawler() success, message, products = crawler.get_rankings(category, limit) return success, message, [p.to_dict() for p in products] def search_cosme_products(keyword: str, limit: int = 10) -> Tuple[bool, str, List[dict]]: """ 搜尋 COSME 商品(便捷函數) Args: keyword: 搜尋關鍵字 limit: 最多回傳數量 Returns: (成功與否, 訊息, 商品資料列表) """ crawler = get_crawler() success, message, products = crawler.search_products(keyword, limit) return success, message, [p.to_dict() for p in products] def get_cosme_categories() -> Dict[str, str]: """取得所有分類""" return CosmeCrawler.CATEGORY_NAMES.copy() if __name__ == '__main__': # 測試 logging.basicConfig(level=logging.INFO) print("=== COSME 爬蟲測試 ===\n") # 測試排行榜 print("[1] 測試排行榜 (分類: 面膜)") success, msg, products = get_cosme_rankings('mask', limit=5) print(f"結果: {msg}") if products: print("排行榜:") for p in products: print(f" {p['rank']}. {p['brand']} - {p['name'][:30]}... (評分: {p['rating']})") print("\n[2] 測試搜尋 (關鍵字: 保濕)") success, msg, products = search_cosme_products('保濕', limit=5) print(f"結果: {msg}") if products: print("搜尋結果:") for p in products[:3]: print(f" - {p['brand']} - {p['name'][:30]}...")