Files
ewoooc/services/cosme_crawler.py
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

460 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
COSME 台灣 (@cosme) 爬蟲服務
爬取 COSME 台灣的美妝保養排行榜和評測資料
網站: https://www.cosme.net.tw/
支援分類:
- 美妝保養排行榜
- 品牌資訊
- 商品評價
"""
import re
import time
import logging
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime
import requests
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
@dataclass
class CosmeProduct:
"""COSME 商品資料結構"""
product_id: str # 商品 ID
name: str # 商品名稱
brand: str # 品牌
category: str # 分類
rating: float # 評分 (0-7)
review_count: int # 評價數量
price: Optional[int] # 價格(可能無)
image_url: str # 圖片 URL
product_url: str # 商品頁面 URL
rank: int # 排名
crawled_at: datetime # 爬取時間
def to_dict(self) -> dict:
"""轉換為字典"""
data = asdict(self)
data['crawled_at'] = self.crawled_at.isoformat()
return data
class CosmeCrawler:
"""COSME 台灣爬蟲"""
BASE_URL = 'https://www.cosme.net.tw'
# 分類對應表 - 使用 tags/{id}/ranking 格式
CATEGORIES = {
# 臉部保養
'face_care': '/tags/1/ranking', # 臉部保養
'cleanser': '/tags/12/ranking', # 洗面乳
'toner': '/tags/13/ranking', # 化妝水
'serum': '/tags/78/ranking', # 精華液
'lotion': '/tags/14/ranking', # 乳液
'cream': '/tags/15/ranking', # 面霜/乳霜
'mask': '/tags/16/ranking', # 面膜/凝霜
'eye_care': '/tags/87/ranking', # 眼霜
'sunscreen': '/tags/10/ranking', # 防曬
# 彩妝
'makeup': '/tags/2/ranking', # 彩妝
'foundation': '/tags/105/ranking', # 粉底液
'lipstick': '/tags/101/ranking', # 唇膏
'eye_makeup': '/tags/96/ranking', # 眼影
'blush': '/tags/109/ranking', # 腮紅
# 身體保養
'body_care': '/tags/3/ranking', # 身體保養
'body_lotion': '/tags/27/ranking', # 身體乳
'hand_care': '/tags/26/ranking', # 護手霜
# 頭髮
'hair_care': '/tags/4/ranking', # 頭髮保養
'shampoo': '/tags/29/ranking', # 洗髮精
'conditioner': '/tags/30/ranking', # 護髮
}
# 中文分類名稱
CATEGORY_NAMES = {
'face_care': '臉部保養',
'cleanser': '洗面乳',
'toner': '化妝水',
'serum': '精華液',
'lotion': '乳液',
'cream': '面霜',
'mask': '面膜',
'eye_care': '眼部保養',
'sunscreen': '防曬',
'makeup': '彩妝',
'foundation': '底妝',
'lipstick': '唇彩',
'eye_makeup': '眼妝',
'blush': '腮紅',
'body_care': '身體保養',
'body_lotion': '身體乳液',
'hand_care': '護手霜',
'hair_care': '頭髮保養',
'shampoo': '洗髮精',
'conditioner': '護髮',
}
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
'Referer': 'https://www.cosme.net.tw/',
}
def __init__(self, timeout: int = 30, delay: float = 1.0):
self.timeout = timeout
self.delay = delay
self.session = requests.Session()
self.session.headers.update(self.DEFAULT_HEADERS)
self._last_request_time = 0
def _rate_limit(self):
"""速率限制"""
elapsed = time.time() - self._last_request_time
if elapsed < self.delay:
time.sleep(self.delay - elapsed)
self._last_request_time = time.time()
def get_rankings(self, category: str = 'mask', limit: int = 10) -> Tuple[bool, str, List[CosmeProduct]]:
"""
取得排行榜商品
Args:
category: 分類代碼(參考 CATEGORIES
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品列表)
"""
try:
self._rate_limit()
# 取得分類 URL
category_path = self.CATEGORIES.get(category, self.CATEGORIES['mask'])
url = f"{self.BASE_URL}{category_path}"
logger.info(f"[COSME] 取得排行榜: {category} -> {url}")
response = self.session.get(url, timeout=self.timeout)
if response.status_code != 200:
return False, f"HTTP {response.status_code}", []
products = self._parse_rankings(response.text, category, limit)
if products:
return True, f"成功取得 {len(products)} 個商品", products
else:
return False, "無法解析商品資料", []
except requests.Timeout:
logger.error("[COSME] 請求超時")
return False, "請求超時", []
except Exception as e:
logger.error(f"[COSME] 取得排行榜失敗: {e}")
return False, str(e), []
def _parse_rankings(self, html: str, category: str, limit: int) -> List[CosmeProduct]:
"""解析排行榜頁面"""
products = []
try:
soup = BeautifulSoup(html, 'html.parser')
# 找到排行榜商品列表 - 使用新的選擇器
product_items = soup.select('.uc-tag-ranking-item')
for idx, item in enumerate(product_items[:limit], 1):
try:
# 商品連結和名稱
name_elem = item.select_one('.product-name a, .product-name h3')
product_link = item.select_one('a[href*="/products/"]')
if not product_link:
continue
# 取得商品名稱
name = ''
if name_elem:
name = name_elem.get_text(strip=True)
else:
# 備用:從連結的 title 或圖片 alt 取得
img = item.select_one('.product-image img')
if img:
name = img.get('title', '') or img.get('alt', '')
# 清理名稱中的排名資訊
name = re.sub(r'^.*第\d+名\s*-\s*', '', name)
product_url = product_link.get('href', '')
if product_url and not product_url.startswith('http'):
product_url = f"{self.BASE_URL}{product_url}"
# 商品 ID
product_id = ''
if product_url:
match = re.search(r'/products/(\d+)', product_url)
if match:
product_id = match.group(1)
# 品牌
brand_elem = item.select_one('.brand-name a, .brand-name')
brand = brand_elem.get_text(strip=True) if brand_elem else ''
# 評分 - 找 .product-score-text.score 或 .score
rating = 0.0
rating_elem = item.select_one('.product-score-text.score, .score')
if rating_elem:
rating_text = rating_elem.get_text(strip=True)
try:
rating = float(re.sub(r'[^\d.]', '', rating_text))
except ValueError:
pass
# 評價數量
review_count = 0
review_elem = item.select_one('.product-review-count')
if review_elem:
review_text = review_elem.get_text(strip=True)
try:
review_count = int(re.sub(r'[^\d]', '', review_text))
except ValueError:
pass
# 價格
price = None
price_elem = item.select_one('.product-market-date')
if price_elem:
price_text = price_elem.get_text(strip=True)
price_match = re.search(r'價格[:]\s*(\d+)', price_text)
if price_match:
try:
price = int(price_match.group(1))
except ValueError:
pass
# 圖片 - 找 product-image 區塊的 img
image_url = ''
img_elem = item.select_one('.product-image img')
if img_elem:
image_url = img_elem.get('src') or img_elem.get('data-src', '')
if image_url and not image_url.startswith('http'):
image_url = f"https:{image_url}" if image_url.startswith('//') else f"{self.BASE_URL}{image_url}"
product = CosmeProduct(
product_id=product_id,
name=name,
brand=brand,
category=self.CATEGORY_NAMES.get(category, category),
rating=rating,
review_count=review_count,
price=price,
image_url=image_url,
product_url=product_url,
rank=idx,
crawled_at=datetime.now()
)
products.append(product)
except Exception as e:
logger.debug(f"[COSME] 解析商品項目失敗: {e}")
continue
logger.info(f"[COSME] 解析到 {len(products)} 個商品")
return products
except Exception as e:
logger.error(f"[COSME] 解析排行榜失敗: {e}")
return []
def search_products(self, keyword: str, limit: int = 10) -> Tuple[bool, str, List[CosmeProduct]]:
"""
搜尋商品
Args:
keyword: 搜尋關鍵字
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品列表)
"""
try:
self._rate_limit()
url = f"{self.BASE_URL}/search/products"
params = {'q': keyword}
logger.info(f"[COSME] 搜尋商品: {keyword}")
response = self.session.get(url, params=params, timeout=self.timeout)
if response.status_code != 200:
return False, f"HTTP {response.status_code}", []
products = self._parse_search_results(response.text, limit)
if products:
return True, f"成功取得 {len(products)} 個商品", products
else:
return False, "無搜尋結果", []
except requests.Timeout:
logger.error("[COSME] 搜尋超時")
return False, "請求超時", []
except Exception as e:
logger.error(f"[COSME] 搜尋失敗: {e}")
return False, str(e), []
def _parse_search_results(self, html: str, limit: int) -> List[CosmeProduct]:
"""解析搜尋結果"""
products = []
try:
soup = BeautifulSoup(html, 'html.parser')
# 搜尋結果商品
product_items = soup.select('.search-product-item, .product-item, .uc-product-card')
for idx, item in enumerate(product_items[:limit], 1):
try:
name_elem = item.select_one('.product-name a, a[href*="/products/"]')
if not name_elem:
continue
name = name_elem.get_text(strip=True)
product_url = name_elem.get('href', '')
if product_url and not product_url.startswith('http'):
product_url = f"{self.BASE_URL}{product_url}"
product_id = ''
if product_url:
match = re.search(r'/products/(\d+)', product_url)
if match:
product_id = match.group(1)
brand_elem = item.select_one('.brand-name, .product-brand')
brand = brand_elem.get_text(strip=True) if brand_elem else ''
rating = 0.0
rating_elem = item.select_one('.rating-score, .score')
if rating_elem:
try:
rating = float(re.sub(r'[^\d.]', '', rating_elem.get_text(strip=True)))
except ValueError:
pass
img_elem = item.select_one('img')
image_url = ''
if img_elem:
image_url = img_elem.get('src') or img_elem.get('data-src', '')
product = CosmeProduct(
product_id=product_id,
name=name,
brand=brand,
category='搜尋結果',
rating=rating,
review_count=0,
price=None,
image_url=image_url,
product_url=product_url,
rank=idx,
crawled_at=datetime.now()
)
products.append(product)
except Exception as e:
logger.debug(f"[COSME] 解析搜尋項目失敗: {e}")
continue
return products
except Exception as e:
logger.error(f"[COSME] 解析搜尋結果失敗: {e}")
return []
# 全域爬蟲實例
_crawler_instance: Optional[CosmeCrawler] = None
def get_crawler() -> CosmeCrawler:
"""取得爬蟲實例(單例模式)"""
global _crawler_instance
if _crawler_instance is None:
_crawler_instance = CosmeCrawler()
return _crawler_instance
def get_cosme_rankings(category: str = 'mask', limit: int = 10) -> Tuple[bool, str, List[dict]]:
"""
取得 COSME 排行榜(便捷函數)
Args:
category: 分類代碼
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品資料列表)
"""
crawler = get_crawler()
success, message, products = crawler.get_rankings(category, limit)
return success, message, [p.to_dict() for p in products]
def search_cosme_products(keyword: str, limit: int = 10) -> Tuple[bool, str, List[dict]]:
"""
搜尋 COSME 商品(便捷函數)
Args:
keyword: 搜尋關鍵字
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品資料列表)
"""
crawler = get_crawler()
success, message, products = crawler.search_products(keyword, limit)
return success, message, [p.to_dict() for p in products]
def get_cosme_categories() -> Dict[str, str]:
"""取得所有分類"""
return CosmeCrawler.CATEGORY_NAMES.copy()
if __name__ == '__main__':
# 測試
logging.basicConfig(level=logging.INFO)
print("=== COSME 爬蟲測試 ===\n")
# 測試排行榜
print("[1] 測試排行榜 (分類: 面膜)")
success, msg, products = get_cosme_rankings('mask', limit=5)
print(f"結果: {msg}")
if products:
print("排行榜:")
for p in products:
print(f" {p['rank']}. {p['brand']} - {p['name'][:30]}... (評分: {p['rating']})")
print("\n[2] 測試搜尋 (關鍵字: 保濕)")
success, msg, products = search_cosme_products('保濕', limit=5)
print(f"結果: {msg}")
if products:
print("搜尋結果:")
for p in products[:3]:
print(f" - {p['brand']} - {p['name'][:30]}...")