Some checks failed
CD Pipeline / deploy (push) Failing after 59s
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml) - 部署模式: rsync Python 檔案至 188 → docker restart (volume mount) - Dockerfile/requirements 變動時自動重建 Docker image - 部署通知: Telegram (開始/成功/失敗) - 健康檢查: https://mo.wooo.work/health (最多 5 次重試) - 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
460 lines
16 KiB
Python
460 lines
16 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
COSME 台灣 (@cosme) 爬蟲服務
|
||
|
||
爬取 COSME 台灣的美妝保養排行榜和評測資料
|
||
網站: https://www.cosme.net.tw/
|
||
|
||
支援分類:
|
||
- 美妝保養排行榜
|
||
- 品牌資訊
|
||
- 商品評價
|
||
"""
|
||
|
||
import re
|
||
import time
|
||
import logging
|
||
from typing import List, Dict, Optional, Tuple
|
||
from dataclasses import dataclass, asdict
|
||
from datetime import datetime
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
@dataclass
|
||
class CosmeProduct:
|
||
"""COSME 商品資料結構"""
|
||
product_id: str # 商品 ID
|
||
name: str # 商品名稱
|
||
brand: str # 品牌
|
||
category: str # 分類
|
||
rating: float # 評分 (0-7)
|
||
review_count: int # 評價數量
|
||
price: Optional[int] # 價格(可能無)
|
||
image_url: str # 圖片 URL
|
||
product_url: str # 商品頁面 URL
|
||
rank: int # 排名
|
||
crawled_at: datetime # 爬取時間
|
||
|
||
def to_dict(self) -> dict:
|
||
"""轉換為字典"""
|
||
data = asdict(self)
|
||
data['crawled_at'] = self.crawled_at.isoformat()
|
||
return data
|
||
|
||
|
||
class CosmeCrawler:
|
||
"""COSME 台灣爬蟲"""
|
||
|
||
BASE_URL = 'https://www.cosme.net.tw'
|
||
|
||
# 分類對應表 - 使用 tags/{id}/ranking 格式
|
||
CATEGORIES = {
|
||
# 臉部保養
|
||
'face_care': '/tags/1/ranking', # 臉部保養
|
||
'cleanser': '/tags/12/ranking', # 洗面乳
|
||
'toner': '/tags/13/ranking', # 化妝水
|
||
'serum': '/tags/78/ranking', # 精華液
|
||
'lotion': '/tags/14/ranking', # 乳液
|
||
'cream': '/tags/15/ranking', # 面霜/乳霜
|
||
'mask': '/tags/16/ranking', # 面膜/凝霜
|
||
'eye_care': '/tags/87/ranking', # 眼霜
|
||
'sunscreen': '/tags/10/ranking', # 防曬
|
||
|
||
# 彩妝
|
||
'makeup': '/tags/2/ranking', # 彩妝
|
||
'foundation': '/tags/105/ranking', # 粉底液
|
||
'lipstick': '/tags/101/ranking', # 唇膏
|
||
'eye_makeup': '/tags/96/ranking', # 眼影
|
||
'blush': '/tags/109/ranking', # 腮紅
|
||
|
||
# 身體保養
|
||
'body_care': '/tags/3/ranking', # 身體保養
|
||
'body_lotion': '/tags/27/ranking', # 身體乳
|
||
'hand_care': '/tags/26/ranking', # 護手霜
|
||
|
||
# 頭髮
|
||
'hair_care': '/tags/4/ranking', # 頭髮保養
|
||
'shampoo': '/tags/29/ranking', # 洗髮精
|
||
'conditioner': '/tags/30/ranking', # 護髮
|
||
}
|
||
|
||
# 中文分類名稱
|
||
CATEGORY_NAMES = {
|
||
'face_care': '臉部保養',
|
||
'cleanser': '洗面乳',
|
||
'toner': '化妝水',
|
||
'serum': '精華液',
|
||
'lotion': '乳液',
|
||
'cream': '面霜',
|
||
'mask': '面膜',
|
||
'eye_care': '眼部保養',
|
||
'sunscreen': '防曬',
|
||
'makeup': '彩妝',
|
||
'foundation': '底妝',
|
||
'lipstick': '唇彩',
|
||
'eye_makeup': '眼妝',
|
||
'blush': '腮紅',
|
||
'body_care': '身體保養',
|
||
'body_lotion': '身體乳液',
|
||
'hand_care': '護手霜',
|
||
'hair_care': '頭髮保養',
|
||
'shampoo': '洗髮精',
|
||
'conditioner': '護髮',
|
||
}
|
||
|
||
DEFAULT_HEADERS = {
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
|
||
'Referer': 'https://www.cosme.net.tw/',
|
||
}
|
||
|
||
def __init__(self, timeout: int = 30, delay: float = 1.0):
|
||
self.timeout = timeout
|
||
self.delay = delay
|
||
self.session = requests.Session()
|
||
self.session.headers.update(self.DEFAULT_HEADERS)
|
||
self._last_request_time = 0
|
||
|
||
def _rate_limit(self):
|
||
"""速率限制"""
|
||
elapsed = time.time() - self._last_request_time
|
||
if elapsed < self.delay:
|
||
time.sleep(self.delay - elapsed)
|
||
self._last_request_time = time.time()
|
||
|
||
def get_rankings(self, category: str = 'mask', limit: int = 10) -> Tuple[bool, str, List[CosmeProduct]]:
|
||
"""
|
||
取得排行榜商品
|
||
|
||
Args:
|
||
category: 分類代碼(參考 CATEGORIES)
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品列表)
|
||
"""
|
||
try:
|
||
self._rate_limit()
|
||
|
||
# 取得分類 URL
|
||
category_path = self.CATEGORIES.get(category, self.CATEGORIES['mask'])
|
||
url = f"{self.BASE_URL}{category_path}"
|
||
|
||
logger.info(f"[COSME] 取得排行榜: {category} -> {url}")
|
||
|
||
response = self.session.get(url, timeout=self.timeout)
|
||
|
||
if response.status_code != 200:
|
||
return False, f"HTTP {response.status_code}", []
|
||
|
||
products = self._parse_rankings(response.text, category, limit)
|
||
|
||
if products:
|
||
return True, f"成功取得 {len(products)} 個商品", products
|
||
else:
|
||
return False, "無法解析商品資料", []
|
||
|
||
except requests.Timeout:
|
||
logger.error("[COSME] 請求超時")
|
||
return False, "請求超時", []
|
||
except Exception as e:
|
||
logger.error(f"[COSME] 取得排行榜失敗: {e}")
|
||
return False, str(e), []
|
||
|
||
def _parse_rankings(self, html: str, category: str, limit: int) -> List[CosmeProduct]:
|
||
"""解析排行榜頁面"""
|
||
products = []
|
||
|
||
try:
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
|
||
# 找到排行榜商品列表 - 使用新的選擇器
|
||
product_items = soup.select('.uc-tag-ranking-item')
|
||
|
||
for idx, item in enumerate(product_items[:limit], 1):
|
||
try:
|
||
# 商品連結和名稱
|
||
name_elem = item.select_one('.product-name a, .product-name h3')
|
||
product_link = item.select_one('a[href*="/products/"]')
|
||
|
||
if not product_link:
|
||
continue
|
||
|
||
# 取得商品名稱
|
||
name = ''
|
||
if name_elem:
|
||
name = name_elem.get_text(strip=True)
|
||
else:
|
||
# 備用:從連結的 title 或圖片 alt 取得
|
||
img = item.select_one('.product-image img')
|
||
if img:
|
||
name = img.get('title', '') or img.get('alt', '')
|
||
# 清理名稱中的排名資訊
|
||
name = re.sub(r'^.*第\d+名\s*-\s*', '', name)
|
||
|
||
product_url = product_link.get('href', '')
|
||
if product_url and not product_url.startswith('http'):
|
||
product_url = f"{self.BASE_URL}{product_url}"
|
||
|
||
# 商品 ID
|
||
product_id = ''
|
||
if product_url:
|
||
match = re.search(r'/products/(\d+)', product_url)
|
||
if match:
|
||
product_id = match.group(1)
|
||
|
||
# 品牌
|
||
brand_elem = item.select_one('.brand-name a, .brand-name')
|
||
brand = brand_elem.get_text(strip=True) if brand_elem else ''
|
||
|
||
# 評分 - 找 .product-score-text.score 或 .score
|
||
rating = 0.0
|
||
rating_elem = item.select_one('.product-score-text.score, .score')
|
||
if rating_elem:
|
||
rating_text = rating_elem.get_text(strip=True)
|
||
try:
|
||
rating = float(re.sub(r'[^\d.]', '', rating_text))
|
||
except ValueError:
|
||
pass
|
||
|
||
# 評價數量
|
||
review_count = 0
|
||
review_elem = item.select_one('.product-review-count')
|
||
if review_elem:
|
||
review_text = review_elem.get_text(strip=True)
|
||
try:
|
||
review_count = int(re.sub(r'[^\d]', '', review_text))
|
||
except ValueError:
|
||
pass
|
||
|
||
# 價格
|
||
price = None
|
||
price_elem = item.select_one('.product-market-date')
|
||
if price_elem:
|
||
price_text = price_elem.get_text(strip=True)
|
||
price_match = re.search(r'價格[::]\s*(\d+)', price_text)
|
||
if price_match:
|
||
try:
|
||
price = int(price_match.group(1))
|
||
except ValueError:
|
||
pass
|
||
|
||
# 圖片 - 找 product-image 區塊的 img
|
||
image_url = ''
|
||
img_elem = item.select_one('.product-image img')
|
||
if img_elem:
|
||
image_url = img_elem.get('src') or img_elem.get('data-src', '')
|
||
if image_url and not image_url.startswith('http'):
|
||
image_url = f"https:{image_url}" if image_url.startswith('//') else f"{self.BASE_URL}{image_url}"
|
||
|
||
product = CosmeProduct(
|
||
product_id=product_id,
|
||
name=name,
|
||
brand=brand,
|
||
category=self.CATEGORY_NAMES.get(category, category),
|
||
rating=rating,
|
||
review_count=review_count,
|
||
price=price,
|
||
image_url=image_url,
|
||
product_url=product_url,
|
||
rank=idx,
|
||
crawled_at=datetime.now()
|
||
)
|
||
products.append(product)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[COSME] 解析商品項目失敗: {e}")
|
||
continue
|
||
|
||
logger.info(f"[COSME] 解析到 {len(products)} 個商品")
|
||
return products
|
||
|
||
except Exception as e:
|
||
logger.error(f"[COSME] 解析排行榜失敗: {e}")
|
||
return []
|
||
|
||
def search_products(self, keyword: str, limit: int = 10) -> Tuple[bool, str, List[CosmeProduct]]:
|
||
"""
|
||
搜尋商品
|
||
|
||
Args:
|
||
keyword: 搜尋關鍵字
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品列表)
|
||
"""
|
||
try:
|
||
self._rate_limit()
|
||
|
||
url = f"{self.BASE_URL}/search/products"
|
||
params = {'q': keyword}
|
||
|
||
logger.info(f"[COSME] 搜尋商品: {keyword}")
|
||
|
||
response = self.session.get(url, params=params, timeout=self.timeout)
|
||
|
||
if response.status_code != 200:
|
||
return False, f"HTTP {response.status_code}", []
|
||
|
||
products = self._parse_search_results(response.text, limit)
|
||
|
||
if products:
|
||
return True, f"成功取得 {len(products)} 個商品", products
|
||
else:
|
||
return False, "無搜尋結果", []
|
||
|
||
except requests.Timeout:
|
||
logger.error("[COSME] 搜尋超時")
|
||
return False, "請求超時", []
|
||
except Exception as e:
|
||
logger.error(f"[COSME] 搜尋失敗: {e}")
|
||
return False, str(e), []
|
||
|
||
def _parse_search_results(self, html: str, limit: int) -> List[CosmeProduct]:
|
||
"""解析搜尋結果"""
|
||
products = []
|
||
|
||
try:
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
|
||
# 搜尋結果商品
|
||
product_items = soup.select('.search-product-item, .product-item, .uc-product-card')
|
||
|
||
for idx, item in enumerate(product_items[:limit], 1):
|
||
try:
|
||
name_elem = item.select_one('.product-name a, a[href*="/products/"]')
|
||
if not name_elem:
|
||
continue
|
||
|
||
name = name_elem.get_text(strip=True)
|
||
product_url = name_elem.get('href', '')
|
||
if product_url and not product_url.startswith('http'):
|
||
product_url = f"{self.BASE_URL}{product_url}"
|
||
|
||
product_id = ''
|
||
if product_url:
|
||
match = re.search(r'/products/(\d+)', product_url)
|
||
if match:
|
||
product_id = match.group(1)
|
||
|
||
brand_elem = item.select_one('.brand-name, .product-brand')
|
||
brand = brand_elem.get_text(strip=True) if brand_elem else ''
|
||
|
||
rating = 0.0
|
||
rating_elem = item.select_one('.rating-score, .score')
|
||
if rating_elem:
|
||
try:
|
||
rating = float(re.sub(r'[^\d.]', '', rating_elem.get_text(strip=True)))
|
||
except ValueError:
|
||
pass
|
||
|
||
img_elem = item.select_one('img')
|
||
image_url = ''
|
||
if img_elem:
|
||
image_url = img_elem.get('src') or img_elem.get('data-src', '')
|
||
|
||
product = CosmeProduct(
|
||
product_id=product_id,
|
||
name=name,
|
||
brand=brand,
|
||
category='搜尋結果',
|
||
rating=rating,
|
||
review_count=0,
|
||
price=None,
|
||
image_url=image_url,
|
||
product_url=product_url,
|
||
rank=idx,
|
||
crawled_at=datetime.now()
|
||
)
|
||
products.append(product)
|
||
|
||
except Exception as e:
|
||
logger.debug(f"[COSME] 解析搜尋項目失敗: {e}")
|
||
continue
|
||
|
||
return products
|
||
|
||
except Exception as e:
|
||
logger.error(f"[COSME] 解析搜尋結果失敗: {e}")
|
||
return []
|
||
|
||
|
||
# 全域爬蟲實例
|
||
_crawler_instance: Optional[CosmeCrawler] = None
|
||
|
||
|
||
def get_crawler() -> CosmeCrawler:
|
||
"""取得爬蟲實例(單例模式)"""
|
||
global _crawler_instance
|
||
if _crawler_instance is None:
|
||
_crawler_instance = CosmeCrawler()
|
||
return _crawler_instance
|
||
|
||
|
||
def get_cosme_rankings(category: str = 'mask', limit: int = 10) -> Tuple[bool, str, List[dict]]:
|
||
"""
|
||
取得 COSME 排行榜(便捷函數)
|
||
|
||
Args:
|
||
category: 分類代碼
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品資料列表)
|
||
"""
|
||
crawler = get_crawler()
|
||
success, message, products = crawler.get_rankings(category, limit)
|
||
return success, message, [p.to_dict() for p in products]
|
||
|
||
|
||
def search_cosme_products(keyword: str, limit: int = 10) -> Tuple[bool, str, List[dict]]:
|
||
"""
|
||
搜尋 COSME 商品(便捷函數)
|
||
|
||
Args:
|
||
keyword: 搜尋關鍵字
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品資料列表)
|
||
"""
|
||
crawler = get_crawler()
|
||
success, message, products = crawler.search_products(keyword, limit)
|
||
return success, message, [p.to_dict() for p in products]
|
||
|
||
|
||
def get_cosme_categories() -> Dict[str, str]:
|
||
"""取得所有分類"""
|
||
return CosmeCrawler.CATEGORY_NAMES.copy()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 測試
|
||
logging.basicConfig(level=logging.INFO)
|
||
|
||
print("=== COSME 爬蟲測試 ===\n")
|
||
|
||
# 測試排行榜
|
||
print("[1] 測試排行榜 (分類: 面膜)")
|
||
success, msg, products = get_cosme_rankings('mask', limit=5)
|
||
print(f"結果: {msg}")
|
||
if products:
|
||
print("排行榜:")
|
||
for p in products:
|
||
print(f" {p['rank']}. {p['brand']} - {p['name'][:30]}... (評分: {p['rating']})")
|
||
|
||
print("\n[2] 測試搜尋 (關鍵字: 保濕)")
|
||
success, msg, products = search_cosme_products('保濕', limit=5)
|
||
print(f"結果: {msg}")
|
||
if products:
|
||
print("搜尋結果:")
|
||
for p in products[:3]:
|
||
print(f" - {p['brand']} - {p['name'][:30]}...")
|