ewoooc/services/cosme_crawler.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
COSME 台灣 (@cosme) 爬蟲服務

爬取 COSME 台灣的美妝保養排行榜和評測資料
網站: https://www.cosme.net.tw/

支援分類:
- 美妝保養排行榜
- 品牌資訊
- 商品評價
"""

import re
import time
import logging
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime

import requests
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)


@dataclass
class CosmeProduct:
    """COSME 商品資料結構"""
    product_id: str           # 商品 ID
    name: str                 # 商品名稱
    brand: str                # 品牌
    category: str             # 分類
    rating: float             # 評分 (0-7)
    review_count: int         # 評價數量
    price: Optional[int]      # 價格（可能無）
    image_url: str            # 圖片 URL
    product_url: str          # 商品頁面 URL
    rank: int                 # 排名
    crawled_at: datetime      # 爬取時間

    def to_dict(self) -> dict:
        """轉換為字典"""
        data = asdict(self)
        data['crawled_at'] = self.crawled_at.isoformat()
        return data


class CosmeCrawler:
    """COSME 台灣爬蟲"""

    BASE_URL = 'https://www.cosme.net.tw'

    # 分類對應表 - 使用 tags/{id}/ranking 格式
    CATEGORIES = {
        # 臉部保養
        'face_care': '/tags/1/ranking',           # 臉部保養
        'cleanser': '/tags/12/ranking',           # 洗面乳
        'toner': '/tags/13/ranking',              # 化妝水
        'serum': '/tags/78/ranking',              # 精華液
        'lotion': '/tags/14/ranking',             # 乳液
        'cream': '/tags/15/ranking',              # 面霜/乳霜
        'mask': '/tags/16/ranking',               # 面膜/凝霜
        'eye_care': '/tags/87/ranking',           # 眼霜
        'sunscreen': '/tags/10/ranking',          # 防曬

        # 彩妝
        'makeup': '/tags/2/ranking',              # 彩妝
        'foundation': '/tags/105/ranking',        # 粉底液
        'lipstick': '/tags/101/ranking',          # 唇膏
        'eye_makeup': '/tags/96/ranking',         # 眼影
        'blush': '/tags/109/ranking',             # 腮紅

        # 身體保養
        'body_care': '/tags/3/ranking',           # 身體保養
        'body_lotion': '/tags/27/ranking',        # 身體乳
        'hand_care': '/tags/26/ranking',          # 護手霜

        # 頭髮
        'hair_care': '/tags/4/ranking',           # 頭髮保養
        'shampoo': '/tags/29/ranking',            # 洗髮精
        'conditioner': '/tags/30/ranking',        # 護髮
    }

    # 中文分類名稱
    CATEGORY_NAMES = {
        'face_care': '臉部保養',
        'cleanser': '洗面乳',
        'toner': '化妝水',
        'serum': '精華液',
        'lotion': '乳液',
        'cream': '面霜',
        'mask': '面膜',
        'eye_care': '眼部保養',
        'sunscreen': '防曬',
        'makeup': '彩妝',
        'foundation': '底妝',
        'lipstick': '唇彩',
        'eye_makeup': '眼妝',
        'blush': '腮紅',
        'body_care': '身體保養',
        'body_lotion': '身體乳液',
        'hand_care': '護手霜',
        'hair_care': '頭髮保養',
        'shampoo': '洗髮精',
        'conditioner': '護髮',
    }

    DEFAULT_HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
        'Referer': 'https://www.cosme.net.tw/',
    }

    def __init__(self, timeout: int = 30, delay: float = 1.0):
        self.timeout = timeout
        self.delay = delay
        self.session = requests.Session()
        self.session.headers.update(self.DEFAULT_HEADERS)
        self._last_request_time = 0

    def _rate_limit(self):
        """速率限制"""
        elapsed = time.time() - self._last_request_time
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)
        self._last_request_time = time.time()

    def get_rankings(self, category: str = 'mask', limit: int = 10) -> Tuple[bool, str, List[CosmeProduct]]:
        """
        取得排行榜商品

        Args:
            category: 分類代碼（參考 CATEGORIES）
            limit: 最多回傳數量

        Returns:
            (成功與否, 訊息, 商品列表)
        """
        try:
            self._rate_limit()

            # 取得分類 URL
            category_path = self.CATEGORIES.get(category, self.CATEGORIES['mask'])
            url = f"{self.BASE_URL}{category_path}"

            logger.info(f"[COSME] 取得排行榜: {category} -> {url}")

            response = self.session.get(url, timeout=self.timeout)

            if response.status_code != 200:
                return False, f"HTTP {response.status_code}", []

            products = self._parse_rankings(response.text, category, limit)

            if products:
                return True, f"成功取得 {len(products)} 個商品", products
            else:
                return False, "無法解析商品資料", []

        except requests.Timeout:
            logger.error("[COSME] 請求超時")
            return False, "請求超時", []
        except Exception as e:
            logger.error(f"[COSME] 取得排行榜失敗: {e}")
            return False, str(e), []

    def _parse_rankings(self, html: str, category: str, limit: int) -> List[CosmeProduct]:
        """解析排行榜頁面"""
        products = []

        try:
            soup = BeautifulSoup(html, 'html.parser')

            # 找到排行榜商品列表 - 使用新的選擇器
            product_items = soup.select('.uc-tag-ranking-item')

            for idx, item in enumerate(product_items[:limit], 1):
                try:
                    # 商品連結和名稱
                    name_elem = item.select_one('.product-name a, .product-name h3')
                    product_link = item.select_one('a[href*="/products/"]')

                    if not product_link:
                        continue

                    # 取得商品名稱
                    name = ''
                    if name_elem:
                        name = name_elem.get_text(strip=True)
                    else:
                        # 備用：從連結的 title 或圖片 alt 取得
                        img = item.select_one('.product-image img')
                        if img:
                            name = img.get('title', '') or img.get('alt', '')
                            # 清理名稱中的排名資訊
                            name = re.sub(r'^.*第\d+名\s*-\s*', '', name)

                    product_url = product_link.get('href', '')
                    if product_url and not product_url.startswith('http'):
                        product_url = f"{self.BASE_URL}{product_url}"

                    # 商品 ID
                    product_id = ''
                    if product_url:
                        match = re.search(r'/products/(\d+)', product_url)
                        if match:
                            product_id = match.group(1)

                    # 品牌
                    brand_elem = item.select_one('.brand-name a, .brand-name')
                    brand = brand_elem.get_text(strip=True) if brand_elem else ''

                    # 評分 - 找 .product-score-text.score 或 .score
                    rating = 0.0
                    rating_elem = item.select_one('.product-score-text.score, .score')
                    if rating_elem:
                        rating_text = rating_elem.get_text(strip=True)
                        try:
                            rating = float(re.sub(r'[^\d.]', '', rating_text))
                        except ValueError:
                            pass

                    # 評價數量
                    review_count = 0
                    review_elem = item.select_one('.product-review-count')
                    if review_elem:
                        review_text = review_elem.get_text(strip=True)
                        try:
                            review_count = int(re.sub(r'[^\d]', '', review_text))
                        except ValueError:
                            pass

                    # 價格
                    price = None
                    price_elem = item.select_one('.product-market-date')
                    if price_elem:
                        price_text = price_elem.get_text(strip=True)
                        price_match = re.search(r'價格[：:]\s*(\d+)', price_text)
                        if price_match:
                            try:
                                price = int(price_match.group(1))
                            except ValueError:
                                pass

                    # 圖片 - 找 product-image 區塊的 img
                    image_url = ''
                    img_elem = item.select_one('.product-image img')
                    if img_elem:
                        image_url = img_elem.get('src') or img_elem.get('data-src', '')
                        if image_url and not image_url.startswith('http'):
                            image_url = f"https:{image_url}" if image_url.startswith('//') else f"{self.BASE_URL}{image_url}"

                    product = CosmeProduct(
                        product_id=product_id,
                        name=name,
                        brand=brand,
                        category=self.CATEGORY_NAMES.get(category, category),
                        rating=rating,
                        review_count=review_count,
                        price=price,
                        image_url=image_url,
                        product_url=product_url,
                        rank=idx,
                        crawled_at=datetime.now()
                    )
                    products.append(product)

                except Exception as e:
                    logger.debug(f"[COSME] 解析商品項目失敗: {e}")
                    continue

            logger.info(f"[COSME] 解析到 {len(products)} 個商品")
            return products

        except Exception as e:
            logger.error(f"[COSME] 解析排行榜失敗: {e}")
            return []

    def search_products(self, keyword: str, limit: int = 10) -> Tuple[bool, str, List[CosmeProduct]]:
        """
        搜尋商品

        Args:
            keyword: 搜尋關鍵字
            limit: 最多回傳數量

        Returns:
            (成功與否, 訊息, 商品列表)
        """
        try:
            self._rate_limit()

            url = f"{self.BASE_URL}/search/products"
            params = {'q': keyword}

            logger.info(f"[COSME] 搜尋商品: {keyword}")

            response = self.session.get(url, params=params, timeout=self.timeout)

            if response.status_code != 200:
                return False, f"HTTP {response.status_code}", []

            products = self._parse_search_results(response.text, limit)

            if products:
                return True, f"成功取得 {len(products)} 個商品", products
            else:
                return False, "無搜尋結果", []

        except requests.Timeout:
            logger.error("[COSME] 搜尋超時")
            return False, "請求超時", []
        except Exception as e:
            logger.error(f"[COSME] 搜尋失敗: {e}")
            return False, str(e), []

    def _parse_search_results(self, html: str, limit: int) -> List[CosmeProduct]:
        """解析搜尋結果"""
        products = []

        try:
            soup = BeautifulSoup(html, 'html.parser')

            # 搜尋結果商品
            product_items = soup.select('.search-product-item, .product-item, .uc-product-card')

            for idx, item in enumerate(product_items[:limit], 1):
                try:
                    name_elem = item.select_one('.product-name a, a[href*="/products/"]')
                    if not name_elem:
                        continue

                    name = name_elem.get_text(strip=True)
                    product_url = name_elem.get('href', '')
                    if product_url and not product_url.startswith('http'):
                        product_url = f"{self.BASE_URL}{product_url}"

                    product_id = ''
                    if product_url:
                        match = re.search(r'/products/(\d+)', product_url)
                        if match:
                            product_id = match.group(1)

                    brand_elem = item.select_one('.brand-name, .product-brand')
                    brand = brand_elem.get_text(strip=True) if brand_elem else ''

                    rating = 0.0
                    rating_elem = item.select_one('.rating-score, .score')
                    if rating_elem:
                        try:
                            rating = float(re.sub(r'[^\d.]', '', rating_elem.get_text(strip=True)))
                        except ValueError:
                            pass

                    img_elem = item.select_one('img')
                    image_url = ''
                    if img_elem:
                        image_url = img_elem.get('src') or img_elem.get('data-src', '')

                    product = CosmeProduct(
                        product_id=product_id,
                        name=name,
                        brand=brand,
                        category='搜尋結果',
                        rating=rating,
                        review_count=0,
                        price=None,
                        image_url=image_url,
                        product_url=product_url,
                        rank=idx,
                        crawled_at=datetime.now()
                    )
                    products.append(product)

                except Exception as e:
                    logger.debug(f"[COSME] 解析搜尋項目失敗: {e}")
                    continue

            return products

        except Exception as e:
            logger.error(f"[COSME] 解析搜尋結果失敗: {e}")
            return []


# 全域爬蟲實例
_crawler_instance: Optional[CosmeCrawler] = None


def get_crawler() -> CosmeCrawler:
    """取得爬蟲實例（單例模式）"""
    global _crawler_instance
    if _crawler_instance is None:
        _crawler_instance = CosmeCrawler()
    return _crawler_instance


def get_cosme_rankings(category: str = 'mask', limit: int = 10) -> Tuple[bool, str, List[dict]]:
    """
    取得 COSME 排行榜（便捷函數）

    Args:
        category: 分類代碼
        limit: 最多回傳數量

    Returns:
        (成功與否, 訊息, 商品資料列表)
    """
    crawler = get_crawler()
    success, message, products = crawler.get_rankings(category, limit)
    return success, message, [p.to_dict() for p in products]


def search_cosme_products(keyword: str, limit: int = 10) -> Tuple[bool, str, List[dict]]:
    """
    搜尋 COSME 商品（便捷函數）

    Args:
        keyword: 搜尋關鍵字
        limit: 最多回傳數量

    Returns:
        (成功與否, 訊息, 商品資料列表)
    """
    crawler = get_crawler()
    success, message, products = crawler.search_products(keyword, limit)
    return success, message, [p.to_dict() for p in products]


def get_cosme_categories() -> Dict[str, str]:
    """取得所有分類"""
    return CosmeCrawler.CATEGORY_NAMES.copy()


if __name__ == '__main__':
    # 測試
    logging.basicConfig(level=logging.INFO)

    print("=== COSME 爬蟲測試 ===\n")

    # 測試排行榜
    print("[1] 測試排行榜 (分類: 面膜)")
    success, msg, products = get_cosme_rankings('mask', limit=5)
    print(f"結果: {msg}")
    if products:
        print("排行榜:")
        for p in products:
            print(f"  {p['rank']}. {p['brand']} - {p['name'][:30]}... (評分: {p['rating']})")

    print("\n[2] 測試搜尋 (關鍵字: 保濕)")
    success, msg, products = search_cosme_products('保濕', limit=5)
    print(f"結果: {msg}")
    if products:
        print("搜尋結果:")
        for p in products[:3]:
            print(f"  - {p['brand']} - {p['name'][:30]}...")