ewoooc/services/mybest_crawler.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
mybest 台灣爬蟲服務

爬取 mybest 台灣的推薦文章和商品排行
網站: https://tw.my-best.com/

支援分類:
- 美妝保養
- 健康保健
- 母嬰用品
- 生活用品
"""

import re
import json
import time
import logging
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime

import requests
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)


@dataclass
class MybestArticle:
    """mybest 文章資料結構"""
    article_id: str           # 文章 ID
    title: str                # 文章標題
    category: str             # 分類
    product_count: int        # 推薦商品數量
    image_url: str            # 縮圖 URL
    article_url: str          # 文章頁面 URL
    description: str          # 簡介
    crawled_at: datetime      # 爬取時間

    def to_dict(self) -> dict:
        """轉換為字典"""
        data = asdict(self)
        data['crawled_at'] = self.crawled_at.isoformat()
        return data


@dataclass
class MybestProduct:
    """mybest 推薦商品資料結構"""
    product_id: str           # 商品 ID
    name: str                 # 商品名稱
    brand: str                # 品牌
    rank: int                 # 排名
    price: Optional[int]      # 價格
    image_url: str            # 圖片 URL
    product_url: str          # 商品連結
    article_title: str        # 所屬文章標題
    crawled_at: datetime      # 爬取時間

    def to_dict(self) -> dict:
        """轉換為字典"""
        data = asdict(self)
        data['crawled_at'] = self.crawled_at.isoformat()
        return data


class MybestCrawler:
    """mybest 台灣爬蟲"""

    BASE_URL = 'https://tw.my-best.com'

    # 分類搜尋關鍵字 - 使用 search_contents API
    CATEGORIES = {
        # 美妝保養
        'skincare': '基礎保養 精華液 乳液',
        'makeup': '彩妝 口紅 眼影',
        'hair_care': '洗髮精 護髮 頭髮保養',
        'body_care': '身體乳 身體保養',
        'sunscreen': '防曬 防曬乳',
        'mask': '面膜 保濕面膜',

        # 健康保健
        'health': '健康食品 保健',
        'supplement': '營養補充 維他命',
        'diet': '減重 瘦身',

        # 母嬰用品
        'baby': '嬰兒用品 奶瓶 尿布',
        'maternity': '孕婦用品 孕婦',
        'kids': '兒童用品 兒童',

        # 居家生活
        'home': '居家用品 收納',
        'kitchen': '廚房用品 鍋具',

        # 3C 家電
        'electronics': '3C 電子 藍牙耳機',
        'appliances': '家電 吸塵器',
    }

    # 中文分類名稱
    CATEGORY_NAMES = {
        'skincare': '基礎保養',
        'makeup': '彩妝',
        'hair_care': '頭髮保養',
        'body_care': '身體保養',
        'sunscreen': '防曬',
        'mask': '面膜',
        'health': '健康食品',
        'supplement': '營養補充',
        'diet': '減重瘦身',
        'baby': '嬰兒用品',
        'maternity': '孕婦用品',
        'kids': '兒童用品',
        'home': '居家用品',
        'kitchen': '廚房用品',
        'electronics': '3C 電子',
        'appliances': '家電',
    }

    DEFAULT_HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
        'Referer': 'https://tw.my-best.com/',
    }

    def __init__(self, timeout: int = 30, delay: float = 1.0):
        self.timeout = timeout
        self.delay = delay
        self.session = requests.Session()
        self.session.headers.update(self.DEFAULT_HEADERS)
        self._last_request_time = 0

    def _rate_limit(self):
        """速率限制"""
        elapsed = time.time() - self._last_request_time
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)
        self._last_request_time = time.time()

    def _extract_next_data(self, html: str) -> Optional[dict]:
        """從 HTML 中提取 Next.js 資料"""
        try:
            soup = BeautifulSoup(html, 'html.parser')
            script = soup.find('script', {'id': '__NEXT_DATA__'})
            if script and script.string:
                return json.loads(script.string)
        except Exception as e:
            logger.debug(f"[mybest] 提取 __NEXT_DATA__ 失敗: {e}")
        return None

    def get_articles(self, category: str = 'skincare', limit: int = 10) -> Tuple[bool, str, List[MybestArticle]]:
        """
        取得分類文章列表 - 使用搜尋 API

        Args:
            category: 分類代碼（參考 CATEGORIES）
            limit: 最多回傳數量

        Returns:
            (成功與否, 訊息, 文章列表)
        """
        try:
            self._rate_limit()

            # 取得分類對應的搜尋關鍵字
            search_keyword = self.CATEGORIES.get(category, self.CATEGORIES['skincare'])
            # 只取第一個關鍵字
            keyword = search_keyword.split()[0]

            url = f"{self.BASE_URL}/search_contents"
            params = {'q': keyword}

            logger.info(f"[mybest] 搜尋文章: {category} -> {keyword}")

            response = self.session.get(url, params=params, timeout=self.timeout)

            if response.status_code != 200:
                return False, f"HTTP {response.status_code}", []

            articles = self._parse_search_page(response.text, category, limit)

            if articles:
                return True, f"成功取得 {len(articles)} 篇文章", articles
            else:
                # 備用：嘗試從首頁取得
                return self.get_latest_presses(limit)

        except requests.Timeout:
            logger.error("[mybest] 請求超時")
            return False, "請求超時", []
        except Exception as e:
            logger.error(f"[mybest] 取得文章列表失敗: {e}")
            return False, str(e), []

    def _parse_search_page(self, html: str, category: str, limit: int) -> List[MybestArticle]:
        """解析搜尋結果頁面 - 使用 HTML 解析"""
        articles = []

        try:
            soup = BeautifulSoup(html, 'html.parser')

            # 找到所有文章連結 - 通常是 /數字 格式的連結
            article_links = soup.find_all('a', href=re.compile(r'^/\d+$'))

            seen_ids = set()
            for link in article_links:
                if len(articles) >= limit:
                    break

                try:
                    href = link.get('href', '')
                    article_id = href.strip('/')

                    if article_id in seen_ids:
                        continue
                    seen_ids.add(article_id)

                    # 找標題 - 從連結內的文字或圖片
                    title = ''
                    title_elem = link.find(['h2', 'h3', 'span', 'p'])
                    if title_elem:
                        title = title_elem.get_text(strip=True)
                    if not title:
                        title = link.get_text(strip=True)

                    # 過濾太短的標題
                    if not title or len(title) < 5:
                        continue

                    # 找圖片
                    image_url = ''
                    img = link.find('img')
                    if img:
                        image_url = img.get('src') or img.get('data-src', '')

                    article = MybestArticle(
                        article_id=article_id,
                        title=title[:100],
                        category=self.CATEGORY_NAMES.get(category, category),
                        product_count=0,
                        image_url=image_url,
                        article_url=f"{self.BASE_URL}/{article_id}",
                        description='',
                        crawled_at=datetime.now()
                    )
                    articles.append(article)

                except Exception as e:
                    logger.debug(f"[mybest] 解析文章連結失敗: {e}")
                    continue

            logger.info(f"[mybest] 從搜尋頁解析到 {len(articles)} 篇文章")
            return articles

        except Exception as e:
            logger.error(f"[mybest] 解析搜尋頁面失敗: {e}")
            return []

    def _parse_articles(self, html: str, category: str, limit: int) -> List[MybestArticle]:
        """解析文章列表頁面"""
        articles = []

        try:
            # 優先嘗試 Next.js 資料
            next_data = self._extract_next_data(html)
            if next_data:
                props = next_data.get('props', {}).get('pageProps', {})
                items = props.get('presses', []) or props.get('articles', [])

                for item in items[:limit]:
                    try:
                        article_id = str(item.get('id', ''))
                        title = item.get('title', '') or item.get('name', '')
                        slug = item.get('slug', '') or item.get('url', '')

                        # 構建 URL
                        article_url = f"{self.BASE_URL}/{slug}" if slug else ''

                        # 圖片 URL
                        image_url = (
                            item.get('rectangleThumbnailUrl') or
                            item.get('thumbnailUrl') or
                            item.get('imageUrl', '')
                        )

                        # 商品數量
                        product_count = item.get('productCountInt', 0) or item.get('productCount', 0)

                        article = MybestArticle(
                            article_id=article_id,
                            title=title,
                            category=self.CATEGORY_NAMES.get(category, category),
                            product_count=product_count,
                            image_url=image_url,
                            article_url=article_url,
                            description=item.get('description', ''),
                            crawled_at=datetime.now()
                        )
                        articles.append(article)

                    except Exception as e:
                        logger.debug(f"[mybest] 解析文章項目失敗: {e}")
                        continue

            # 備用: 直接解析 HTML
            if not articles:
                soup = BeautifulSoup(html, 'html.parser')

                # 文章卡片
                article_cards = soup.select('a[href*="/"]')

                for card in article_cards[:limit * 3]:  # 多取一些，因為可能有非文章連結
                    try:
                        href = card.get('href', '')
                        if not href or '/categories/' in href or href == '/':
                            continue

                        # 標題
                        title_elem = card.select_one('[class*="title"], h2, h3')
                        if not title_elem:
                            continue

                        title = title_elem.get_text(strip=True)
                        if not title or len(title) < 5:
                            continue

                        # URL
                        article_url = href if href.startswith('http') else f"{self.BASE_URL}{href}"

                        # 圖片
                        img_elem = card.select_one('img')
                        image_url = ''
                        if img_elem:
                            image_url = img_elem.get('src') or img_elem.get('data-src', '')

                        # ID 從 URL 提取
                        article_id = href.split('/')[-1] if href else ''

                        article = MybestArticle(
                            article_id=article_id,
                            title=title,
                            category=self.CATEGORY_NAMES.get(category, category),
                            product_count=0,
                            image_url=image_url,
                            article_url=article_url,
                            description='',
                            crawled_at=datetime.now()
                        )
                        articles.append(article)

                        if len(articles) >= limit:
                            break

                    except Exception as e:
                        logger.debug(f"[mybest] 解析 HTML 文章失敗: {e}")
                        continue

            logger.info(f"[mybest] 解析到 {len(articles)} 篇文章")
            return articles

        except Exception as e:
            logger.error(f"[mybest] 解析文章列表失敗: {e}")
            return []

    def get_article_products(self, article_url: str, limit: int = 10) -> Tuple[bool, str, List[MybestProduct]]:
        """
        取得文章中的推薦商品

        Args:
            article_url: 文章 URL
            limit: 最多回傳數量

        Returns:
            (成功與否, 訊息, 商品列表)
        """
        try:
            self._rate_limit()

            logger.info(f"[mybest] 取得文章商品: {article_url}")

            response = self.session.get(article_url, timeout=self.timeout)

            if response.status_code != 200:
                return False, f"HTTP {response.status_code}", []

            products = self._parse_article_products(response.text, article_url, limit)

            if products:
                return True, f"成功取得 {len(products)} 個商品", products
            else:
                return False, "無法解析商品資料", []

        except requests.Timeout:
            logger.error("[mybest] 請求超時")
            return False, "請求超時", []
        except Exception as e:
            logger.error(f"[mybest] 取得文章商品失敗: {e}")
            return False, str(e), []

    def _parse_article_products(self, html: str, article_url: str, limit: int) -> List[MybestProduct]:
        """解析文章內的推薦商品"""
        products = []

        try:
            # 取得文章標題
            article_title = ''
            soup = BeautifulSoup(html, 'html.parser')
            title_elem = soup.select_one('h1')
            if title_elem:
                article_title = title_elem.get_text(strip=True)

            # 優先嘗試 Next.js 資料
            next_data = self._extract_next_data(html)
            if next_data:
                props = next_data.get('props', {}).get('pageProps', {})
                press = props.get('press', {}) or props.get('article', {})
                items = press.get('products', []) or press.get('items', [])

                if not article_title:
                    article_title = press.get('title', '')

                for idx, item in enumerate(items[:limit], 1):
                    try:
                        product_id = str(item.get('id', ''))
                        name = item.get('name', '') or item.get('title', '')
                        brand = item.get('brand', '') or item.get('maker', '')

                        # 價格
                        price = None
                        price_val = item.get('price') or item.get('lowestPrice')
                        if price_val:
                            try:
                                price = int(re.sub(r'[^\d]', '', str(price_val)))
                            except (ValueError, TypeError):
                                pass

                        # 圖片
                        image_url = item.get('imageUrl') or item.get('thumbnailUrl', '')

                        # 商品連結
                        product_url = item.get('url') or item.get('affiliateUrl', '')

                        product = MybestProduct(
                            product_id=product_id,
                            name=name,
                            brand=brand,
                            rank=idx,
                            price=price,
                            image_url=image_url,
                            product_url=product_url,
                            article_title=article_title,
                            crawled_at=datetime.now()
                        )
                        products.append(product)

                    except Exception as e:
                        logger.debug(f"[mybest] 解析商品項目失敗: {e}")
                        continue

            # 備用: 直接解析 HTML
            if not products:
                # 找到商品區塊 (通常有排名標示)
                product_sections = soup.select('[class*="ranking"], [class*="product"]')

                for idx, section in enumerate(product_sections[:limit], 1):
                    try:
                        # 商品名稱
                        name_elem = section.select_one('h2, h3, [class*="name"], [class*="title"]')
                        if not name_elem:
                            continue

                        name = name_elem.get_text(strip=True)
                        if not name or len(name) < 3:
                            continue

                        # 品牌
                        brand_elem = section.select_one('[class*="brand"], [class*="maker"]')
                        brand = brand_elem.get_text(strip=True) if brand_elem else ''

                        # 圖片
                        img_elem = section.select_one('img')
                        image_url = ''
                        if img_elem:
                            image_url = img_elem.get('src') or img_elem.get('data-src', '')

                        # 連結
                        link_elem = section.select_one('a[href*="http"]')
                        product_url = link_elem.get('href', '') if link_elem else ''

                        product = MybestProduct(
                            product_id=str(idx),
                            name=name,
                            brand=brand,
                            rank=idx,
                            price=None,
                            image_url=image_url,
                            product_url=product_url,
                            article_title=article_title,
                            crawled_at=datetime.now()
                        )
                        products.append(product)

                    except Exception as e:
                        logger.debug(f"[mybest] 解析 HTML 商品失敗: {e}")
                        continue

            logger.info(f"[mybest] 解析到 {len(products)} 個商品")
            return products

        except Exception as e:
            logger.error(f"[mybest] 解析文章商品失敗: {e}")
            return []

    def get_latest_presses(self, limit: int = 20) -> Tuple[bool, str, List[MybestArticle]]:
        """
        取得最新推薦文章 - 從首頁取得

        Args:
            limit: 最多回傳數量

        Returns:
            (成功與否, 訊息, 文章列表)
        """
        try:
            self._rate_limit()

            url = f"{self.BASE_URL}/"
            logger.info(f"[mybest] 取得首頁文章: {url}")

            response = self.session.get(url, timeout=self.timeout)

            if response.status_code != 200:
                return False, f"HTTP {response.status_code}", []

            articles = self._parse_homepage(response.text, limit)

            if articles:
                return True, f"成功取得 {len(articles)} 篇文章", articles
            else:
                return False, "無法解析文章資料", []

        except requests.Timeout:
            logger.error("[mybest] 請求超時")
            return False, "請求超時", []
        except Exception as e:
            logger.error(f"[mybest] 取得最新文章失敗: {e}")
            return False, str(e), []

    def _parse_homepage(self, html: str, limit: int) -> List[MybestArticle]:
        """解析首頁，取得推薦文章"""
        articles = []

        try:
            # 從 __NEXT_DATA__ 取得
            next_data = self._extract_next_data(html)
            if next_data:
                props = next_data.get('props', {}).get('pageProps', {})
                top_data = props.get('data', {}).get('top', {})

                # 從 displayableContents 取得文章
                contents = top_data.get('displayableContents', [])
                for item in contents[:limit]:
                    try:
                        press_id = str(item.get('pressId', ''))
                        article_url = item.get('url', '')
                        if not article_url:
                            article_url = f"{self.BASE_URL}/{press_id}"

                        # 取得商品數量
                        product_count = item.get('productCountInt', 0)

                        # 取得縮圖
                        image_url = item.get('rectangleThumbnailUrl', '') or item.get('thumbnailProductImageUrl', '')

                        # 取得商品名稱作為標題
                        merchandise = item.get('merchandise', '')
                        if isinstance(merchandise, str) and merchandise:
                            title = f"{merchandise} 推薦排行榜"
                        elif isinstance(merchandise, dict):
                            title = merchandise.get('name', '') or f"精選推薦 #{press_id}"
                        else:
                            title = f"精選推薦 #{press_id}"

                        article = MybestArticle(
                            article_id=press_id,
                            title=title,
                            category='精選推薦',
                            product_count=product_count,
                            image_url=image_url,
                            article_url=article_url,
                            description='',
                            crawled_at=datetime.now()
                        )
                        articles.append(article)

                    except Exception as e:
                        logger.debug(f"[mybest] 解析首頁文章失敗: {e}")
                        continue

                # 從 displayableItemLists 取得精選清單
                item_lists = top_data.get('displayableItemLists', [])
                for item in item_lists[:limit - len(articles)]:
                    try:
                        list_id = str(item.get('id', ''))
                        title = item.get('title', '')

                        if not title:
                            continue

                        image_url = item.get('thumbnailCardSquareUrl', '')

                        article = MybestArticle(
                            article_id=f"list_{list_id}",
                            title=title,
                            category='達人推薦',
                            product_count=len(item.get('itemParts', [])),
                            image_url=image_url,
                            article_url=f"{self.BASE_URL}/item_lists/{list_id}",
                            description=item.get('introduction', '')[:100] if item.get('introduction') else '',
                            crawled_at=datetime.now()
                        )
                        articles.append(article)

                    except Exception as e:
                        logger.debug(f"[mybest] 解析精選清單失敗: {e}")
                        continue

            logger.info(f"[mybest] 從首頁解析到 {len(articles)} 篇文章")
            return articles[:limit]

        except Exception as e:
            logger.error(f"[mybest] 解析首頁失敗: {e}")
            return []

    def search_articles(self, keyword: str, limit: int = 10) -> Tuple[bool, str, List[MybestArticle]]:
        """
        搜尋文章

        Args:
            keyword: 搜尋關鍵字
            limit: 最多回傳數量

        Returns:
            (成功與否, 訊息, 文章列表)
        """
        try:
            self._rate_limit()

            url = f"{self.BASE_URL}/search"
            params = {'q': keyword}

            logger.info(f"[mybest] 搜尋文章: {keyword}")

            response = self.session.get(url, params=params, timeout=self.timeout)

            if response.status_code != 200:
                return False, f"HTTP {response.status_code}", []

            articles = self._parse_articles(response.text, 'search', limit)

            if articles:
                return True, f"成功取得 {len(articles)} 篇文章", articles
            else:
                return False, "無搜尋結果", []

        except requests.Timeout:
            logger.error("[mybest] 搜尋超時")
            return False, "請求超時", []
        except Exception as e:
            logger.error(f"[mybest] 搜尋失敗: {e}")
            return False, str(e), []


# 全域爬蟲實例
_crawler_instance: Optional[MybestCrawler] = None


def get_crawler() -> MybestCrawler:
    """取得爬蟲實例（單例模式）"""
    global _crawler_instance
    if _crawler_instance is None:
        _crawler_instance = MybestCrawler()
    return _crawler_instance


def get_mybest_articles(category: str = 'skincare', limit: int = 10) -> Tuple[bool, str, List[dict]]:
    """
    取得 mybest 文章（便捷函數）

    Args:
        category: 分類代碼
        limit: 最多回傳數量

    Returns:
        (成功與否, 訊息, 文章資料列表)
    """
    crawler = get_crawler()
    success, message, articles = crawler.get_articles(category, limit)
    return success, message, [a.to_dict() for a in articles]


def get_mybest_latest(limit: int = 20) -> Tuple[bool, str, List[dict]]:
    """
    取得 mybest 最新文章（便捷函數）

    Args:
        limit: 最多回傳數量

    Returns:
        (成功與否, 訊息, 文章資料列表)
    """
    crawler = get_crawler()
    success, message, articles = crawler.get_latest_presses(limit)
    return success, message, [a.to_dict() for a in articles]


def search_mybest_articles(keyword: str, limit: int = 10) -> Tuple[bool, str, List[dict]]:
    """
    搜尋 mybest 文章（便捷函數）

    Args:
        keyword: 搜尋關鍵字
        limit: 最多回傳數量

    Returns:
        (成功與否, 訊息, 文章資料列表)
    """
    crawler = get_crawler()
    success, message, articles = crawler.search_articles(keyword, limit)
    return success, message, [a.to_dict() for a in articles]


def get_mybest_categories() -> Dict[str, str]:
    """取得所有分類"""
    return MybestCrawler.CATEGORY_NAMES.copy()


if __name__ == '__main__':
    # 測試
    logging.basicConfig(level=logging.INFO)

    print("=== mybest 爬蟲測試 ===\n")

    # 測試最新文章
    print("[1] 測試最新文章")
    success, msg, articles = get_mybest_latest(limit=5)
    print(f"結果: {msg}")
    if articles:
        print("最新文章:")
        for a in articles:
            print(f"  - {a['title'][:40]}... ({a['product_count']}款商品)")

    # 測試分類文章
    print("\n[2] 測試分類文章 (分類: 基礎保養)")
    success, msg, articles = get_mybest_articles('skincare', limit=5)
    print(f"結果: {msg}")
    if articles:
        print("分類文章:")
        for a in articles[:3]:
            print(f"  - {a['title'][:40]}...")

    # 測試搜尋
    print("\n[3] 測試搜尋 (關鍵字: 面膜)")
    success, msg, articles = search_mybest_articles('面膜', limit=5)
    print(f"結果: {msg}")
    if articles:
        print("搜尋結果:")
        for a in articles[:3]:
            print(f"  - {a['title'][:40]}...")