ewoooc/services/trend_crawler.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
趨勢資料爬蟲模組
負責收集新聞、天氣、YouTube 熱門影片等趨勢資訊
"""

import requests
import feedparser
import logging
import os
import socket
import time
import random
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
from dataclasses import dataclass, field
from bs4 import BeautifulSoup
import json
import re

# RSS 解析超時設定（秒）
RSS_TIMEOUT = 15

logger = logging.getLogger(__name__)

# YouTube Data API 設定
YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')  # 從環境變數讀取
YOUTUBE_API_URL = "https://www.googleapis.com/youtube/v3"

# YouTube 搜尋關鍵字（按分類）
YOUTUBE_SEARCH_QUERIES = {
    "時尚美妝": ["美妝推薦", "保養心得", "化妝教學", "護膚技巧"],
    "生活居家": ["居家收納", "生活好物", "家電開箱", "清潔技巧"],
    "健康保健": ["健康飲食", "養生保健", "運動健身", "營養補充"],
}

# 中央氣象署 API (開放資料)
CWA_API_KEY = "CWA-XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX"  # 需要申請
CWA_API_URL = "https://opendata.cwa.gov.tw/api/v1/rest/datastore"

# Google News RSS 來源（台灣）
NEWS_RSS_FEEDS = {
    "時尚美妝": [
        "https://news.google.com/rss/search?q=美妝+保養+時尚&hl=zh-TW&gl=TW&ceid=TW:zh-Hant",
        "https://news.google.com/rss/search?q=護膚+彩妝+美白&hl=zh-TW&gl=TW&ceid=TW:zh-Hant",
    ],
    "生活居家": [
        "https://news.google.com/rss/search?q=居家+生活+家電&hl=zh-TW&gl=TW&ceid=TW:zh-Hant",
        "https://news.google.com/rss/search?q=收納+清潔+家具&hl=zh-TW&gl=TW&ceid=TW:zh-Hant",
    ],
    "健康保健": [
        "https://news.google.com/rss/search?q=健康+養生+保健&hl=zh-TW&gl=TW&ceid=TW:zh-Hant",
        "https://news.google.com/rss/search?q=營養+保健食品+運動&hl=zh-TW&gl=TW&ceid=TW:zh-Hant",
    ],
}

# PTT 熱門看板（美妝、生活相關）
PTT_BOARDS = {
    "時尚美妝": ["MakeUp", "BeautySalon", "facelift"],
    "生活居家": ["Lifeismoney", "hypermall", "e-shopping"],
    "健康保健": ["FITNESS", "BeautyBody", "Health"],
}

# Dcard 熱門版（美妝、生活相關）
DCARD_FORUMS = {
    "時尚美妝": ["makeup", "skin_care", "beauty"],
    "生活居家": ["life", "home", "shopping"],
    "健康保健": ["fitness", "health", "food"],
}


@dataclass
class NewsItem:
    """新聞項目"""
    title: str
    link: str
    source: str
    published: datetime
    category: str
    summary: str = ""


@dataclass
class SocialPost:
    """社群貼文項目（PTT/Dcard）"""
    title: str
    link: str
    source: str  # 'PTT' or 'Dcard'
    board: str   # 看板/版名
    published: datetime
    category: str
    likes: int = 0
    comments: int = 0


@dataclass
class YouTubeVideo:
    """YouTube 影片項目"""
    title: str
    video_id: str
    channel_title: str
    published: datetime
    category: str
    thumbnail_url: str = ""
    view_count: int = 0
    description: str = ""

    @property
    def url(self) -> str:
        return f"https://www.youtube.com/watch?v={self.video_id}"


@dataclass
class WeatherInfo:
    """天氣資訊"""
    location: str
    date: str
    weather_description: str
    min_temp: float
    max_temp: float
    rain_probability: int
    humidity: int = 0
    comfort: str = ""
    uv_index: str = ""
    marketing_suggestions: List[str] = field(default_factory=list)


@dataclass
class TrendData:
    """趨勢資料"""
    timestamp: datetime
    news_items: List[NewsItem]
    youtube_videos: List[YouTubeVideo]
    social_posts: List[SocialPost]  # PTT/Dcard 貼文
    weather: Optional[WeatherInfo]
    keywords: List[str]
    category_trends: Dict[str, List[str]]


class TrendCrawler:
    """趨勢資料爬蟲"""

    def __init__(self, cwa_api_key: str = None):
        self.cwa_api_key = cwa_api_key or CWA_API_KEY
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

    def fetch_news(self, categories: List[str] = None, max_per_category: int = 10,
                   time_range: str = "week") -> List[NewsItem]:
        """
        抓取新聞

        Args:
            categories: 要抓取的分類 (預設全部)
            max_per_category: 每個分類最多抓取數量
            time_range: 時間範圍 ("day", "week", "month")

        Returns:
            新聞列表
        """
        # 計算時間過濾閾值（寬鬆處理，因為 Google News RSS 的時間可能不準確）
        now = datetime.now()
        if time_range == "day":
            cutoff_time = now - timedelta(days=3)  # 寬鬆 3 天
        elif time_range == "month":
            cutoff_time = now - timedelta(days=60)  # 寬鬆 60 天
        else:  # week (預設)
            cutoff_time = now - timedelta(days=14)  # 寬鬆 14 天

        categories = categories or list(NEWS_RSS_FEEDS.keys())
        news_items = []

        for category in categories:
            if category not in NEWS_RSS_FEEDS:
                continue

            for rss_url in NEWS_RSS_FEEDS[category]:
                try:
                    # 設定 socket 超時以防止 RSS 解析卡住
                    old_timeout = socket.getdefaulttimeout()
                    socket.setdefaulttimeout(RSS_TIMEOUT)
                    try:
                        feed = feedparser.parse(rss_url)
                    finally:
                        socket.setdefaulttimeout(old_timeout)

                    for entry in feed.entries[:max_per_category * 3]:  # 多抓一些再過濾
                        # 解析發布時間
                        published = datetime.now()
                        if hasattr(entry, 'published_parsed') and entry.published_parsed:
                            published = datetime(*entry.published_parsed[:6])

                        # 時間過濾（寬鬆，因為 Google News 時間不一定準確）
                        if published < cutoff_time:
                            continue

                        # 提取來源
                        source = ""
                        if hasattr(entry, 'source') and entry.source:
                            source = entry.source.get('title', '')

                        news_items.append(NewsItem(
                            title=entry.title,
                            link=entry.link,
                            source=source,
                            published=published,
                            category=category,
                            summary=entry.get('summary', '')[:200] if entry.get('summary') else ""
                        ))

                except Exception as e:
                    logger.error(f"抓取 RSS 失敗 ({category}): {e}")

        # 依發布時間排序
        news_items.sort(key=lambda x: x.published, reverse=True)

        # 限制每個分類的數量
        category_counts = {}
        filtered_items = []
        for item in news_items:
            count = category_counts.get(item.category, 0)
            if count < max_per_category:
                filtered_items.append(item)
                category_counts[item.category] = count + 1

        logger.info(f"共抓取 {len(filtered_items)} 則新聞 (時間範圍: {time_range})")
        return filtered_items

    def fetch_weather(self, location: str = "臺北市") -> Optional[WeatherInfo]:
        """
        抓取天氣資訊（使用中央氣象署 API）

        Args:
            location: 地點名稱

        Returns:
            天氣資訊
        """
        # 先嘗試使用免費的 wttr.in API 作為備案
        try:
            return self._fetch_weather_wttr(location)
        except Exception as e:
            logger.warning(f"wttr.in 抓取失敗: {e}")

        # 如果有 API Key，嘗試使用中央氣象署 API
        if self.cwa_api_key and not self.cwa_api_key.startswith("CWA-XXX"):
            try:
                return self._fetch_weather_cwa(location)
            except Exception as e:
                logger.error(f"中央氣象署 API 抓取失敗: {e}")

        # 所有 API 都失敗時，返回預設天氣資訊（根據季節）
        logger.warning("所有天氣 API 失敗，使用預設天氣資訊")
        return self._get_default_weather(location)

    def _get_default_weather(self, location: str) -> WeatherInfo:
        """根據季節返回預設天氣資訊"""
        month = datetime.now().month

        # 台灣季節性天氣預設值
        if month in [12, 1, 2]:  # 冬季
            weather_desc = "多雲"
            min_temp, max_temp = 12.0, 18.0
            humidity = 70
            rain_prob = 30
        elif month in [3, 4, 5]:  # 春季
            weather_desc = "多雲時晴"
            min_temp, max_temp = 18.0, 25.0
            humidity = 75
            rain_prob = 40
        elif month in [6, 7, 8]:  # 夏季
            weather_desc = "晴時多雲"
            min_temp, max_temp = 26.0, 34.0
            humidity = 80
            rain_prob = 50
        else:  # 秋季 9, 10, 11
            weather_desc = "晴"
            min_temp, max_temp = 20.0, 28.0
            humidity = 65
            rain_prob = 20

        suggestions = self._generate_weather_marketing_suggestions(
            weather_desc, min_temp, max_temp, humidity, rain_prob
        )

        return WeatherInfo(
            location=location,
            date=datetime.now().strftime('%Y-%m-%d'),
            weather_description=f"{weather_desc}（預估）",
            min_temp=min_temp,
            max_temp=max_temp,
            rain_probability=rain_prob,
            humidity=humidity,
            marketing_suggestions=suggestions
        )

    def _fetch_weather_wttr(self, location: str) -> WeatherInfo:
        """使用 wttr.in 抓取天氣"""
        # 使用英文地名避免編碼問題
        location_map = {
            '臺北市': 'Taipei',
            '台北市': 'Taipei',
            '新北市': 'New+Taipei',
            '桃園市': 'Taoyuan',
            '臺中市': 'Taichung',
            '台中市': 'Taichung',
            '臺南市': 'Tainan',
            '台南市': 'Tainan',
            '高雄市': 'Kaohsiung',
        }
        query_location = location_map.get(location, 'Taipei')

        # 嘗試多種方式連接（解決某些 Docker 環境的 SSL 問題）
        urls_to_try = [
            f"https://wttr.in/{query_location}?format=j1&lang=zh-tw",
            f"http://wttr.in/{query_location}?format=j1&lang=zh-tw",
        ]

        response = None
        last_error = None
        for url in urls_to_try:
            try:
                response = requests.get(url, headers=self.headers, timeout=15, verify=True)
                response.raise_for_status()
                break
            except requests.exceptions.SSLError as e:
                last_error = e
                logger.warning(f"SSL 錯誤嘗試 {url}: {e}")
                # SSL 錯誤時嘗試下一個 URL
                continue
            except Exception as e:
                last_error = e
                logger.warning(f"連接失敗 {url}: {e}")
                continue

        if response is None:
            raise last_error or Exception("無法連接到 wttr.in")

        data = response.json()
        current = data.get('current_condition', [{}])[0]
        weather_area = data.get('nearest_area', [{}])[0]
        forecast = data.get('weather', [{}])[0]

        # 提取資訊
        weather_desc = current.get('lang_zh', [{}])[0].get('value', current.get('weatherDesc', [{}])[0].get('value', ''))
        min_temp = float(forecast.get('mintempC', 0))
        max_temp = float(forecast.get('maxtempC', 0))
        humidity = int(current.get('humidity', 0))

        # 計算降雨機率 (取最高值)
        rain_prob = 0
        for hourly in forecast.get('hourly', []):
            prob = int(hourly.get('chanceofrain', 0))
            if prob > rain_prob:
                rain_prob = prob

        # 生成行銷建議
        suggestions = self._generate_weather_marketing_suggestions(
            weather_desc, min_temp, max_temp, humidity, rain_prob
        )

        return WeatherInfo(
            location=location,
            date=datetime.now().strftime('%Y-%m-%d'),
            weather_description=weather_desc,
            min_temp=min_temp,
            max_temp=max_temp,
            rain_probability=rain_prob,
            humidity=humidity,
            marketing_suggestions=suggestions
        )

    def _fetch_weather_cwa(self, location: str) -> Optional[WeatherInfo]:
        """使用中央氣象署 API 抓取天氣"""
        # 36 小時預報 API
        url = f"{CWA_API_URL}/F-C0032-001"
        params = {
            'Authorization': self.cwa_api_key,
            'locationName': location,
            'format': 'JSON'
        }

        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()

        data = response.json()
        records = data.get('records', {})
        locations = records.get('location', [])

        if not locations:
            return None

        loc_data = locations[0]
        weather_elements = {we['elementName']: we for we in loc_data.get('weatherElement', [])}

        # 取得今日資訊
        wx = weather_elements.get('Wx', {}).get('time', [{}])[0]
        min_t = weather_elements.get('MinT', {}).get('time', [{}])[0]
        max_t = weather_elements.get('MaxT', {}).get('time', [{}])[0]
        pop = weather_elements.get('PoP', {}).get('time', [{}])[0]
        ci = weather_elements.get('CI', {}).get('time', [{}])[0]

        weather_desc = wx.get('parameter', {}).get('parameterName', '')
        min_temp = float(min_t.get('parameter', {}).get('parameterName', 0))
        max_temp = float(max_t.get('parameter', {}).get('parameterName', 0))
        rain_prob = int(pop.get('parameter', {}).get('parameterName', 0))
        comfort = ci.get('parameter', {}).get('parameterName', '')

        suggestions = self._generate_weather_marketing_suggestions(
            weather_desc, min_temp, max_temp, 0, rain_prob
        )

        return WeatherInfo(
            location=location,
            date=datetime.now().strftime('%Y-%m-%d'),
            weather_description=weather_desc,
            min_temp=min_temp,
            max_temp=max_temp,
            rain_probability=rain_prob,
            comfort=comfort,
            marketing_suggestions=suggestions
        )

    def _generate_weather_marketing_suggestions(
        self, weather_desc: str, min_temp: float, max_temp: float,
        humidity: int, rain_prob: int
    ) -> List[str]:
        """根據天氣生成行銷建議"""
        suggestions = []
        avg_temp = (min_temp + max_temp) / 2

        # 溫度相關建議
        if avg_temp < 15:
            suggestions.extend([
                "寒流來襲！保暖商品熱賣中",
                "冬季護膚：加強保濕鎖水",
                "暖呼呼居家好物推薦",
                "冬天進補，養生保健品需求增加"
            ])
        elif avg_temp < 22:
            suggestions.extend([
                "換季保養正當時",
                "早晚溫差大，注意保暖",
                "春秋薄外套熱銷季"
            ])
        elif avg_temp < 28:
            suggestions.extend([
                "舒適好天氣，戶外活動好時機",
                "輕薄透氣商品推薦"
            ])
        else:
            suggestions.extend([
                "炎炎夏日，防曬美白必備",
                "消暑降溫商品熱賣",
                "夏日控油保養推薦",
                "涼感商品需求增加"
            ])

        # 降雨相關建議
        if rain_prob > 60:
            suggestions.extend([
                "下雨天宅在家，網購好時機",
                "雨具雨傘熱賣中",
                "居家生活用品推薦",
                "室內運動器材正夯"
            ])
        elif rain_prob > 30:
            suggestions.append("外出記得帶傘，晴雨兩用傘推薦")

        # 濕度相關建議
        if humidity > 80:
            suggestions.extend([
                "潮濕天氣，除濕機熱賣",
                "防霉防潮商品推薦",
                "清爽控油保養品需求增加"
            ])
        elif humidity < 40:
            suggestions.extend([
                "乾燥天氣，加強保濕",
                "加濕器熱銷中",
                "護唇膏、護手霜需求增加"
            ])

        # 天氣描述相關
        weather_lower = weather_desc.lower()
        if any(x in weather_lower for x in ['晴', 'sunny', 'clear']):
            suggestions.append("好天氣外出，防曬不可少")
        if any(x in weather_lower for x in ['雲', 'cloudy', '陰']):
            suggestions.append("陰天也要防曬，紫外線仍存在")

        return suggestions[:6]  # 最多返回 6 個建議

    def fetch_youtube_trends(self, categories: List[str] = None, max_per_category: int = 5,
                             time_range: str = "week") -> List[YouTubeVideo]:
        """
        抓取 YouTube 熱門影片

        Args:
            categories: 要抓取的分類 (預設全部)
            max_per_category: 每個分類最多抓取數量
            time_range: 時間範圍 ("day", "week", "month")

        Returns:
            YouTube 影片列表
        """
        if not YOUTUBE_API_KEY:
            logger.warning("YouTube API Key 未設定，跳過 YouTube 趨勢抓取")
            return []

        # 計算時間過濾
        if time_range == "day":
            days_ago = 1
        elif time_range == "month":
            days_ago = 30
        else:  # week (預設)
            days_ago = 7

        published_after = (datetime.utcnow() - timedelta(days=days_ago)).strftime('%Y-%m-%dT%H:%M:%SZ')

        categories = categories or list(YOUTUBE_SEARCH_QUERIES.keys())
        videos = []

        for category in categories:
            if category not in YOUTUBE_SEARCH_QUERIES:
                continue

            for query in YOUTUBE_SEARCH_QUERIES[category][:2]:  # 每個分類最多用 2 個關鍵字
                try:
                    # 搜尋影片
                    search_url = f"{YOUTUBE_API_URL}/search"
                    params = {
                        'part': 'snippet',
                        'q': query,
                        'type': 'video',
                        'regionCode': 'TW',
                        'relevanceLanguage': 'zh-Hant',
                        'maxResults': max_per_category,
                        'order': 'viewCount',  # 按觀看次數排序
                        'publishedAfter': published_after,
                        'key': YOUTUBE_API_KEY
                    }

                    response = requests.get(search_url, params=params, timeout=10)
                    response.raise_for_status()
                    data = response.json()

                    for item in data.get('items', []):
                        snippet = item.get('snippet', {})
                        video_id = item.get('id', {}).get('videoId', '')

                        if not video_id:
                            continue

                        # 解析發布時間
                        published_str = snippet.get('publishedAt', '')
                        try:
                            published = datetime.strptime(published_str[:19], '%Y-%m-%dT%H:%M:%S')
                        except:
                            published = datetime.now()

                        videos.append(YouTubeVideo(
                            title=snippet.get('title', ''),
                            video_id=video_id,
                            channel_title=snippet.get('channelTitle', ''),
                            published=published,
                            category=category,
                            thumbnail_url=snippet.get('thumbnails', {}).get('medium', {}).get('url', ''),
                            description=snippet.get('description', '')[:200]
                        ))

                except Exception as e:
                    logger.error(f"抓取 YouTube 失敗 ({category}/{query}): {e}")

        # 去重（同一影片可能出現在多個搜尋結果中）
        seen_ids = set()
        unique_videos = []
        for video in videos:
            if video.video_id not in seen_ids:
                seen_ids.add(video.video_id)
                unique_videos.append(video)

        # 依發布時間排序
        unique_videos.sort(key=lambda x: x.published, reverse=True)

        logger.info(f"共抓取 {len(unique_videos)} 則 YouTube 影片")
        return unique_videos

    def extract_trending_keywords(self, news_items: List[NewsItem], youtube_videos: List[YouTubeVideo] = None) -> List[str]:
        """
        從新聞和 YouTube 影片中提取熱門關鍵字

        Args:
            news_items: 新聞列表
            youtube_videos: YouTube 影片列表

        Returns:
            關鍵字列表
        """
        # 合併所有標題（包含新聞和 YouTube）
        all_titles = " ".join([n.title for n in news_items])
        if youtube_videos:
            all_titles += " " + " ".join([v.title for v in youtube_videos])

        # 簡單的關鍵字提取（可以用 LLM 改進）
        # 移除常見無意義詞
        stopwords = ['的', '了', '是', '在', '和', '與', '及', '等', '也', '都', '有', '這', '那',
                     '就', '不', '人', '會', '可', '能', '要', '說', '讓', '被', '把', '給', '從',
                     '到', '為', '以', '於', '但', '而', '或', '如', '若', '因', '所', '將', '對']

        # 提取可能的關鍵字（長度 2-6 的中文詞）
        keywords = {}
        pattern = r'[\u4e00-\u9fff]{2,6}'
        matches = re.findall(pattern, all_titles)

        for word in matches:
            if word not in stopwords:
                keywords[word] = keywords.get(word, 0) + 1

        # 按頻率排序
        sorted_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)

        return [kw for kw, count in sorted_keywords[:20]]

    def fetch_ptt_trends(self, categories: List[str] = None, max_per_board: int = 10,
                         time_range: str = "week") -> List[SocialPost]:
        """
        抓取 PTT 熱門文章

        Args:
            categories: 要抓取的分類 (預設全部)
            max_per_board: 每個看板最多抓取數量
            time_range: 時間範圍 ("day", "week", "month")

        Returns:
            SocialPost 列表
        """
        # 計算時間過濾
        now = datetime.now()
        if time_range == "day":
            cutoff_time = now - timedelta(days=2)
        elif time_range == "month":
            cutoff_time = now - timedelta(days=45)
        else:  # week (預設)
            cutoff_time = now - timedelta(days=10)

        categories = categories or list(PTT_BOARDS.keys())
        posts = []

        for category in categories:
            if category not in PTT_BOARDS:
                continue

            for board in PTT_BOARDS[category]:
                try:
                    # 反爬蟲延遲：隨機等待 1-2 秒
                    time.sleep(random.uniform(1.0, 2.0))

                    # PTT 網頁版 URL
                    url = f"https://www.ptt.cc/bbs/{board}/index.html"

                    # 需要設定 cookies 來通過年齡驗證
                    cookies = {'over18': '1'}

                    response = requests.get(url, headers=self.headers, cookies=cookies, timeout=10)
                    response.raise_for_status()

                    soup = BeautifulSoup(response.text, 'html.parser')

                    # 找到文章列表
                    articles = soup.select('div.r-ent')

                    for article in articles[:max_per_board]:
                        try:
                            # 標題
                            title_elem = article.select_one('div.title a')
                            if not title_elem:
                                continue

                            title = title_elem.text.strip()
                            link = "https://www.ptt.cc" + title_elem['href']

                            # 推文數
                            nrec_elem = article.select_one('div.nrec span')
                            likes = 0
                            if nrec_elem:
                                nrec_text = nrec_elem.text.strip()
                                if nrec_text == '爆':
                                    likes = 100
                                elif nrec_text.startswith('X'):
                                    likes = -10
                                elif nrec_text.isdigit():
                                    likes = int(nrec_text)

                            # 日期（PTT 只顯示 月/日）
                            date_elem = article.select_one('div.date')
                            published = now
                            if date_elem:
                                date_text = date_elem.text.strip()
                                try:
                                    month_day = date_text.split('/')
                                    if len(month_day) == 2:
                                        month, day = int(month_day[0]), int(month_day[1])
                                        published = datetime(now.year, month, day)
                                        # 如果解析出來的日期在未來，說明是去年的
                                        if published > now:
                                            published = datetime(now.year - 1, month, day)
                                except:
                                    pass

                            # 時間過濾
                            if published < cutoff_time:
                                continue

                            # 過濾公告
                            if title.startswith('[公告]') or title.startswith('[徵求]'):
                                continue

                            posts.append(SocialPost(
                                title=title,
                                link=link,
                                source='PTT',
                                board=board,
                                published=published,
                                category=category,
                                likes=likes
                            ))

                        except Exception as e:
                            logger.debug(f"解析 PTT 文章失敗: {e}")
                            continue

                except Exception as e:
                    logger.error(f"抓取 PTT 看板 {board} 失敗: {e}")

        # 依推文數排序
        posts.sort(key=lambda x: x.likes, reverse=True)

        logger.info(f"共抓取 {len(posts)} 則 PTT 文章")
        return posts

    def fetch_dcard_trends(self, categories: List[str] = None, max_per_forum: int = 10,
                           time_range: str = "week") -> List[SocialPost]:
        """
        抓取 Dcard 熱門文章

        Args:
            categories: 要抓取的分類 (預設全部)
            max_per_forum: 每個版最多抓取數量
            time_range: 時間範圍 ("day", "week", "month")

        Returns:
            SocialPost 列表
        """
        # 計算時間過濾
        now = datetime.now()
        if time_range == "day":
            cutoff_time = now - timedelta(days=2)
        elif time_range == "month":
            cutoff_time = now - timedelta(days=45)
        else:  # week (預設)
            cutoff_time = now - timedelta(days=10)

        categories = categories or list(DCARD_FORUMS.keys())
        posts = []

        for category in categories:
            if category not in DCARD_FORUMS:
                continue

            for forum in DCARD_FORUMS[category]:
                try:
                    # 反爬蟲延遲：隨機等待 1-2 秒
                    time.sleep(random.uniform(1.0, 2.0))

                    # Dcard API（公開 API）
                    url = f"https://www.dcard.tw/service/api/v2/forums/{forum}/posts"
                    params = {
                        'limit': max_per_forum,
                        'popular': 'true'  # 熱門文章
                    }

                    response = requests.get(url, headers=self.headers, params=params, timeout=10)

                    # 如果 API 失敗，嘗試使用網頁抓取
                    if response.status_code != 200:
                        logger.warning(f"Dcard API 失敗 ({forum}), 嘗試網頁抓取")
                        continue

                    data = response.json()

                    for post in data:
                        try:
                            title = post.get('title', '')
                            post_id = post.get('id')
                            link = f"https://www.dcard.tw/f/{forum}/p/{post_id}"

                            # 解析時間
                            created_at = post.get('createdAt', '')
                            try:
                                # ISO 格式: 2024-01-15T10:30:00.000Z
                                published = datetime.strptime(created_at[:19], '%Y-%m-%dT%H:%M:%S')
                            except:
                                published = now

                            # 時間過濾
                            if published < cutoff_time:
                                continue

                            likes = post.get('likeCount', 0)
                            comments = post.get('commentCount', 0)

                            posts.append(SocialPost(
                                title=title,
                                link=link,
                                source='Dcard',
                                board=forum,
                                published=published,
                                category=category,
                                likes=likes,
                                comments=comments
                            ))

                        except Exception as e:
                            logger.debug(f"解析 Dcard 文章失敗: {e}")
                            continue

                except Exception as e:
                    logger.error(f"抓取 Dcard 版 {forum} 失敗: {e}")

        # 依讚數排序
        posts.sort(key=lambda x: x.likes, reverse=True)

        logger.info(f"共抓取 {len(posts)} 則 Dcard 文章")
        return posts

    def fetch_social_trends(self, categories: List[str] = None, max_per_source: int = 10,
                            time_range: str = "week") -> List[SocialPost]:
        """
        抓取所有社群平台的趨勢（PTT + Dcard）

        Args:
            categories: 要抓取的分類
            max_per_source: 每個來源最多抓取數量
            time_range: 時間範圍

        Returns:
            合併後的 SocialPost 列表
        """
        all_posts = []

        # PTT
        ptt_posts = self.fetch_ptt_trends(categories, max_per_source, time_range)
        all_posts.extend(ptt_posts)

        # Dcard
        dcard_posts = self.fetch_dcard_trends(categories, max_per_source, time_range)
        all_posts.extend(dcard_posts)

        # 依讚數排序
        all_posts.sort(key=lambda x: x.likes, reverse=True)

        return all_posts

    def get_all_trends(self, categories: List[str] = None,
                       weather_location: str = "臺北市",
                       time_range: str = "week",
                       include_social: bool = True) -> TrendData:
        """
        獲取所有趨勢資料

        Args:
            categories: 新聞分類
            weather_location: 天氣地點
            time_range: 時間範圍 ("day", "week", "month")
            include_social: 是否包含社群平台（PTT/Dcard）

        Returns:
            TrendData
        """
        # 抓取新聞
        news_items = self.fetch_news(categories, time_range=time_range)

        # 抓取 YouTube 趨勢
        youtube_videos = self.fetch_youtube_trends(categories, time_range=time_range)

        # 抓取社群趨勢（PTT/Dcard）
        social_posts = []
        if include_social:
            social_posts = self.fetch_social_trends(categories, time_range=time_range)

        # 抓取天氣
        weather = self.fetch_weather(weather_location)

        # 提取關鍵字（結合新聞、YouTube 和社群）
        keywords = self.extract_trending_keywords(news_items, youtube_videos)
        # 加入社群標題的關鍵字
        if social_posts:
            social_titles = " ".join([p.title for p in social_posts])
            pattern = r'[\u4e00-\u9fff]{2,6}'
            matches = re.findall(pattern, social_titles)
            for word in matches:
                if word not in keywords:
                    keywords.append(word)
            keywords = keywords[:25]  # 限制總數

        # 按分類整理趨勢
        category_trends = {}
        for item in news_items:
            if item.category not in category_trends:
                category_trends[item.category] = []
            category_trends[item.category].append(item.title)

        # 加入 YouTube 標題
        for video in youtube_videos:
            if video.category not in category_trends:
                category_trends[video.category] = []
            category_trends[video.category].append(f"[YT] {video.title}")

        # 加入社群標題
        for post in social_posts:
            if post.category not in category_trends:
                category_trends[post.category] = []
            category_trends[post.category].append(f"[{post.source}] {post.title}")

        return TrendData(
            timestamp=datetime.now(),
            news_items=news_items,
            youtube_videos=youtube_videos,
            social_posts=social_posts,
            weather=weather,
            keywords=keywords,
            category_trends=category_trends
        )


# 建立全域實例
trend_crawler = TrendCrawler()


if __name__ == "__main__":
    # 測試程式碼
    logging.basicConfig(level=logging.INFO)

    crawler = TrendCrawler()

    print("=== 測試天氣抓取 ===")
    weather = crawler.fetch_weather("臺北市")
    if weather:
        print(f"地點: {weather.location}")
        print(f"天氣: {weather.weather_description}")
        print(f"溫度: {weather.min_temp}°C ~ {weather.max_temp}°C")
        print(f"降雨機率: {weather.rain_probability}%")
        print(f"行銷建議: {weather.marketing_suggestions}")
    else:
        print("天氣抓取失敗")

    print("\n=== 測試新聞抓取 ===")
    news = crawler.fetch_news(categories=["時尚美妝"], max_per_category=5)
    for item in news[:5]:
        print(f"- {item.title}")

    print("\n=== 提取關鍵字 ===")
    keywords = crawler.extract_trending_keywords(news)
    print(f"熱門關鍵字: {keywords[:10]}")