Some checks failed
CD Pipeline / deploy (push) Failing after 59s
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml) - 部署模式: rsync Python 檔案至 188 → docker restart (volume mount) - Dockerfile/requirements 變動時自動重建 Docker image - 部署通知: Telegram (開始/成功/失敗) - 健康檢查: https://mo.wooo.work/health (最多 5 次重試) - 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
967 lines
34 KiB
Python
967 lines
34 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
趨勢資料爬蟲模組
|
||
負責收集新聞、天氣、YouTube 熱門影片等趨勢資訊
|
||
"""
|
||
|
||
import requests
|
||
import feedparser
|
||
import logging
|
||
import os
|
||
import socket
|
||
import time
|
||
import random
|
||
from typing import List, Dict, Any, Optional
|
||
from datetime import datetime, timedelta
|
||
from dataclasses import dataclass, field
|
||
from bs4 import BeautifulSoup
|
||
import json
|
||
import re
|
||
|
||
# RSS 解析超時設定(秒)
|
||
RSS_TIMEOUT = 15
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# YouTube Data API 設定
|
||
YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # 從環境變數讀取
|
||
YOUTUBE_API_URL = "https://www.googleapis.com/youtube/v3"
|
||
|
||
# YouTube 搜尋關鍵字(按分類)
|
||
YOUTUBE_SEARCH_QUERIES = {
|
||
"時尚美妝": ["美妝推薦", "保養心得", "化妝教學", "護膚技巧"],
|
||
"生活居家": ["居家收納", "生活好物", "家電開箱", "清潔技巧"],
|
||
"健康保健": ["健康飲食", "養生保健", "運動健身", "營養補充"],
|
||
}
|
||
|
||
# 中央氣象署 API (開放資料)
|
||
CWA_API_KEY = "CWA-XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" # 需要申請
|
||
CWA_API_URL = "https://opendata.cwa.gov.tw/api/v1/rest/datastore"
|
||
|
||
# Google News RSS 來源(台灣)
|
||
NEWS_RSS_FEEDS = {
|
||
"時尚美妝": [
|
||
"https://news.google.com/rss/search?q=美妝+保養+時尚&hl=zh-TW&gl=TW&ceid=TW:zh-Hant",
|
||
"https://news.google.com/rss/search?q=護膚+彩妝+美白&hl=zh-TW&gl=TW&ceid=TW:zh-Hant",
|
||
],
|
||
"生活居家": [
|
||
"https://news.google.com/rss/search?q=居家+生活+家電&hl=zh-TW&gl=TW&ceid=TW:zh-Hant",
|
||
"https://news.google.com/rss/search?q=收納+清潔+家具&hl=zh-TW&gl=TW&ceid=TW:zh-Hant",
|
||
],
|
||
"健康保健": [
|
||
"https://news.google.com/rss/search?q=健康+養生+保健&hl=zh-TW&gl=TW&ceid=TW:zh-Hant",
|
||
"https://news.google.com/rss/search?q=營養+保健食品+運動&hl=zh-TW&gl=TW&ceid=TW:zh-Hant",
|
||
],
|
||
}
|
||
|
||
# PTT 熱門看板(美妝、生活相關)
|
||
PTT_BOARDS = {
|
||
"時尚美妝": ["MakeUp", "BeautySalon", "facelift"],
|
||
"生活居家": ["Lifeismoney", "hypermall", "e-shopping"],
|
||
"健康保健": ["FITNESS", "BeautyBody", "Health"],
|
||
}
|
||
|
||
# Dcard 熱門版(美妝、生活相關)
|
||
DCARD_FORUMS = {
|
||
"時尚美妝": ["makeup", "skin_care", "beauty"],
|
||
"生活居家": ["life", "home", "shopping"],
|
||
"健康保健": ["fitness", "health", "food"],
|
||
}
|
||
|
||
|
||
@dataclass
|
||
class NewsItem:
|
||
"""新聞項目"""
|
||
title: str
|
||
link: str
|
||
source: str
|
||
published: datetime
|
||
category: str
|
||
summary: str = ""
|
||
|
||
|
||
@dataclass
|
||
class SocialPost:
|
||
"""社群貼文項目(PTT/Dcard)"""
|
||
title: str
|
||
link: str
|
||
source: str # 'PTT' or 'Dcard'
|
||
board: str # 看板/版名
|
||
published: datetime
|
||
category: str
|
||
likes: int = 0
|
||
comments: int = 0
|
||
|
||
|
||
@dataclass
|
||
class YouTubeVideo:
|
||
"""YouTube 影片項目"""
|
||
title: str
|
||
video_id: str
|
||
channel_title: str
|
||
published: datetime
|
||
category: str
|
||
thumbnail_url: str = ""
|
||
view_count: int = 0
|
||
description: str = ""
|
||
|
||
@property
|
||
def url(self) -> str:
|
||
return f"https://www.youtube.com/watch?v={self.video_id}"
|
||
|
||
|
||
@dataclass
|
||
class WeatherInfo:
|
||
"""天氣資訊"""
|
||
location: str
|
||
date: str
|
||
weather_description: str
|
||
min_temp: float
|
||
max_temp: float
|
||
rain_probability: int
|
||
humidity: int = 0
|
||
comfort: str = ""
|
||
uv_index: str = ""
|
||
marketing_suggestions: List[str] = field(default_factory=list)
|
||
|
||
|
||
@dataclass
|
||
class TrendData:
|
||
"""趨勢資料"""
|
||
timestamp: datetime
|
||
news_items: List[NewsItem]
|
||
youtube_videos: List[YouTubeVideo]
|
||
social_posts: List[SocialPost] # PTT/Dcard 貼文
|
||
weather: Optional[WeatherInfo]
|
||
keywords: List[str]
|
||
category_trends: Dict[str, List[str]]
|
||
|
||
|
||
class TrendCrawler:
|
||
"""趨勢資料爬蟲"""
|
||
|
||
def __init__(self, cwa_api_key: str = None):
|
||
self.cwa_api_key = cwa_api_key or CWA_API_KEY
|
||
self.headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||
}
|
||
|
||
def fetch_news(self, categories: List[str] = None, max_per_category: int = 10,
|
||
time_range: str = "week") -> List[NewsItem]:
|
||
"""
|
||
抓取新聞
|
||
|
||
Args:
|
||
categories: 要抓取的分類 (預設全部)
|
||
max_per_category: 每個分類最多抓取數量
|
||
time_range: 時間範圍 ("day", "week", "month")
|
||
|
||
Returns:
|
||
新聞列表
|
||
"""
|
||
# 計算時間過濾閾值(寬鬆處理,因為 Google News RSS 的時間可能不準確)
|
||
now = datetime.now()
|
||
if time_range == "day":
|
||
cutoff_time = now - timedelta(days=3) # 寬鬆 3 天
|
||
elif time_range == "month":
|
||
cutoff_time = now - timedelta(days=60) # 寬鬆 60 天
|
||
else: # week (預設)
|
||
cutoff_time = now - timedelta(days=14) # 寬鬆 14 天
|
||
|
||
categories = categories or list(NEWS_RSS_FEEDS.keys())
|
||
news_items = []
|
||
|
||
for category in categories:
|
||
if category not in NEWS_RSS_FEEDS:
|
||
continue
|
||
|
||
for rss_url in NEWS_RSS_FEEDS[category]:
|
||
try:
|
||
# 設定 socket 超時以防止 RSS 解析卡住
|
||
old_timeout = socket.getdefaulttimeout()
|
||
socket.setdefaulttimeout(RSS_TIMEOUT)
|
||
try:
|
||
feed = feedparser.parse(rss_url)
|
||
finally:
|
||
socket.setdefaulttimeout(old_timeout)
|
||
|
||
for entry in feed.entries[:max_per_category * 3]: # 多抓一些再過濾
|
||
# 解析發布時間
|
||
published = datetime.now()
|
||
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
||
published = datetime(*entry.published_parsed[:6])
|
||
|
||
# 時間過濾(寬鬆,因為 Google News 時間不一定準確)
|
||
if published < cutoff_time:
|
||
continue
|
||
|
||
# 提取來源
|
||
source = ""
|
||
if hasattr(entry, 'source') and entry.source:
|
||
source = entry.source.get('title', '')
|
||
|
||
news_items.append(NewsItem(
|
||
title=entry.title,
|
||
link=entry.link,
|
||
source=source,
|
||
published=published,
|
||
category=category,
|
||
summary=entry.get('summary', '')[:200] if entry.get('summary') else ""
|
||
))
|
||
|
||
except Exception as e:
|
||
logger.error(f"抓取 RSS 失敗 ({category}): {e}")
|
||
|
||
# 依發布時間排序
|
||
news_items.sort(key=lambda x: x.published, reverse=True)
|
||
|
||
# 限制每個分類的數量
|
||
category_counts = {}
|
||
filtered_items = []
|
||
for item in news_items:
|
||
count = category_counts.get(item.category, 0)
|
||
if count < max_per_category:
|
||
filtered_items.append(item)
|
||
category_counts[item.category] = count + 1
|
||
|
||
logger.info(f"共抓取 {len(filtered_items)} 則新聞 (時間範圍: {time_range})")
|
||
return filtered_items
|
||
|
||
def fetch_weather(self, location: str = "臺北市") -> Optional[WeatherInfo]:
|
||
"""
|
||
抓取天氣資訊(使用中央氣象署 API)
|
||
|
||
Args:
|
||
location: 地點名稱
|
||
|
||
Returns:
|
||
天氣資訊
|
||
"""
|
||
# 先嘗試使用免費的 wttr.in API 作為備案
|
||
try:
|
||
return self._fetch_weather_wttr(location)
|
||
except Exception as e:
|
||
logger.warning(f"wttr.in 抓取失敗: {e}")
|
||
|
||
# 如果有 API Key,嘗試使用中央氣象署 API
|
||
if self.cwa_api_key and not self.cwa_api_key.startswith("CWA-XXX"):
|
||
try:
|
||
return self._fetch_weather_cwa(location)
|
||
except Exception as e:
|
||
logger.error(f"中央氣象署 API 抓取失敗: {e}")
|
||
|
||
# 所有 API 都失敗時,返回預設天氣資訊(根據季節)
|
||
logger.warning("所有天氣 API 失敗,使用預設天氣資訊")
|
||
return self._get_default_weather(location)
|
||
|
||
def _get_default_weather(self, location: str) -> WeatherInfo:
|
||
"""根據季節返回預設天氣資訊"""
|
||
month = datetime.now().month
|
||
|
||
# 台灣季節性天氣預設值
|
||
if month in [12, 1, 2]: # 冬季
|
||
weather_desc = "多雲"
|
||
min_temp, max_temp = 12.0, 18.0
|
||
humidity = 70
|
||
rain_prob = 30
|
||
elif month in [3, 4, 5]: # 春季
|
||
weather_desc = "多雲時晴"
|
||
min_temp, max_temp = 18.0, 25.0
|
||
humidity = 75
|
||
rain_prob = 40
|
||
elif month in [6, 7, 8]: # 夏季
|
||
weather_desc = "晴時多雲"
|
||
min_temp, max_temp = 26.0, 34.0
|
||
humidity = 80
|
||
rain_prob = 50
|
||
else: # 秋季 9, 10, 11
|
||
weather_desc = "晴"
|
||
min_temp, max_temp = 20.0, 28.0
|
||
humidity = 65
|
||
rain_prob = 20
|
||
|
||
suggestions = self._generate_weather_marketing_suggestions(
|
||
weather_desc, min_temp, max_temp, humidity, rain_prob
|
||
)
|
||
|
||
return WeatherInfo(
|
||
location=location,
|
||
date=datetime.now().strftime('%Y-%m-%d'),
|
||
weather_description=f"{weather_desc}(預估)",
|
||
min_temp=min_temp,
|
||
max_temp=max_temp,
|
||
rain_probability=rain_prob,
|
||
humidity=humidity,
|
||
marketing_suggestions=suggestions
|
||
)
|
||
|
||
def _fetch_weather_wttr(self, location: str) -> WeatherInfo:
|
||
"""使用 wttr.in 抓取天氣"""
|
||
# 使用英文地名避免編碼問題
|
||
location_map = {
|
||
'臺北市': 'Taipei',
|
||
'台北市': 'Taipei',
|
||
'新北市': 'New+Taipei',
|
||
'桃園市': 'Taoyuan',
|
||
'臺中市': 'Taichung',
|
||
'台中市': 'Taichung',
|
||
'臺南市': 'Tainan',
|
||
'台南市': 'Tainan',
|
||
'高雄市': 'Kaohsiung',
|
||
}
|
||
query_location = location_map.get(location, 'Taipei')
|
||
|
||
# 嘗試多種方式連接(解決某些 Docker 環境的 SSL 問題)
|
||
urls_to_try = [
|
||
f"https://wttr.in/{query_location}?format=j1&lang=zh-tw",
|
||
f"http://wttr.in/{query_location}?format=j1&lang=zh-tw",
|
||
]
|
||
|
||
response = None
|
||
last_error = None
|
||
for url in urls_to_try:
|
||
try:
|
||
response = requests.get(url, headers=self.headers, timeout=15, verify=True)
|
||
response.raise_for_status()
|
||
break
|
||
except requests.exceptions.SSLError as e:
|
||
last_error = e
|
||
logger.warning(f"SSL 錯誤嘗試 {url}: {e}")
|
||
# SSL 錯誤時嘗試下一個 URL
|
||
continue
|
||
except Exception as e:
|
||
last_error = e
|
||
logger.warning(f"連接失敗 {url}: {e}")
|
||
continue
|
||
|
||
if response is None:
|
||
raise last_error or Exception("無法連接到 wttr.in")
|
||
|
||
data = response.json()
|
||
current = data.get('current_condition', [{}])[0]
|
||
weather_area = data.get('nearest_area', [{}])[0]
|
||
forecast = data.get('weather', [{}])[0]
|
||
|
||
# 提取資訊
|
||
weather_desc = current.get('lang_zh', [{}])[0].get('value', current.get('weatherDesc', [{}])[0].get('value', ''))
|
||
min_temp = float(forecast.get('mintempC', 0))
|
||
max_temp = float(forecast.get('maxtempC', 0))
|
||
humidity = int(current.get('humidity', 0))
|
||
|
||
# 計算降雨機率 (取最高值)
|
||
rain_prob = 0
|
||
for hourly in forecast.get('hourly', []):
|
||
prob = int(hourly.get('chanceofrain', 0))
|
||
if prob > rain_prob:
|
||
rain_prob = prob
|
||
|
||
# 生成行銷建議
|
||
suggestions = self._generate_weather_marketing_suggestions(
|
||
weather_desc, min_temp, max_temp, humidity, rain_prob
|
||
)
|
||
|
||
return WeatherInfo(
|
||
location=location,
|
||
date=datetime.now().strftime('%Y-%m-%d'),
|
||
weather_description=weather_desc,
|
||
min_temp=min_temp,
|
||
max_temp=max_temp,
|
||
rain_probability=rain_prob,
|
||
humidity=humidity,
|
||
marketing_suggestions=suggestions
|
||
)
|
||
|
||
def _fetch_weather_cwa(self, location: str) -> Optional[WeatherInfo]:
|
||
"""使用中央氣象署 API 抓取天氣"""
|
||
# 36 小時預報 API
|
||
url = f"{CWA_API_URL}/F-C0032-001"
|
||
params = {
|
||
'Authorization': self.cwa_api_key,
|
||
'locationName': location,
|
||
'format': 'JSON'
|
||
}
|
||
|
||
response = requests.get(url, params=params, timeout=10)
|
||
response.raise_for_status()
|
||
|
||
data = response.json()
|
||
records = data.get('records', {})
|
||
locations = records.get('location', [])
|
||
|
||
if not locations:
|
||
return None
|
||
|
||
loc_data = locations[0]
|
||
weather_elements = {we['elementName']: we for we in loc_data.get('weatherElement', [])}
|
||
|
||
# 取得今日資訊
|
||
wx = weather_elements.get('Wx', {}).get('time', [{}])[0]
|
||
min_t = weather_elements.get('MinT', {}).get('time', [{}])[0]
|
||
max_t = weather_elements.get('MaxT', {}).get('time', [{}])[0]
|
||
pop = weather_elements.get('PoP', {}).get('time', [{}])[0]
|
||
ci = weather_elements.get('CI', {}).get('time', [{}])[0]
|
||
|
||
weather_desc = wx.get('parameter', {}).get('parameterName', '')
|
||
min_temp = float(min_t.get('parameter', {}).get('parameterName', 0))
|
||
max_temp = float(max_t.get('parameter', {}).get('parameterName', 0))
|
||
rain_prob = int(pop.get('parameter', {}).get('parameterName', 0))
|
||
comfort = ci.get('parameter', {}).get('parameterName', '')
|
||
|
||
suggestions = self._generate_weather_marketing_suggestions(
|
||
weather_desc, min_temp, max_temp, 0, rain_prob
|
||
)
|
||
|
||
return WeatherInfo(
|
||
location=location,
|
||
date=datetime.now().strftime('%Y-%m-%d'),
|
||
weather_description=weather_desc,
|
||
min_temp=min_temp,
|
||
max_temp=max_temp,
|
||
rain_probability=rain_prob,
|
||
comfort=comfort,
|
||
marketing_suggestions=suggestions
|
||
)
|
||
|
||
def _generate_weather_marketing_suggestions(
|
||
self, weather_desc: str, min_temp: float, max_temp: float,
|
||
humidity: int, rain_prob: int
|
||
) -> List[str]:
|
||
"""根據天氣生成行銷建議"""
|
||
suggestions = []
|
||
avg_temp = (min_temp + max_temp) / 2
|
||
|
||
# 溫度相關建議
|
||
if avg_temp < 15:
|
||
suggestions.extend([
|
||
"寒流來襲!保暖商品熱賣中",
|
||
"冬季護膚:加強保濕鎖水",
|
||
"暖呼呼居家好物推薦",
|
||
"冬天進補,養生保健品需求增加"
|
||
])
|
||
elif avg_temp < 22:
|
||
suggestions.extend([
|
||
"換季保養正當時",
|
||
"早晚溫差大,注意保暖",
|
||
"春秋薄外套熱銷季"
|
||
])
|
||
elif avg_temp < 28:
|
||
suggestions.extend([
|
||
"舒適好天氣,戶外活動好時機",
|
||
"輕薄透氣商品推薦"
|
||
])
|
||
else:
|
||
suggestions.extend([
|
||
"炎炎夏日,防曬美白必備",
|
||
"消暑降溫商品熱賣",
|
||
"夏日控油保養推薦",
|
||
"涼感商品需求增加"
|
||
])
|
||
|
||
# 降雨相關建議
|
||
if rain_prob > 60:
|
||
suggestions.extend([
|
||
"下雨天宅在家,網購好時機",
|
||
"雨具雨傘熱賣中",
|
||
"居家生活用品推薦",
|
||
"室內運動器材正夯"
|
||
])
|
||
elif rain_prob > 30:
|
||
suggestions.append("外出記得帶傘,晴雨兩用傘推薦")
|
||
|
||
# 濕度相關建議
|
||
if humidity > 80:
|
||
suggestions.extend([
|
||
"潮濕天氣,除濕機熱賣",
|
||
"防霉防潮商品推薦",
|
||
"清爽控油保養品需求增加"
|
||
])
|
||
elif humidity < 40:
|
||
suggestions.extend([
|
||
"乾燥天氣,加強保濕",
|
||
"加濕器熱銷中",
|
||
"護唇膏、護手霜需求增加"
|
||
])
|
||
|
||
# 天氣描述相關
|
||
weather_lower = weather_desc.lower()
|
||
if any(x in weather_lower for x in ['晴', 'sunny', 'clear']):
|
||
suggestions.append("好天氣外出,防曬不可少")
|
||
if any(x in weather_lower for x in ['雲', 'cloudy', '陰']):
|
||
suggestions.append("陰天也要防曬,紫外線仍存在")
|
||
|
||
return suggestions[:6] # 最多返回 6 個建議
|
||
|
||
def fetch_youtube_trends(self, categories: List[str] = None, max_per_category: int = 5,
|
||
time_range: str = "week") -> List[YouTubeVideo]:
|
||
"""
|
||
抓取 YouTube 熱門影片
|
||
|
||
Args:
|
||
categories: 要抓取的分類 (預設全部)
|
||
max_per_category: 每個分類最多抓取數量
|
||
time_range: 時間範圍 ("day", "week", "month")
|
||
|
||
Returns:
|
||
YouTube 影片列表
|
||
"""
|
||
if not YOUTUBE_API_KEY:
|
||
logger.warning("YouTube API Key 未設定,跳過 YouTube 趨勢抓取")
|
||
return []
|
||
|
||
# 計算時間過濾
|
||
if time_range == "day":
|
||
days_ago = 1
|
||
elif time_range == "month":
|
||
days_ago = 30
|
||
else: # week (預設)
|
||
days_ago = 7
|
||
|
||
published_after = (datetime.utcnow() - timedelta(days=days_ago)).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||
|
||
categories = categories or list(YOUTUBE_SEARCH_QUERIES.keys())
|
||
videos = []
|
||
|
||
for category in categories:
|
||
if category not in YOUTUBE_SEARCH_QUERIES:
|
||
continue
|
||
|
||
for query in YOUTUBE_SEARCH_QUERIES[category][:2]: # 每個分類最多用 2 個關鍵字
|
||
try:
|
||
# 搜尋影片
|
||
search_url = f"{YOUTUBE_API_URL}/search"
|
||
params = {
|
||
'part': 'snippet',
|
||
'q': query,
|
||
'type': 'video',
|
||
'regionCode': 'TW',
|
||
'relevanceLanguage': 'zh-Hant',
|
||
'maxResults': max_per_category,
|
||
'order': 'viewCount', # 按觀看次數排序
|
||
'publishedAfter': published_after,
|
||
'key': YOUTUBE_API_KEY
|
||
}
|
||
|
||
response = requests.get(search_url, params=params, timeout=10)
|
||
response.raise_for_status()
|
||
data = response.json()
|
||
|
||
for item in data.get('items', []):
|
||
snippet = item.get('snippet', {})
|
||
video_id = item.get('id', {}).get('videoId', '')
|
||
|
||
if not video_id:
|
||
continue
|
||
|
||
# 解析發布時間
|
||
published_str = snippet.get('publishedAt', '')
|
||
try:
|
||
published = datetime.strptime(published_str[:19], '%Y-%m-%dT%H:%M:%S')
|
||
except:
|
||
published = datetime.now()
|
||
|
||
videos.append(YouTubeVideo(
|
||
title=snippet.get('title', ''),
|
||
video_id=video_id,
|
||
channel_title=snippet.get('channelTitle', ''),
|
||
published=published,
|
||
category=category,
|
||
thumbnail_url=snippet.get('thumbnails', {}).get('medium', {}).get('url', ''),
|
||
description=snippet.get('description', '')[:200]
|
||
))
|
||
|
||
except Exception as e:
|
||
logger.error(f"抓取 YouTube 失敗 ({category}/{query}): {e}")
|
||
|
||
# 去重(同一影片可能出現在多個搜尋結果中)
|
||
seen_ids = set()
|
||
unique_videos = []
|
||
for video in videos:
|
||
if video.video_id not in seen_ids:
|
||
seen_ids.add(video.video_id)
|
||
unique_videos.append(video)
|
||
|
||
# 依發布時間排序
|
||
unique_videos.sort(key=lambda x: x.published, reverse=True)
|
||
|
||
logger.info(f"共抓取 {len(unique_videos)} 則 YouTube 影片")
|
||
return unique_videos
|
||
|
||
def extract_trending_keywords(self, news_items: List[NewsItem], youtube_videos: List[YouTubeVideo] = None) -> List[str]:
|
||
"""
|
||
從新聞和 YouTube 影片中提取熱門關鍵字
|
||
|
||
Args:
|
||
news_items: 新聞列表
|
||
youtube_videos: YouTube 影片列表
|
||
|
||
Returns:
|
||
關鍵字列表
|
||
"""
|
||
# 合併所有標題(包含新聞和 YouTube)
|
||
all_titles = " ".join([n.title for n in news_items])
|
||
if youtube_videos:
|
||
all_titles += " " + " ".join([v.title for v in youtube_videos])
|
||
|
||
# 簡單的關鍵字提取(可以用 LLM 改進)
|
||
# 移除常見無意義詞
|
||
stopwords = ['的', '了', '是', '在', '和', '與', '及', '等', '也', '都', '有', '這', '那',
|
||
'就', '不', '人', '會', '可', '能', '要', '說', '讓', '被', '把', '給', '從',
|
||
'到', '為', '以', '於', '但', '而', '或', '如', '若', '因', '所', '將', '對']
|
||
|
||
# 提取可能的關鍵字(長度 2-6 的中文詞)
|
||
keywords = {}
|
||
pattern = r'[\u4e00-\u9fff]{2,6}'
|
||
matches = re.findall(pattern, all_titles)
|
||
|
||
for word in matches:
|
||
if word not in stopwords:
|
||
keywords[word] = keywords.get(word, 0) + 1
|
||
|
||
# 按頻率排序
|
||
sorted_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)
|
||
|
||
return [kw for kw, count in sorted_keywords[:20]]
|
||
|
||
def fetch_ptt_trends(self, categories: List[str] = None, max_per_board: int = 10,
|
||
time_range: str = "week") -> List[SocialPost]:
|
||
"""
|
||
抓取 PTT 熱門文章
|
||
|
||
Args:
|
||
categories: 要抓取的分類 (預設全部)
|
||
max_per_board: 每個看板最多抓取數量
|
||
time_range: 時間範圍 ("day", "week", "month")
|
||
|
||
Returns:
|
||
SocialPost 列表
|
||
"""
|
||
# 計算時間過濾
|
||
now = datetime.now()
|
||
if time_range == "day":
|
||
cutoff_time = now - timedelta(days=2)
|
||
elif time_range == "month":
|
||
cutoff_time = now - timedelta(days=45)
|
||
else: # week (預設)
|
||
cutoff_time = now - timedelta(days=10)
|
||
|
||
categories = categories or list(PTT_BOARDS.keys())
|
||
posts = []
|
||
|
||
for category in categories:
|
||
if category not in PTT_BOARDS:
|
||
continue
|
||
|
||
for board in PTT_BOARDS[category]:
|
||
try:
|
||
# 反爬蟲延遲:隨機等待 1-2 秒
|
||
time.sleep(random.uniform(1.0, 2.0))
|
||
|
||
# PTT 網頁版 URL
|
||
url = f"https://www.ptt.cc/bbs/{board}/index.html"
|
||
|
||
# 需要設定 cookies 來通過年齡驗證
|
||
cookies = {'over18': '1'}
|
||
|
||
response = requests.get(url, headers=self.headers, cookies=cookies, timeout=10)
|
||
response.raise_for_status()
|
||
|
||
soup = BeautifulSoup(response.text, 'html.parser')
|
||
|
||
# 找到文章列表
|
||
articles = soup.select('div.r-ent')
|
||
|
||
for article in articles[:max_per_board]:
|
||
try:
|
||
# 標題
|
||
title_elem = article.select_one('div.title a')
|
||
if not title_elem:
|
||
continue
|
||
|
||
title = title_elem.text.strip()
|
||
link = "https://www.ptt.cc" + title_elem['href']
|
||
|
||
# 推文數
|
||
nrec_elem = article.select_one('div.nrec span')
|
||
likes = 0
|
||
if nrec_elem:
|
||
nrec_text = nrec_elem.text.strip()
|
||
if nrec_text == '爆':
|
||
likes = 100
|
||
elif nrec_text.startswith('X'):
|
||
likes = -10
|
||
elif nrec_text.isdigit():
|
||
likes = int(nrec_text)
|
||
|
||
# 日期(PTT 只顯示 月/日)
|
||
date_elem = article.select_one('div.date')
|
||
published = now
|
||
if date_elem:
|
||
date_text = date_elem.text.strip()
|
||
try:
|
||
month_day = date_text.split('/')
|
||
if len(month_day) == 2:
|
||
month, day = int(month_day[0]), int(month_day[1])
|
||
published = datetime(now.year, month, day)
|
||
# 如果解析出來的日期在未來,說明是去年的
|
||
if published > now:
|
||
published = datetime(now.year - 1, month, day)
|
||
except:
|
||
pass
|
||
|
||
# 時間過濾
|
||
if published < cutoff_time:
|
||
continue
|
||
|
||
# 過濾公告
|
||
if title.startswith('[公告]') or title.startswith('[徵求]'):
|
||
continue
|
||
|
||
posts.append(SocialPost(
|
||
title=title,
|
||
link=link,
|
||
source='PTT',
|
||
board=board,
|
||
published=published,
|
||
category=category,
|
||
likes=likes
|
||
))
|
||
|
||
except Exception as e:
|
||
logger.debug(f"解析 PTT 文章失敗: {e}")
|
||
continue
|
||
|
||
except Exception as e:
|
||
logger.error(f"抓取 PTT 看板 {board} 失敗: {e}")
|
||
|
||
# 依推文數排序
|
||
posts.sort(key=lambda x: x.likes, reverse=True)
|
||
|
||
logger.info(f"共抓取 {len(posts)} 則 PTT 文章")
|
||
return posts
|
||
|
||
def fetch_dcard_trends(self, categories: List[str] = None, max_per_forum: int = 10,
|
||
time_range: str = "week") -> List[SocialPost]:
|
||
"""
|
||
抓取 Dcard 熱門文章
|
||
|
||
Args:
|
||
categories: 要抓取的分類 (預設全部)
|
||
max_per_forum: 每個版最多抓取數量
|
||
time_range: 時間範圍 ("day", "week", "month")
|
||
|
||
Returns:
|
||
SocialPost 列表
|
||
"""
|
||
# 計算時間過濾
|
||
now = datetime.now()
|
||
if time_range == "day":
|
||
cutoff_time = now - timedelta(days=2)
|
||
elif time_range == "month":
|
||
cutoff_time = now - timedelta(days=45)
|
||
else: # week (預設)
|
||
cutoff_time = now - timedelta(days=10)
|
||
|
||
categories = categories or list(DCARD_FORUMS.keys())
|
||
posts = []
|
||
|
||
for category in categories:
|
||
if category not in DCARD_FORUMS:
|
||
continue
|
||
|
||
for forum in DCARD_FORUMS[category]:
|
||
try:
|
||
# 反爬蟲延遲:隨機等待 1-2 秒
|
||
time.sleep(random.uniform(1.0, 2.0))
|
||
|
||
# Dcard API(公開 API)
|
||
url = f"https://www.dcard.tw/service/api/v2/forums/{forum}/posts"
|
||
params = {
|
||
'limit': max_per_forum,
|
||
'popular': 'true' # 熱門文章
|
||
}
|
||
|
||
response = requests.get(url, headers=self.headers, params=params, timeout=10)
|
||
|
||
# 如果 API 失敗,嘗試使用網頁抓取
|
||
if response.status_code != 200:
|
||
logger.warning(f"Dcard API 失敗 ({forum}), 嘗試網頁抓取")
|
||
continue
|
||
|
||
data = response.json()
|
||
|
||
for post in data:
|
||
try:
|
||
title = post.get('title', '')
|
||
post_id = post.get('id')
|
||
link = f"https://www.dcard.tw/f/{forum}/p/{post_id}"
|
||
|
||
# 解析時間
|
||
created_at = post.get('createdAt', '')
|
||
try:
|
||
# ISO 格式: 2024-01-15T10:30:00.000Z
|
||
published = datetime.strptime(created_at[:19], '%Y-%m-%dT%H:%M:%S')
|
||
except:
|
||
published = now
|
||
|
||
# 時間過濾
|
||
if published < cutoff_time:
|
||
continue
|
||
|
||
likes = post.get('likeCount', 0)
|
||
comments = post.get('commentCount', 0)
|
||
|
||
posts.append(SocialPost(
|
||
title=title,
|
||
link=link,
|
||
source='Dcard',
|
||
board=forum,
|
||
published=published,
|
||
category=category,
|
||
likes=likes,
|
||
comments=comments
|
||
))
|
||
|
||
except Exception as e:
|
||
logger.debug(f"解析 Dcard 文章失敗: {e}")
|
||
continue
|
||
|
||
except Exception as e:
|
||
logger.error(f"抓取 Dcard 版 {forum} 失敗: {e}")
|
||
|
||
# 依讚數排序
|
||
posts.sort(key=lambda x: x.likes, reverse=True)
|
||
|
||
logger.info(f"共抓取 {len(posts)} 則 Dcard 文章")
|
||
return posts
|
||
|
||
def fetch_social_trends(self, categories: List[str] = None, max_per_source: int = 10,
|
||
time_range: str = "week") -> List[SocialPost]:
|
||
"""
|
||
抓取所有社群平台的趨勢(PTT + Dcard)
|
||
|
||
Args:
|
||
categories: 要抓取的分類
|
||
max_per_source: 每個來源最多抓取數量
|
||
time_range: 時間範圍
|
||
|
||
Returns:
|
||
合併後的 SocialPost 列表
|
||
"""
|
||
all_posts = []
|
||
|
||
# PTT
|
||
ptt_posts = self.fetch_ptt_trends(categories, max_per_source, time_range)
|
||
all_posts.extend(ptt_posts)
|
||
|
||
# Dcard
|
||
dcard_posts = self.fetch_dcard_trends(categories, max_per_source, time_range)
|
||
all_posts.extend(dcard_posts)
|
||
|
||
# 依讚數排序
|
||
all_posts.sort(key=lambda x: x.likes, reverse=True)
|
||
|
||
return all_posts
|
||
|
||
def get_all_trends(self, categories: List[str] = None,
|
||
weather_location: str = "臺北市",
|
||
time_range: str = "week",
|
||
include_social: bool = True) -> TrendData:
|
||
"""
|
||
獲取所有趨勢資料
|
||
|
||
Args:
|
||
categories: 新聞分類
|
||
weather_location: 天氣地點
|
||
time_range: 時間範圍 ("day", "week", "month")
|
||
include_social: 是否包含社群平台(PTT/Dcard)
|
||
|
||
Returns:
|
||
TrendData
|
||
"""
|
||
# 抓取新聞
|
||
news_items = self.fetch_news(categories, time_range=time_range)
|
||
|
||
# 抓取 YouTube 趨勢
|
||
youtube_videos = self.fetch_youtube_trends(categories, time_range=time_range)
|
||
|
||
# 抓取社群趨勢(PTT/Dcard)
|
||
social_posts = []
|
||
if include_social:
|
||
social_posts = self.fetch_social_trends(categories, time_range=time_range)
|
||
|
||
# 抓取天氣
|
||
weather = self.fetch_weather(weather_location)
|
||
|
||
# 提取關鍵字(結合新聞、YouTube 和社群)
|
||
keywords = self.extract_trending_keywords(news_items, youtube_videos)
|
||
# 加入社群標題的關鍵字
|
||
if social_posts:
|
||
social_titles = " ".join([p.title for p in social_posts])
|
||
pattern = r'[\u4e00-\u9fff]{2,6}'
|
||
matches = re.findall(pattern, social_titles)
|
||
for word in matches:
|
||
if word not in keywords:
|
||
keywords.append(word)
|
||
keywords = keywords[:25] # 限制總數
|
||
|
||
# 按分類整理趨勢
|
||
category_trends = {}
|
||
for item in news_items:
|
||
if item.category not in category_trends:
|
||
category_trends[item.category] = []
|
||
category_trends[item.category].append(item.title)
|
||
|
||
# 加入 YouTube 標題
|
||
for video in youtube_videos:
|
||
if video.category not in category_trends:
|
||
category_trends[video.category] = []
|
||
category_trends[video.category].append(f"[YT] {video.title}")
|
||
|
||
# 加入社群標題
|
||
for post in social_posts:
|
||
if post.category not in category_trends:
|
||
category_trends[post.category] = []
|
||
category_trends[post.category].append(f"[{post.source}] {post.title}")
|
||
|
||
return TrendData(
|
||
timestamp=datetime.now(),
|
||
news_items=news_items,
|
||
youtube_videos=youtube_videos,
|
||
social_posts=social_posts,
|
||
weather=weather,
|
||
keywords=keywords,
|
||
category_trends=category_trends
|
||
)
|
||
|
||
|
||
# 建立全域實例
|
||
trend_crawler = TrendCrawler()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# 測試程式碼
|
||
logging.basicConfig(level=logging.INFO)
|
||
|
||
crawler = TrendCrawler()
|
||
|
||
print("=== 測試天氣抓取 ===")
|
||
weather = crawler.fetch_weather("臺北市")
|
||
if weather:
|
||
print(f"地點: {weather.location}")
|
||
print(f"天氣: {weather.weather_description}")
|
||
print(f"溫度: {weather.min_temp}°C ~ {weather.max_temp}°C")
|
||
print(f"降雨機率: {weather.rain_probability}%")
|
||
print(f"行銷建議: {weather.marketing_suggestions}")
|
||
else:
|
||
print("天氣抓取失敗")
|
||
|
||
print("\n=== 測試新聞抓取 ===")
|
||
news = crawler.fetch_news(categories=["時尚美妝"], max_per_category=5)
|
||
for item in news[:5]:
|
||
print(f"- {item.title}")
|
||
|
||
print("\n=== 提取關鍵字 ===")
|
||
keywords = crawler.extract_trending_keywords(news)
|
||
print(f"熱門關鍵字: {keywords[:10]}")
|