Some checks failed
CD Pipeline / deploy (push) Failing after 59s
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml) - 部署模式: rsync Python 檔案至 188 → docker restart (volume mount) - Dockerfile/requirements 變動時自動重建 Docker image - 部署通知: Telegram (開始/成功/失敗) - 健康檢查: https://mo.wooo.work/health (最多 5 次重試) - 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
454 lines
14 KiB
Python
454 lines
14 KiB
Python
"""
|
||
PChome 24h 爬蟲服務
|
||
|
||
爬取 PChome 24h 商品資料,支援:
|
||
- 館別頁面爬取 (如 /region/DDAB)
|
||
- 商品詳細資料取得
|
||
- 批次查詢商品 API
|
||
|
||
API 參考:
|
||
- 商品 API: https://ecapi-cdn.pchome.com.tw/cdn/ecshop/prodapi/v2/prod?id=ID1,ID2,...
|
||
- 圖片 URL: https://img.pchome.com.tw/cs{Pic.B}
|
||
"""
|
||
|
||
import re
|
||
import time
|
||
import logging
|
||
from typing import List, Dict, Optional, Tuple
|
||
from dataclasses import dataclass, asdict
|
||
from datetime import datetime
|
||
|
||
import requests
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
@dataclass
|
||
class PChomeProduct:
|
||
"""PChome 商品資料結構"""
|
||
product_id: str # 商品 ID (如 DDABSD-1900HIE3P)
|
||
name: str # 商品名稱
|
||
price: int # 售價
|
||
original_price: int # 原價
|
||
discount: Optional[int] # 折扣 (%)
|
||
image_url: str # 圖片 URL
|
||
product_url: str # 商品頁面 URL
|
||
stock: int # 庫存數量
|
||
store: str # 店家代碼
|
||
rating: Optional[float] # 評分
|
||
review_count: int # 評論數
|
||
is_on_sale: bool # 是否特價中
|
||
crawled_at: datetime # 爬取時間
|
||
|
||
def to_dict(self) -> dict:
|
||
"""轉換為字典"""
|
||
data = asdict(self)
|
||
data['crawled_at'] = self.crawled_at.isoformat()
|
||
return data
|
||
|
||
|
||
class PChomeCrawler:
|
||
"""PChome 24h 爬蟲"""
|
||
|
||
# 基礎 URL
|
||
BASE_URL = 'https://24h.pchome.com.tw'
|
||
API_URL = 'https://ecapi-cdn.pchome.com.tw/cdn/ecshop/prodapi/v2/prod'
|
||
IMAGE_BASE_URL = 'https://img.pchome.com.tw/cs'
|
||
|
||
# 預設 Headers
|
||
DEFAULT_HEADERS = {
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
'Accept': 'application/json, text/html,application/xhtml+xml',
|
||
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
|
||
'Referer': 'https://24h.pchome.com.tw/',
|
||
}
|
||
|
||
# 商品 ID 正則表達式
|
||
PRODUCT_ID_PATTERN = re.compile(r'[A-Z]{4}[A-Z0-9]{2}-?[A-Z0-9]{8,10}')
|
||
|
||
def __init__(self, timeout: int = 30, delay: float = 0.5):
|
||
"""
|
||
初始化爬蟲
|
||
|
||
Args:
|
||
timeout: 請求超時時間 (秒)
|
||
delay: 請求間隔延遲 (秒),避免過度頻繁請求
|
||
"""
|
||
self.timeout = timeout
|
||
self.delay = delay
|
||
self.session = requests.Session()
|
||
self.session.headers.update(self.DEFAULT_HEADERS)
|
||
self._last_request_time = 0
|
||
|
||
def _rate_limit(self):
|
||
"""速率限制"""
|
||
elapsed = time.time() - self._last_request_time
|
||
if elapsed < self.delay:
|
||
time.sleep(self.delay - elapsed)
|
||
self._last_request_time = time.time()
|
||
|
||
def _normalize_product_id(self, product_id: str) -> str:
|
||
"""
|
||
正規化商品 ID 格式
|
||
|
||
Args:
|
||
product_id: 原始商品 ID
|
||
|
||
Returns:
|
||
正規化後的 ID (格式: XXXXXX-XXXXXXXX)
|
||
"""
|
||
# 移除空白
|
||
product_id = product_id.strip()
|
||
|
||
# 如果已經有 - 就直接返回
|
||
if '-' in product_id:
|
||
return product_id
|
||
|
||
# 在第 6 個字元後加入 -
|
||
if len(product_id) > 6:
|
||
return f"{product_id[:6]}-{product_id[6:]}"
|
||
|
||
return product_id
|
||
|
||
def _extract_product_ids_from_html(self, html: str) -> List[str]:
|
||
"""
|
||
從 HTML 中提取商品 ID
|
||
|
||
Args:
|
||
html: 頁面 HTML 內容
|
||
|
||
Returns:
|
||
商品 ID 列表 (已去重)
|
||
"""
|
||
raw_ids = self.PRODUCT_ID_PATTERN.findall(html)
|
||
|
||
# 正規化並去重
|
||
normalized_ids = set()
|
||
for pid in raw_ids:
|
||
normalized = self._normalize_product_id(pid)
|
||
normalized_ids.add(normalized)
|
||
|
||
return list(normalized_ids)
|
||
|
||
def fetch_region_page(self, region_code: str) -> Tuple[bool, str, List[str]]:
|
||
"""
|
||
爬取館別頁面,取得商品 ID 列表
|
||
|
||
Args:
|
||
region_code: 館別代碼 (如 DDAB)
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品 ID 列表)
|
||
"""
|
||
url = f"{self.BASE_URL}/region/{region_code}"
|
||
|
||
try:
|
||
self._rate_limit()
|
||
response = self.session.get(url, timeout=self.timeout)
|
||
response.raise_for_status()
|
||
|
||
product_ids = self._extract_product_ids_from_html(response.text)
|
||
logger.info(f"從 {url} 取得 {len(product_ids)} 個商品 ID")
|
||
|
||
return True, f"成功取得 {len(product_ids)} 個商品", product_ids
|
||
|
||
except requests.RequestException as e:
|
||
logger.error(f"爬取 {url} 失敗: {e}")
|
||
return False, f"請求失敗: {str(e)}", []
|
||
|
||
def fetch_product_details(self, product_ids: List[str], batch_size: int = 20) -> Tuple[bool, str, List[PChomeProduct]]:
|
||
"""
|
||
批次取得商品詳細資料
|
||
|
||
Args:
|
||
product_ids: 商品 ID 列表
|
||
batch_size: 每批次查詢數量 (API 限制約 20-30 個)
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品資料列表)
|
||
"""
|
||
if not product_ids:
|
||
return False, "沒有提供商品 ID", []
|
||
|
||
all_products = []
|
||
failed_count = 0
|
||
|
||
# 分批處理
|
||
for i in range(0, len(product_ids), batch_size):
|
||
batch = product_ids[i:i + batch_size]
|
||
|
||
try:
|
||
self._rate_limit()
|
||
|
||
# 呼叫商品 API
|
||
params = {'id': ','.join(batch)}
|
||
response = self.session.get(
|
||
self.API_URL,
|
||
params=params,
|
||
timeout=self.timeout
|
||
)
|
||
response.raise_for_status()
|
||
|
||
data = response.json()
|
||
crawled_at = datetime.now()
|
||
|
||
# 解析商品資料
|
||
for prod_key, prod_data in data.items():
|
||
try:
|
||
product = self._parse_product_data(prod_data, crawled_at)
|
||
if product:
|
||
all_products.append(product)
|
||
except Exception as e:
|
||
logger.warning(f"解析商品 {prod_key} 失敗: {e}")
|
||
failed_count += 1
|
||
|
||
logger.info(f"批次 {i // batch_size + 1}: 取得 {len(data)} 個商品資料")
|
||
|
||
except requests.RequestException as e:
|
||
logger.error(f"API 請求失敗 (批次 {i // batch_size + 1}): {e}")
|
||
failed_count += len(batch)
|
||
|
||
message = f"成功取得 {len(all_products)} 個商品資料"
|
||
if failed_count > 0:
|
||
message += f",{failed_count} 個失敗"
|
||
|
||
return len(all_products) > 0, message, all_products
|
||
|
||
def _parse_product_data(self, data: dict, crawled_at: datetime) -> Optional[PChomeProduct]:
|
||
"""
|
||
解析 API 回傳的商品資料
|
||
|
||
Args:
|
||
data: API 回傳的商品資料
|
||
crawled_at: 爬取時間
|
||
|
||
Returns:
|
||
PChomeProduct 物件
|
||
"""
|
||
try:
|
||
product_id = data.get('Id', '')
|
||
# 移除尾部的 -000
|
||
if product_id.endswith('-000'):
|
||
product_id = product_id[:-4]
|
||
|
||
# 取得價格資訊
|
||
# [2026-04-18 台北] Bug-1 防禦 Layer C:PChome API 若改版導致 'P' 欄位消失,
|
||
# 舊版靜默歸零 → DB 寫入 $0 → 全鏈路幻覺。改為偵測到缺值時 return None,
|
||
# 由上層決定跳過 — Claude Opus 4.7
|
||
price_info = data.get('Price', {})
|
||
if isinstance(price_info, dict):
|
||
price = price_info.get('P')
|
||
if price is None or price == 0:
|
||
logger.warning(
|
||
f"[PChome] Id={data.get('Id', '?')} 價格欄位 Price.P 缺失或為 0,"
|
||
f"疑似 API 格式變更或商品下架,跳過此筆"
|
||
)
|
||
return None
|
||
original_price = price_info.get('M', price)
|
||
else:
|
||
price = price_info
|
||
if not price:
|
||
logger.warning(
|
||
f"[PChome] Id={data.get('Id', '?')} 價格欄位為非 dict 且為空,跳過"
|
||
)
|
||
return None
|
||
original_price = price
|
||
|
||
# 計算折扣
|
||
discount = None
|
||
if original_price and original_price > price:
|
||
discount = round((1 - price / original_price) * 100)
|
||
|
||
# 取得圖片 URL
|
||
pic_info = data.get('Pic', {})
|
||
if isinstance(pic_info, dict):
|
||
pic_path = pic_info.get('B', '')
|
||
else:
|
||
pic_path = pic_info or ''
|
||
|
||
image_url = f"{self.IMAGE_BASE_URL}{pic_path}" if pic_path else ''
|
||
|
||
return PChomeProduct(
|
||
product_id=product_id,
|
||
name=data.get('Name', ''),
|
||
price=price,
|
||
original_price=original_price,
|
||
discount=discount,
|
||
image_url=image_url,
|
||
product_url=f"{self.BASE_URL}/prod/{product_id}",
|
||
stock=data.get('Qty', 0),
|
||
store=data.get('Store', ''),
|
||
rating=data.get('RatingValue'),
|
||
review_count=data.get('ReviewCount', 0),
|
||
is_on_sale=data.get('isOnSale', False),
|
||
crawled_at=crawled_at
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"解析商品資料失敗: {e}")
|
||
return None
|
||
|
||
def crawl_region(self, region_code: str) -> Tuple[bool, str, List[PChomeProduct]]:
|
||
"""
|
||
完整爬取館別頁面 (取得 ID + 詳細資料)
|
||
|
||
Args:
|
||
region_code: 館別代碼
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品資料列表)
|
||
"""
|
||
# Step 1: 取得商品 ID
|
||
success, message, product_ids = self.fetch_region_page(region_code)
|
||
if not success:
|
||
return False, message, []
|
||
|
||
if not product_ids:
|
||
return False, "頁面中沒有找到商品", []
|
||
|
||
# Step 2: 取得詳細資料
|
||
success, message, products = self.fetch_product_details(product_ids)
|
||
|
||
return success, message, products
|
||
|
||
def search_products(self, keyword: str, limit: int = 50) -> Tuple[bool, str, List[PChomeProduct]]:
|
||
"""
|
||
搜尋商品 (使用搜尋 API)
|
||
|
||
Args:
|
||
keyword: 搜尋關鍵字
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品資料列表)
|
||
"""
|
||
search_url = f"https://ecshweb.pchome.com.tw/search/v4.3/all/results"
|
||
params = {
|
||
'q': keyword,
|
||
'page': 1,
|
||
'sort': 'rnk/dc',
|
||
'cateid': '24h',
|
||
}
|
||
|
||
try:
|
||
self._rate_limit()
|
||
response = self.session.get(search_url, params=params, timeout=self.timeout)
|
||
response.raise_for_status()
|
||
|
||
data = response.json()
|
||
prods = data.get('Prods', [])
|
||
|
||
if not prods:
|
||
return False, "沒有找到符合的商品", []
|
||
|
||
# 取得商品 ID
|
||
product_ids = [p.get('Id', '') for p in prods[:limit] if p.get('Id')]
|
||
|
||
# 取得詳細資料
|
||
return self.fetch_product_details(product_ids)
|
||
|
||
except requests.RequestException as e:
|
||
logger.error(f"搜尋失敗: {e}")
|
||
return False, f"搜尋失敗: {str(e)}", []
|
||
|
||
|
||
# 預設爬蟲實例
|
||
_crawler_instance = None
|
||
|
||
|
||
def get_crawler() -> PChomeCrawler:
|
||
"""取得爬蟲實例 (單例模式)"""
|
||
global _crawler_instance
|
||
if _crawler_instance is None:
|
||
_crawler_instance = PChomeCrawler()
|
||
return _crawler_instance
|
||
|
||
|
||
# 快捷函數
|
||
def crawl_pchome_region(region_code: str) -> Tuple[bool, str, List[dict]]:
|
||
"""
|
||
爬取 PChome 館別頁面
|
||
|
||
Args:
|
||
region_code: 館別代碼 (如 DDAB)
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品資料列表)
|
||
"""
|
||
crawler = get_crawler()
|
||
success, message, products = crawler.crawl_region(region_code)
|
||
return success, message, [p.to_dict() for p in products]
|
||
|
||
|
||
def search_pchome_products(keyword: str, limit: int = 50) -> Tuple[bool, str, List[dict]]:
|
||
"""
|
||
搜尋 PChome 商品
|
||
|
||
Args:
|
||
keyword: 搜尋關鍵字
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品資料列表)
|
||
"""
|
||
crawler = get_crawler()
|
||
success, message, products = crawler.search_products(keyword, limit)
|
||
return success, message, [p.to_dict() for p in products]
|
||
|
||
|
||
def get_pchome_bestsellers(category: str, limit: int = 5) -> Tuple[bool, str, List[dict]]:
|
||
"""
|
||
取得 PChome 分類熱銷商品
|
||
|
||
Args:
|
||
category: 分類關鍵字 (如 '面膜', '乳液', '精華液')
|
||
limit: 最多回傳數量
|
||
|
||
Returns:
|
||
(成功與否, 訊息, 商品資料列表)
|
||
"""
|
||
crawler = get_crawler()
|
||
# 使用搜尋 API,按銷量排序
|
||
success, message, products = crawler.search_products(category, limit=limit)
|
||
if success and products:
|
||
# 轉換為精簡格式
|
||
result = []
|
||
for p in products[:limit]:
|
||
result.append({
|
||
'name': p.name,
|
||
'price': p.price,
|
||
'original_price': p.original_price,
|
||
'discount': p.discount,
|
||
'url': p.product_url,
|
||
'image': p.image_url
|
||
})
|
||
return True, f"成功取得 {len(result)} 個熱銷商品", result
|
||
return success, message, []
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 測試
|
||
logging.basicConfig(level=logging.INFO)
|
||
|
||
print("=== PChome 爬蟲測試 ===\n")
|
||
|
||
# 測試館別爬取
|
||
print("[1] 測試館別爬取 (DDAB - 美妝保養)")
|
||
success, msg, products = crawl_pchome_region('DDAB')
|
||
print(f"結果: {msg}")
|
||
if products:
|
||
print(f"範例商品:")
|
||
for p in products[:3]:
|
||
print(f" - {p['name'][:30]}... ${p['price']} (原價 ${p['original_price']})")
|
||
|
||
print("\n" + "=" * 50 + "\n")
|
||
|
||
# 測試搜尋
|
||
print("[2] 測試搜尋 (關鍵字: iPhone)")
|
||
success, msg, products = search_pchome_products('iPhone', limit=5)
|
||
print(f"結果: {msg}")
|
||
if products:
|
||
print(f"搜尋結果:")
|
||
for p in products[:3]:
|
||
print(f" - {p['name'][:30]}... ${p['price']}")
|