Files
ewoooc/services/pchome_crawler.py
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

454 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
PChome 24h 爬蟲服務
爬取 PChome 24h 商品資料,支援:
- 館別頁面爬取 (如 /region/DDAB)
- 商品詳細資料取得
- 批次查詢商品 API
API 參考:
- 商品 API: https://ecapi-cdn.pchome.com.tw/cdn/ecshop/prodapi/v2/prod?id=ID1,ID2,...
- 圖片 URL: https://img.pchome.com.tw/cs{Pic.B}
"""
import re
import time
import logging
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime
import requests
logger = logging.getLogger(__name__)
@dataclass
class PChomeProduct:
"""PChome 商品資料結構"""
product_id: str # 商品 ID (如 DDABSD-1900HIE3P)
name: str # 商品名稱
price: int # 售價
original_price: int # 原價
discount: Optional[int] # 折扣 (%)
image_url: str # 圖片 URL
product_url: str # 商品頁面 URL
stock: int # 庫存數量
store: str # 店家代碼
rating: Optional[float] # 評分
review_count: int # 評論數
is_on_sale: bool # 是否特價中
crawled_at: datetime # 爬取時間
def to_dict(self) -> dict:
"""轉換為字典"""
data = asdict(self)
data['crawled_at'] = self.crawled_at.isoformat()
return data
class PChomeCrawler:
"""PChome 24h 爬蟲"""
# 基礎 URL
BASE_URL = 'https://24h.pchome.com.tw'
API_URL = 'https://ecapi-cdn.pchome.com.tw/cdn/ecshop/prodapi/v2/prod'
IMAGE_BASE_URL = 'https://img.pchome.com.tw/cs'
# 預設 Headers
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'application/json, text/html,application/xhtml+xml',
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
'Referer': 'https://24h.pchome.com.tw/',
}
# 商品 ID 正則表達式
PRODUCT_ID_PATTERN = re.compile(r'[A-Z]{4}[A-Z0-9]{2}-?[A-Z0-9]{8,10}')
def __init__(self, timeout: int = 30, delay: float = 0.5):
"""
初始化爬蟲
Args:
timeout: 請求超時時間 (秒)
delay: 請求間隔延遲 (秒),避免過度頻繁請求
"""
self.timeout = timeout
self.delay = delay
self.session = requests.Session()
self.session.headers.update(self.DEFAULT_HEADERS)
self._last_request_time = 0
def _rate_limit(self):
"""速率限制"""
elapsed = time.time() - self._last_request_time
if elapsed < self.delay:
time.sleep(self.delay - elapsed)
self._last_request_time = time.time()
def _normalize_product_id(self, product_id: str) -> str:
"""
正規化商品 ID 格式
Args:
product_id: 原始商品 ID
Returns:
正規化後的 ID (格式: XXXXXX-XXXXXXXX)
"""
# 移除空白
product_id = product_id.strip()
# 如果已經有 - 就直接返回
if '-' in product_id:
return product_id
# 在第 6 個字元後加入 -
if len(product_id) > 6:
return f"{product_id[:6]}-{product_id[6:]}"
return product_id
def _extract_product_ids_from_html(self, html: str) -> List[str]:
"""
從 HTML 中提取商品 ID
Args:
html: 頁面 HTML 內容
Returns:
商品 ID 列表 (已去重)
"""
raw_ids = self.PRODUCT_ID_PATTERN.findall(html)
# 正規化並去重
normalized_ids = set()
for pid in raw_ids:
normalized = self._normalize_product_id(pid)
normalized_ids.add(normalized)
return list(normalized_ids)
def fetch_region_page(self, region_code: str) -> Tuple[bool, str, List[str]]:
"""
爬取館別頁面,取得商品 ID 列表
Args:
region_code: 館別代碼 (如 DDAB)
Returns:
(成功與否, 訊息, 商品 ID 列表)
"""
url = f"{self.BASE_URL}/region/{region_code}"
try:
self._rate_limit()
response = self.session.get(url, timeout=self.timeout)
response.raise_for_status()
product_ids = self._extract_product_ids_from_html(response.text)
logger.info(f"{url} 取得 {len(product_ids)} 個商品 ID")
return True, f"成功取得 {len(product_ids)} 個商品", product_ids
except requests.RequestException as e:
logger.error(f"爬取 {url} 失敗: {e}")
return False, f"請求失敗: {str(e)}", []
def fetch_product_details(self, product_ids: List[str], batch_size: int = 20) -> Tuple[bool, str, List[PChomeProduct]]:
"""
批次取得商品詳細資料
Args:
product_ids: 商品 ID 列表
batch_size: 每批次查詢數量 (API 限制約 20-30 個)
Returns:
(成功與否, 訊息, 商品資料列表)
"""
if not product_ids:
return False, "沒有提供商品 ID", []
all_products = []
failed_count = 0
# 分批處理
for i in range(0, len(product_ids), batch_size):
batch = product_ids[i:i + batch_size]
try:
self._rate_limit()
# 呼叫商品 API
params = {'id': ','.join(batch)}
response = self.session.get(
self.API_URL,
params=params,
timeout=self.timeout
)
response.raise_for_status()
data = response.json()
crawled_at = datetime.now()
# 解析商品資料
for prod_key, prod_data in data.items():
try:
product = self._parse_product_data(prod_data, crawled_at)
if product:
all_products.append(product)
except Exception as e:
logger.warning(f"解析商品 {prod_key} 失敗: {e}")
failed_count += 1
logger.info(f"批次 {i // batch_size + 1}: 取得 {len(data)} 個商品資料")
except requests.RequestException as e:
logger.error(f"API 請求失敗 (批次 {i // batch_size + 1}): {e}")
failed_count += len(batch)
message = f"成功取得 {len(all_products)} 個商品資料"
if failed_count > 0:
message += f"{failed_count} 個失敗"
return len(all_products) > 0, message, all_products
def _parse_product_data(self, data: dict, crawled_at: datetime) -> Optional[PChomeProduct]:
"""
解析 API 回傳的商品資料
Args:
data: API 回傳的商品資料
crawled_at: 爬取時間
Returns:
PChomeProduct 物件
"""
try:
product_id = data.get('Id', '')
# 移除尾部的 -000
if product_id.endswith('-000'):
product_id = product_id[:-4]
# 取得價格資訊
# [2026-04-18 台北] Bug-1 防禦 Layer CPChome API 若改版導致 'P' 欄位消失,
# 舊版靜默歸零 → DB 寫入 $0 → 全鏈路幻覺。改為偵測到缺值時 return None
# 由上層決定跳過 — Claude Opus 4.7
price_info = data.get('Price', {})
if isinstance(price_info, dict):
price = price_info.get('P')
if price is None or price == 0:
logger.warning(
f"[PChome] Id={data.get('Id', '?')} 價格欄位 Price.P 缺失或為 0"
f"疑似 API 格式變更或商品下架,跳過此筆"
)
return None
original_price = price_info.get('M', price)
else:
price = price_info
if not price:
logger.warning(
f"[PChome] Id={data.get('Id', '?')} 價格欄位為非 dict 且為空,跳過"
)
return None
original_price = price
# 計算折扣
discount = None
if original_price and original_price > price:
discount = round((1 - price / original_price) * 100)
# 取得圖片 URL
pic_info = data.get('Pic', {})
if isinstance(pic_info, dict):
pic_path = pic_info.get('B', '')
else:
pic_path = pic_info or ''
image_url = f"{self.IMAGE_BASE_URL}{pic_path}" if pic_path else ''
return PChomeProduct(
product_id=product_id,
name=data.get('Name', ''),
price=price,
original_price=original_price,
discount=discount,
image_url=image_url,
product_url=f"{self.BASE_URL}/prod/{product_id}",
stock=data.get('Qty', 0),
store=data.get('Store', ''),
rating=data.get('RatingValue'),
review_count=data.get('ReviewCount', 0),
is_on_sale=data.get('isOnSale', False),
crawled_at=crawled_at
)
except Exception as e:
logger.error(f"解析商品資料失敗: {e}")
return None
def crawl_region(self, region_code: str) -> Tuple[bool, str, List[PChomeProduct]]:
"""
完整爬取館別頁面 (取得 ID + 詳細資料)
Args:
region_code: 館別代碼
Returns:
(成功與否, 訊息, 商品資料列表)
"""
# Step 1: 取得商品 ID
success, message, product_ids = self.fetch_region_page(region_code)
if not success:
return False, message, []
if not product_ids:
return False, "頁面中沒有找到商品", []
# Step 2: 取得詳細資料
success, message, products = self.fetch_product_details(product_ids)
return success, message, products
def search_products(self, keyword: str, limit: int = 50) -> Tuple[bool, str, List[PChomeProduct]]:
"""
搜尋商品 (使用搜尋 API)
Args:
keyword: 搜尋關鍵字
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品資料列表)
"""
search_url = f"https://ecshweb.pchome.com.tw/search/v4.3/all/results"
params = {
'q': keyword,
'page': 1,
'sort': 'rnk/dc',
'cateid': '24h',
}
try:
self._rate_limit()
response = self.session.get(search_url, params=params, timeout=self.timeout)
response.raise_for_status()
data = response.json()
prods = data.get('Prods', [])
if not prods:
return False, "沒有找到符合的商品", []
# 取得商品 ID
product_ids = [p.get('Id', '') for p in prods[:limit] if p.get('Id')]
# 取得詳細資料
return self.fetch_product_details(product_ids)
except requests.RequestException as e:
logger.error(f"搜尋失敗: {e}")
return False, f"搜尋失敗: {str(e)}", []
# 預設爬蟲實例
_crawler_instance = None
def get_crawler() -> PChomeCrawler:
"""取得爬蟲實例 (單例模式)"""
global _crawler_instance
if _crawler_instance is None:
_crawler_instance = PChomeCrawler()
return _crawler_instance
# 快捷函數
def crawl_pchome_region(region_code: str) -> Tuple[bool, str, List[dict]]:
"""
爬取 PChome 館別頁面
Args:
region_code: 館別代碼 (如 DDAB)
Returns:
(成功與否, 訊息, 商品資料列表)
"""
crawler = get_crawler()
success, message, products = crawler.crawl_region(region_code)
return success, message, [p.to_dict() for p in products]
def search_pchome_products(keyword: str, limit: int = 50) -> Tuple[bool, str, List[dict]]:
"""
搜尋 PChome 商品
Args:
keyword: 搜尋關鍵字
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品資料列表)
"""
crawler = get_crawler()
success, message, products = crawler.search_products(keyword, limit)
return success, message, [p.to_dict() for p in products]
def get_pchome_bestsellers(category: str, limit: int = 5) -> Tuple[bool, str, List[dict]]:
"""
取得 PChome 分類熱銷商品
Args:
category: 分類關鍵字 (如 '面膜', '乳液', '精華液')
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品資料列表)
"""
crawler = get_crawler()
# 使用搜尋 API按銷量排序
success, message, products = crawler.search_products(category, limit=limit)
if success and products:
# 轉換為精簡格式
result = []
for p in products[:limit]:
result.append({
'name': p.name,
'price': p.price,
'original_price': p.original_price,
'discount': p.discount,
'url': p.product_url,
'image': p.image_url
})
return True, f"成功取得 {len(result)} 個熱銷商品", result
return success, message, []
if __name__ == '__main__':
# 測試
logging.basicConfig(level=logging.INFO)
print("=== PChome 爬蟲測試 ===\n")
# 測試館別爬取
print("[1] 測試館別爬取 (DDAB - 美妝保養)")
success, msg, products = crawl_pchome_region('DDAB')
print(f"結果: {msg}")
if products:
print(f"範例商品:")
for p in products[:3]:
print(f" - {p['name'][:30]}... ${p['price']} (原價 ${p['original_price']})")
print("\n" + "=" * 50 + "\n")
# 測試搜尋
print("[2] 測試搜尋 (關鍵字: iPhone)")
success, msg, products = search_pchome_products('iPhone', limit=5)
print(f"結果: {msg}")
if products:
print(f"搜尋結果:")
for p in products[:3]:
print(f" - {p['name'][:30]}... ${p['price']}")