Files
ewoooc/services/pchome_crawler.py
ogt 6435bed005
Some checks failed
CD Pipeline / deploy (push) Failing after 1m2s
feat: implement missing PChome high-level comparison functions
Previously pchome_crawler.py only had low-level crawling primitives.
All high-level functions used by openclaw_bot_routes.py were missing,
causing _PCHOME_AVAILABLE = False on startup and '簡報生成失敗' errors.

Implemented:
  search_pchome(keyword, limit)        — simplified search → list of dicts
  find_best_match(keyword, momo_price) — best PChome match for a product
  compare_product(name, price, icode)  — single momo vs PChome comparison
  batch_compare_top(db, top_n, date)   — batch compare TOP-N momo hottest
  save_matches(db, results)            — persist results to pchome_matches
  ensure_tables(db)                    — idempotent table creation
  fmt_compare_msg(results, keyword)    — Telegram Markdown single-item msg
  fmt_daily_report(results, date_str)  — Telegram Markdown daily report msg

After this commit _PCHOME_AVAILABLE will be True and competitor PPT
generation will no longer throw RuntimeError.
2026-04-20 06:09:33 +08:00

745 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
PChome 24h 爬蟲服務
爬取 PChome 24h 商品資料,支援:
- 館別頁面爬取 (如 /region/DDAB)
- 商品詳細資料取得
- 批次查詢商品 API
API 參考:
- 商品 API: https://ecapi-cdn.pchome.com.tw/cdn/ecshop/prodapi/v2/prod?id=ID1,ID2,...
- 圖片 URL: https://img.pchome.com.tw/cs{Pic.B}
"""
import re
import time
import logging
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime
import requests
logger = logging.getLogger(__name__)
@dataclass
class PChomeProduct:
"""PChome 商品資料結構"""
product_id: str # 商品 ID (如 DDABSD-1900HIE3P)
name: str # 商品名稱
price: int # 售價
original_price: int # 原價
discount: Optional[int] # 折扣 (%)
image_url: str # 圖片 URL
product_url: str # 商品頁面 URL
stock: int # 庫存數量
store: str # 店家代碼
rating: Optional[float] # 評分
review_count: int # 評論數
is_on_sale: bool # 是否特價中
crawled_at: datetime # 爬取時間
def to_dict(self) -> dict:
"""轉換為字典"""
data = asdict(self)
data['crawled_at'] = self.crawled_at.isoformat()
return data
class PChomeCrawler:
"""PChome 24h 爬蟲"""
# 基礎 URL
BASE_URL = 'https://24h.pchome.com.tw'
API_URL = 'https://ecapi-cdn.pchome.com.tw/cdn/ecshop/prodapi/v2/prod'
IMAGE_BASE_URL = 'https://img.pchome.com.tw/cs'
# 預設 Headers
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'application/json, text/html,application/xhtml+xml',
'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
'Referer': 'https://24h.pchome.com.tw/',
}
# 商品 ID 正則表達式
PRODUCT_ID_PATTERN = re.compile(r'[A-Z]{4}[A-Z0-9]{2}-?[A-Z0-9]{8,10}')
def __init__(self, timeout: int = 30, delay: float = 0.5):
"""
初始化爬蟲
Args:
timeout: 請求超時時間 (秒)
delay: 請求間隔延遲 (秒),避免過度頻繁請求
"""
self.timeout = timeout
self.delay = delay
self.session = requests.Session()
self.session.headers.update(self.DEFAULT_HEADERS)
self._last_request_time = 0
def _rate_limit(self):
"""速率限制"""
elapsed = time.time() - self._last_request_time
if elapsed < self.delay:
time.sleep(self.delay - elapsed)
self._last_request_time = time.time()
def _normalize_product_id(self, product_id: str) -> str:
"""
正規化商品 ID 格式
Args:
product_id: 原始商品 ID
Returns:
正規化後的 ID (格式: XXXXXX-XXXXXXXX)
"""
# 移除空白
product_id = product_id.strip()
# 如果已經有 - 就直接返回
if '-' in product_id:
return product_id
# 在第 6 個字元後加入 -
if len(product_id) > 6:
return f"{product_id[:6]}-{product_id[6:]}"
return product_id
def _extract_product_ids_from_html(self, html: str) -> List[str]:
"""
從 HTML 中提取商品 ID
Args:
html: 頁面 HTML 內容
Returns:
商品 ID 列表 (已去重)
"""
raw_ids = self.PRODUCT_ID_PATTERN.findall(html)
# 正規化並去重
normalized_ids = set()
for pid in raw_ids:
normalized = self._normalize_product_id(pid)
normalized_ids.add(normalized)
return list(normalized_ids)
def fetch_region_page(self, region_code: str) -> Tuple[bool, str, List[str]]:
"""
爬取館別頁面,取得商品 ID 列表
Args:
region_code: 館別代碼 (如 DDAB)
Returns:
(成功與否, 訊息, 商品 ID 列表)
"""
url = f"{self.BASE_URL}/region/{region_code}"
try:
self._rate_limit()
response = self.session.get(url, timeout=self.timeout)
response.raise_for_status()
product_ids = self._extract_product_ids_from_html(response.text)
logger.info(f"{url} 取得 {len(product_ids)} 個商品 ID")
return True, f"成功取得 {len(product_ids)} 個商品", product_ids
except requests.RequestException as e:
logger.error(f"爬取 {url} 失敗: {e}")
return False, f"請求失敗: {str(e)}", []
def fetch_product_details(self, product_ids: List[str], batch_size: int = 20) -> Tuple[bool, str, List[PChomeProduct]]:
"""
批次取得商品詳細資料
Args:
product_ids: 商品 ID 列表
batch_size: 每批次查詢數量 (API 限制約 20-30 個)
Returns:
(成功與否, 訊息, 商品資料列表)
"""
if not product_ids:
return False, "沒有提供商品 ID", []
all_products = []
failed_count = 0
# 分批處理
for i in range(0, len(product_ids), batch_size):
batch = product_ids[i:i + batch_size]
try:
self._rate_limit()
# 呼叫商品 API
params = {'id': ','.join(batch)}
response = self.session.get(
self.API_URL,
params=params,
timeout=self.timeout
)
response.raise_for_status()
data = response.json()
crawled_at = datetime.now()
# 解析商品資料
for prod_key, prod_data in data.items():
try:
product = self._parse_product_data(prod_data, crawled_at)
if product:
all_products.append(product)
except Exception as e:
logger.warning(f"解析商品 {prod_key} 失敗: {e}")
failed_count += 1
logger.info(f"批次 {i // batch_size + 1}: 取得 {len(data)} 個商品資料")
except requests.RequestException as e:
logger.error(f"API 請求失敗 (批次 {i // batch_size + 1}): {e}")
failed_count += len(batch)
message = f"成功取得 {len(all_products)} 個商品資料"
if failed_count > 0:
message += f"{failed_count} 個失敗"
return len(all_products) > 0, message, all_products
def _parse_product_data(self, data: dict, crawled_at: datetime) -> Optional[PChomeProduct]:
"""
解析 API 回傳的商品資料
Args:
data: API 回傳的商品資料
crawled_at: 爬取時間
Returns:
PChomeProduct 物件
"""
try:
product_id = data.get('Id', '')
# 移除尾部的 -000
if product_id.endswith('-000'):
product_id = product_id[:-4]
# 取得價格資訊
# [2026-04-18 台北] Bug-1 防禦 Layer CPChome API 若改版導致 'P' 欄位消失,
# 舊版靜默歸零 → DB 寫入 $0 → 全鏈路幻覺。改為偵測到缺值時 return None
# 由上層決定跳過 — Claude Opus 4.7
price_info = data.get('Price', {})
if isinstance(price_info, dict):
price = price_info.get('P')
if price is None or price == 0:
logger.warning(
f"[PChome] Id={data.get('Id', '?')} 價格欄位 Price.P 缺失或為 0"
f"疑似 API 格式變更或商品下架,跳過此筆"
)
return None
original_price = price_info.get('M', price)
else:
price = price_info
if not price:
logger.warning(
f"[PChome] Id={data.get('Id', '?')} 價格欄位為非 dict 且為空,跳過"
)
return None
original_price = price
# 計算折扣
discount = None
if original_price and original_price > price:
discount = round((1 - price / original_price) * 100)
# 取得圖片 URL
pic_info = data.get('Pic', {})
if isinstance(pic_info, dict):
pic_path = pic_info.get('B', '')
else:
pic_path = pic_info or ''
image_url = f"{self.IMAGE_BASE_URL}{pic_path}" if pic_path else ''
return PChomeProduct(
product_id=product_id,
name=data.get('Name', ''),
price=price,
original_price=original_price,
discount=discount,
image_url=image_url,
product_url=f"{self.BASE_URL}/prod/{product_id}",
stock=data.get('Qty', 0),
store=data.get('Store', ''),
rating=data.get('RatingValue'),
review_count=data.get('ReviewCount', 0),
is_on_sale=data.get('isOnSale', False),
crawled_at=crawled_at
)
except Exception as e:
logger.error(f"解析商品資料失敗: {e}")
return None
def crawl_region(self, region_code: str) -> Tuple[bool, str, List[PChomeProduct]]:
"""
完整爬取館別頁面 (取得 ID + 詳細資料)
Args:
region_code: 館別代碼
Returns:
(成功與否, 訊息, 商品資料列表)
"""
# Step 1: 取得商品 ID
success, message, product_ids = self.fetch_region_page(region_code)
if not success:
return False, message, []
if not product_ids:
return False, "頁面中沒有找到商品", []
# Step 2: 取得詳細資料
success, message, products = self.fetch_product_details(product_ids)
return success, message, products
def search_products(self, keyword: str, limit: int = 50) -> Tuple[bool, str, List[PChomeProduct]]:
"""
搜尋商品 (使用搜尋 API)
Args:
keyword: 搜尋關鍵字
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品資料列表)
"""
search_url = f"https://ecshweb.pchome.com.tw/search/v4.3/all/results"
params = {
'q': keyword,
'page': 1,
'sort': 'rnk/dc',
'cateid': '24h',
}
try:
self._rate_limit()
response = self.session.get(search_url, params=params, timeout=self.timeout)
response.raise_for_status()
data = response.json()
prods = data.get('Prods', [])
if not prods:
return False, "沒有找到符合的商品", []
# 取得商品 ID
product_ids = [p.get('Id', '') for p in prods[:limit] if p.get('Id')]
# 取得詳細資料
return self.fetch_product_details(product_ids)
except requests.RequestException as e:
logger.error(f"搜尋失敗: {e}")
return False, f"搜尋失敗: {str(e)}", []
# 預設爬蟲實例
_crawler_instance = None
def get_crawler() -> PChomeCrawler:
"""取得爬蟲實例 (單例模式)"""
global _crawler_instance
if _crawler_instance is None:
_crawler_instance = PChomeCrawler()
return _crawler_instance
# 快捷函數
def crawl_pchome_region(region_code: str) -> Tuple[bool, str, List[dict]]:
"""
爬取 PChome 館別頁面
Args:
region_code: 館別代碼 (如 DDAB)
Returns:
(成功與否, 訊息, 商品資料列表)
"""
crawler = get_crawler()
success, message, products = crawler.crawl_region(region_code)
return success, message, [p.to_dict() for p in products]
def search_pchome_products(keyword: str, limit: int = 50) -> Tuple[bool, str, List[dict]]:
"""
搜尋 PChome 商品
Args:
keyword: 搜尋關鍵字
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品資料列表)
"""
crawler = get_crawler()
success, message, products = crawler.search_products(keyword, limit)
return success, message, [p.to_dict() for p in products]
def get_pchome_bestsellers(category: str, limit: int = 5) -> Tuple[bool, str, List[dict]]:
"""
取得 PChome 分類熱銷商品
Args:
category: 分類關鍵字 (如 '面膜', '乳液', '精華液')
limit: 最多回傳數量
Returns:
(成功與否, 訊息, 商品資料列表)
"""
crawler = get_crawler()
# 使用搜尋 API按銷量排序
success, message, products = crawler.search_products(category, limit=limit)
if success and products:
# 轉換為精簡格式
result = []
for p in products[:limit]:
result.append({
'name': p.name,
'price': p.price,
'original_price': p.original_price,
'discount': p.discount,
'url': p.product_url,
'image': p.image_url
})
return True, f"成功取得 {len(result)} 個熱銷商品", result
return success, message, []
if __name__ == '__main__':
# 測試
logging.basicConfig(level=logging.INFO)
print("=== PChome 爬蟲測試 ===\n")
# 測試館別爬取
print("[1] 測試館別爬取 (DDAB - 美妝保養)")
success, msg, products = crawl_pchome_region('DDAB')
print(f"結果: {msg}")
if products:
print(f"範例商品:")
for p in products[:3]:
print(f" - {p['name'][:30]}... ${p['price']} (原價 ${p['original_price']})")
print("\n" + "=" * 50 + "\n")
# 測試搜尋
print("[2] 測試搜尋 (關鍵字: iPhone)")
success, msg, products = search_pchome_products('iPhone', limit=5)
print(f"結果: {msg}")
if products:
print(f"搜尋結果:")
for p in products[:3]:
print(f" - {p['name'][:30]}... ${p['price']}")
# =============================================================================
# 高階競品比較函數(供 openclaw_bot_routes 使用)
# =============================================================================
def search_pchome(keyword: str, limit: int = 10) -> List[dict]:
"""
搜尋 PChome 商品(簡化版,直接回傳 list
Returns:
[{'name', 'price', 'url', 'in_stock'}, ...]
"""
ok, _, products = search_pchome_products(keyword, limit=limit)
if not ok:
return []
result = []
for p in products:
result.append({
'name': p.get('name', ''),
'price': p.get('price', 0),
'url': p.get('product_url', ''),
'in_stock': p.get('stock', 0) > 0,
})
return result
def find_best_match(keyword: str, momo_price: float) -> Optional[dict]:
"""
在 PChome 搜尋最接近 keyword 的商品並回傳最佳匹配。
Returns:
{'name', 'price', 'url', 'price_diff'} or None
"""
results = search_pchome(keyword, limit=5)
if not results:
return None
best = min(results, key=lambda r: abs(r['price'] - momo_price))
best['price_diff'] = best['price'] - momo_price
return best
def compare_product(
momo_name: str,
momo_price: float,
momo_icode: str = '',
) -> dict:
"""
單一商品 momo vs PChome 比價。
Returns:
{
'momo_name', 'momo_price', 'momo_icode',
'found': bool,
'pc_name', 'pc_price', 'pc_url',
'price_diff': pc_price - momo_price (正值=PChome貴=momo有優勢),
'price_diff_pct': %
}
"""
base: dict = {
'momo_name': momo_name,
'momo_price': momo_price,
'momo_icode': momo_icode,
'found': False,
'pc_name': '',
'pc_price': 0,
'pc_url': '',
'price_diff': 0,
'price_diff_pct': 0.0,
}
try:
match = find_best_match(momo_name, momo_price)
if not match:
return base
pc_price = float(match.get('price', 0))
diff = pc_price - momo_price
pct = (diff / momo_price * 100) if momo_price else 0
base.update({
'found': True,
'pc_name': match.get('name', ''),
'pc_price': pc_price,
'pc_url': match.get('url', ''),
'price_diff': diff,
'price_diff_pct': pct,
})
except Exception as e:
logger.warning("[PChome] compare_product error: %s", e)
return base
def batch_compare_top(
db,
top_n: int = 30,
date_str: str = '',
) -> List[dict]:
"""
批量比較 momo TOP-N 熱銷商品 vs PChome。
Args:
db: SQLAlchemy engine由 _db() 回傳)
top_n: 取 momo 前 N 名熱銷商品
date_str: 日期字串,格式 'YYYY/MM/DD';空則取最新日期
Returns:
[compare_product() 結果, ...]
"""
results: List[dict] = []
try:
from sqlalchemy import text as _text
date_filter = ''
params: dict = {'limit': top_n}
if date_str:
date_filter = "WHERE DATE(s.date) = DATE(:date_str)"
params['date_str'] = date_str.replace('/', '-')
sql = f"""
SELECT p.name, p.i_code,
COALESCE(SUM(s.revenue), 0) AS total_rev
FROM products p
JOIN daily_sales s ON p.id = s.product_id
{date_filter}
GROUP BY p.id, p.name, p.i_code
ORDER BY total_rev DESC
LIMIT :limit
"""
with db.connect() as conn:
rows = conn.execute(_text(sql), params).fetchall()
for row in rows:
name, icode, rev = row[0], row[1], float(row[2] or 0)
try:
cmp = compare_product(name, rev / max(1, 1), icode)
results.append(cmp)
time.sleep(0.4) # 限速
except Exception as e:
logger.warning("[PChome] batch item error: %s", e)
except Exception as e:
logger.error("[PChome] batch_compare_top error: %s", e)
return results
def save_matches(db, results: List[dict]) -> None:
"""
將比價結果寫入 pchome_matches 表(若不存在則建立)。
"""
if not results:
return
try:
from sqlalchemy import text as _text
ensure_tables(db)
with db.begin() as conn:
for r in results:
if not r.get('found'):
continue
conn.execute(_text("""
INSERT INTO pchome_matches
(momo_icode, momo_name, momo_price,
pc_name, pc_price, pc_url,
price_diff, price_diff_pct, matched_at)
VALUES
(:icode, :mname, :mprice,
:pcname, :pcprice, :pcurl,
:diff, :pct, NOW())
ON CONFLICT (momo_icode) DO UPDATE SET
pc_name = EXCLUDED.pc_name,
pc_price = EXCLUDED.pc_price,
pc_url = EXCLUDED.pc_url,
price_diff = EXCLUDED.price_diff,
price_diff_pct = EXCLUDED.price_diff_pct,
matched_at = NOW()
"""), {
'icode': r.get('momo_icode', ''),
'mname': r.get('momo_name', ''),
'mprice': r.get('momo_price', 0),
'pcname': r.get('pc_name', ''),
'pcprice': r.get('pc_price', 0),
'pcurl': r.get('pc_url', ''),
'diff': r.get('price_diff', 0),
'pct': r.get('price_diff_pct', 0),
})
except Exception as e:
logger.warning("[PChome] save_matches error: %s", e)
def ensure_tables(db) -> None:
"""建立 pchome_matches 表(冪等)"""
try:
from sqlalchemy import text as _text
with db.begin() as conn:
conn.execute(_text("""
CREATE TABLE IF NOT EXISTS pchome_matches (
id SERIAL PRIMARY KEY,
momo_icode VARCHAR(64) UNIQUE,
momo_name TEXT,
momo_price NUMERIC(12,2),
pc_name TEXT,
pc_price NUMERIC(12,2),
pc_url TEXT,
price_diff NUMERIC(12,2),
price_diff_pct NUMERIC(8,2),
matched_at TIMESTAMP DEFAULT NOW()
)
"""))
except Exception as e:
logger.warning("[PChome] ensure_tables error: %s", e)
def fmt_compare_msg(results: List[dict], keyword: str = '') -> str:
"""
格式化單品比價訊息Telegram Markdown
"""
if not results:
return f"⚠️ 找不到「{keyword}」的 PChome 比價資料"
lines = [f"🔍 *momo vs PChome 比價|{keyword}*\n"]
for r in results[:5]:
found = r.get('found')
mname = r.get('momo_name', '')[:28]
mprice = r.get('momo_price', 0)
if not found:
lines.append(f"{mname}\n momo `NT${mprice:,.0f}` PChome _未找到_\n")
continue
pcprice = r.get('pc_price', 0)
diff = r.get('price_diff', 0)
pct = r.get('price_diff_pct', 0)
pcurl = r.get('pc_url', '')
if diff > 10:
icon = "" # PChome 貴 → momo 有優勢
note = f"momo 便宜 NT${abs(diff):,.0f}{abs(pct):.1f}%"
elif diff < -10:
icon = "⚠️" # momo 貴
note = f"PChome 便宜 NT${abs(diff):,.0f}{abs(pct):.1f}%"
else:
icon = ""
note = "價差 <NT$10持平"
lines.append(
f"{icon} *{mname}*\n"
f" momo `NT${mprice:,.0f}` "
f"[PChome `NT${pcprice:,.0f}`]({pcurl})\n"
f" {note}\n"
)
return "\n".join(lines)
def fmt_daily_report(results: List[dict], date_str: str = '') -> str:
"""
格式化競品日報訊息Telegram Markdown
"""
found = [r for r in results if r.get('found')]
pc_wins = [r for r in found if r.get('price_diff', 0) > 10] # PChome 貴 → momo優
mo_wins = [r for r in found if r.get('price_diff', 0) < -10] # momo 貴 → PChome優
avg_pct = (sum(r.get('price_diff_pct', 0) for r in found) / len(found)
if found else 0)
label = date_str or datetime.now().strftime('%Y/%m/%d')
lines = [
f"📊 *競品比價日報|{label}*\n",
f"🔢 掃描 `{len(results)}` 件 | 比對成功 `{len(found)}` 件",
f"✅ momo 具優勢 `{len(pc_wins)}` 件 | ⚠️ 需注意 `{len(mo_wins)}` 件",
f"📈 平均價差 `{avg_pct:+.1f}%`(正=PChome貴=momo有優勢\n",
]
if pc_wins:
lines.append("🏆 *momo 優勢商品TOP5*")
for r in pc_wins[:5]:
lines.append(
f"{r['momo_name'][:20]} "
f"momo `NT${r['momo_price']:,.0f}` vs PC `NT${r['pc_price']:,.0f}`"
f" 省 NT${abs(r['price_diff']):,.0f}"
)
lines.append("")
if mo_wins:
lines.append("⚠️ *需注意商品PChome 更便宜 TOP5*")
for r in mo_wins[:5]:
lines.append(
f" ⚠️ {r['momo_name'][:20]} "
f"momo `NT${r['momo_price']:,.0f}` vs PC `NT${r['pc_price']:,.0f}`"
f" 差 NT${abs(r['price_diff']):,.0f}"
)
lines.append("")
lines.append("_資料來源PChome 24h 即時爬取_")
return "\n".join(lines)