Some checks failed
CD Pipeline / deploy (push) Failing after 59s
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml) - 部署模式: rsync Python 檔案至 188 → docker restart (volume mount) - Dockerfile/requirements 變動時自動重建 Docker image - 部署通知: Telegram (開始/成功/失敗) - 健康檢查: https://mo.wooo.work/health (最多 5 次重試) - 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
381 lines
13 KiB
Python
381 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
競品價格補給線 Worker (Competitor Price Feeder)
|
||
|
||
角色:獨立背景 Worker(生產者端)
|
||
架構位置:
|
||
[本 Worker — 每 4 小時跑一次] → competitor_prices DB 表
|
||
↓
|
||
[AI Pipeline] → fetch_candidates() LEFT JOIN competitor_prices(消費者端)
|
||
|
||
設計原則:
|
||
- 與 AI Pipeline 完全解耦:本 Worker 掛了不影響核心大腦
|
||
- 自帶重試機制,不阻塞主排程
|
||
- 語意化標籤 (tags) 讓 Hermes 獲得更豐富的情境
|
||
|
||
爬取邏輯:
|
||
MOMO 商品名稱 → PChome 關鍵字搜尋 → 模糊比對最佳匹配 → 寫入 competitor_prices
|
||
|
||
依賴:
|
||
services/pchome_crawler.py — 搜尋 + 批量 API
|
||
services/price_comparison.py — ProductNameParser + 模糊比對
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
import time
|
||
from dataclasses import dataclass
|
||
from datetime import datetime, timedelta, timezone
|
||
from typing import Optional
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ── 比對參數 ─────────────────────────────────────────
|
||
MIN_MATCH_SCORE = 0.45 # 低於此分數不寫入(避免張冠李戴)
|
||
SEARCH_LIMIT = 10 # 每個 SKU 搜尋 PChome 前 N 筆
|
||
BATCH_SIZE = 30 # 每批 DB 寫入筆數
|
||
RATE_DELAY = 0.8 # 每次 PChome 請求間隔(秒)
|
||
TTL_HOURS = 6 # competitor_prices 快取有效期
|
||
|
||
# ── Feeder 結果 ───────────────────────────────────────
|
||
@dataclass
|
||
class FeederResult:
|
||
total_skus: int
|
||
matched: int
|
||
skipped_no_result: int
|
||
skipped_low_score: int
|
||
errors: int
|
||
duration_sec: float
|
||
|
||
|
||
def _extract_tags(pchome_product) -> list:
|
||
"""
|
||
從 PChomeProduct 物件提取語意標籤
|
||
|
||
標籤設計:
|
||
- "on_sale" — is_on_sale = True
|
||
- "discount_10pct" — 折扣 10~19%
|
||
- "discount_20pct" — 折扣 20~29%
|
||
- "discount_30pct" — 折扣 ≥ 30%
|
||
- "low_stock" — 庫存 < 10
|
||
- "high_rating" — 評分 ≥ 4.5
|
||
"""
|
||
tags = []
|
||
|
||
if pchome_product.is_on_sale:
|
||
tags.append("on_sale")
|
||
|
||
try:
|
||
disc = int(pchome_product.discount or 0)
|
||
except (ValueError, TypeError):
|
||
disc = 0
|
||
if disc >= 30:
|
||
tags.append("discount_30pct")
|
||
elif disc >= 20:
|
||
tags.append("discount_20pct")
|
||
elif disc >= 10:
|
||
tags.append("discount_10pct")
|
||
|
||
try:
|
||
stock = int(pchome_product.stock) if pchome_product.stock is not None else None
|
||
if stock is not None and 0 < stock < 10:
|
||
tags.append("low_stock")
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
try:
|
||
if pchome_product.rating and float(pchome_product.rating) >= 4.5:
|
||
tags.append("high_rating")
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
return tags
|
||
|
||
|
||
def _find_best_match(momo_name: str, pchome_products: list) -> Optional[tuple]:
|
||
"""
|
||
從 PChome 搜尋結果中找出與 MOMO 商品名稱最接近的一筆
|
||
|
||
Args:
|
||
momo_name: MOMO 商品名稱
|
||
pchome_products: PChomeProduct 列表
|
||
|
||
Returns:
|
||
(PChomeProduct, score) or None
|
||
"""
|
||
try:
|
||
from services.price_comparison import ProductNameParser
|
||
parser = ProductNameParser()
|
||
except ImportError:
|
||
# Fallback:用 difflib 直接比字串
|
||
from difflib import SequenceMatcher
|
||
best, best_score = None, 0.0
|
||
for p in pchome_products:
|
||
score = SequenceMatcher(None, momo_name.lower(), p.name.lower()).ratio()
|
||
if score > best_score:
|
||
best, best_score = p, score
|
||
return (best, best_score) if best else None
|
||
|
||
# 使用 ProductNameParser 的結構化比對
|
||
momo_parsed = parser.parse(momo_name, "momo", 0, "", "")
|
||
|
||
best, best_score = None, 0.0
|
||
for p in pchome_products:
|
||
pchome_parsed = parser.parse(p.name, "pchome", p.price, p.product_id, p.product_url)
|
||
score = _structural_similarity(momo_parsed, pchome_parsed)
|
||
if score > best_score:
|
||
best, best_score = p, score
|
||
|
||
return (best, best_score) if best else None
|
||
|
||
|
||
def _structural_similarity(momo_p, pchome_p) -> float:
|
||
"""
|
||
結構化相似度計算(品牌 + 規格 + 關鍵字)
|
||
|
||
權重:品牌匹配 0.4 + 規格匹配 0.3 + 關鍵字相似 0.3
|
||
"""
|
||
from difflib import SequenceMatcher
|
||
|
||
score = 0.0
|
||
|
||
# 品牌比對 (0.4)
|
||
if momo_p.brand and pchome_p.brand:
|
||
if momo_p.brand == pchome_p.brand:
|
||
score += 0.4
|
||
elif momo_p.brand in pchome_p.brand or pchome_p.brand in momo_p.brand:
|
||
score += 0.2
|
||
elif not momo_p.brand and not pchome_p.brand:
|
||
score += 0.1 # 都沒有品牌,不扣分
|
||
|
||
# 規格比對 (0.3) — 容量/克重
|
||
momo_specs = momo_p.specs or {}
|
||
pchome_specs = pchome_p.specs or {}
|
||
if momo_specs and pchome_specs:
|
||
matching_specs = sum(
|
||
1 for k, v in momo_specs.items()
|
||
if pchome_specs.get(k) == v
|
||
)
|
||
total_specs = max(len(momo_specs), len(pchome_specs), 1)
|
||
score += 0.3 * (matching_specs / total_specs)
|
||
elif not momo_specs and not pchome_specs:
|
||
score += 0.15
|
||
|
||
# 關鍵字相似度 (0.3)
|
||
momo_kws = " ".join(momo_p.keywords or [])
|
||
pchome_kws = " ".join(pchome_p.keywords or [])
|
||
if momo_kws and pchome_kws:
|
||
kw_sim = SequenceMatcher(None, momo_kws.lower(), pchome_kws.lower()).ratio()
|
||
score += 0.3 * kw_sim
|
||
|
||
return round(score, 3)
|
||
|
||
|
||
class CompetitorPriceFeeder:
|
||
"""
|
||
競品價格補給線 Worker
|
||
|
||
用法:
|
||
feeder = CompetitorPriceFeeder(engine=db_engine)
|
||
result = feeder.run(source="pchome")
|
||
"""
|
||
|
||
def __init__(self, engine=None):
|
||
self.engine = engine
|
||
|
||
def _fetch_active_skus(self) -> list:
|
||
"""
|
||
從 products 表取得待監控的 ACTIVE 商品清單
|
||
|
||
Returns:
|
||
list of {"sku": str, "name": str, "category": str}
|
||
"""
|
||
if self.engine is None:
|
||
raise RuntimeError("需要注入 SQLAlchemy engine")
|
||
|
||
from sqlalchemy import text
|
||
sql = text("""
|
||
SELECT DISTINCT p.i_code AS sku, p.name, p.category
|
||
FROM products p
|
||
JOIN price_records pr ON pr.product_id = p.id
|
||
WHERE p.status = 'ACTIVE'
|
||
ORDER BY p.i_code
|
||
""")
|
||
with self.engine.connect() as conn:
|
||
rows = conn.execute(sql).fetchall()
|
||
return [dict(r._mapping) for r in rows]
|
||
|
||
def _upsert_competitor_price(
|
||
self,
|
||
sku: str,
|
||
product, # PChomeProduct
|
||
match_score: float,
|
||
tags: list,
|
||
source: str = "pchome",
|
||
):
|
||
"""單筆寫入/更新 competitor_prices"""
|
||
from sqlalchemy import text
|
||
_taipei = timezone(timedelta(hours=8))
|
||
expires_at = (datetime.now(_taipei) + timedelta(hours=TTL_HOURS)).strftime("%Y-%m-%d %H:%M:%S")
|
||
with self.engine.begin() as conn:
|
||
conn.execute(text("""
|
||
INSERT INTO competitor_prices
|
||
(sku, source, price, original_price, discount_pct,
|
||
competitor_product_id, competitor_product_name,
|
||
match_score, tags, crawled_at, expires_at)
|
||
VALUES
|
||
(:sku, :source, :price, :original_price, :discount_pct,
|
||
:comp_id, :comp_name,
|
||
:match_score, :tags, CURRENT_TIMESTAMP, :expires_at)
|
||
ON CONFLICT (sku, source) DO UPDATE
|
||
SET price = EXCLUDED.price,
|
||
original_price = EXCLUDED.original_price,
|
||
discount_pct = EXCLUDED.discount_pct,
|
||
competitor_product_id = EXCLUDED.competitor_product_id,
|
||
competitor_product_name = EXCLUDED.competitor_product_name,
|
||
match_score = EXCLUDED.match_score,
|
||
tags = EXCLUDED.tags,
|
||
crawled_at = CURRENT_TIMESTAMP,
|
||
expires_at = :expires_at
|
||
"""), {
|
||
"sku": sku,
|
||
"source": source,
|
||
"price": product.price,
|
||
"original_price":product.original_price,
|
||
"discount_pct": product.discount,
|
||
"comp_id": product.product_id,
|
||
"comp_name": product.name[:200],
|
||
"match_score": match_score,
|
||
"tags": json.dumps(tags, ensure_ascii=False),
|
||
"expires_at": expires_at,
|
||
})
|
||
|
||
def run(self, source: str = "pchome") -> FeederResult:
|
||
"""
|
||
執行一輪競品價格抓取與寫入
|
||
|
||
Args:
|
||
source: 競品來源代碼(目前支援 'pchome')
|
||
|
||
Returns:
|
||
FeederResult
|
||
"""
|
||
start = time.time()
|
||
|
||
if source != "pchome":
|
||
logger.warning(f"[Feeder] 尚未支援 source={source},跳過")
|
||
return FeederResult(0, 0, 0, 0, 0, 0.0)
|
||
|
||
from services.pchome_crawler import PChomeCrawler
|
||
crawler = PChomeCrawler(timeout=30, delay=RATE_DELAY)
|
||
|
||
# Step 1: 取得監控清單
|
||
try:
|
||
skus = self._fetch_active_skus()
|
||
except Exception as e:
|
||
logger.error(f"[Feeder] 讀取商品清單失敗: {e}")
|
||
return FeederResult(0, 0, 0, 0, 1, time.time() - start)
|
||
|
||
logger.info(f"[Feeder] 開始抓取 {len(skus)} 支商品的 PChome 競品價格")
|
||
|
||
matched = 0
|
||
skipped_no = 0
|
||
skipped_low = 0
|
||
errors = 0
|
||
|
||
for item in skus:
|
||
sku = item["sku"]
|
||
momo_name = item["name"]
|
||
|
||
# 用商品名稱前 20 字搜尋(避免 query 過長)
|
||
keyword = momo_name[:20].strip()
|
||
|
||
try:
|
||
ok, _, products = crawler.search_products(keyword, limit=SEARCH_LIMIT)
|
||
if not ok or not products:
|
||
logger.debug(f"[Feeder] {sku} 無搜尋結果,跳過")
|
||
skipped_no += 1
|
||
continue
|
||
|
||
result = _find_best_match(momo_name, products)
|
||
if not result:
|
||
skipped_no += 1
|
||
continue
|
||
|
||
best_product, score = result
|
||
|
||
if score < MIN_MATCH_SCORE:
|
||
logger.debug(
|
||
f"[Feeder] {sku} 比對分數過低 ({score:.3f} < {MIN_MATCH_SCORE}),跳過"
|
||
)
|
||
skipped_low += 1
|
||
continue
|
||
|
||
tags = _extract_tags(best_product)
|
||
self._upsert_competitor_price(sku, best_product, score, tags, source)
|
||
matched += 1
|
||
logger.debug(
|
||
f"[Feeder] {sku} → PChome ${best_product.price} "
|
||
f"score={score:.3f} tags={tags}"
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"[Feeder] {sku} 處理失敗: {e}")
|
||
errors += 1
|
||
|
||
duration = round(time.time() - start, 2)
|
||
logger.info(
|
||
f"[Feeder] 完成 matched={matched} skipped_no={skipped_no} "
|
||
f"skipped_low={skipped_low} errors={errors} 耗時={duration}s"
|
||
)
|
||
return FeederResult(
|
||
total_skus=len(skus),
|
||
matched=matched,
|
||
skipped_no_result=skipped_no,
|
||
skipped_low_score=skipped_low,
|
||
errors=errors,
|
||
duration_sec=duration,
|
||
)
|
||
|
||
|
||
# ─────────────────────────────────────────────
|
||
# CLI 測試(不依賴 DB,直接測試爬蟲 + 比對邏輯)
|
||
# python3 services/competitor_price_feeder.py
|
||
# ─────────────────────────────────────────────
|
||
if __name__ == "__main__":
|
||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||
|
||
from services.pchome_crawler import PChomeCrawler
|
||
|
||
test_items = [
|
||
{"sku": "A003", "name": "舒特膚AD乳液200ml"},
|
||
{"sku": "A001", "name": "玻尿酸面膜10片裝"},
|
||
{"sku": "A009", "name": "美白化妝水150ml"},
|
||
]
|
||
|
||
crawler = PChomeCrawler(delay=0.8)
|
||
|
||
print("=== Competitor Price Feeder CLI 測試 ===\n")
|
||
for item in test_items:
|
||
keyword = item["name"][:20]
|
||
ok, msg, products = crawler.search_products(keyword, limit=10)
|
||
|
||
if not ok or not products:
|
||
print(f"[{item['sku']}] 無結果: {msg}")
|
||
continue
|
||
|
||
result = _find_best_match(item["name"], products)
|
||
if not result:
|
||
print(f"[{item['sku']}] 無法比對")
|
||
continue
|
||
|
||
best, score = result
|
||
tags = _extract_tags(best)
|
||
symbol = "✅" if score >= MIN_MATCH_SCORE else "⚠️ 低分"
|
||
print(
|
||
f"{symbol} [{item['sku']}] {item['name'][:25]}\n"
|
||
f" → PChome: {best.name[:40]}\n"
|
||
f" → 售價 ${best.price} | 分數 {score:.3f} | 標籤 {tags}\n"
|
||
)
|