Files
ewoooc/services/competitor_price_feeder.py
ogt 1b4f3a7bbe
Some checks failed
CD Pipeline / deploy (push) Failing after 59s
feat: EwoooC 初始化 — 完整專案推版至 Gitea
- 建立 Gitea Actions CD pipeline (.gitea/workflows/cd.yaml)
- 部署模式: rsync Python 檔案至 188 → docker restart (volume mount)
- Dockerfile/requirements 變動時自動重建 Docker image
- 部署通知: Telegram (開始/成功/失敗)
- 健康檢查: https://mo.wooo.work/health (最多 5 次重試)
- 同步最新 CLAUDE.md / ADR-008 / memory (2026-04-19)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 01:21:13 +08:00

381 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
競品價格補給線 Worker (Competitor Price Feeder)
角色:獨立背景 Worker生產者端
架構位置:
[本 Worker — 每 4 小時跑一次] → competitor_prices DB 表
[AI Pipeline] → fetch_candidates() LEFT JOIN competitor_prices消費者端
設計原則:
- 與 AI Pipeline 完全解耦:本 Worker 掛了不影響核心大腦
- 自帶重試機制,不阻塞主排程
- 語意化標籤 (tags) 讓 Hermes 獲得更豐富的情境
爬取邏輯:
MOMO 商品名稱 → PChome 關鍵字搜尋 → 模糊比對最佳匹配 → 寫入 competitor_prices
依賴:
services/pchome_crawler.py — 搜尋 + 批量 API
services/price_comparison.py — ProductNameParser + 模糊比對
"""
import json
import logging
import time
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Optional
logger = logging.getLogger(__name__)
# ── 比對參數 ─────────────────────────────────────────
MIN_MATCH_SCORE = 0.45 # 低於此分數不寫入(避免張冠李戴)
SEARCH_LIMIT = 10 # 每個 SKU 搜尋 PChome 前 N 筆
BATCH_SIZE = 30 # 每批 DB 寫入筆數
RATE_DELAY = 0.8 # 每次 PChome 請求間隔(秒)
TTL_HOURS = 6 # competitor_prices 快取有效期
# ── Feeder 結果 ───────────────────────────────────────
@dataclass
class FeederResult:
total_skus: int
matched: int
skipped_no_result: int
skipped_low_score: int
errors: int
duration_sec: float
def _extract_tags(pchome_product) -> list:
"""
從 PChomeProduct 物件提取語意標籤
標籤設計:
- "on_sale" — is_on_sale = True
- "discount_10pct" — 折扣 10~19%
- "discount_20pct" — 折扣 20~29%
- "discount_30pct" — 折扣 ≥ 30%
- "low_stock" — 庫存 < 10
- "high_rating" — 評分 ≥ 4.5
"""
tags = []
if pchome_product.is_on_sale:
tags.append("on_sale")
try:
disc = int(pchome_product.discount or 0)
except (ValueError, TypeError):
disc = 0
if disc >= 30:
tags.append("discount_30pct")
elif disc >= 20:
tags.append("discount_20pct")
elif disc >= 10:
tags.append("discount_10pct")
try:
stock = int(pchome_product.stock) if pchome_product.stock is not None else None
if stock is not None and 0 < stock < 10:
tags.append("low_stock")
except (ValueError, TypeError):
pass
try:
if pchome_product.rating and float(pchome_product.rating) >= 4.5:
tags.append("high_rating")
except (ValueError, TypeError):
pass
return tags
def _find_best_match(momo_name: str, pchome_products: list) -> Optional[tuple]:
"""
從 PChome 搜尋結果中找出與 MOMO 商品名稱最接近的一筆
Args:
momo_name: MOMO 商品名稱
pchome_products: PChomeProduct 列表
Returns:
(PChomeProduct, score) or None
"""
try:
from services.price_comparison import ProductNameParser
parser = ProductNameParser()
except ImportError:
# Fallback用 difflib 直接比字串
from difflib import SequenceMatcher
best, best_score = None, 0.0
for p in pchome_products:
score = SequenceMatcher(None, momo_name.lower(), p.name.lower()).ratio()
if score > best_score:
best, best_score = p, score
return (best, best_score) if best else None
# 使用 ProductNameParser 的結構化比對
momo_parsed = parser.parse(momo_name, "momo", 0, "", "")
best, best_score = None, 0.0
for p in pchome_products:
pchome_parsed = parser.parse(p.name, "pchome", p.price, p.product_id, p.product_url)
score = _structural_similarity(momo_parsed, pchome_parsed)
if score > best_score:
best, best_score = p, score
return (best, best_score) if best else None
def _structural_similarity(momo_p, pchome_p) -> float:
"""
結構化相似度計算(品牌 + 規格 + 關鍵字)
權重:品牌匹配 0.4 + 規格匹配 0.3 + 關鍵字相似 0.3
"""
from difflib import SequenceMatcher
score = 0.0
# 品牌比對 (0.4)
if momo_p.brand and pchome_p.brand:
if momo_p.brand == pchome_p.brand:
score += 0.4
elif momo_p.brand in pchome_p.brand or pchome_p.brand in momo_p.brand:
score += 0.2
elif not momo_p.brand and not pchome_p.brand:
score += 0.1 # 都沒有品牌,不扣分
# 規格比對 (0.3) — 容量/克重
momo_specs = momo_p.specs or {}
pchome_specs = pchome_p.specs or {}
if momo_specs and pchome_specs:
matching_specs = sum(
1 for k, v in momo_specs.items()
if pchome_specs.get(k) == v
)
total_specs = max(len(momo_specs), len(pchome_specs), 1)
score += 0.3 * (matching_specs / total_specs)
elif not momo_specs and not pchome_specs:
score += 0.15
# 關鍵字相似度 (0.3)
momo_kws = " ".join(momo_p.keywords or [])
pchome_kws = " ".join(pchome_p.keywords or [])
if momo_kws and pchome_kws:
kw_sim = SequenceMatcher(None, momo_kws.lower(), pchome_kws.lower()).ratio()
score += 0.3 * kw_sim
return round(score, 3)
class CompetitorPriceFeeder:
"""
競品價格補給線 Worker
用法:
feeder = CompetitorPriceFeeder(engine=db_engine)
result = feeder.run(source="pchome")
"""
def __init__(self, engine=None):
self.engine = engine
def _fetch_active_skus(self) -> list:
"""
從 products 表取得待監控的 ACTIVE 商品清單
Returns:
list of {"sku": str, "name": str, "category": str}
"""
if self.engine is None:
raise RuntimeError("需要注入 SQLAlchemy engine")
from sqlalchemy import text
sql = text("""
SELECT DISTINCT p.i_code AS sku, p.name, p.category
FROM products p
JOIN price_records pr ON pr.product_id = p.id
WHERE p.status = 'ACTIVE'
ORDER BY p.i_code
""")
with self.engine.connect() as conn:
rows = conn.execute(sql).fetchall()
return [dict(r._mapping) for r in rows]
def _upsert_competitor_price(
self,
sku: str,
product, # PChomeProduct
match_score: float,
tags: list,
source: str = "pchome",
):
"""單筆寫入/更新 competitor_prices"""
from sqlalchemy import text
_taipei = timezone(timedelta(hours=8))
expires_at = (datetime.now(_taipei) + timedelta(hours=TTL_HOURS)).strftime("%Y-%m-%d %H:%M:%S")
with self.engine.begin() as conn:
conn.execute(text("""
INSERT INTO competitor_prices
(sku, source, price, original_price, discount_pct,
competitor_product_id, competitor_product_name,
match_score, tags, crawled_at, expires_at)
VALUES
(:sku, :source, :price, :original_price, :discount_pct,
:comp_id, :comp_name,
:match_score, :tags, CURRENT_TIMESTAMP, :expires_at)
ON CONFLICT (sku, source) DO UPDATE
SET price = EXCLUDED.price,
original_price = EXCLUDED.original_price,
discount_pct = EXCLUDED.discount_pct,
competitor_product_id = EXCLUDED.competitor_product_id,
competitor_product_name = EXCLUDED.competitor_product_name,
match_score = EXCLUDED.match_score,
tags = EXCLUDED.tags,
crawled_at = CURRENT_TIMESTAMP,
expires_at = :expires_at
"""), {
"sku": sku,
"source": source,
"price": product.price,
"original_price":product.original_price,
"discount_pct": product.discount,
"comp_id": product.product_id,
"comp_name": product.name[:200],
"match_score": match_score,
"tags": json.dumps(tags, ensure_ascii=False),
"expires_at": expires_at,
})
def run(self, source: str = "pchome") -> FeederResult:
"""
執行一輪競品價格抓取與寫入
Args:
source: 競品來源代碼(目前支援 'pchome'
Returns:
FeederResult
"""
start = time.time()
if source != "pchome":
logger.warning(f"[Feeder] 尚未支援 source={source},跳過")
return FeederResult(0, 0, 0, 0, 0, 0.0)
from services.pchome_crawler import PChomeCrawler
crawler = PChomeCrawler(timeout=30, delay=RATE_DELAY)
# Step 1: 取得監控清單
try:
skus = self._fetch_active_skus()
except Exception as e:
logger.error(f"[Feeder] 讀取商品清單失敗: {e}")
return FeederResult(0, 0, 0, 0, 1, time.time() - start)
logger.info(f"[Feeder] 開始抓取 {len(skus)} 支商品的 PChome 競品價格")
matched = 0
skipped_no = 0
skipped_low = 0
errors = 0
for item in skus:
sku = item["sku"]
momo_name = item["name"]
# 用商品名稱前 20 字搜尋(避免 query 過長)
keyword = momo_name[:20].strip()
try:
ok, _, products = crawler.search_products(keyword, limit=SEARCH_LIMIT)
if not ok or not products:
logger.debug(f"[Feeder] {sku} 無搜尋結果,跳過")
skipped_no += 1
continue
result = _find_best_match(momo_name, products)
if not result:
skipped_no += 1
continue
best_product, score = result
if score < MIN_MATCH_SCORE:
logger.debug(
f"[Feeder] {sku} 比對分數過低 ({score:.3f} < {MIN_MATCH_SCORE}),跳過"
)
skipped_low += 1
continue
tags = _extract_tags(best_product)
self._upsert_competitor_price(sku, best_product, score, tags, source)
matched += 1
logger.debug(
f"[Feeder] {sku} → PChome ${best_product.price} "
f"score={score:.3f} tags={tags}"
)
except Exception as e:
logger.error(f"[Feeder] {sku} 處理失敗: {e}")
errors += 1
duration = round(time.time() - start, 2)
logger.info(
f"[Feeder] 完成 matched={matched} skipped_no={skipped_no} "
f"skipped_low={skipped_low} errors={errors} 耗時={duration}s"
)
return FeederResult(
total_skus=len(skus),
matched=matched,
skipped_no_result=skipped_no,
skipped_low_score=skipped_low,
errors=errors,
duration_sec=duration,
)
# ─────────────────────────────────────────────
# CLI 測試(不依賴 DB直接測試爬蟲 + 比對邏輯)
# python3 services/competitor_price_feeder.py
# ─────────────────────────────────────────────
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
from services.pchome_crawler import PChomeCrawler
test_items = [
{"sku": "A003", "name": "舒特膚AD乳液200ml"},
{"sku": "A001", "name": "玻尿酸面膜10片裝"},
{"sku": "A009", "name": "美白化妝水150ml"},
]
crawler = PChomeCrawler(delay=0.8)
print("=== Competitor Price Feeder CLI 測試 ===\n")
for item in test_items:
keyword = item["name"][:20]
ok, msg, products = crawler.search_products(keyword, limit=10)
if not ok or not products:
print(f"[{item['sku']}] 無結果: {msg}")
continue
result = _find_best_match(item["name"], products)
if not result:
print(f"[{item['sku']}] 無法比對")
continue
best, score = result
tags = _extract_tags(best)
symbol = "" if score >= MIN_MATCH_SCORE else "⚠️ 低分"
print(
f"{symbol} [{item['sku']}] {item['name'][:25]}\n"
f" → PChome: {best.name[:40]}\n"
f" → 售價 ${best.price} | 分數 {score:.3f} | 標籤 {tags}\n"
)