ewoooc/services/competitor_price_feeder.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
競品價格補給線 Worker (Competitor Price Feeder)

角色：獨立背景 Worker（生產者端）
架構位置：
  [本 Worker — 每 4 小時跑一次] → competitor_prices DB 表
                                        ↓
  [AI Pipeline] → fetch_candidates() LEFT JOIN competitor_prices（消費者端）

設計原則：
- 與 AI Pipeline 完全解耦：本 Worker 掛了不影響核心大腦
- 自帶重試機制，不阻塞主排程
- 語意化標籤 (tags) 讓 Hermes 獲得更豐富的情境

爬取邏輯：
  MOMO 商品名稱 → PChome 關鍵字搜尋 → 模糊比對最佳匹配 → 寫入 competitor_prices

依賴：
  services/pchome_crawler.py   — 搜尋 + 批量 API
  services/price_comparison.py — ProductNameParser + 模糊比對
"""

import json
import logging
import time
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Optional

logger = logging.getLogger(__name__)

# ── 比對參數 ─────────────────────────────────────────
MIN_MATCH_SCORE  = 0.45  # 低於此分數不寫入（避免張冠李戴）
SEARCH_LIMIT     = 10    # 每個 SKU 搜尋 PChome 前 N 筆
BATCH_SIZE       = 30    # 每批 DB 寫入筆數
RATE_DELAY       = 0.8   # 每次 PChome 請求間隔（秒）
TTL_HOURS        = 6     # competitor_prices 快取有效期

# ── Feeder 結果 ───────────────────────────────────────
@dataclass
class FeederResult:
    total_skus: int
    matched: int
    skipped_no_result: int
    skipped_low_score: int
    errors: int
    duration_sec: float


def _extract_tags(pchome_product) -> list:
    """
    從 PChomeProduct 物件提取語意標籤

    標籤設計：
      - "on_sale"        — is_on_sale = True
      - "discount_10pct" — 折扣 10~19%
      - "discount_20pct" — 折扣 20~29%
      - "discount_30pct" — 折扣 ≥ 30%
      - "low_stock"      — 庫存 < 10
      - "high_rating"    — 評分 ≥ 4.5
    """
    tags = []

    if pchome_product.is_on_sale:
        tags.append("on_sale")

    try:
        disc = int(pchome_product.discount or 0)
    except (ValueError, TypeError):
        disc = 0
    if disc >= 30:
        tags.append("discount_30pct")
    elif disc >= 20:
        tags.append("discount_20pct")
    elif disc >= 10:
        tags.append("discount_10pct")

    try:
        stock = int(pchome_product.stock) if pchome_product.stock is not None else None
        if stock is not None and 0 < stock < 10:
            tags.append("low_stock")
    except (ValueError, TypeError):
        pass

    try:
        if pchome_product.rating and float(pchome_product.rating) >= 4.5:
            tags.append("high_rating")
    except (ValueError, TypeError):
        pass

    return tags


def _find_best_match(momo_name: str, pchome_products: list) -> Optional[tuple]:
    """
    從 PChome 搜尋結果中找出與 MOMO 商品名稱最接近的一筆

    Args:
        momo_name:       MOMO 商品名稱
        pchome_products: PChomeProduct 列表

    Returns:
        (PChomeProduct, score) or None
    """
    try:
        from services.price_comparison import ProductNameParser
        parser = ProductNameParser()
    except ImportError:
        # Fallback：用 difflib 直接比字串
        from difflib import SequenceMatcher
        best, best_score = None, 0.0
        for p in pchome_products:
            score = SequenceMatcher(None, momo_name.lower(), p.name.lower()).ratio()
            if score > best_score:
                best, best_score = p, score
        return (best, best_score) if best else None

    # 使用 ProductNameParser 的結構化比對
    momo_parsed = parser.parse(momo_name, "momo", 0, "", "")

    best, best_score = None, 0.0
    for p in pchome_products:
        pchome_parsed = parser.parse(p.name, "pchome", p.price, p.product_id, p.product_url)
        score = _structural_similarity(momo_parsed, pchome_parsed)
        if score > best_score:
            best, best_score = p, score

    return (best, best_score) if best else None


def _structural_similarity(momo_p, pchome_p) -> float:
    """
    結構化相似度計算（品牌 + 規格 + 關鍵字）

    權重：品牌匹配 0.4 + 規格匹配 0.3 + 關鍵字相似 0.3
    """
    from difflib import SequenceMatcher

    score = 0.0

    # 品牌比對 (0.4)
    if momo_p.brand and pchome_p.brand:
        if momo_p.brand == pchome_p.brand:
            score += 0.4
        elif momo_p.brand in pchome_p.brand or pchome_p.brand in momo_p.brand:
            score += 0.2
    elif not momo_p.brand and not pchome_p.brand:
        score += 0.1  # 都沒有品牌，不扣分

    # 規格比對 (0.3) — 容量/克重
    momo_specs  = momo_p.specs  or {}
    pchome_specs = pchome_p.specs or {}
    if momo_specs and pchome_specs:
        matching_specs = sum(
            1 for k, v in momo_specs.items()
            if pchome_specs.get(k) == v
        )
        total_specs = max(len(momo_specs), len(pchome_specs), 1)
        score += 0.3 * (matching_specs / total_specs)
    elif not momo_specs and not pchome_specs:
        score += 0.15

    # 關鍵字相似度 (0.3)
    momo_kws  = " ".join(momo_p.keywords or [])
    pchome_kws = " ".join(pchome_p.keywords or [])
    if momo_kws and pchome_kws:
        kw_sim = SequenceMatcher(None, momo_kws.lower(), pchome_kws.lower()).ratio()
        score += 0.3 * kw_sim

    return round(score, 3)


class CompetitorPriceFeeder:
    """
    競品價格補給線 Worker

    用法：
        feeder = CompetitorPriceFeeder(engine=db_engine)
        result = feeder.run(source="pchome")
    """

    def __init__(self, engine=None):
        self.engine = engine

    def _fetch_active_skus(self) -> list:
        """
        從 products 表取得待監控的 ACTIVE 商品清單

        Returns:
            list of {"sku": str, "name": str, "category": str}
        """
        if self.engine is None:
            raise RuntimeError("需要注入 SQLAlchemy engine")

        from sqlalchemy import text
        sql = text("""
            SELECT DISTINCT p.i_code AS sku, p.name, p.category
            FROM products p
            JOIN price_records pr ON pr.product_id = p.id
            WHERE p.status = 'ACTIVE'
            ORDER BY p.i_code
        """)
        with self.engine.connect() as conn:
            rows = conn.execute(sql).fetchall()
        return [dict(r._mapping) for r in rows]

    def _upsert_competitor_price(
        self,
        sku: str,
        product,           # PChomeProduct
        match_score: float,
        tags: list,
        source: str = "pchome",
    ):
        """單筆寫入/更新 competitor_prices"""
        from sqlalchemy import text
        _taipei = timezone(timedelta(hours=8))
        expires_at = (datetime.now(_taipei) + timedelta(hours=TTL_HOURS)).strftime("%Y-%m-%d %H:%M:%S")
        with self.engine.begin() as conn:
            conn.execute(text("""
                INSERT INTO competitor_prices
                    (sku, source, price, original_price, discount_pct,
                     competitor_product_id, competitor_product_name,
                     match_score, tags, crawled_at, expires_at)
                VALUES
                    (:sku, :source, :price, :original_price, :discount_pct,
                     :comp_id, :comp_name,
                     :match_score, :tags, CURRENT_TIMESTAMP, :expires_at)
                ON CONFLICT (sku, source) DO UPDATE
                    SET price                   = EXCLUDED.price,
                        original_price          = EXCLUDED.original_price,
                        discount_pct            = EXCLUDED.discount_pct,
                        competitor_product_id   = EXCLUDED.competitor_product_id,
                        competitor_product_name = EXCLUDED.competitor_product_name,
                        match_score             = EXCLUDED.match_score,
                        tags                    = EXCLUDED.tags,
                        crawled_at              = CURRENT_TIMESTAMP,
                        expires_at              = :expires_at
            """), {
                "sku":           sku,
                "source":        source,
                "price":         product.price,
                "original_price":product.original_price,
                "discount_pct":  product.discount,
                "comp_id":       product.product_id,
                "comp_name":     product.name[:200],
                "match_score":   match_score,
                "tags":          json.dumps(tags, ensure_ascii=False),
                "expires_at":    expires_at,
            })

    def run(self, source: str = "pchome") -> FeederResult:
        """
        執行一輪競品價格抓取與寫入

        Args:
            source: 競品來源代碼（目前支援 'pchome'）

        Returns:
            FeederResult
        """
        start = time.time()

        if source != "pchome":
            logger.warning(f"[Feeder] 尚未支援 source={source}，跳過")
            return FeederResult(0, 0, 0, 0, 0, 0.0)

        from services.pchome_crawler import PChomeCrawler
        crawler = PChomeCrawler(timeout=30, delay=RATE_DELAY)

        # Step 1: 取得監控清單
        try:
            skus = self._fetch_active_skus()
        except Exception as e:
            logger.error(f"[Feeder] 讀取商品清單失敗: {e}")
            return FeederResult(0, 0, 0, 0, 1, time.time() - start)

        logger.info(f"[Feeder] 開始抓取 {len(skus)} 支商品的 PChome 競品價格")

        matched = 0
        skipped_no = 0
        skipped_low = 0
        errors = 0

        for item in skus:
            sku      = item["sku"]
            momo_name = item["name"]

            # 用商品名稱前 20 字搜尋（避免 query 過長）
            keyword = momo_name[:20].strip()

            try:
                ok, _, products = crawler.search_products(keyword, limit=SEARCH_LIMIT)
                if not ok or not products:
                    logger.debug(f"[Feeder] {sku} 無搜尋結果，跳過")
                    skipped_no += 1
                    continue

                result = _find_best_match(momo_name, products)
                if not result:
                    skipped_no += 1
                    continue

                best_product, score = result

                if score < MIN_MATCH_SCORE:
                    logger.debug(
                        f"[Feeder] {sku} 比對分數過低 ({score:.3f} < {MIN_MATCH_SCORE})，跳過"
                    )
                    skipped_low += 1
                    continue

                tags = _extract_tags(best_product)
                self._upsert_competitor_price(sku, best_product, score, tags, source)
                matched += 1
                logger.debug(
                    f"[Feeder] {sku} → PChome ${best_product.price} "
                    f"score={score:.3f} tags={tags}"
                )

            except Exception as e:
                logger.error(f"[Feeder] {sku} 處理失敗: {e}")
                errors += 1

        duration = round(time.time() - start, 2)
        logger.info(
            f"[Feeder] 完成 matched={matched} skipped_no={skipped_no} "
            f"skipped_low={skipped_low} errors={errors} 耗時={duration}s"
        )
        return FeederResult(
            total_skus=len(skus),
            matched=matched,
            skipped_no_result=skipped_no,
            skipped_low_score=skipped_low,
            errors=errors,
            duration_sec=duration,
        )


# ─────────────────────────────────────────────
# CLI 測試（不依賴 DB，直接測試爬蟲 + 比對邏輯）
# python3 services/competitor_price_feeder.py
# ─────────────────────────────────────────────
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")

    from services.pchome_crawler import PChomeCrawler

    test_items = [
        {"sku": "A003", "name": "舒特膚AD乳液200ml"},
        {"sku": "A001", "name": "玻尿酸面膜10片裝"},
        {"sku": "A009", "name": "美白化妝水150ml"},
    ]

    crawler = PChomeCrawler(delay=0.8)

    print("=== Competitor Price Feeder CLI 測試 ===\n")
    for item in test_items:
        keyword = item["name"][:20]
        ok, msg, products = crawler.search_products(keyword, limit=10)

        if not ok or not products:
            print(f"[{item['sku']}] 無結果: {msg}")
            continue

        result = _find_best_match(item["name"], products)
        if not result:
            print(f"[{item['sku']}] 無法比對")
            continue

        best, score = result
        tags = _extract_tags(best)
        symbol = "✅" if score >= MIN_MATCH_SCORE else "⚠️ 低分"
        print(
            f"{symbol} [{item['sku']}] {item['name'][:25]}\n"
            f"   → PChome: {best.name[:40]}\n"
            f"   → 售價 ${best.price} | 分數 {score:.3f} | 標籤 {tags}\n"
        )