ewoooc/services/competitor_intel_repository.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""PChome / MOMO 競價情報共用資料出口。

短期 canonical source：
- competitor_prices：目前有效配對
- competitor_price_history：價格歷史趨勢
- competitor_match_attempts：未配對與低信心診斷
"""

from __future__ import annotations

import os
import pickle
import time
from datetime import date, datetime, timedelta
from pathlib import Path
from threading import Lock
from typing import Any, Optional, Union

from sqlalchemy import inspect, text


PCHOME_MATCH_SCORE_FLOOR = 0.76
COMPETITOR_INTEL_CACHE_TTL_SECONDS = int(os.getenv("COMPETITOR_INTEL_CACHE_TTL_SECONDS", "1800"))
_BASE_DIR = Path(__file__).resolve().parents[1]
_CACHE_FILE = _BASE_DIR / "data" / "competitor_intel_cache.pkl"
_CACHE_LOCK = Lock()
_MEM_CACHE: dict[str, dict[str, Any]] = {}


def _num(value: Any) -> float:
    try:
        return float(value or 0)
    except (TypeError, ValueError):
        return 0.0


def _date_label(value: Any) -> str:
    if hasattr(value, "strftime"):
        return value.strftime("%Y-%m-%d")
    return str(value or "")


def _month_label(value: Any) -> str:
    if hasattr(value, "strftime"):
        return value.strftime("%Y-%m")
    return str(value or "")[:7]


def clear_competitor_intel_cache() -> None:
    """Clear cached PChome/MOMO intelligence after crawler/import updates."""
    with _CACHE_LOCK:
        _MEM_CACHE.clear()
        try:
            if _CACHE_FILE.exists():
                _CACHE_FILE.unlink()
        except OSError:
            pass


def _load_shared_cache() -> dict[str, dict[str, Any]]:
    if not _CACHE_FILE.exists():
        return {}
    try:
        with _CACHE_FILE.open("rb") as handle:
            payload = pickle.load(handle)
        return payload if isinstance(payload, dict) else {}
    except Exception:
        return {}


def _write_shared_cache(payload: dict[str, dict[str, Any]]) -> None:
    try:
        _CACHE_FILE.parent.mkdir(parents=True, exist_ok=True)
        tmp_file = _CACHE_FILE.with_suffix(f".{os.getpid()}.tmp")
        with tmp_file.open("wb") as handle:
            pickle.dump(payload, handle, protocol=pickle.HIGHEST_PROTOCOL)
        os.replace(tmp_file, _CACHE_FILE)
    except Exception:
        try:
            if "tmp_file" in locals() and tmp_file.exists():
                tmp_file.unlink()
        except OSError:
            pass


def _cached_payload(cache_key: str, producer, ttl_seconds: int = COMPETITOR_INTEL_CACHE_TTL_SECONDS):
    if ttl_seconds <= 0:
        return producer()
    now = time.time()
    with _CACHE_LOCK:
        entry = _MEM_CACHE.get(cache_key)
        if entry and now - float(entry.get("time", 0)) < ttl_seconds:
            return entry.get("value")
        shared = _load_shared_cache()
        entry = shared.get(cache_key)
        if entry and now - float(entry.get("time", 0)) < ttl_seconds:
            _MEM_CACHE[cache_key] = entry
            return entry.get("value")

    value = producer()
    entry = {"time": now, "value": value}
    with _CACHE_LOCK:
        _MEM_CACHE[cache_key] = entry
        shared = _load_shared_cache()
        shared[cache_key] = entry
        stale_before = now - max(ttl_seconds * 4, 3600)
        shared = {
            key: item
            for key, item in shared.items()
            if isinstance(item, dict) and float(item.get("time", 0)) >= stale_before
        }
        _write_shared_cache(shared)
    return value


def fetch_competitor_coverage(engine) -> dict:
    return _cached_payload(
        f"coverage:v2:floor={PCHOME_MATCH_SCORE_FLOOR}",
        lambda: _fetch_competitor_coverage_uncached(engine),
    )


def _fetch_competitor_coverage_uncached(engine) -> dict:
    """讀取目前 PChome 比價覆蓋率與待審分類。"""
    if not inspect(engine).has_table("competitor_prices"):
        return {
            "active_with_price": 0,
            "valid_matches": 0,
            "pending": 0,
            "match_rate": 0,
            "attempt_status": {},
        }

    sql = text(f"""
        WITH latest_momo AS (
            SELECT
                p.id AS product_id,
                p.i_code AS sku,
                pr.price AS momo_price,
                ROW_NUMBER() OVER (PARTITION BY p.id ORDER BY pr.timestamp DESC, pr.id DESC) AS rn
            FROM products p
            JOIN price_records pr ON pr.product_id = p.id
            WHERE p.status = 'ACTIVE'
        ),
        valid_competitor AS (
            SELECT DISTINCT ON (cp.sku)
                cp.sku
            FROM competitor_prices cp
            WHERE cp.source = 'pchome'
              AND (cp.expires_at IS NULL OR cp.expires_at > CURRENT_TIMESTAMP)
              AND cp.price IS NOT NULL
              AND cp.price > 0
              AND COALESCE(cp.match_score, 0) >= {PCHOME_MATCH_SCORE_FLOOR}
              AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
            ORDER BY cp.sku, cp.crawled_at DESC NULLS LAST
        ),
        latest_attempt AS (
            SELECT DISTINCT ON (sku)
                sku,
                attempt_status
            FROM competitor_match_attempts
            WHERE source = 'pchome'
            ORDER BY sku, attempted_at DESC NULLS LAST
        )
        SELECT
            (SELECT COUNT(*) FROM latest_momo WHERE rn = 1) AS active_with_price,
            (SELECT COUNT(*) FROM valid_competitor) AS valid_matches,
            (SELECT COUNT(*)
             FROM latest_momo lm
             LEFT JOIN valid_competitor vc ON vc.sku = lm.sku
             WHERE lm.rn = 1 AND vc.sku IS NULL) AS pending,
            COALESCE(la.attempt_status, 'never_attempted') AS attempt_status,
            COUNT(*) AS status_count
        FROM latest_momo lm
        LEFT JOIN valid_competitor vc ON vc.sku = lm.sku
        LEFT JOIN latest_attempt la ON la.sku = lm.sku
        WHERE lm.rn = 1
          AND vc.sku IS NULL
        GROUP BY COALESCE(la.attempt_status, 'never_attempted')
    """)
    with engine.connect() as conn:
        rows = conn.execute(sql).mappings().all()

    active = int(rows[0].get("active_with_price") or 0) if rows else 0
    valid = int(rows[0].get("valid_matches") or 0) if rows else 0
    pending = int(rows[0].get("pending") or 0) if rows else 0
    statuses = {
        str(row.get("attempt_status")): int(row.get("status_count") or 0)
        for row in rows
    }
    return {
        "active_with_price": active,
        "valid_matches": valid,
        "pending": pending,
        "match_rate": round(valid / max(active, 1) * 100, 1),
        "attempt_status": statuses,
        "match_score_floor": PCHOME_MATCH_SCORE_FLOOR,
    }


def fetch_competitor_gap_trend(engine, days: int = 30) -> dict:
    days = max(7, min(int(days or 30), 120))
    return _cached_payload(
        f"gap_trend:v2:days={days}:floor={PCHOME_MATCH_SCORE_FLOOR}",
        lambda: _fetch_competitor_gap_trend_uncached(engine, days=days),
    )


def _fetch_competitor_gap_trend_uncached(engine, days: int = 30) -> dict:
    """近 N 天 PChome 價差壓力趨勢。"""
    if not inspect(engine).has_table("competitor_price_history"):
        return {"labels": [], "avg_gap_pct": [], "risk_count": [], "momo_advantage_count": [], "match_count": []}

    days = max(7, min(int(days or 30), 120))
    sql = text(f"""
        WITH latest_history AS (
            SELECT
                date_trunc('day', cph.crawled_at)::date AS bucket_date,
                cph.sku,
                cph.momo_price,
                cph.price AS pchome_price,
                ROW_NUMBER() OVER (
                    PARTITION BY date_trunc('day', cph.crawled_at)::date, cph.sku
                    ORDER BY cph.crawled_at DESC
                ) AS rn
            FROM competitor_price_history cph
            WHERE cph.source = 'pchome'
              AND cph.crawled_at >= CURRENT_DATE - (:days * INTERVAL '1 day')
              AND cph.momo_price IS NOT NULL
              AND cph.momo_price > 0
              AND cph.price IS NOT NULL
              AND cph.price > 0
              AND COALESCE(cph.match_score, 0) >= {PCHOME_MATCH_SCORE_FLOOR}
              AND COALESCE(cph.tags, '[]'::jsonb) ? 'identity_v2'
        )
        SELECT
            bucket_date,
            COUNT(*) AS match_count,
            ROUND(AVG((momo_price - pchome_price) / pchome_price * 100)::numeric, 2) AS avg_gap_pct,
            SUM(CASE WHEN momo_price > pchome_price * 1.05 THEN 1 ELSE 0 END) AS risk_count,
            SUM(CASE WHEN momo_price < pchome_price * 0.95 THEN 1 ELSE 0 END) AS momo_advantage_count
        FROM latest_history
        WHERE rn = 1
        GROUP BY bucket_date
        ORDER BY bucket_date
    """)
    with engine.connect() as conn:
        rows = conn.execute(sql, {"days": days}).mappings().all()

    return {
        "labels": [_date_label(row.get("bucket_date")) for row in rows],
        "avg_gap_pct": [_num(row.get("avg_gap_pct")) for row in rows],
        "risk_count": [int(row.get("risk_count") or 0) for row in rows],
        "momo_advantage_count": [int(row.get("momo_advantage_count") or 0) for row in rows],
        "match_count": [int(row.get("match_count") or 0) for row in rows],
    }


def fetch_competitor_monthly_pressure(engine, months: int = 12) -> dict:
    months = max(3, min(int(months or 12), 36))
    return _cached_payload(
        f"monthly_pressure:v2:months={months}:floor={PCHOME_MATCH_SCORE_FLOOR}",
        lambda: _fetch_competitor_monthly_pressure_uncached(engine, months=months),
    )


def _fetch_competitor_monthly_pressure_uncached(engine, months: int = 12) -> dict:
    """月度競品價格壓力，用於 growth analysis。"""
    if not inspect(engine).has_table("competitor_price_history"):
        return {"labels": [], "avg_gap_pct": [], "risk_count": [], "match_count": []}

    months = max(3, min(int(months or 12), 36))
    sql = text(f"""
        WITH latest_history AS (
            SELECT
                date_trunc('month', cph.crawled_at)::date AS bucket_month,
                cph.sku,
                cph.momo_price,
                cph.price AS pchome_price,
                ROW_NUMBER() OVER (
                    PARTITION BY date_trunc('month', cph.crawled_at)::date, cph.sku
                    ORDER BY cph.crawled_at DESC
                ) AS rn
            FROM competitor_price_history cph
            WHERE cph.source = 'pchome'
              AND cph.crawled_at >= date_trunc('month', CURRENT_DATE) - (:months * INTERVAL '1 month')
              AND cph.momo_price IS NOT NULL
              AND cph.momo_price > 0
              AND cph.price IS NOT NULL
              AND cph.price > 0
              AND COALESCE(cph.match_score, 0) >= {PCHOME_MATCH_SCORE_FLOOR}
              AND COALESCE(cph.tags, '[]'::jsonb) ? 'identity_v2'
        )
        SELECT
            bucket_month,
            COUNT(*) AS match_count,
            ROUND(AVG((momo_price - pchome_price) / pchome_price * 100)::numeric, 2) AS avg_gap_pct,
            SUM(CASE WHEN momo_price > pchome_price * 1.05 THEN 1 ELSE 0 END) AS risk_count
        FROM latest_history
        WHERE rn = 1
        GROUP BY bucket_month
        ORDER BY bucket_month
    """)
    with engine.connect() as conn:
        rows = conn.execute(sql, {"months": months}).mappings().all()

    return {
        "labels": [_month_label(row.get("bucket_month")) for row in rows],
        "avg_gap_pct": [_num(row.get("avg_gap_pct")) for row in rows],
        "risk_count": [int(row.get("risk_count") or 0) for row in rows],
        "match_count": [int(row.get("match_count") or 0) for row in rows],
    }


def fetch_top_competitor_risks(engine, limit: int = 10) -> list[dict]:
    limit = max(1, min(int(limit or 10), 50))
    return _cached_payload(
        f"top_risks:v2:limit={limit}:floor={PCHOME_MATCH_SCORE_FLOOR}",
        lambda: _fetch_top_competitor_risks_uncached(engine, limit=limit),
    )


def _fetch_top_competitor_risks_uncached(engine, limit: int = 10) -> list[dict]:
    """目前 MOMO 比 PChome 貴的高風險商品。"""
    if not inspect(engine).has_table("competitor_prices"):
        return []

    limit = max(1, min(int(limit or 10), 50))
    sql = text(f"""
        WITH valid_competitor AS (
            SELECT DISTINCT ON (cp.sku)
                cp.sku,
                cp.price AS pchome_price,
                cp.competitor_product_id,
                cp.competitor_product_name,
                cp.match_score,
                cp.crawled_at
            FROM competitor_prices cp
            WHERE cp.source = 'pchome'
              AND (cp.expires_at IS NULL OR cp.expires_at > CURRENT_TIMESTAMP)
              AND cp.price IS NOT NULL
              AND cp.price > 0
              AND COALESCE(cp.match_score, 0) >= {PCHOME_MATCH_SCORE_FLOOR}
              AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
            ORDER BY cp.sku, cp.crawled_at DESC NULLS LAST
        )
        SELECT
            p.i_code AS sku,
            p.name,
            p.category,
            latest_price.momo_price,
            vc.pchome_price,
            vc.competitor_product_id,
            vc.competitor_product_name,
            vc.match_score,
            vc.crawled_at,
            (latest_price.momo_price - vc.pchome_price) AS gap_amount,
            ((latest_price.momo_price - vc.pchome_price) / vc.pchome_price * 100) AS gap_pct
        FROM valid_competitor vc
        JOIN products p
          ON p.i_code = vc.sku
         AND p.status = 'ACTIVE'
        JOIN LATERAL (
            SELECT pr.price AS momo_price
            FROM price_records pr
            WHERE pr.product_id = p.id
            ORDER BY pr.timestamp DESC, pr.id DESC
            LIMIT 1
        ) latest_price ON TRUE
        WHERE latest_price.momo_price > vc.pchome_price * 1.05
        ORDER BY gap_pct DESC NULLS LAST, gap_amount DESC NULLS LAST
        LIMIT :limit
    """)
    with engine.connect() as conn:
        rows = conn.execute(sql, {"limit": limit}).mappings().all()

    result = []
    for row in rows:
        result.append({
            "sku": str(row.get("sku") or ""),
            "name": row.get("name") or "",
            "category": row.get("category") or "",
            "momo_price": _num(row.get("momo_price")),
            "pchome_price": _num(row.get("pchome_price")),
            "gap_amount": _num(row.get("gap_amount")),
            "gap_pct": _num(row.get("gap_pct")),
            "match_score": _num(row.get("match_score")),
            "pchome_id": row.get("competitor_product_id"),
            "pchome_name": row.get("competitor_product_name") or "",
            "crawled_at": _date_label(row.get("crawled_at")),
        })
    return result


def fetch_competitor_comparison_results(
    engine,
    start_date: Optional[Union[date, datetime, str]] = None,
    end_date: Optional[Union[date, datetime, str]] = None,
    limit: int = 30,
) -> list[dict]:
    """輸出與 legacy competitor PPT 相容的比價結果，不再 live crawl。"""
    limit = max(1, min(int(limit or 30), 100))
    inspector = inspect(engine)
    if not (
        inspector.has_table("products")
        and inspector.has_table("price_records")
        and inspector.has_table("competitor_prices")
    ):
        return []

    has_daily_sales = inspector.has_table("daily_sales")
    has_match_attempts = inspector.has_table("competitor_match_attempts")
    sales_cte = ""
    sales_join = ""
    sales_select = "0 AS momo_revenue,"
    attempt_cte = """
        latest_attempt AS (
            SELECT
                NULL AS sku,
                NULL AS attempt_status,
                NULL AS candidate_count,
                NULL AS best_match_score,
                NULL AS error_message,
                NULL AS attempted_at
            WHERE FALSE
        )
    """
    order_expr = (
        "lm.momo_price DESC NULLS LAST, "
        "(vc.pchome_price IS NULL), "
        "ABS((lm.momo_price - vc.pchome_price) / vc.pchome_price * 100) DESC NULLS LAST"
    )
    params: dict[str, Any] = {"limit": limit}

    if has_daily_sales:
        where = []
        if start_date:
            where.append("DATE(s.date) >= DATE(:start_date)")
            params["start_date"] = str(start_date).replace("/", "-")[:10]
        if end_date:
            where.append("DATE(s.date) <= DATE(:end_date)")
            params["end_date"] = str(end_date).replace("/", "-")[:10]
        sales_where = "WHERE " + " AND ".join(where) if where else ""
        sales_cte = f""",
        sales_rank AS (
            SELECT
                s.product_id,
                SUM(COALESCE(s.revenue, 0)) AS momo_revenue
            FROM daily_sales s
            {sales_where}
            GROUP BY s.product_id
        )
        """
        sales_join = "LEFT JOIN sales_rank sr ON sr.product_id = lm.product_id"
        sales_select = "COALESCE(sr.momo_revenue, 0) AS momo_revenue,"
        order_expr = (
            "COALESCE(sr.momo_revenue, 0) DESC, "
            "(vc.pchome_price IS NULL), "
            "ABS((lm.momo_price - vc.pchome_price) / vc.pchome_price * 100) DESC NULLS LAST"
        )

    if has_match_attempts:
        attempt_cte = """
        latest_attempt AS (
            SELECT DISTINCT ON (cma.sku)
                cma.sku,
                cma.attempt_status,
                cma.candidate_count,
                cma.best_match_score,
                cma.error_message,
                cma.attempted_at
            FROM competitor_match_attempts cma
            WHERE cma.source = 'pchome'
            ORDER BY cma.sku, cma.attempted_at DESC NULLS LAST
        )
        """

    sql = text(f"""
        WITH latest_momo AS (
            SELECT
                p.id AS product_id,
                p.i_code AS sku,
                p.name,
                pr.price AS momo_price,
                ROW_NUMBER() OVER (PARTITION BY p.id ORDER BY pr.timestamp DESC, pr.id DESC) AS rn
            FROM products p
            JOIN price_records pr ON pr.product_id = p.id
            WHERE p.status = 'ACTIVE'
        ),
        valid_competitor AS (
            SELECT DISTINCT ON (cp.sku)
                cp.sku,
                cp.price AS pchome_price,
                cp.competitor_product_id,
                cp.competitor_product_name,
                cp.match_score
            FROM competitor_prices cp
            WHERE cp.source = 'pchome'
              AND (cp.expires_at IS NULL OR cp.expires_at > CURRENT_TIMESTAMP)
              AND cp.price IS NOT NULL
              AND cp.price > 0
              AND COALESCE(cp.match_score, 0) >= {PCHOME_MATCH_SCORE_FLOOR}
              AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
            ORDER BY cp.sku, cp.crawled_at DESC NULLS LAST
        ),
        {attempt_cte}
        {sales_cte}
        SELECT
            lm.sku,
            lm.name,
            lm.momo_price,
            vc.pchome_price,
            vc.competitor_product_id,
            vc.competitor_product_name,
            vc.match_score,
            la.attempt_status,
            la.candidate_count,
            la.best_match_score,
            la.error_message,
            la.attempted_at,
            {sales_select}
            (vc.pchome_price - lm.momo_price) AS price_diff,
            ((vc.pchome_price - lm.momo_price) / lm.momo_price * 100) AS price_diff_pct
        FROM latest_momo lm
        LEFT JOIN valid_competitor vc ON vc.sku = lm.sku
        LEFT JOIN latest_attempt la ON la.sku = lm.sku
        {sales_join}
        WHERE lm.rn = 1
          AND lm.momo_price > 0
        ORDER BY {order_expr}
        LIMIT :limit
    """)
    with engine.connect() as conn:
        rows = conn.execute(sql, params).mappings().all()

    results = []
    for row in rows:
        pchome_id = row.get("competitor_product_id")
        found = bool(row.get("pchome_price"))
        results.append({
            "found": found,
            "momo_icode": str(row.get("sku") or ""),
            "momo_name": row.get("name") or "",
            "momo_price": _num(row.get("momo_price")),
            "pc_name": row.get("competitor_product_name") or "",
            "pc_price": _num(row.get("pchome_price")),
            "pc_url": f"https://24h.pchome.com.tw/prod/{pchome_id}" if pchome_id else "",
            "price_diff": _num(row.get("price_diff")),
            "price_diff_pct": _num(row.get("price_diff_pct")),
            "match_score": _num(row.get("match_score")),
            "momo_revenue": _num(row.get("momo_revenue")),
            "match_status": "matched" if found else (row.get("attempt_status") or "no_valid_match"),
            "candidate_count": int(row.get("candidate_count") or 0),
            "best_match_score": _num(row.get("best_match_score")),
            "match_diagnostic": row.get("error_message") or "",
        })
    return results


def build_competitor_intel_payload(engine, days: int = 30) -> dict:
    """頁面、AI、PPT 可共用的摘要 payload。"""
    return {
        "coverage": fetch_competitor_coverage(engine),
        "trend": fetch_competitor_gap_trend(engine, days=days),
        "top_risks": fetch_top_competitor_risks(engine, limit=10),
        "match_score_floor": PCHOME_MATCH_SCORE_FLOOR,
    }