收緊 PChome 同款比對門檻

2026-05-19 15:53:09 +08:00
parent 6c548e2251
commit 75390f8495
19 changed files with 94 additions and 31 deletions
--- a/TODO_NEXT_STEPS.txt
+++ b/TODO_NEXT_STEPS.txt
@@ -4,7 +4,7 @@
 ================================================================================

 【已完成】
-   - V10.263 強化核心 MOMO/PChome 比價鏈路：新增 `marketplace_product_matcher.py` 身份比對、只讓 `identity_v2` 高信心配對進 Dashboard/AI/Excel/Daily/Growth/PPT，並建立 `competitor_intel_repository.py` 統一圖表與簡報資料出口。
+   - V10.265 強化核心 MOMO/PChome 比價鏈路：新增 `marketplace_product_matcher.py` 身份比對、只讓 `identity_v2` 且分數 ≥ 0.76 的高信心配對進 Dashboard/AI/Excel/Daily/Growth/PPT，並建立 `competitor_intel_repository.py` 統一圖表與簡報資料出口；同品牌但不同型號/不同組數會進待審，不進正式比價。
   - V10.254 續補 `/growth_analysis` 快取命中效能：PostgreSQL source fingerprint 加 60 秒短 TTL，匯入 realtime_sales_monthly 後同步清除 growth shared cache 與短快取，避免快取命中仍頻繁掃大表 COUNT。
   - V10.253 修正 Elephant Alpha L3 HITL 空告警：價格類與資源調配低信心事件若沒有 Hermes/實證資料，只記 suppressed telemetry 與 cooldown，不寫 pending human_review、不發 Telegram；`resource_optimization` 會保留 queue/load 原始指標供追查。
   - V10.251 修正 OpenClaw Q&A 備援遙測：Ollama 主路徑仍為 GCP-A → GCP-B → 111，Gemini 只記為 `openclaw_qa_gemini_fallback`，NIM 只記為 `openclaw_qa_nim`；AI Calls 會把 legacy `openclaw_qa + gemini` 標成 Gemini 備援，避免再次誤判 Gemini-first。
--- a/config.py
+++ b/config.py
@@ -320,7 +320,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
 # ==========================================
 # 系統版本與路徑
 # ==========================================
-SYSTEM_VERSION = "V10.264"
+SYSTEM_VERSION = "V10.265"
 LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
 public_url = PUBLIC_URL  # 用於模板顯示

--- a/docs/AI_INTELLIGENCE_MODULE_SOT.md
+++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md
@@ -188,7 +188,7 @@ SKU/商品ID   = find_col(['商品ID', 'Product ID', 'ID', 'i_code', 'Item Code'
 | `discount_pct` | INTEGER | 折扣 %（NULL=未折扣） |
 | `competitor_product_id` | VARCHAR(100) | PChome 商品 ID |
 | `competitor_product_name` | TEXT | PChome 商品名稱（核對用） |
-| `match_score` | NUMERIC(4,3) | 商品身份比對分數（0~1），< 0.62 不寫入正式快取 |
+| `match_score` | NUMERIC(4,3) | 商品身份比對分數（0~1），< 0.76 不寫入正式快取 |
 | `tags` | JSONB | 語意標籤，如 `["on_sale","discount_20pct"]` |
 | `crawled_at` | TIMESTAMP | 爬取時間 |
 | `expires_at` | TIMESTAMP | TTL = crawled_at + 6h，過期後 Hermes 忽略 |
@@ -286,7 +286,7 @@ LIMIT 300
                ↓
 [HermesAnalystService.fetch_candidates()]  ←← AI Pipeline 消費端
  ↓ LEFT JOIN competitor_prices（零網路等待）
-  ↓ 有效期內（expires_at > NOW()）+ match_score ≥ 0.62 + tags 含 identity_v2 才 JOIN
+  ↓ 有效期內（expires_at > NOW()）+ match_score ≥ 0.76 + tags 含 identity_v2 才 JOIN
  ↓ pchome_price + competitor_tags 一起傳給 Hermes
 ```

@@ -297,7 +297,7 @@ LIMIT 300
 | 解耦方式 | DB 表快取（非 Redis） | PostgreSQL 已是核心，無需額外依賴；支援 JOIN |
 | TTL | 6 小時 | 與 AI Pipeline 排程週期對齊 |
 | 比對算法 | 品牌 + 核心 token + 容量/重量/包數 + 品類 + 價格 sanity check | 由 `marketplace_product_matcher.py` 統一供 feeder、legacy crawler、AI/PPT 鏈路使用 |
-| 最低比對門檻 | 0.62 | 核心比價寧可待審，不允許低信心錯配影響 AI 決策 |
+| 最低比對門檻 | 0.76 | 核心比價寧可待審，不允許低信心錯配影響 AI 決策 |
 | 已有不同 PChome 商品覆蓋門檻 | 0.84 | 新候選與既有正式配對不同時，除非超高信心，否則寫入 `needs_review` attempt 不覆蓋 |
 | 語意標籤 | JSONB 陣列 | 傳給 Hermes 提升情境感知品質 |

@@ -309,7 +309,7 @@ MOMO 商品名稱
  → PChomeCrawler.search_products(keyword, limit=12)
  → marketplace_product_matcher.score_marketplace_match()
    → 品牌衝突 / 容量衝突 / 包數衝突 hard veto
-    → 同款高信心 score ≥ 0.62 才進 competitor_prices
+    → 同款高信心 score ≥ 0.76 才進 competitor_prices
    → 低信心、規格衝突、既有配對衝突寫入 competitor_match_attempts
 ```

@@ -320,7 +320,7 @@ LEFT JOIN competitor_prices cp
       ON cp.sku = lmp.sku
      AND cp.source = 'pchome'
      AND cp.expires_at > NOW()
-      AND cp.match_score >= 0.62
+      AND cp.match_score >= 0.76
      AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
 ```
 → 無競品資料的商品仍回傳，`pchome_price=NULL`，`_batch_analyze` 自動跳過
--- a/routes/ai_routes.py
+++ b/routes/ai_routes.py
@@ -1518,7 +1518,7 @@ def api_icaim_dashboard():
                    ON cp.sku = lm.sku
                   AND cp.source = 'pchome'
                   AND cp.expires_at > NOW()
-                   AND cp.match_score >= 0.62
+                   AND cp.match_score >= 0.76
                   AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
                WHERE lm.rn = 1
                  AND (lm.momo_price - cp.price) / cp.price * 100 > 15
@@ -1528,7 +1528,7 @@ def api_icaim_dashboard():
                (SELECT COUNT(*) FROM competitor_prices
                  WHERE expires_at > NOW()
                    AND source = 'pchome'
-                    AND COALESCE(match_score, 0) >= 0.62
+                    AND COALESCE(match_score, 0) >= 0.76
                    AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2')                AS valid_competitor_prices,
                (SELECT COUNT(*) FROM high_risk)                                    AS high_risk_count,
                (SELECT COUNT(*) FROM ai_price_recommendations)                     AS total_ai_recs,
@@ -1565,7 +1565,7 @@ def api_icaim_dashboard():
                ON cp.sku = lp.sku
               AND cp.source = 'pchome'
               AND cp.expires_at > NOW()
-               AND cp.match_score >= 0.62
+               AND cp.match_score >= 0.76
               AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
            WHERE lp.rn = 1
            ORDER BY gap_pct DESC NULLS LAST
--- a/routes/api_routes.py
+++ b/routes/api_routes.py
@@ -334,7 +334,7 @@ def _build_price_history_payload(session, product):
            WHERE sku = :sku
              AND source = 'pchome'
              AND crawled_at >= :start_date
-              AND COALESCE(match_score, 0) >= 0.62
+              AND COALESCE(match_score, 0) >= 0.76
              AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2'
            ORDER BY crawled_at
        """), {
--- a/routes/dashboard_routes.py
+++ b/routes/dashboard_routes.py
@@ -40,7 +40,7 @@ sys_log = SystemLogger("DashboardRoutes").get_logger()
 dashboard_bp = Blueprint('dashboard', __name__)

 PRODUCT_PICK_LIST_LIMIT = 50
-PCHOME_MATCH_SCORE_FLOOR = 0.62
+PCHOME_MATCH_SCORE_FLOOR = 0.76


 def _build_pchome_product_url(product_id):
--- a/routes/export_routes.py
+++ b/routes/export_routes.py
@@ -137,7 +137,7 @@ def export_excel_ai_picks():
                  AND (cp.expires_at IS NULL OR cp.expires_at > CURRENT_TIMESTAMP)
                  AND cp.price IS NOT NULL
                  AND cp.price > 0
-                  AND COALESCE(cp.match_score, 0) >= 0.62
+                  AND COALESCE(cp.match_score, 0) >= 0.76
                  AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
                ORDER BY cp.sku, cp.crawled_at DESC NULLS LAST
            )
--- a/services/ai_product_pick_agent.py
+++ b/services/ai_product_pick_agent.py
@@ -154,7 +154,7 @@ def _fetch_candidates_without_sales(conn, limit: int) -> List[Dict[str, Any]]:
          ON cp.sku = lm.sku
         AND cp.source = 'pchome'
         AND (cp.expires_at IS NULL OR cp.expires_at > CURRENT_TIMESTAMP)
-         AND cp.match_score >= 0.62
+         AND cp.match_score >= 0.76
         {identity_condition}
        WHERE lm.rn = 1
        ORDER BY cp.match_score DESC, cp.crawled_at DESC
@@ -249,7 +249,7 @@ def _fetch_candidates(conn, limit: int) -> List[Dict[str, Any]]:
            FROM competitor_price_history
            WHERE source = 'pchome'
              AND crawled_at >= CURRENT_TIMESTAMP - INTERVAL '30 days'
-              AND COALESCE(match_score, 0) >= 0.62
+              AND COALESCE(match_score, 0) >= 0.76
              AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2'
            GROUP BY sku, source
        )
@@ -277,7 +277,7 @@ def _fetch_candidates(conn, limit: int) -> List[Dict[str, Any]]:
          ON cp.sku = lm.sku
         AND cp.source = 'pchome'
         AND (cp.expires_at IS NULL OR cp.expires_at > CURRENT_TIMESTAMP)
-         AND cp.match_score >= 0.62
+         AND cp.match_score >= 0.76
         {identity_condition}
        LEFT JOIN history_stats hs
          ON hs.sku = lm.sku
@@ -515,7 +515,7 @@ def generate_product_pick_list(engine, limit: int = 50) -> ProductPickResult:
        scored = [_score_candidate(row) for row in rows if _to_float(row.get("pchome_price")) > 0]
        picks = [
            pick for pick in scored
-            if pick["pick_score"] >= 45 and (_to_float(pick.get("match_score")) >= 0.62)
+            if pick["pick_score"] >= 45 and (_to_float(pick.get("match_score")) >= 0.76)
        ]
        picks.sort(key=lambda item: item["pick_score"], reverse=True)
        picks = picks[:limit]
--- a/services/chart_generator_service.py
+++ b/services/chart_generator_service.py
@@ -107,7 +107,7 @@ def _fetch_price_history(sku: str, days: int = 30) -> Dict[str, Any]:
                WHERE sku = :sku
                  AND source = 'pchome'
                  AND crawled_at >= NOW() - INTERVAL ':days days'
-                  AND COALESCE(match_score, 0) >= 0.62
+                  AND COALESCE(match_score, 0) >= 0.76
                  AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2'
                GROUP BY dt ORDER BY dt
            """.replace(":days", str(days))), {"sku": sku}).fetchall()
@@ -443,7 +443,7 @@ def price_history_heatmap(days: int = 30) -> Optional[bytes]:
                ) pr ON pr.product_id = p.id
                WHERE cp.crawled_at >= NOW() - INTERVAL '{days} days'
                  AND p.category IS NOT NULL
-                  AND COALESCE(cp.match_score, 0) >= 0.62
+                  AND COALESCE(cp.match_score, 0) >= 0.76
                  AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
                GROUP BY p.category, dt
                ORDER BY p.category, dt
--- a/services/competitor_intel_repository.py
+++ b/services/competitor_intel_repository.py
@@ -16,7 +16,7 @@ from typing import Any, Optional, Union
 from sqlalchemy import inspect, text


-PCHOME_MATCH_SCORE_FLOOR = 0.62
+PCHOME_MATCH_SCORE_FLOOR = 0.76


 def _num(value: Any) -> float:
--- a/services/competitor_price_feeder.py
+++ b/services/competitor_price_feeder.py
@@ -34,7 +34,7 @@ from typing import Optional
 logger = logging.getLogger(__name__)

 # ── 比對參數 ─────────────────────────────────────────
-MIN_MATCH_SCORE  = 0.62  # 低於此分數不寫入；核心比價寧可待審也不能錯配
+MIN_MATCH_SCORE  = 0.76  # 低於此分數不寫入；核心比價寧可待審也不能錯配
 REPLACE_DIFFERENT_PRODUCT_SCORE = 0.84  # 已有不同 PChome 商品時，需超高信心才覆蓋
 SEARCH_LIMIT     = 12    # 每個搜尋詞取 PChome 前 N 筆
 MAX_SEARCH_TERMS  = 3     # 每個 MOMO 商品最多嘗試幾組搜尋詞
--- a/services/elephant_alpha_autonomous_engine.py
+++ b/services/elephant_alpha_autonomous_engine.py
@@ -369,7 +369,7 @@ class ElephantAlphaAutonomousEngine:
                    ) pr ON pr.product_id = p.id
                    JOIN competitor_prices cp ON cp.sku = p.i_code
                    WHERE cp.expires_at > NOW()
-                      AND COALESCE(cp.match_score, 0) >= 0.62
+                      AND COALESCE(cp.match_score, 0) >= 0.76
                      AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
                      AND cp.price < pr.price * 0.85
                      AND cp.crawled_at >= NOW() - INTERVAL '2 hours'
@@ -394,7 +394,7 @@ class ElephantAlphaAutonomousEngine:
                    ) pr ON pr.product_id = p.id
                    JOIN competitor_prices cp ON cp.sku = p.i_code
                    WHERE cp.expires_at > NOW()
-                      AND COALESCE(cp.match_score, 0) >= 0.62
+                      AND COALESCE(cp.match_score, 0) >= 0.76
                      AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
                      AND cp.price > pr.price * 1.05
                      AND cp.crawled_at >= NOW() - INTERVAL '1 hour'
--- a/services/hermes_analyst_service.py
+++ b/services/hermes_analyst_service.py
@@ -396,7 +396,7 @@ class HermesAnalystService:
                   ON cp.sku = lmp.sku
                  AND cp.source = 'pchome'
                  AND cp.expires_at > NOW()
-                  AND cp.match_score >= 0.62
+                  AND cp.match_score >= 0.76
                  AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
            WHERE lmp.rn = 1
              AND rs.sales_7d_prev > 0
--- a/services/marketplace_product_matcher.py
+++ b/services/marketplace_product_matcher.py
@@ -390,6 +390,8 @@ def _spec_component(left_values: Iterable[float], right_values: Iterable[float])


 def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool]:
+    left_counts = [count for count, _unit in left.counts]
+    right_counts = [count for count, _unit in right.counts]
    if left.total_piece_count and right.total_piece_count:
        if left.total_piece_count == right.total_piece_count:
            return 1.0, False
@@ -398,7 +400,13 @@ def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float,
    if left.counts and right.counts:
        if set(left.counts) & set(right.counts):
            return 0.85, False
+        if left_counts and right_counts:
+            ratio = max(max(left_counts), max(right_counts)) / max(min(max(left_counts), max(right_counts)), 1)
+            if ratio >= 1.5:
+                return 0.0, True
        return 0.35, False
+    if (left_counts and max(left_counts) > 1) or (right_counts and max(right_counts) > 1):
+        return 0.0, True
    return 0.5, False


@@ -428,6 +436,48 @@ def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, b
    return score, bool(conflicts), tuple(conflicts)


+def _has_bundle_offer(identity: ProductIdentity) -> bool:
+    text = identity.normalized_name
+    return bool(
+        re.search(r"買\s*\d+\s*送\s*\d+", text)
+        or re.search(r"買\s*[一二兩雙三四五六七八九十]\s*送\s*[一二兩雙三四五六七八九十]", text)
+        or "買一送一" in text
+        or "囤貨組" in text
+    )
+
+
+def _has_multi_component(identity: ProductIdentity) -> bool:
+    text = identity.normalized_name
+    return bool(
+        "+" in text
+        or "＋" in text
+        or re.search(r"\d+\s*(?:ml|g|毫升|公克)\s*x\s*\d+", text, re.I)
+    )
+
+
+def _chinese_bigram_score(left: ProductIdentity, right: ProductIdentity) -> float:
+    def signature(identity: ProductIdentity) -> set[str]:
+        text = identity.searchable_name
+        for token in sorted(identity.brand_tokens, key=len, reverse=True):
+            text = text.replace(token, " ")
+        text = re.sub(r"[a-z0-9]+", " ", text)
+        text = "".join(char for char in text if "\u4e00" <= char <= "\u9fff")
+        for phrase in (
+            "官方", "直營", "公司貨", "專櫃", "正貨", "原廠", "限定", "獨家",
+            "期間", "超值", "特惠", "優惠", "新品", "經典", "人氣", "熱銷",
+            "必買", "推薦", "任選", "禮盒", "母親節", "超品日", "多款",
+            "組", "入", "瓶", "盒", "包", "片", "支", "條",
+        ):
+            text = text.replace(phrase, "")
+        return {text[i:i + 2] for i in range(max(0, len(text) - 1))}
+
+    left_signature = signature(left)
+    right_signature = signature(right)
+    if not left_signature or not right_signature:
+        return 0.55
+    return 2 * len(left_signature & right_signature) / (len(left_signature) + len(right_signature))
+
+
 def score_marketplace_match(
    momo_name: str,
    competitor_name: str,
@@ -441,6 +491,7 @@ def score_marketplace_match(
    token_score = _weighted_token_score(left, right)
    spec_score, spec_conflict, spec_reasons = _spec_score(left, right)
    sequence_score = SequenceMatcher(None, left.searchable_name, right.searchable_name).ratio()
+    chinese_name_score = _chinese_bigram_score(left, right)
    if left.product_type and right.product_type:
        type_score = 1.0 if left.product_type == right.product_type else 0.0
    else:
@@ -452,8 +503,20 @@ def score_marketplace_match(
    reasons.extend(spec_reasons)
    if left.product_type and right.product_type and left.product_type != right.product_type:
        reasons.append("type_conflict")
+    if _has_bundle_offer(left) != _has_bundle_offer(right):
+        reasons.append("bundle_offer_conflict")
+    if _has_multi_component(left) != _has_multi_component(right):
+        reasons.append("multi_component_conflict")
+    if chinese_name_score < 0.16:
+        reasons.append("product_line_conflict")

    hard_veto = brand_conflict or spec_conflict
+    if _has_bundle_offer(left) != _has_bundle_offer(right):
+        hard_veto = True
+    if _has_multi_component(left) != _has_multi_component(right):
+        hard_veto = True
+    if chinese_name_score < 0.16 and token_score < 0.72:
+        hard_veto = True
    if left.product_type and right.product_type and left.product_type != right.product_type and token_score < 0.55:
        hard_veto = True

--- a/services/openclaw_strategist_service.py
+++ b/services/openclaw_strategist_service.py
@@ -555,7 +555,7 @@ def _fetch_competitor_summary() -> Dict[str, Any]:
                FROM price_records ORDER BY product_id, timestamp DESC
            ) pr ON pr.product_id = p.id
            WHERE cp.expires_at > NOW()
-              AND COALESCE(cp.match_score, 0) >= 0.62
+              AND COALESCE(cp.match_score, 0) >= 0.76
              AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
        """)).fetchone()
        if row and row[0]:
--- a/services/pchome_crawler.py
+++ b/services/pchome_crawler.py
@@ -505,7 +505,7 @@ def find_best_match(keyword: str, momo_price: float) -> Optional[dict]:
                best = result
                best_score = diagnostics.score
                best_diagnostics = diagnostics
-        if not best or best_score < 0.62:
+        if not best or best_score < 0.76:
            return None
        best['match_score'] = best_score
        best['match_reasons'] = list(getattr(best_diagnostics, 'reasons', ()) or ())
--- a/templates/growth_analysis.html
+++ b/templates/growth_analysis.html
@@ -140,7 +140,7 @@
          {% set coverage = chart_data.competitor_coverage | default({}) %}
          <div class="ga-competitor-quality">
            <span>高信心門檻</span>
-            <strong class="momo-mono">{{ coverage.match_score_floor | default(0.62) }}</strong>
+            <strong class="momo-mono">{{ coverage.match_score_floor | default(0.76) }}</strong>
            <span>有效配對</span>
            <strong class="momo-mono">{{ coverage.valid_matches | default(0) | number_format }}</strong>
            <span>覆蓋率</span>
--- a/tests/test_frontend_v2_assets.py
+++ b/tests/test_frontend_v2_assets.py
@@ -361,7 +361,7 @@ def test_ai_product_pick_agent_uses_real_competitor_data_and_dashboard_action():
    route_source = (ROOT / "routes/ai_routes.py").read_text(encoding="utf-8")
    template = (ROOT / "templates/ai_intelligence.html").read_text(encoding="utf-8")

-    assert "MIN_MATCH_SCORE  = 0.62" in feeder_source
+    assert "MIN_MATCH_SCORE  = 0.76" in feeder_source
    assert "REPLACE_DIFFERENT_PRODUCT_SCORE" in feeder_source
    assert "marketplace_product_matcher" in feeder_source
    assert "MAX_SEARCH_TERMS" in feeder_source
--- a/tests/test_marketplace_product_matcher.py
+++ b/tests/test_marketplace_product_matcher.py
@@ -27,7 +27,7 @@ def test_marketplace_matcher_rejects_brand_conflict_even_when_volume_matches():
        competitor_price=1249,
    )

-    assert diagnostics.score < 0.62
+    assert diagnostics.score < 0.76
    assert diagnostics.hard_veto is True
    assert "brand_conflict" in diagnostics.reasons

@@ -42,7 +42,7 @@ def test_marketplace_matcher_rejects_volume_conflict_for_same_brand():
        competitor_price=3549,
    )

-    assert diagnostics.score < 0.62
+    assert diagnostics.score < 0.76
    assert diagnostics.hard_veto is True
    assert "volume_conflict" in diagnostics.reasons

@@ -57,7 +57,7 @@ def test_marketplace_matcher_handles_bundle_piece_count():
        competitor_price=4590,
    )

-    assert diagnostics.score >= 0.62
+    assert diagnostics.score >= 0.76
    assert diagnostics.hard_veto is False