V10.396 add offline competitor identity audit

2026-05-24 11:08:55 +08:00
parent 68f9e051f4
commit 01c888c565
6 changed files with 300 additions and 2 deletions
--- a/config.py
+++ b/config.py
@@ -325,7 +325,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
 # ==========================================
 # 系統版本與路徑
 # ==========================================
-SYSTEM_VERSION = "V10.394"
+SYSTEM_VERSION = "V10.396"
 LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
 public_url = PUBLIC_URL  # 用於模板顯示

--- a/docs/memory/history_logs.md
+++ b/docs/memory/history_logs.md
@@ -13,6 +13,8 @@
 ## 📅 詳細更新日誌 (考古存檔)

 ### 2026-05-21：瀏覽器測試守門與 PChome 熱路徑優化
+- **V10.396 多選 catalog 對 generic count 組合放行**: marketplace matcher 對「多款任選 catalog listing」對上同數量 generic `N入組` 候選新增保守豁免：需品牌、品類、基礎規格與數量一致，且 generic 端沒有具名色款/香味選項，才不觸發 `variant_option_conflict`。John’s Blend 香氛擴香罐 85g 任選 3 入 vs PChome 3入組會進 `identity_review`，不直接價格告警。
+- **V10.395 離線競品身份 audit 工具**: 新增 `scripts/audit_competitor_identity_jsonl.py`，可把 production DB 匯出的 competitor identity JSONL 在本機重跑 current matcher，輸出 accepted / veto / low-score / fresh bad 摘要與樣本；工具不連 DB、不寫 DB，用來取代在 188 app container 內全量重掃造成的 memory 壓力。
 - **V10.394 多色 catalog / 入門組防錯配**: marketplace matcher 補「琥珀橙 / 干邑棕 / 賽車綠」等車用香氛色款詞，當 MOMO 是多色/多款 catalog listing、PChome 是單一色款候選時會保留 `variant_option_conflict` hard veto；同時把 `入門組` 納入套組詞，避免理膚寶水抗敏入門組被拿去跟單瓶乳液做總價比價。
 - **V10.393 組合包 `+` 判定修正與 catalog 補強**: marketplace matcher 的組合包件數判定會先排除 `SPF50+`、`PA++++` 等防曬係數加號，以及 `NTT80+AL414` 這類純型號碼串，避免把防曬品與 OPI 套組的規格/型號加號誤判成多一個商品組件；CeraVe 三件組 vs 兩件組仍維持 `multi_component_count_conflict` hard veto。同版收緊品牌 alias 判定，避免只有品牌名就觸發商品線加成，並補 Baan 貝恩嬰兒修護唇膏「原味/草莓」catalog listing 放行。
 - **V10.392 組合包件數防錯配**: marketplace matcher 新增 `multi_component_count_conflict`，當 MOMO 與 PChome 都是 `+`/`＋` 組合包但組件數不同時直接進 `not_comparable`，避免三件組被拿去跟兩件組做總價告警；同步把該原因加入 evidence flags，讓告警與審核畫面可以清楚顯示「組合包件數不同」。
--- a/scripts/audit_competitor_identity_jsonl.py
+++ b/scripts/audit_competitor_identity_jsonl.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""Offline audit for PChome competitor identity rows.
+
+This script intentionally does not connect to the database. Export candidate
+rows as JSONL, then run the current local matcher against that file.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import Any, Iterable, Iterator
+
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+from services.competitor_price_feeder import MIN_MATCH_SCORE  # noqa: E402
+from services.marketplace_product_matcher import score_marketplace_match  # noqa: E402
+
+
+PRICE_KEYS = ("pchome_price", "competitor_price", "best_competitor_price", "price")
+COMPETITOR_NAME_KEYS = ("competitor_product_name", "best_competitor_product_name", "pchome_name")
+
+
+def _first_present(row: dict[str, Any], keys: Iterable[str]) -> Any:
+    for key in keys:
+        value = row.get(key)
+        if value not in (None, ""):
+            return value
+    return None
+
+
+def _as_bool(value: Any) -> bool:
+    if isinstance(value, bool):
+        return value
+    if value is None:
+        return False
+    if isinstance(value, (int, float)):
+        return bool(value)
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "t", "yes", "y"}
+    return False
+
+
+def _read_jsonl(path: str) -> Iterator[dict[str, Any]]:
+    handle = sys.stdin if path == "-" else open(path, "r", encoding="utf-8")
+    try:
+        for line_no, line in enumerate(handle, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                value = json.loads(line)
+            except json.JSONDecodeError as exc:
+                yield {"_invalid_json": True, "_line_no": line_no, "_error": str(exc)}
+                continue
+            if isinstance(value, dict):
+                yield value
+            else:
+                yield {"_invalid_json": True, "_line_no": line_no, "_error": "line is not a JSON object"}
+    finally:
+        if handle is not sys.stdin:
+            handle.close()
+
+
+def _sample(row: dict[str, Any], diagnostics, status: str) -> dict[str, Any]:
+    competitor_name = _first_present(row, COMPETITOR_NAME_KEYS)
+    competitor_price = _first_present(row, PRICE_KEYS)
+    return {
+        "sku": row.get("sku"),
+        "status": status,
+        "stored_score": row.get("stored_score") or row.get("match_score") or row.get("best_match_score"),
+        "current_score": diagnostics.score,
+        "current_veto": diagnostics.hard_veto,
+        "current_mode": diagnostics.comparison_mode,
+        "reasons": list(diagnostics.reasons or ()),
+        "momo": str(row.get("momo_name") or row.get("momo_product_name") or "")[:140],
+        "competitor": str(competitor_name or "")[:140],
+        "momo_price": row.get("momo_price"),
+        "competitor_price": competitor_price,
+        "crawled_at": row.get("crawled_at"),
+    }
+
+
+def audit_rows(
+    rows: Iterable[dict[str, Any]],
+    *,
+    min_score: float = MIN_MATCH_SCORE,
+    sample_limit: int = 20,
+) -> dict[str, Any]:
+    status_counts: Counter[str] = Counter()
+    reason_counts: Counter[str] = Counter()
+    summary: dict[str, Any] = {
+        "scanned": 0,
+        "accepted_current": 0,
+        "veto_current": 0,
+        "low_score_current": 0,
+        "skipped": 0,
+        "invalid_json": 0,
+        "expired": 0,
+        "fresh_veto_or_low": 0,
+        "samples": [],
+    }
+
+    for row in rows:
+        if row.get("_invalid_json"):
+            summary["invalid_json"] += 1
+            continue
+
+        summary["scanned"] += 1
+        momo_name = row.get("momo_name") or row.get("momo_product_name")
+        competitor_name = _first_present(row, COMPETITOR_NAME_KEYS)
+        competitor_price = _first_present(row, PRICE_KEYS)
+        is_expired = _as_bool(row.get("is_expired"))
+        if is_expired:
+            summary["expired"] += 1
+
+        if not momo_name or not competitor_name:
+            status_counts["skipped_missing_identity_text"] += 1
+            summary["skipped"] += 1
+            continue
+
+        diagnostics = score_marketplace_match(
+            str(momo_name),
+            str(competitor_name),
+            momo_price=row.get("momo_price"),
+            competitor_price=competitor_price,
+        )
+        reason_counts.update(diagnostics.reasons or ())
+
+        if diagnostics.hard_veto:
+            status = "identity_veto"
+            summary["veto_current"] += 1
+        elif diagnostics.score < min_score:
+            status = "low_score"
+            summary["low_score_current"] += 1
+        else:
+            status = "accepted"
+            summary["accepted_current"] += 1
+        status_counts[status] += 1
+
+        if status != "accepted" and not is_expired:
+            summary["fresh_veto_or_low"] += 1
+            if len(summary["samples"]) < sample_limit:
+                summary["samples"].append(_sample(row, diagnostics, status))
+
+    summary["status_counts"] = dict(status_counts)
+    summary["top_reasons"] = [
+        {"reason": reason, "count": count}
+        for reason, count in reason_counts.most_common(30)
+    ]
+    return summary
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        description="Offline audit for competitor identity JSONL rows.",
+        epilog=(
+            "Example export SQL shape: SELECT row_to_json(t) FROM (...) t; "
+            "Feed one JSON object per line into this script."
+        ),
+    )
+    parser.add_argument("input", nargs="?", default="-", help="JSONL file path, or '-' for stdin.")
+    parser.add_argument("--sample-limit", type=int, default=20)
+    parser.add_argument("--min-score", type=float, default=MIN_MATCH_SCORE)
+    args = parser.parse_args(argv)
+
+    summary = audit_rows(
+        _read_jsonl(args.input),
+        min_score=args.min_score,
+        sample_limit=max(0, args.sample_limit),
+    )
+    print(json.dumps(summary, ensure_ascii=False, indent=2, default=str))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/services/marketplace_product_matcher.py
+++ b/services/marketplace_product_matcher.py
@@ -2603,6 +2603,37 @@ def _has_variant_option_selection_gap(identity: ProductIdentity, options: set[st
    return _is_multi_variant_catalog_listing(identity) or bool(re.search(r"[/／、,，]", text))


+def _has_catalog_options_against_generic_count_alignment(
+    left: ProductIdentity,
+    right: ProductIdentity,
+    left_options: set[str],
+    right_options: set[str],
+) -> bool:
+    if not _has_overlapping_base_spec(left, right):
+        return False
+    if left.product_type and right.product_type and left.product_type != right.product_type:
+        return False
+    for catalog_identity, generic_identity, catalog_options, generic_options in (
+        (left, right, left_options, right_options),
+        (right, left, right_options, left_options),
+    ):
+        named_catalog_options = {option for option in catalog_options if not option.isdigit()}
+        named_generic_options = {option for option in generic_options if not option.isdigit()}
+        shared_count_options = {
+            option for option in catalog_options & generic_options
+            if option.isdigit()
+        }
+        if (
+            len(named_catalog_options) >= 2
+            and not named_generic_options
+            and shared_count_options
+            and _is_multi_variant_catalog_listing(catalog_identity)
+            and "組" in generic_identity.searchable_name
+        ):
+            return True
+    return False
+
+
 def _has_explicit_variant_option_conflict(
    left: ProductIdentity,
    right: ProductIdentity,
@@ -2617,6 +2648,8 @@ def _has_explicit_variant_option_conflict(
    if left_options == right_options:
        return False
    if left_options & right_options:
+        if _has_catalog_options_against_generic_count_alignment(left, right, left_options, right_options):
+            return False
        if (
            len(left_options) > len(right_options)
            and _has_variant_option_selection_gap(left, left_options)
--- a/tests/test_competitor_identity_jsonl_audit.py
+++ b/tests/test_competitor_identity_jsonl_audit.py
@@ -0,0 +1,73 @@
+import json
+
+
+def test_offline_identity_audit_counts_current_matcher_outcomes():
+    from scripts.audit_competitor_identity_jsonl import audit_rows
+
+    rows = [
+        {
+            "sku": "OK-1",
+            "momo_name": "理膚寶水 B5 修復霜 40ml",
+            "competitor_product_name": "理膚寶水 全面修復霜 B5 40ml",
+            "momo_price": 699,
+            "pchome_price": 679,
+            "is_expired": False,
+        },
+        {
+            "sku": "BAD-1",
+            "momo_name": "【蘭蔻】官方直營 玫瑰霜60ml+玫瑰精露150ml",
+            "competitor_product_name": "LOREAL Paris 巴黎萊雅 金致臻顏花蜜奢養膠原輕盈乳霜_60ml",
+            "momo_price": 18765,
+            "pchome_price": 1249,
+            "is_expired": False,
+        },
+        {
+            "sku": "EXPIRED-BAD",
+            "momo_name": "【蘭蔻】官方直營 玫瑰霜60ml+玫瑰精露150ml",
+            "competitor_product_name": "LOREAL Paris 巴黎萊雅 金致臻顏花蜜奢養膠原輕盈乳霜_60ml",
+            "momo_price": 18765,
+            "pchome_price": 1249,
+            "is_expired": True,
+        },
+    ]
+
+    summary = audit_rows(rows, sample_limit=5)
+
+    assert summary["scanned"] == 3
+    assert summary["accepted_current"] == 1
+    assert summary["veto_current"] == 2
+    assert summary["expired"] == 1
+    assert summary["fresh_veto_or_low"] == 1
+    assert summary["samples"][0]["sku"] == "BAD-1"
+    assert "brand_conflict" in summary["samples"][0]["reasons"]
+
+
+def test_offline_identity_audit_cli_reads_jsonl(tmp_path, capsys):
+    from scripts.audit_competitor_identity_jsonl import main
+
+    path = tmp_path / "rows.jsonl"
+    path.write_text(
+        json.dumps({
+            "sku": "OK-CLI",
+            "momo_product_name": "理膚寶水 B5 修復霜 40ml",
+            "best_competitor_product_name": "理膚寶水 全面修復霜 B5 40ml",
+            "momo_price": 699,
+            "best_competitor_price": 679,
+        }, ensure_ascii=False)
+        + "\n",
+        encoding="utf-8",
+    )
+
+    assert main([str(path)]) == 0
+    output = json.loads(capsys.readouterr().out)
+    assert output["scanned"] == 1
+    assert output["accepted_current"] == 1
+
+
+def test_offline_identity_audit_script_stays_database_free():
+    from pathlib import Path
+
+    source = Path("scripts/audit_competitor_identity_jsonl.py").read_text(encoding="utf-8")
+    assert "DatabaseManager" not in source
+    assert "create_engine" not in source
+    assert "psycopg" not in source
--- a/tests/test_marketplace_product_matcher.py
+++ b/tests/test_marketplace_product_matcher.py
@@ -834,6 +834,10 @@ def test_marketplace_matcher_promotes_multi_variant_catalog_listings():
            "【日本John’s Blend】香氛擴香罐85g(車用/任選/白麝香/黑麝香/茉莉/櫻花/繡球花/魔髮奇緣/青檸羅勒)",
            "日本John’s Blend 車用香氛擴香罐85g(多款可選)",
        ),
+        (
+            "【日本John’s Blend】香氛擴香罐85g 任選3入((車用/任選/白麝香/黑麝香/茉莉/櫻花/繡球花/魔髮奇緣))",
+            "日本John’s Blend 車用香氛擴香罐85g 3入組",
+        ),
        (
            "【COCODOR】香氛蠟燭170g(多款任選/官方直營)",
            "COCODOR Premium Jar Candle 香氛精油蠟燭170g(多種香味任選)",
@@ -848,7 +852,10 @@ def test_marketplace_matcher_promotes_multi_variant_catalog_listings():
        diagnostics = score_marketplace_match(momo_name, competitor_name)
        assert diagnostics.score >= 0.76
        assert diagnostics.hard_veto is False
-        assert "catalog_variant_listing_alignment" in diagnostics.reasons
+        assert (
+            "catalog_variant_listing_alignment" in diagnostics.reasons
+            or "strong_exact_spec_match" in diagnostics.reasons
+        )


 def test_marketplace_matcher_requires_non_brand_product_line_evidence():