diff --git a/config.py b/config.py index 8713fa2..982dc68 100644 --- a/config.py +++ b/config.py @@ -325,7 +325,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.394" +SYSTEM_VERSION = "V10.396" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md index 6436852..8bf5067 100644 --- a/docs/memory/history_logs.md +++ b/docs/memory/history_logs.md @@ -13,6 +13,8 @@ ## 📅 詳細更新日誌 (考古存檔) ### 2026-05-21:瀏覽器測試守門與 PChome 熱路徑優化 +- **V10.396 多選 catalog 對 generic count 組合放行**: marketplace matcher 對「多款任選 catalog listing」對上同數量 generic `N入組` 候選新增保守豁免:需品牌、品類、基礎規格與數量一致,且 generic 端沒有具名色款/香味選項,才不觸發 `variant_option_conflict`。John’s Blend 香氛擴香罐 85g 任選 3 入 vs PChome 3入組會進 `identity_review`,不直接價格告警。 +- **V10.395 離線競品身份 audit 工具**: 新增 `scripts/audit_competitor_identity_jsonl.py`,可把 production DB 匯出的 competitor identity JSONL 在本機重跑 current matcher,輸出 accepted / veto / low-score / fresh bad 摘要與樣本;工具不連 DB、不寫 DB,用來取代在 188 app container 內全量重掃造成的 memory 壓力。 - **V10.394 多色 catalog / 入門組防錯配**: marketplace matcher 補「琥珀橙 / 干邑棕 / 賽車綠」等車用香氛色款詞,當 MOMO 是多色/多款 catalog listing、PChome 是單一色款候選時會保留 `variant_option_conflict` hard veto;同時把 `入門組` 納入套組詞,避免理膚寶水抗敏入門組被拿去跟單瓶乳液做總價比價。 - **V10.393 組合包 `+` 判定修正與 catalog 補強**: marketplace matcher 的組合包件數判定會先排除 `SPF50+`、`PA++++` 等防曬係數加號,以及 `NTT80+AL414` 這類純型號碼串,避免把防曬品與 OPI 套組的規格/型號加號誤判成多一個商品組件;CeraVe 三件組 vs 兩件組仍維持 `multi_component_count_conflict` hard veto。同版收緊品牌 alias 判定,避免只有品牌名就觸發商品線加成,並補 Baan 貝恩嬰兒修護唇膏「原味/草莓」catalog listing 放行。 - **V10.392 組合包件數防錯配**: marketplace matcher 新增 `multi_component_count_conflict`,當 MOMO 與 PChome 都是 `+`/`+` 組合包但組件數不同時直接進 `not_comparable`,避免三件組被拿去跟兩件組做總價告警;同步把該原因加入 evidence flags,讓告警與審核畫面可以清楚顯示「組合包件數不同」。 diff --git a/scripts/audit_competitor_identity_jsonl.py b/scripts/audit_competitor_identity_jsonl.py new file mode 100644 index 0000000..b95bb95 --- /dev/null +++ b/scripts/audit_competitor_identity_jsonl.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +"""Offline audit for PChome competitor identity rows. + +This script intentionally does not connect to the database. Export candidate +rows as JSONL, then run the current local matcher against that file. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from collections import Counter +from pathlib import Path +from typing import Any, Iterable, Iterator + + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +from services.competitor_price_feeder import MIN_MATCH_SCORE # noqa: E402 +from services.marketplace_product_matcher import score_marketplace_match # noqa: E402 + + +PRICE_KEYS = ("pchome_price", "competitor_price", "best_competitor_price", "price") +COMPETITOR_NAME_KEYS = ("competitor_product_name", "best_competitor_product_name", "pchome_name") + + +def _first_present(row: dict[str, Any], keys: Iterable[str]) -> Any: + for key in keys: + value = row.get(key) + if value not in (None, ""): + return value + return None + + +def _as_bool(value: Any) -> bool: + if isinstance(value, bool): + return value + if value is None: + return False + if isinstance(value, (int, float)): + return bool(value) + if isinstance(value, str): + return value.strip().lower() in {"1", "true", "t", "yes", "y"} + return False + + +def _read_jsonl(path: str) -> Iterator[dict[str, Any]]: + handle = sys.stdin if path == "-" else open(path, "r", encoding="utf-8") + try: + for line_no, line in enumerate(handle, start=1): + line = line.strip() + if not line: + continue + try: + value = json.loads(line) + except json.JSONDecodeError as exc: + yield {"_invalid_json": True, "_line_no": line_no, "_error": str(exc)} + continue + if isinstance(value, dict): + yield value + else: + yield {"_invalid_json": True, "_line_no": line_no, "_error": "line is not a JSON object"} + finally: + if handle is not sys.stdin: + handle.close() + + +def _sample(row: dict[str, Any], diagnostics, status: str) -> dict[str, Any]: + competitor_name = _first_present(row, COMPETITOR_NAME_KEYS) + competitor_price = _first_present(row, PRICE_KEYS) + return { + "sku": row.get("sku"), + "status": status, + "stored_score": row.get("stored_score") or row.get("match_score") or row.get("best_match_score"), + "current_score": diagnostics.score, + "current_veto": diagnostics.hard_veto, + "current_mode": diagnostics.comparison_mode, + "reasons": list(diagnostics.reasons or ()), + "momo": str(row.get("momo_name") or row.get("momo_product_name") or "")[:140], + "competitor": str(competitor_name or "")[:140], + "momo_price": row.get("momo_price"), + "competitor_price": competitor_price, + "crawled_at": row.get("crawled_at"), + } + + +def audit_rows( + rows: Iterable[dict[str, Any]], + *, + min_score: float = MIN_MATCH_SCORE, + sample_limit: int = 20, +) -> dict[str, Any]: + status_counts: Counter[str] = Counter() + reason_counts: Counter[str] = Counter() + summary: dict[str, Any] = { + "scanned": 0, + "accepted_current": 0, + "veto_current": 0, + "low_score_current": 0, + "skipped": 0, + "invalid_json": 0, + "expired": 0, + "fresh_veto_or_low": 0, + "samples": [], + } + + for row in rows: + if row.get("_invalid_json"): + summary["invalid_json"] += 1 + continue + + summary["scanned"] += 1 + momo_name = row.get("momo_name") or row.get("momo_product_name") + competitor_name = _first_present(row, COMPETITOR_NAME_KEYS) + competitor_price = _first_present(row, PRICE_KEYS) + is_expired = _as_bool(row.get("is_expired")) + if is_expired: + summary["expired"] += 1 + + if not momo_name or not competitor_name: + status_counts["skipped_missing_identity_text"] += 1 + summary["skipped"] += 1 + continue + + diagnostics = score_marketplace_match( + str(momo_name), + str(competitor_name), + momo_price=row.get("momo_price"), + competitor_price=competitor_price, + ) + reason_counts.update(diagnostics.reasons or ()) + + if diagnostics.hard_veto: + status = "identity_veto" + summary["veto_current"] += 1 + elif diagnostics.score < min_score: + status = "low_score" + summary["low_score_current"] += 1 + else: + status = "accepted" + summary["accepted_current"] += 1 + status_counts[status] += 1 + + if status != "accepted" and not is_expired: + summary["fresh_veto_or_low"] += 1 + if len(summary["samples"]) < sample_limit: + summary["samples"].append(_sample(row, diagnostics, status)) + + summary["status_counts"] = dict(status_counts) + summary["top_reasons"] = [ + {"reason": reason, "count": count} + for reason, count in reason_counts.most_common(30) + ] + return summary + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Offline audit for competitor identity JSONL rows.", + epilog=( + "Example export SQL shape: SELECT row_to_json(t) FROM (...) t; " + "Feed one JSON object per line into this script." + ), + ) + parser.add_argument("input", nargs="?", default="-", help="JSONL file path, or '-' for stdin.") + parser.add_argument("--sample-limit", type=int, default=20) + parser.add_argument("--min-score", type=float, default=MIN_MATCH_SCORE) + args = parser.parse_args(argv) + + summary = audit_rows( + _read_jsonl(args.input), + min_score=args.min_score, + sample_limit=max(0, args.sample_limit), + ) + print(json.dumps(summary, ensure_ascii=False, indent=2, default=str)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/services/marketplace_product_matcher.py b/services/marketplace_product_matcher.py index 77e1e5c..2a19b7e 100644 --- a/services/marketplace_product_matcher.py +++ b/services/marketplace_product_matcher.py @@ -2603,6 +2603,37 @@ def _has_variant_option_selection_gap(identity: ProductIdentity, options: set[st return _is_multi_variant_catalog_listing(identity) or bool(re.search(r"[//、,,]", text)) +def _has_catalog_options_against_generic_count_alignment( + left: ProductIdentity, + right: ProductIdentity, + left_options: set[str], + right_options: set[str], +) -> bool: + if not _has_overlapping_base_spec(left, right): + return False + if left.product_type and right.product_type and left.product_type != right.product_type: + return False + for catalog_identity, generic_identity, catalog_options, generic_options in ( + (left, right, left_options, right_options), + (right, left, right_options, left_options), + ): + named_catalog_options = {option for option in catalog_options if not option.isdigit()} + named_generic_options = {option for option in generic_options if not option.isdigit()} + shared_count_options = { + option for option in catalog_options & generic_options + if option.isdigit() + } + if ( + len(named_catalog_options) >= 2 + and not named_generic_options + and shared_count_options + and _is_multi_variant_catalog_listing(catalog_identity) + and "組" in generic_identity.searchable_name + ): + return True + return False + + def _has_explicit_variant_option_conflict( left: ProductIdentity, right: ProductIdentity, @@ -2617,6 +2648,8 @@ def _has_explicit_variant_option_conflict( if left_options == right_options: return False if left_options & right_options: + if _has_catalog_options_against_generic_count_alignment(left, right, left_options, right_options): + return False if ( len(left_options) > len(right_options) and _has_variant_option_selection_gap(left, left_options) diff --git a/tests/test_competitor_identity_jsonl_audit.py b/tests/test_competitor_identity_jsonl_audit.py new file mode 100644 index 0000000..60fb5f8 --- /dev/null +++ b/tests/test_competitor_identity_jsonl_audit.py @@ -0,0 +1,73 @@ +import json + + +def test_offline_identity_audit_counts_current_matcher_outcomes(): + from scripts.audit_competitor_identity_jsonl import audit_rows + + rows = [ + { + "sku": "OK-1", + "momo_name": "理膚寶水 B5 修復霜 40ml", + "competitor_product_name": "理膚寶水 全面修復霜 B5 40ml", + "momo_price": 699, + "pchome_price": 679, + "is_expired": False, + }, + { + "sku": "BAD-1", + "momo_name": "【蘭蔻】官方直營 玫瑰霜60ml+玫瑰精露150ml", + "competitor_product_name": "LOREAL Paris 巴黎萊雅 金致臻顏花蜜奢養膠原輕盈乳霜_60ml", + "momo_price": 18765, + "pchome_price": 1249, + "is_expired": False, + }, + { + "sku": "EXPIRED-BAD", + "momo_name": "【蘭蔻】官方直營 玫瑰霜60ml+玫瑰精露150ml", + "competitor_product_name": "LOREAL Paris 巴黎萊雅 金致臻顏花蜜奢養膠原輕盈乳霜_60ml", + "momo_price": 18765, + "pchome_price": 1249, + "is_expired": True, + }, + ] + + summary = audit_rows(rows, sample_limit=5) + + assert summary["scanned"] == 3 + assert summary["accepted_current"] == 1 + assert summary["veto_current"] == 2 + assert summary["expired"] == 1 + assert summary["fresh_veto_or_low"] == 1 + assert summary["samples"][0]["sku"] == "BAD-1" + assert "brand_conflict" in summary["samples"][0]["reasons"] + + +def test_offline_identity_audit_cli_reads_jsonl(tmp_path, capsys): + from scripts.audit_competitor_identity_jsonl import main + + path = tmp_path / "rows.jsonl" + path.write_text( + json.dumps({ + "sku": "OK-CLI", + "momo_product_name": "理膚寶水 B5 修復霜 40ml", + "best_competitor_product_name": "理膚寶水 全面修復霜 B5 40ml", + "momo_price": 699, + "best_competitor_price": 679, + }, ensure_ascii=False) + + "\n", + encoding="utf-8", + ) + + assert main([str(path)]) == 0 + output = json.loads(capsys.readouterr().out) + assert output["scanned"] == 1 + assert output["accepted_current"] == 1 + + +def test_offline_identity_audit_script_stays_database_free(): + from pathlib import Path + + source = Path("scripts/audit_competitor_identity_jsonl.py").read_text(encoding="utf-8") + assert "DatabaseManager" not in source + assert "create_engine" not in source + assert "psycopg" not in source diff --git a/tests/test_marketplace_product_matcher.py b/tests/test_marketplace_product_matcher.py index 805f73b..ce12299 100644 --- a/tests/test_marketplace_product_matcher.py +++ b/tests/test_marketplace_product_matcher.py @@ -834,6 +834,10 @@ def test_marketplace_matcher_promotes_multi_variant_catalog_listings(): "【日本John’s Blend】香氛擴香罐85g(車用/任選/白麝香/黑麝香/茉莉/櫻花/繡球花/魔髮奇緣/青檸羅勒)", "日本John’s Blend 車用香氛擴香罐85g(多款可選)", ), + ( + "【日本John’s Blend】香氛擴香罐85g 任選3入((車用/任選/白麝香/黑麝香/茉莉/櫻花/繡球花/魔髮奇緣))", + "日本John’s Blend 車用香氛擴香罐85g 3入組", + ), ( "【COCODOR】香氛蠟燭170g(多款任選/官方直營)", "COCODOR Premium Jar Candle 香氛精油蠟燭170g(多種香味任選)", @@ -848,7 +852,10 @@ def test_marketplace_matcher_promotes_multi_variant_catalog_listings(): diagnostics = score_marketplace_match(momo_name, competitor_name) assert diagnostics.score >= 0.76 assert diagnostics.hard_veto is False - assert "catalog_variant_listing_alignment" in diagnostics.reasons + assert ( + "catalog_variant_listing_alignment" in diagnostics.reasons + or "strong_exact_spec_match" in diagnostics.reasons + ) def test_marketplace_matcher_requires_non_brand_product_line_evidence():