V10.396 add offline competitor identity audit
All checks were successful
CD Pipeline / deploy (push) Successful in 1m6s
All checks were successful
CD Pipeline / deploy (push) Successful in 1m6s
This commit is contained in:
@@ -325,7 +325,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
|
||||
# ==========================================
|
||||
# 系統版本與路徑
|
||||
# ==========================================
|
||||
SYSTEM_VERSION = "V10.394"
|
||||
SYSTEM_VERSION = "V10.396"
|
||||
LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
|
||||
public_url = PUBLIC_URL # 用於模板顯示
|
||||
|
||||
|
||||
@@ -13,6 +13,8 @@
|
||||
## 📅 詳細更新日誌 (考古存檔)
|
||||
|
||||
### 2026-05-21:瀏覽器測試守門與 PChome 熱路徑優化
|
||||
- **V10.396 多選 catalog 對 generic count 組合放行**: marketplace matcher 對「多款任選 catalog listing」對上同數量 generic `N入組` 候選新增保守豁免:需品牌、品類、基礎規格與數量一致,且 generic 端沒有具名色款/香味選項,才不觸發 `variant_option_conflict`。John’s Blend 香氛擴香罐 85g 任選 3 入 vs PChome 3入組會進 `identity_review`,不直接價格告警。
|
||||
- **V10.395 離線競品身份 audit 工具**: 新增 `scripts/audit_competitor_identity_jsonl.py`,可把 production DB 匯出的 competitor identity JSONL 在本機重跑 current matcher,輸出 accepted / veto / low-score / fresh bad 摘要與樣本;工具不連 DB、不寫 DB,用來取代在 188 app container 內全量重掃造成的 memory 壓力。
|
||||
- **V10.394 多色 catalog / 入門組防錯配**: marketplace matcher 補「琥珀橙 / 干邑棕 / 賽車綠」等車用香氛色款詞,當 MOMO 是多色/多款 catalog listing、PChome 是單一色款候選時會保留 `variant_option_conflict` hard veto;同時把 `入門組` 納入套組詞,避免理膚寶水抗敏入門組被拿去跟單瓶乳液做總價比價。
|
||||
- **V10.393 組合包 `+` 判定修正與 catalog 補強**: marketplace matcher 的組合包件數判定會先排除 `SPF50+`、`PA++++` 等防曬係數加號,以及 `NTT80+AL414` 這類純型號碼串,避免把防曬品與 OPI 套組的規格/型號加號誤判成多一個商品組件;CeraVe 三件組 vs 兩件組仍維持 `multi_component_count_conflict` hard veto。同版收緊品牌 alias 判定,避免只有品牌名就觸發商品線加成,並補 Baan 貝恩嬰兒修護唇膏「原味/草莓」catalog listing 放行。
|
||||
- **V10.392 組合包件數防錯配**: marketplace matcher 新增 `multi_component_count_conflict`,當 MOMO 與 PChome 都是 `+`/`+` 組合包但組件數不同時直接進 `not_comparable`,避免三件組被拿去跟兩件組做總價告警;同步把該原因加入 evidence flags,讓告警與審核畫面可以清楚顯示「組合包件數不同」。
|
||||
|
||||
183
scripts/audit_competitor_identity_jsonl.py
Normal file
183
scripts/audit_competitor_identity_jsonl.py
Normal file
@@ -0,0 +1,183 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Offline audit for PChome competitor identity rows.
|
||||
|
||||
This script intentionally does not connect to the database. Export candidate
|
||||
rows as JSONL, then run the current local matcher against that file.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Iterator
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from services.competitor_price_feeder import MIN_MATCH_SCORE # noqa: E402
|
||||
from services.marketplace_product_matcher import score_marketplace_match # noqa: E402
|
||||
|
||||
|
||||
PRICE_KEYS = ("pchome_price", "competitor_price", "best_competitor_price", "price")
|
||||
COMPETITOR_NAME_KEYS = ("competitor_product_name", "best_competitor_product_name", "pchome_name")
|
||||
|
||||
|
||||
def _first_present(row: dict[str, Any], keys: Iterable[str]) -> Any:
|
||||
for key in keys:
|
||||
value = row.get(key)
|
||||
if value not in (None, ""):
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
def _as_bool(value: Any) -> bool:
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if value is None:
|
||||
return False
|
||||
if isinstance(value, (int, float)):
|
||||
return bool(value)
|
||||
if isinstance(value, str):
|
||||
return value.strip().lower() in {"1", "true", "t", "yes", "y"}
|
||||
return False
|
||||
|
||||
|
||||
def _read_jsonl(path: str) -> Iterator[dict[str, Any]]:
|
||||
handle = sys.stdin if path == "-" else open(path, "r", encoding="utf-8")
|
||||
try:
|
||||
for line_no, line in enumerate(handle, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
value = json.loads(line)
|
||||
except json.JSONDecodeError as exc:
|
||||
yield {"_invalid_json": True, "_line_no": line_no, "_error": str(exc)}
|
||||
continue
|
||||
if isinstance(value, dict):
|
||||
yield value
|
||||
else:
|
||||
yield {"_invalid_json": True, "_line_no": line_no, "_error": "line is not a JSON object"}
|
||||
finally:
|
||||
if handle is not sys.stdin:
|
||||
handle.close()
|
||||
|
||||
|
||||
def _sample(row: dict[str, Any], diagnostics, status: str) -> dict[str, Any]:
|
||||
competitor_name = _first_present(row, COMPETITOR_NAME_KEYS)
|
||||
competitor_price = _first_present(row, PRICE_KEYS)
|
||||
return {
|
||||
"sku": row.get("sku"),
|
||||
"status": status,
|
||||
"stored_score": row.get("stored_score") or row.get("match_score") or row.get("best_match_score"),
|
||||
"current_score": diagnostics.score,
|
||||
"current_veto": diagnostics.hard_veto,
|
||||
"current_mode": diagnostics.comparison_mode,
|
||||
"reasons": list(diagnostics.reasons or ()),
|
||||
"momo": str(row.get("momo_name") or row.get("momo_product_name") or "")[:140],
|
||||
"competitor": str(competitor_name or "")[:140],
|
||||
"momo_price": row.get("momo_price"),
|
||||
"competitor_price": competitor_price,
|
||||
"crawled_at": row.get("crawled_at"),
|
||||
}
|
||||
|
||||
|
||||
def audit_rows(
|
||||
rows: Iterable[dict[str, Any]],
|
||||
*,
|
||||
min_score: float = MIN_MATCH_SCORE,
|
||||
sample_limit: int = 20,
|
||||
) -> dict[str, Any]:
|
||||
status_counts: Counter[str] = Counter()
|
||||
reason_counts: Counter[str] = Counter()
|
||||
summary: dict[str, Any] = {
|
||||
"scanned": 0,
|
||||
"accepted_current": 0,
|
||||
"veto_current": 0,
|
||||
"low_score_current": 0,
|
||||
"skipped": 0,
|
||||
"invalid_json": 0,
|
||||
"expired": 0,
|
||||
"fresh_veto_or_low": 0,
|
||||
"samples": [],
|
||||
}
|
||||
|
||||
for row in rows:
|
||||
if row.get("_invalid_json"):
|
||||
summary["invalid_json"] += 1
|
||||
continue
|
||||
|
||||
summary["scanned"] += 1
|
||||
momo_name = row.get("momo_name") or row.get("momo_product_name")
|
||||
competitor_name = _first_present(row, COMPETITOR_NAME_KEYS)
|
||||
competitor_price = _first_present(row, PRICE_KEYS)
|
||||
is_expired = _as_bool(row.get("is_expired"))
|
||||
if is_expired:
|
||||
summary["expired"] += 1
|
||||
|
||||
if not momo_name or not competitor_name:
|
||||
status_counts["skipped_missing_identity_text"] += 1
|
||||
summary["skipped"] += 1
|
||||
continue
|
||||
|
||||
diagnostics = score_marketplace_match(
|
||||
str(momo_name),
|
||||
str(competitor_name),
|
||||
momo_price=row.get("momo_price"),
|
||||
competitor_price=competitor_price,
|
||||
)
|
||||
reason_counts.update(diagnostics.reasons or ())
|
||||
|
||||
if diagnostics.hard_veto:
|
||||
status = "identity_veto"
|
||||
summary["veto_current"] += 1
|
||||
elif diagnostics.score < min_score:
|
||||
status = "low_score"
|
||||
summary["low_score_current"] += 1
|
||||
else:
|
||||
status = "accepted"
|
||||
summary["accepted_current"] += 1
|
||||
status_counts[status] += 1
|
||||
|
||||
if status != "accepted" and not is_expired:
|
||||
summary["fresh_veto_or_low"] += 1
|
||||
if len(summary["samples"]) < sample_limit:
|
||||
summary["samples"].append(_sample(row, diagnostics, status))
|
||||
|
||||
summary["status_counts"] = dict(status_counts)
|
||||
summary["top_reasons"] = [
|
||||
{"reason": reason, "count": count}
|
||||
for reason, count in reason_counts.most_common(30)
|
||||
]
|
||||
return summary
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Offline audit for competitor identity JSONL rows.",
|
||||
epilog=(
|
||||
"Example export SQL shape: SELECT row_to_json(t) FROM (...) t; "
|
||||
"Feed one JSON object per line into this script."
|
||||
),
|
||||
)
|
||||
parser.add_argument("input", nargs="?", default="-", help="JSONL file path, or '-' for stdin.")
|
||||
parser.add_argument("--sample-limit", type=int, default=20)
|
||||
parser.add_argument("--min-score", type=float, default=MIN_MATCH_SCORE)
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
summary = audit_rows(
|
||||
_read_jsonl(args.input),
|
||||
min_score=args.min_score,
|
||||
sample_limit=max(0, args.sample_limit),
|
||||
)
|
||||
print(json.dumps(summary, ensure_ascii=False, indent=2, default=str))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -2603,6 +2603,37 @@ def _has_variant_option_selection_gap(identity: ProductIdentity, options: set[st
|
||||
return _is_multi_variant_catalog_listing(identity) or bool(re.search(r"[//、,,]", text))
|
||||
|
||||
|
||||
def _has_catalog_options_against_generic_count_alignment(
|
||||
left: ProductIdentity,
|
||||
right: ProductIdentity,
|
||||
left_options: set[str],
|
||||
right_options: set[str],
|
||||
) -> bool:
|
||||
if not _has_overlapping_base_spec(left, right):
|
||||
return False
|
||||
if left.product_type and right.product_type and left.product_type != right.product_type:
|
||||
return False
|
||||
for catalog_identity, generic_identity, catalog_options, generic_options in (
|
||||
(left, right, left_options, right_options),
|
||||
(right, left, right_options, left_options),
|
||||
):
|
||||
named_catalog_options = {option for option in catalog_options if not option.isdigit()}
|
||||
named_generic_options = {option for option in generic_options if not option.isdigit()}
|
||||
shared_count_options = {
|
||||
option for option in catalog_options & generic_options
|
||||
if option.isdigit()
|
||||
}
|
||||
if (
|
||||
len(named_catalog_options) >= 2
|
||||
and not named_generic_options
|
||||
and shared_count_options
|
||||
and _is_multi_variant_catalog_listing(catalog_identity)
|
||||
and "組" in generic_identity.searchable_name
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _has_explicit_variant_option_conflict(
|
||||
left: ProductIdentity,
|
||||
right: ProductIdentity,
|
||||
@@ -2617,6 +2648,8 @@ def _has_explicit_variant_option_conflict(
|
||||
if left_options == right_options:
|
||||
return False
|
||||
if left_options & right_options:
|
||||
if _has_catalog_options_against_generic_count_alignment(left, right, left_options, right_options):
|
||||
return False
|
||||
if (
|
||||
len(left_options) > len(right_options)
|
||||
and _has_variant_option_selection_gap(left, left_options)
|
||||
|
||||
73
tests/test_competitor_identity_jsonl_audit.py
Normal file
73
tests/test_competitor_identity_jsonl_audit.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import json
|
||||
|
||||
|
||||
def test_offline_identity_audit_counts_current_matcher_outcomes():
|
||||
from scripts.audit_competitor_identity_jsonl import audit_rows
|
||||
|
||||
rows = [
|
||||
{
|
||||
"sku": "OK-1",
|
||||
"momo_name": "理膚寶水 B5 修復霜 40ml",
|
||||
"competitor_product_name": "理膚寶水 全面修復霜 B5 40ml",
|
||||
"momo_price": 699,
|
||||
"pchome_price": 679,
|
||||
"is_expired": False,
|
||||
},
|
||||
{
|
||||
"sku": "BAD-1",
|
||||
"momo_name": "【蘭蔻】官方直營 玫瑰霜60ml+玫瑰精露150ml",
|
||||
"competitor_product_name": "LOREAL Paris 巴黎萊雅 金致臻顏花蜜奢養膠原輕盈乳霜_60ml",
|
||||
"momo_price": 18765,
|
||||
"pchome_price": 1249,
|
||||
"is_expired": False,
|
||||
},
|
||||
{
|
||||
"sku": "EXPIRED-BAD",
|
||||
"momo_name": "【蘭蔻】官方直營 玫瑰霜60ml+玫瑰精露150ml",
|
||||
"competitor_product_name": "LOREAL Paris 巴黎萊雅 金致臻顏花蜜奢養膠原輕盈乳霜_60ml",
|
||||
"momo_price": 18765,
|
||||
"pchome_price": 1249,
|
||||
"is_expired": True,
|
||||
},
|
||||
]
|
||||
|
||||
summary = audit_rows(rows, sample_limit=5)
|
||||
|
||||
assert summary["scanned"] == 3
|
||||
assert summary["accepted_current"] == 1
|
||||
assert summary["veto_current"] == 2
|
||||
assert summary["expired"] == 1
|
||||
assert summary["fresh_veto_or_low"] == 1
|
||||
assert summary["samples"][0]["sku"] == "BAD-1"
|
||||
assert "brand_conflict" in summary["samples"][0]["reasons"]
|
||||
|
||||
|
||||
def test_offline_identity_audit_cli_reads_jsonl(tmp_path, capsys):
|
||||
from scripts.audit_competitor_identity_jsonl import main
|
||||
|
||||
path = tmp_path / "rows.jsonl"
|
||||
path.write_text(
|
||||
json.dumps({
|
||||
"sku": "OK-CLI",
|
||||
"momo_product_name": "理膚寶水 B5 修復霜 40ml",
|
||||
"best_competitor_product_name": "理膚寶水 全面修復霜 B5 40ml",
|
||||
"momo_price": 699,
|
||||
"best_competitor_price": 679,
|
||||
}, ensure_ascii=False)
|
||||
+ "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
assert main([str(path)]) == 0
|
||||
output = json.loads(capsys.readouterr().out)
|
||||
assert output["scanned"] == 1
|
||||
assert output["accepted_current"] == 1
|
||||
|
||||
|
||||
def test_offline_identity_audit_script_stays_database_free():
|
||||
from pathlib import Path
|
||||
|
||||
source = Path("scripts/audit_competitor_identity_jsonl.py").read_text(encoding="utf-8")
|
||||
assert "DatabaseManager" not in source
|
||||
assert "create_engine" not in source
|
||||
assert "psycopg" not in source
|
||||
@@ -834,6 +834,10 @@ def test_marketplace_matcher_promotes_multi_variant_catalog_listings():
|
||||
"【日本John’s Blend】香氛擴香罐85g(車用/任選/白麝香/黑麝香/茉莉/櫻花/繡球花/魔髮奇緣/青檸羅勒)",
|
||||
"日本John’s Blend 車用香氛擴香罐85g(多款可選)",
|
||||
),
|
||||
(
|
||||
"【日本John’s Blend】香氛擴香罐85g 任選3入((車用/任選/白麝香/黑麝香/茉莉/櫻花/繡球花/魔髮奇緣))",
|
||||
"日本John’s Blend 車用香氛擴香罐85g 3入組",
|
||||
),
|
||||
(
|
||||
"【COCODOR】香氛蠟燭170g(多款任選/官方直營)",
|
||||
"COCODOR Premium Jar Candle 香氛精油蠟燭170g(多種香味任選)",
|
||||
@@ -848,7 +852,10 @@ def test_marketplace_matcher_promotes_multi_variant_catalog_listings():
|
||||
diagnostics = score_marketplace_match(momo_name, competitor_name)
|
||||
assert diagnostics.score >= 0.76
|
||||
assert diagnostics.hard_veto is False
|
||||
assert "catalog_variant_listing_alignment" in diagnostics.reasons
|
||||
assert (
|
||||
"catalog_variant_listing_alignment" in diagnostics.reasons
|
||||
or "strong_exact_spec_match" in diagnostics.reasons
|
||||
)
|
||||
|
||||
|
||||
def test_marketplace_matcher_requires_non_brand_product_line_evidence():
|
||||
|
||||
Reference in New Issue
Block a user