V10.396 add offline competitor identity audit
All checks were successful
CD Pipeline / deploy (push) Successful in 1m6s

This commit is contained in:
OoO
2026-05-24 11:08:55 +08:00
committed by AiderHeal Bot
parent 68f9e051f4
commit 01c888c565
6 changed files with 300 additions and 2 deletions

View File

@@ -325,7 +325,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
# ==========================================
# 系統版本與路徑
# ==========================================
SYSTEM_VERSION = "V10.394"
SYSTEM_VERSION = "V10.396"
LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
public_url = PUBLIC_URL # 用於模板顯示

View File

@@ -13,6 +13,8 @@
## 📅 詳細更新日誌 (考古存檔)
### 2026-05-21瀏覽器測試守門與 PChome 熱路徑優化
- **V10.396 多選 catalog 對 generic count 組合放行**: marketplace matcher 對「多款任選 catalog listing」對上同數量 generic `N入組` 候選新增保守豁免:需品牌、品類、基礎規格與數量一致,且 generic 端沒有具名色款/香味選項,才不觸發 `variant_option_conflict`。Johns Blend 香氛擴香罐 85g 任選 3 入 vs PChome 3入組會進 `identity_review`,不直接價格告警。
- **V10.395 離線競品身份 audit 工具**: 新增 `scripts/audit_competitor_identity_jsonl.py`,可把 production DB 匯出的 competitor identity JSONL 在本機重跑 current matcher輸出 accepted / veto / low-score / fresh bad 摘要與樣本;工具不連 DB、不寫 DB用來取代在 188 app container 內全量重掃造成的 memory 壓力。
- **V10.394 多色 catalog / 入門組防錯配**: marketplace matcher 補「琥珀橙 / 干邑棕 / 賽車綠」等車用香氛色款詞,當 MOMO 是多色/多款 catalog listing、PChome 是單一色款候選時會保留 `variant_option_conflict` hard veto同時把 `入門組` 納入套組詞,避免理膚寶水抗敏入門組被拿去跟單瓶乳液做總價比價。
- **V10.393 組合包 `+` 判定修正與 catalog 補強**: marketplace matcher 的組合包件數判定會先排除 `SPF50+``PA++++` 等防曬係數加號,以及 `NTT80+AL414` 這類純型號碼串,避免把防曬品與 OPI 套組的規格/型號加號誤判成多一個商品組件CeraVe 三件組 vs 兩件組仍維持 `multi_component_count_conflict` hard veto。同版收緊品牌 alias 判定,避免只有品牌名就觸發商品線加成,並補 Baan 貝恩嬰兒修護唇膏「原味/草莓」catalog listing 放行。
- **V10.392 組合包件數防錯配**: marketplace matcher 新增 `multi_component_count_conflict`,當 MOMO 與 PChome 都是 `+`/`` 組合包但組件數不同時直接進 `not_comparable`,避免三件組被拿去跟兩件組做總價告警;同步把該原因加入 evidence flags讓告警與審核畫面可以清楚顯示「組合包件數不同」。

View File

@@ -0,0 +1,183 @@
#!/usr/bin/env python3
"""Offline audit for PChome competitor identity rows.
This script intentionally does not connect to the database. Export candidate
rows as JSONL, then run the current local matcher against that file.
"""
from __future__ import annotations
import argparse
import json
import sys
from collections import Counter
from pathlib import Path
from typing import Any, Iterable, Iterator
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from services.competitor_price_feeder import MIN_MATCH_SCORE # noqa: E402
from services.marketplace_product_matcher import score_marketplace_match # noqa: E402
PRICE_KEYS = ("pchome_price", "competitor_price", "best_competitor_price", "price")
COMPETITOR_NAME_KEYS = ("competitor_product_name", "best_competitor_product_name", "pchome_name")
def _first_present(row: dict[str, Any], keys: Iterable[str]) -> Any:
for key in keys:
value = row.get(key)
if value not in (None, ""):
return value
return None
def _as_bool(value: Any) -> bool:
if isinstance(value, bool):
return value
if value is None:
return False
if isinstance(value, (int, float)):
return bool(value)
if isinstance(value, str):
return value.strip().lower() in {"1", "true", "t", "yes", "y"}
return False
def _read_jsonl(path: str) -> Iterator[dict[str, Any]]:
handle = sys.stdin if path == "-" else open(path, "r", encoding="utf-8")
try:
for line_no, line in enumerate(handle, start=1):
line = line.strip()
if not line:
continue
try:
value = json.loads(line)
except json.JSONDecodeError as exc:
yield {"_invalid_json": True, "_line_no": line_no, "_error": str(exc)}
continue
if isinstance(value, dict):
yield value
else:
yield {"_invalid_json": True, "_line_no": line_no, "_error": "line is not a JSON object"}
finally:
if handle is not sys.stdin:
handle.close()
def _sample(row: dict[str, Any], diagnostics, status: str) -> dict[str, Any]:
competitor_name = _first_present(row, COMPETITOR_NAME_KEYS)
competitor_price = _first_present(row, PRICE_KEYS)
return {
"sku": row.get("sku"),
"status": status,
"stored_score": row.get("stored_score") or row.get("match_score") or row.get("best_match_score"),
"current_score": diagnostics.score,
"current_veto": diagnostics.hard_veto,
"current_mode": diagnostics.comparison_mode,
"reasons": list(diagnostics.reasons or ()),
"momo": str(row.get("momo_name") or row.get("momo_product_name") or "")[:140],
"competitor": str(competitor_name or "")[:140],
"momo_price": row.get("momo_price"),
"competitor_price": competitor_price,
"crawled_at": row.get("crawled_at"),
}
def audit_rows(
rows: Iterable[dict[str, Any]],
*,
min_score: float = MIN_MATCH_SCORE,
sample_limit: int = 20,
) -> dict[str, Any]:
status_counts: Counter[str] = Counter()
reason_counts: Counter[str] = Counter()
summary: dict[str, Any] = {
"scanned": 0,
"accepted_current": 0,
"veto_current": 0,
"low_score_current": 0,
"skipped": 0,
"invalid_json": 0,
"expired": 0,
"fresh_veto_or_low": 0,
"samples": [],
}
for row in rows:
if row.get("_invalid_json"):
summary["invalid_json"] += 1
continue
summary["scanned"] += 1
momo_name = row.get("momo_name") or row.get("momo_product_name")
competitor_name = _first_present(row, COMPETITOR_NAME_KEYS)
competitor_price = _first_present(row, PRICE_KEYS)
is_expired = _as_bool(row.get("is_expired"))
if is_expired:
summary["expired"] += 1
if not momo_name or not competitor_name:
status_counts["skipped_missing_identity_text"] += 1
summary["skipped"] += 1
continue
diagnostics = score_marketplace_match(
str(momo_name),
str(competitor_name),
momo_price=row.get("momo_price"),
competitor_price=competitor_price,
)
reason_counts.update(diagnostics.reasons or ())
if diagnostics.hard_veto:
status = "identity_veto"
summary["veto_current"] += 1
elif diagnostics.score < min_score:
status = "low_score"
summary["low_score_current"] += 1
else:
status = "accepted"
summary["accepted_current"] += 1
status_counts[status] += 1
if status != "accepted" and not is_expired:
summary["fresh_veto_or_low"] += 1
if len(summary["samples"]) < sample_limit:
summary["samples"].append(_sample(row, diagnostics, status))
summary["status_counts"] = dict(status_counts)
summary["top_reasons"] = [
{"reason": reason, "count": count}
for reason, count in reason_counts.most_common(30)
]
return summary
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(
description="Offline audit for competitor identity JSONL rows.",
epilog=(
"Example export SQL shape: SELECT row_to_json(t) FROM (...) t; "
"Feed one JSON object per line into this script."
),
)
parser.add_argument("input", nargs="?", default="-", help="JSONL file path, or '-' for stdin.")
parser.add_argument("--sample-limit", type=int, default=20)
parser.add_argument("--min-score", type=float, default=MIN_MATCH_SCORE)
args = parser.parse_args(argv)
summary = audit_rows(
_read_jsonl(args.input),
min_score=args.min_score,
sample_limit=max(0, args.sample_limit),
)
print(json.dumps(summary, ensure_ascii=False, indent=2, default=str))
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -2603,6 +2603,37 @@ def _has_variant_option_selection_gap(identity: ProductIdentity, options: set[st
return _is_multi_variant_catalog_listing(identity) or bool(re.search(r"[//、,]", text))
def _has_catalog_options_against_generic_count_alignment(
left: ProductIdentity,
right: ProductIdentity,
left_options: set[str],
right_options: set[str],
) -> bool:
if not _has_overlapping_base_spec(left, right):
return False
if left.product_type and right.product_type and left.product_type != right.product_type:
return False
for catalog_identity, generic_identity, catalog_options, generic_options in (
(left, right, left_options, right_options),
(right, left, right_options, left_options),
):
named_catalog_options = {option for option in catalog_options if not option.isdigit()}
named_generic_options = {option for option in generic_options if not option.isdigit()}
shared_count_options = {
option for option in catalog_options & generic_options
if option.isdigit()
}
if (
len(named_catalog_options) >= 2
and not named_generic_options
and shared_count_options
and _is_multi_variant_catalog_listing(catalog_identity)
and "" in generic_identity.searchable_name
):
return True
return False
def _has_explicit_variant_option_conflict(
left: ProductIdentity,
right: ProductIdentity,
@@ -2617,6 +2648,8 @@ def _has_explicit_variant_option_conflict(
if left_options == right_options:
return False
if left_options & right_options:
if _has_catalog_options_against_generic_count_alignment(left, right, left_options, right_options):
return False
if (
len(left_options) > len(right_options)
and _has_variant_option_selection_gap(left, left_options)

View File

@@ -0,0 +1,73 @@
import json
def test_offline_identity_audit_counts_current_matcher_outcomes():
from scripts.audit_competitor_identity_jsonl import audit_rows
rows = [
{
"sku": "OK-1",
"momo_name": "理膚寶水 B5 修復霜 40ml",
"competitor_product_name": "理膚寶水 全面修復霜 B5 40ml",
"momo_price": 699,
"pchome_price": 679,
"is_expired": False,
},
{
"sku": "BAD-1",
"momo_name": "【蘭蔻】官方直營 玫瑰霜60ml+玫瑰精露150ml",
"competitor_product_name": "LOREAL Paris 巴黎萊雅 金致臻顏花蜜奢養膠原輕盈乳霜_60ml",
"momo_price": 18765,
"pchome_price": 1249,
"is_expired": False,
},
{
"sku": "EXPIRED-BAD",
"momo_name": "【蘭蔻】官方直營 玫瑰霜60ml+玫瑰精露150ml",
"competitor_product_name": "LOREAL Paris 巴黎萊雅 金致臻顏花蜜奢養膠原輕盈乳霜_60ml",
"momo_price": 18765,
"pchome_price": 1249,
"is_expired": True,
},
]
summary = audit_rows(rows, sample_limit=5)
assert summary["scanned"] == 3
assert summary["accepted_current"] == 1
assert summary["veto_current"] == 2
assert summary["expired"] == 1
assert summary["fresh_veto_or_low"] == 1
assert summary["samples"][0]["sku"] == "BAD-1"
assert "brand_conflict" in summary["samples"][0]["reasons"]
def test_offline_identity_audit_cli_reads_jsonl(tmp_path, capsys):
from scripts.audit_competitor_identity_jsonl import main
path = tmp_path / "rows.jsonl"
path.write_text(
json.dumps({
"sku": "OK-CLI",
"momo_product_name": "理膚寶水 B5 修復霜 40ml",
"best_competitor_product_name": "理膚寶水 全面修復霜 B5 40ml",
"momo_price": 699,
"best_competitor_price": 679,
}, ensure_ascii=False)
+ "\n",
encoding="utf-8",
)
assert main([str(path)]) == 0
output = json.loads(capsys.readouterr().out)
assert output["scanned"] == 1
assert output["accepted_current"] == 1
def test_offline_identity_audit_script_stays_database_free():
from pathlib import Path
source = Path("scripts/audit_competitor_identity_jsonl.py").read_text(encoding="utf-8")
assert "DatabaseManager" not in source
assert "create_engine" not in source
assert "psycopg" not in source

View File

@@ -834,6 +834,10 @@ def test_marketplace_matcher_promotes_multi_variant_catalog_listings():
"【日本Johns Blend】香氛擴香罐85g(車用/任選/白麝香/黑麝香/茉莉/櫻花/繡球花/魔髮奇緣/青檸羅勒)",
"日本Johns Blend 車用香氛擴香罐85g(多款可選)",
),
(
"【日本Johns Blend】香氛擴香罐85g 任選3入((車用/任選/白麝香/黑麝香/茉莉/櫻花/繡球花/魔髮奇緣))",
"日本Johns Blend 車用香氛擴香罐85g 3入組",
),
(
"【COCODOR】香氛蠟燭170g(多款任選/官方直營)",
"COCODOR Premium Jar Candle 香氛精油蠟燭170g(多種香味任選)",
@@ -848,7 +852,10 @@ def test_marketplace_matcher_promotes_multi_variant_catalog_listings():
diagnostics = score_marketplace_match(momo_name, competitor_name)
assert diagnostics.score >= 0.76
assert diagnostics.hard_veto is False
assert "catalog_variant_listing_alignment" in diagnostics.reasons
assert (
"catalog_variant_listing_alignment" in diagnostics.reasons
or "strong_exact_spec_match" in diagnostics.reasons
)
def test_marketplace_matcher_requires_non_brand_product_line_evidence():