收緊 PChome 同款比對門檻
All checks were successful
CD Pipeline / deploy (push) Successful in 1m19s

This commit is contained in:
OoO
2026-05-19 15:53:09 +08:00
parent 6c548e2251
commit 75390f8495
19 changed files with 94 additions and 31 deletions

View File

@@ -4,7 +4,7 @@
================================================================================
【已完成】
- V10.263 強化核心 MOMO/PChome 比價鏈路:新增 `marketplace_product_matcher.py` 身份比對、只讓 `identity_v2` 高信心配對進 Dashboard/AI/Excel/Daily/Growth/PPT並建立 `competitor_intel_repository.py` 統一圖表與簡報資料出口。
- V10.265 強化核心 MOMO/PChome 比價鏈路:新增 `marketplace_product_matcher.py` 身份比對、只讓 `identity_v2` 且分數 ≥ 0.76 的高信心配對進 Dashboard/AI/Excel/Daily/Growth/PPT並建立 `competitor_intel_repository.py` 統一圖表與簡報資料出口;同品牌但不同型號/不同組數會進待審,不進正式比價
- V10.254 續補 `/growth_analysis` 快取命中效能PostgreSQL source fingerprint 加 60 秒短 TTL匯入 realtime_sales_monthly 後同步清除 growth shared cache 與短快取,避免快取命中仍頻繁掃大表 COUNT。
- V10.253 修正 Elephant Alpha L3 HITL 空告警:價格類與資源調配低信心事件若沒有 Hermes/實證資料,只記 suppressed telemetry 與 cooldown不寫 pending human_review、不發 Telegram`resource_optimization` 會保留 queue/load 原始指標供追查。
- V10.251 修正 OpenClaw Q&A 備援遙測Ollama 主路徑仍為 GCP-A → GCP-B → 111Gemini 只記為 `openclaw_qa_gemini_fallback`NIM 只記為 `openclaw_qa_nim`AI Calls 會把 legacy `openclaw_qa + gemini` 標成 Gemini 備援,避免再次誤判 Gemini-first。

View File

@@ -320,7 +320,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '')
# ==========================================
# 系統版本與路徑
# ==========================================
SYSTEM_VERSION = "V10.264"
SYSTEM_VERSION = "V10.265"
LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log')
public_url = PUBLIC_URL # 用於模板顯示

View File

@@ -188,7 +188,7 @@ SKU/商品ID = find_col(['商品ID', 'Product ID', 'ID', 'i_code', 'Item Code'
| `discount_pct` | INTEGER | 折扣 %NULL=未折扣) |
| `competitor_product_id` | VARCHAR(100) | PChome 商品 ID |
| `competitor_product_name` | TEXT | PChome 商品名稱(核對用) |
| `match_score` | NUMERIC(4,3) | 商品身份比對分數0~1< 0.62 不寫入正式快取 |
| `match_score` | NUMERIC(4,3) | 商品身份比對分數0~1< 0.76 不寫入正式快取 |
| `tags` | JSONB | 語意標籤,如 `["on_sale","discount_20pct"]` |
| `crawled_at` | TIMESTAMP | 爬取時間 |
| `expires_at` | TIMESTAMP | TTL = crawled_at + 6h過期後 Hermes 忽略 |
@@ -286,7 +286,7 @@ LIMIT 300
[HermesAnalystService.fetch_candidates()] ←← AI Pipeline 消費端
↓ LEFT JOIN competitor_prices零網路等待
↓ 有效期內expires_at > NOW()+ match_score ≥ 0.62 + tags 含 identity_v2 才 JOIN
↓ 有效期內expires_at > NOW()+ match_score ≥ 0.76 + tags 含 identity_v2 才 JOIN
↓ pchome_price + competitor_tags 一起傳給 Hermes
```
@@ -297,7 +297,7 @@ LIMIT 300
| 解耦方式 | DB 表快取(非 Redis | PostgreSQL 已是核心,無需額外依賴;支援 JOIN |
| TTL | 6 小時 | 與 AI Pipeline 排程週期對齊 |
| 比對算法 | 品牌 + 核心 token + 容量/重量/包數 + 品類 + 價格 sanity check | 由 `marketplace_product_matcher.py` 統一供 feeder、legacy crawler、AI/PPT 鏈路使用 |
| 最低比對門檻 | 0.62 | 核心比價寧可待審,不允許低信心錯配影響 AI 決策 |
| 最低比對門檻 | 0.76 | 核心比價寧可待審,不允許低信心錯配影響 AI 決策 |
| 已有不同 PChome 商品覆蓋門檻 | 0.84 | 新候選與既有正式配對不同時,除非超高信心,否則寫入 `needs_review` attempt 不覆蓋 |
| 語意標籤 | JSONB 陣列 | 傳給 Hermes 提升情境感知品質 |
@@ -309,7 +309,7 @@ MOMO 商品名稱
→ PChomeCrawler.search_products(keyword, limit=12)
→ marketplace_product_matcher.score_marketplace_match()
→ 品牌衝突 / 容量衝突 / 包數衝突 hard veto
→ 同款高信心 score ≥ 0.62 才進 competitor_prices
→ 同款高信心 score ≥ 0.76 才進 competitor_prices
→ 低信心、規格衝突、既有配對衝突寫入 competitor_match_attempts
```
@@ -320,7 +320,7 @@ LEFT JOIN competitor_prices cp
ON cp.sku = lmp.sku
AND cp.source = 'pchome'
AND cp.expires_at > NOW()
AND cp.match_score >= 0.62
AND cp.match_score >= 0.76
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
```
→ 無競品資料的商品仍回傳,`pchome_price=NULL``_batch_analyze` 自動跳過

View File

@@ -1518,7 +1518,7 @@ def api_icaim_dashboard():
ON cp.sku = lm.sku
AND cp.source = 'pchome'
AND cp.expires_at > NOW()
AND cp.match_score >= 0.62
AND cp.match_score >= 0.76
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
WHERE lm.rn = 1
AND (lm.momo_price - cp.price) / cp.price * 100 > 15
@@ -1528,7 +1528,7 @@ def api_icaim_dashboard():
(SELECT COUNT(*) FROM competitor_prices
WHERE expires_at > NOW()
AND source = 'pchome'
AND COALESCE(match_score, 0) >= 0.62
AND COALESCE(match_score, 0) >= 0.76
AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2') AS valid_competitor_prices,
(SELECT COUNT(*) FROM high_risk) AS high_risk_count,
(SELECT COUNT(*) FROM ai_price_recommendations) AS total_ai_recs,
@@ -1565,7 +1565,7 @@ def api_icaim_dashboard():
ON cp.sku = lp.sku
AND cp.source = 'pchome'
AND cp.expires_at > NOW()
AND cp.match_score >= 0.62
AND cp.match_score >= 0.76
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
WHERE lp.rn = 1
ORDER BY gap_pct DESC NULLS LAST

View File

@@ -334,7 +334,7 @@ def _build_price_history_payload(session, product):
WHERE sku = :sku
AND source = 'pchome'
AND crawled_at >= :start_date
AND COALESCE(match_score, 0) >= 0.62
AND COALESCE(match_score, 0) >= 0.76
AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2'
ORDER BY crawled_at
"""), {

View File

@@ -40,7 +40,7 @@ sys_log = SystemLogger("DashboardRoutes").get_logger()
dashboard_bp = Blueprint('dashboard', __name__)
PRODUCT_PICK_LIST_LIMIT = 50
PCHOME_MATCH_SCORE_FLOOR = 0.62
PCHOME_MATCH_SCORE_FLOOR = 0.76
def _build_pchome_product_url(product_id):

View File

@@ -137,7 +137,7 @@ def export_excel_ai_picks():
AND (cp.expires_at IS NULL OR cp.expires_at > CURRENT_TIMESTAMP)
AND cp.price IS NOT NULL
AND cp.price > 0
AND COALESCE(cp.match_score, 0) >= 0.62
AND COALESCE(cp.match_score, 0) >= 0.76
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
ORDER BY cp.sku, cp.crawled_at DESC NULLS LAST
)

View File

@@ -154,7 +154,7 @@ def _fetch_candidates_without_sales(conn, limit: int) -> List[Dict[str, Any]]:
ON cp.sku = lm.sku
AND cp.source = 'pchome'
AND (cp.expires_at IS NULL OR cp.expires_at > CURRENT_TIMESTAMP)
AND cp.match_score >= 0.62
AND cp.match_score >= 0.76
{identity_condition}
WHERE lm.rn = 1
ORDER BY cp.match_score DESC, cp.crawled_at DESC
@@ -249,7 +249,7 @@ def _fetch_candidates(conn, limit: int) -> List[Dict[str, Any]]:
FROM competitor_price_history
WHERE source = 'pchome'
AND crawled_at >= CURRENT_TIMESTAMP - INTERVAL '30 days'
AND COALESCE(match_score, 0) >= 0.62
AND COALESCE(match_score, 0) >= 0.76
AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2'
GROUP BY sku, source
)
@@ -277,7 +277,7 @@ def _fetch_candidates(conn, limit: int) -> List[Dict[str, Any]]:
ON cp.sku = lm.sku
AND cp.source = 'pchome'
AND (cp.expires_at IS NULL OR cp.expires_at > CURRENT_TIMESTAMP)
AND cp.match_score >= 0.62
AND cp.match_score >= 0.76
{identity_condition}
LEFT JOIN history_stats hs
ON hs.sku = lm.sku
@@ -515,7 +515,7 @@ def generate_product_pick_list(engine, limit: int = 50) -> ProductPickResult:
scored = [_score_candidate(row) for row in rows if _to_float(row.get("pchome_price")) > 0]
picks = [
pick for pick in scored
if pick["pick_score"] >= 45 and (_to_float(pick.get("match_score")) >= 0.62)
if pick["pick_score"] >= 45 and (_to_float(pick.get("match_score")) >= 0.76)
]
picks.sort(key=lambda item: item["pick_score"], reverse=True)
picks = picks[:limit]

View File

@@ -107,7 +107,7 @@ def _fetch_price_history(sku: str, days: int = 30) -> Dict[str, Any]:
WHERE sku = :sku
AND source = 'pchome'
AND crawled_at >= NOW() - INTERVAL ':days days'
AND COALESCE(match_score, 0) >= 0.62
AND COALESCE(match_score, 0) >= 0.76
AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2'
GROUP BY dt ORDER BY dt
""".replace(":days", str(days))), {"sku": sku}).fetchall()
@@ -443,7 +443,7 @@ def price_history_heatmap(days: int = 30) -> Optional[bytes]:
) pr ON pr.product_id = p.id
WHERE cp.crawled_at >= NOW() - INTERVAL '{days} days'
AND p.category IS NOT NULL
AND COALESCE(cp.match_score, 0) >= 0.62
AND COALESCE(cp.match_score, 0) >= 0.76
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
GROUP BY p.category, dt
ORDER BY p.category, dt

View File

@@ -16,7 +16,7 @@ from typing import Any, Optional, Union
from sqlalchemy import inspect, text
PCHOME_MATCH_SCORE_FLOOR = 0.62
PCHOME_MATCH_SCORE_FLOOR = 0.76
def _num(value: Any) -> float:

View File

@@ -34,7 +34,7 @@ from typing import Optional
logger = logging.getLogger(__name__)
# ── 比對參數 ─────────────────────────────────────────
MIN_MATCH_SCORE = 0.62 # 低於此分數不寫入;核心比價寧可待審也不能錯配
MIN_MATCH_SCORE = 0.76 # 低於此分數不寫入;核心比價寧可待審也不能錯配
REPLACE_DIFFERENT_PRODUCT_SCORE = 0.84 # 已有不同 PChome 商品時,需超高信心才覆蓋
SEARCH_LIMIT = 12 # 每個搜尋詞取 PChome 前 N 筆
MAX_SEARCH_TERMS = 3 # 每個 MOMO 商品最多嘗試幾組搜尋詞

View File

@@ -369,7 +369,7 @@ class ElephantAlphaAutonomousEngine:
) pr ON pr.product_id = p.id
JOIN competitor_prices cp ON cp.sku = p.i_code
WHERE cp.expires_at > NOW()
AND COALESCE(cp.match_score, 0) >= 0.62
AND COALESCE(cp.match_score, 0) >= 0.76
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
AND cp.price < pr.price * 0.85
AND cp.crawled_at >= NOW() - INTERVAL '2 hours'
@@ -394,7 +394,7 @@ class ElephantAlphaAutonomousEngine:
) pr ON pr.product_id = p.id
JOIN competitor_prices cp ON cp.sku = p.i_code
WHERE cp.expires_at > NOW()
AND COALESCE(cp.match_score, 0) >= 0.62
AND COALESCE(cp.match_score, 0) >= 0.76
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
AND cp.price > pr.price * 1.05
AND cp.crawled_at >= NOW() - INTERVAL '1 hour'

View File

@@ -396,7 +396,7 @@ class HermesAnalystService:
ON cp.sku = lmp.sku
AND cp.source = 'pchome'
AND cp.expires_at > NOW()
AND cp.match_score >= 0.62
AND cp.match_score >= 0.76
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
WHERE lmp.rn = 1
AND rs.sales_7d_prev > 0

View File

@@ -390,6 +390,8 @@ def _spec_component(left_values: Iterable[float], right_values: Iterable[float])
def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool]:
left_counts = [count for count, _unit in left.counts]
right_counts = [count for count, _unit in right.counts]
if left.total_piece_count and right.total_piece_count:
if left.total_piece_count == right.total_piece_count:
return 1.0, False
@@ -398,7 +400,13 @@ def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float,
if left.counts and right.counts:
if set(left.counts) & set(right.counts):
return 0.85, False
if left_counts and right_counts:
ratio = max(max(left_counts), max(right_counts)) / max(min(max(left_counts), max(right_counts)), 1)
if ratio >= 1.5:
return 0.0, True
return 0.35, False
if (left_counts and max(left_counts) > 1) or (right_counts and max(right_counts) > 1):
return 0.0, True
return 0.5, False
@@ -428,6 +436,48 @@ def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, b
return score, bool(conflicts), tuple(conflicts)
def _has_bundle_offer(identity: ProductIdentity) -> bool:
text = identity.normalized_name
return bool(
re.search(r"\s*\d+\s*送\s*\d+", text)
or re.search(r"\s*[一二兩雙三四五六七八九十]\s*送\s*[一二兩雙三四五六七八九十]", text)
or "買一送一" in text
or "囤貨組" in text
)
def _has_multi_component(identity: ProductIdentity) -> bool:
text = identity.normalized_name
return bool(
"+" in text
or "" in text
or re.search(r"\d+\s*(?:ml|g|毫升|公克)\s*x\s*\d+", text, re.I)
)
def _chinese_bigram_score(left: ProductIdentity, right: ProductIdentity) -> float:
def signature(identity: ProductIdentity) -> set[str]:
text = identity.searchable_name
for token in sorted(identity.brand_tokens, key=len, reverse=True):
text = text.replace(token, " ")
text = re.sub(r"[a-z0-9]+", " ", text)
text = "".join(char for char in text if "\u4e00" <= char <= "\u9fff")
for phrase in (
"官方", "直營", "公司貨", "專櫃", "正貨", "原廠", "限定", "獨家",
"期間", "超值", "特惠", "優惠", "新品", "經典", "人氣", "熱銷",
"必買", "推薦", "任選", "禮盒", "母親節", "超品日", "多款",
"", "", "", "", "", "", "", "",
):
text = text.replace(phrase, "")
return {text[i:i + 2] for i in range(max(0, len(text) - 1))}
left_signature = signature(left)
right_signature = signature(right)
if not left_signature or not right_signature:
return 0.55
return 2 * len(left_signature & right_signature) / (len(left_signature) + len(right_signature))
def score_marketplace_match(
momo_name: str,
competitor_name: str,
@@ -441,6 +491,7 @@ def score_marketplace_match(
token_score = _weighted_token_score(left, right)
spec_score, spec_conflict, spec_reasons = _spec_score(left, right)
sequence_score = SequenceMatcher(None, left.searchable_name, right.searchable_name).ratio()
chinese_name_score = _chinese_bigram_score(left, right)
if left.product_type and right.product_type:
type_score = 1.0 if left.product_type == right.product_type else 0.0
else:
@@ -452,8 +503,20 @@ def score_marketplace_match(
reasons.extend(spec_reasons)
if left.product_type and right.product_type and left.product_type != right.product_type:
reasons.append("type_conflict")
if _has_bundle_offer(left) != _has_bundle_offer(right):
reasons.append("bundle_offer_conflict")
if _has_multi_component(left) != _has_multi_component(right):
reasons.append("multi_component_conflict")
if chinese_name_score < 0.16:
reasons.append("product_line_conflict")
hard_veto = brand_conflict or spec_conflict
if _has_bundle_offer(left) != _has_bundle_offer(right):
hard_veto = True
if _has_multi_component(left) != _has_multi_component(right):
hard_veto = True
if chinese_name_score < 0.16 and token_score < 0.72:
hard_veto = True
if left.product_type and right.product_type and left.product_type != right.product_type and token_score < 0.55:
hard_veto = True

View File

@@ -555,7 +555,7 @@ def _fetch_competitor_summary() -> Dict[str, Any]:
FROM price_records ORDER BY product_id, timestamp DESC
) pr ON pr.product_id = p.id
WHERE cp.expires_at > NOW()
AND COALESCE(cp.match_score, 0) >= 0.62
AND COALESCE(cp.match_score, 0) >= 0.76
AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2'
""")).fetchone()
if row and row[0]:

View File

@@ -505,7 +505,7 @@ def find_best_match(keyword: str, momo_price: float) -> Optional[dict]:
best = result
best_score = diagnostics.score
best_diagnostics = diagnostics
if not best or best_score < 0.62:
if not best or best_score < 0.76:
return None
best['match_score'] = best_score
best['match_reasons'] = list(getattr(best_diagnostics, 'reasons', ()) or ())

View File

@@ -140,7 +140,7 @@
{% set coverage = chart_data.competitor_coverage | default({}) %}
<div class="ga-competitor-quality">
<span>高信心門檻</span>
<strong class="momo-mono">{{ coverage.match_score_floor | default(0.62) }}</strong>
<strong class="momo-mono">{{ coverage.match_score_floor | default(0.76) }}</strong>
<span>有效配對</span>
<strong class="momo-mono">{{ coverage.valid_matches | default(0) | number_format }}</strong>
<span>覆蓋率</span>

View File

@@ -361,7 +361,7 @@ def test_ai_product_pick_agent_uses_real_competitor_data_and_dashboard_action():
route_source = (ROOT / "routes/ai_routes.py").read_text(encoding="utf-8")
template = (ROOT / "templates/ai_intelligence.html").read_text(encoding="utf-8")
assert "MIN_MATCH_SCORE = 0.62" in feeder_source
assert "MIN_MATCH_SCORE = 0.76" in feeder_source
assert "REPLACE_DIFFERENT_PRODUCT_SCORE" in feeder_source
assert "marketplace_product_matcher" in feeder_source
assert "MAX_SEARCH_TERMS" in feeder_source

View File

@@ -27,7 +27,7 @@ def test_marketplace_matcher_rejects_brand_conflict_even_when_volume_matches():
competitor_price=1249,
)
assert diagnostics.score < 0.62
assert diagnostics.score < 0.76
assert diagnostics.hard_veto is True
assert "brand_conflict" in diagnostics.reasons
@@ -42,7 +42,7 @@ def test_marketplace_matcher_rejects_volume_conflict_for_same_brand():
competitor_price=3549,
)
assert diagnostics.score < 0.62
assert diagnostics.score < 0.76
assert diagnostics.hard_veto is True
assert "volume_conflict" in diagnostics.reasons
@@ -57,7 +57,7 @@ def test_marketplace_matcher_handles_bundle_piece_count():
competitor_price=4590,
)
assert diagnostics.score >= 0.62
assert diagnostics.score >= 0.76
assert diagnostics.hard_veto is False