diff --git a/TODO_NEXT_STEPS.txt b/TODO_NEXT_STEPS.txt index 77e6bc8..41c7716 100644 --- a/TODO_NEXT_STEPS.txt +++ b/TODO_NEXT_STEPS.txt @@ -4,7 +4,7 @@ ================================================================================ 【已完成】 - - V10.263 強化核心 MOMO/PChome 比價鏈路:新增 `marketplace_product_matcher.py` 身份比對、只讓 `identity_v2` 高信心配對進 Dashboard/AI/Excel/Daily/Growth/PPT,並建立 `competitor_intel_repository.py` 統一圖表與簡報資料出口。 + - V10.265 強化核心 MOMO/PChome 比價鏈路:新增 `marketplace_product_matcher.py` 身份比對、只讓 `identity_v2` 且分數 ≥ 0.76 的高信心配對進 Dashboard/AI/Excel/Daily/Growth/PPT,並建立 `competitor_intel_repository.py` 統一圖表與簡報資料出口;同品牌但不同型號/不同組數會進待審,不進正式比價。 - V10.254 續補 `/growth_analysis` 快取命中效能:PostgreSQL source fingerprint 加 60 秒短 TTL,匯入 realtime_sales_monthly 後同步清除 growth shared cache 與短快取,避免快取命中仍頻繁掃大表 COUNT。 - V10.253 修正 Elephant Alpha L3 HITL 空告警:價格類與資源調配低信心事件若沒有 Hermes/實證資料,只記 suppressed telemetry 與 cooldown,不寫 pending human_review、不發 Telegram;`resource_optimization` 會保留 queue/load 原始指標供追查。 - V10.251 修正 OpenClaw Q&A 備援遙測:Ollama 主路徑仍為 GCP-A → GCP-B → 111,Gemini 只記為 `openclaw_qa_gemini_fallback`,NIM 只記為 `openclaw_qa_nim`;AI Calls 會把 legacy `openclaw_qa + gemini` 標成 Gemini 備援,避免再次誤判 Gemini-first。 diff --git a/config.py b/config.py index 24d834c..e20bc46 100644 --- a/config.py +++ b/config.py @@ -320,7 +320,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.264" +SYSTEM_VERSION = "V10.265" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index bbb09f8..1ded158 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -188,7 +188,7 @@ SKU/商品ID = find_col(['商品ID', 'Product ID', 'ID', 'i_code', 'Item Code' | `discount_pct` | INTEGER | 折扣 %(NULL=未折扣) | | `competitor_product_id` | VARCHAR(100) | PChome 商品 ID | | `competitor_product_name` | TEXT | PChome 商品名稱(核對用) | -| `match_score` | NUMERIC(4,3) | 商品身份比對分數(0~1),< 0.62 不寫入正式快取 | +| `match_score` | NUMERIC(4,3) | 商品身份比對分數(0~1),< 0.76 不寫入正式快取 | | `tags` | JSONB | 語意標籤,如 `["on_sale","discount_20pct"]` | | `crawled_at` | TIMESTAMP | 爬取時間 | | `expires_at` | TIMESTAMP | TTL = crawled_at + 6h,過期後 Hermes 忽略 | @@ -286,7 +286,7 @@ LIMIT 300 ↓ [HermesAnalystService.fetch_candidates()] ←← AI Pipeline 消費端 ↓ LEFT JOIN competitor_prices(零網路等待) - ↓ 有效期內(expires_at > NOW())+ match_score ≥ 0.62 + tags 含 identity_v2 才 JOIN + ↓ 有效期內(expires_at > NOW())+ match_score ≥ 0.76 + tags 含 identity_v2 才 JOIN ↓ pchome_price + competitor_tags 一起傳給 Hermes ``` @@ -297,7 +297,7 @@ LIMIT 300 | 解耦方式 | DB 表快取(非 Redis) | PostgreSQL 已是核心,無需額外依賴;支援 JOIN | | TTL | 6 小時 | 與 AI Pipeline 排程週期對齊 | | 比對算法 | 品牌 + 核心 token + 容量/重量/包數 + 品類 + 價格 sanity check | 由 `marketplace_product_matcher.py` 統一供 feeder、legacy crawler、AI/PPT 鏈路使用 | -| 最低比對門檻 | 0.62 | 核心比價寧可待審,不允許低信心錯配影響 AI 決策 | +| 最低比對門檻 | 0.76 | 核心比價寧可待審,不允許低信心錯配影響 AI 決策 | | 已有不同 PChome 商品覆蓋門檻 | 0.84 | 新候選與既有正式配對不同時,除非超高信心,否則寫入 `needs_review` attempt 不覆蓋 | | 語意標籤 | JSONB 陣列 | 傳給 Hermes 提升情境感知品質 | @@ -309,7 +309,7 @@ MOMO 商品名稱 → PChomeCrawler.search_products(keyword, limit=12) → marketplace_product_matcher.score_marketplace_match() → 品牌衝突 / 容量衝突 / 包數衝突 hard veto - → 同款高信心 score ≥ 0.62 才進 competitor_prices + → 同款高信心 score ≥ 0.76 才進 competitor_prices → 低信心、規格衝突、既有配對衝突寫入 competitor_match_attempts ``` @@ -320,7 +320,7 @@ LEFT JOIN competitor_prices cp ON cp.sku = lmp.sku AND cp.source = 'pchome' AND cp.expires_at > NOW() - AND cp.match_score >= 0.62 + AND cp.match_score >= 0.76 AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2' ``` → 無競品資料的商品仍回傳,`pchome_price=NULL`,`_batch_analyze` 自動跳過 diff --git a/routes/ai_routes.py b/routes/ai_routes.py index 1756ff3..7bbc55e 100644 --- a/routes/ai_routes.py +++ b/routes/ai_routes.py @@ -1518,7 +1518,7 @@ def api_icaim_dashboard(): ON cp.sku = lm.sku AND cp.source = 'pchome' AND cp.expires_at > NOW() - AND cp.match_score >= 0.62 + AND cp.match_score >= 0.76 AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2' WHERE lm.rn = 1 AND (lm.momo_price - cp.price) / cp.price * 100 > 15 @@ -1528,7 +1528,7 @@ def api_icaim_dashboard(): (SELECT COUNT(*) FROM competitor_prices WHERE expires_at > NOW() AND source = 'pchome' - AND COALESCE(match_score, 0) >= 0.62 + AND COALESCE(match_score, 0) >= 0.76 AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2') AS valid_competitor_prices, (SELECT COUNT(*) FROM high_risk) AS high_risk_count, (SELECT COUNT(*) FROM ai_price_recommendations) AS total_ai_recs, @@ -1565,7 +1565,7 @@ def api_icaim_dashboard(): ON cp.sku = lp.sku AND cp.source = 'pchome' AND cp.expires_at > NOW() - AND cp.match_score >= 0.62 + AND cp.match_score >= 0.76 AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2' WHERE lp.rn = 1 ORDER BY gap_pct DESC NULLS LAST diff --git a/routes/api_routes.py b/routes/api_routes.py index 6829eb9..be8f5d0 100644 --- a/routes/api_routes.py +++ b/routes/api_routes.py @@ -334,7 +334,7 @@ def _build_price_history_payload(session, product): WHERE sku = :sku AND source = 'pchome' AND crawled_at >= :start_date - AND COALESCE(match_score, 0) >= 0.62 + AND COALESCE(match_score, 0) >= 0.76 AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2' ORDER BY crawled_at """), { diff --git a/routes/dashboard_routes.py b/routes/dashboard_routes.py index 8342931..981bee3 100644 --- a/routes/dashboard_routes.py +++ b/routes/dashboard_routes.py @@ -40,7 +40,7 @@ sys_log = SystemLogger("DashboardRoutes").get_logger() dashboard_bp = Blueprint('dashboard', __name__) PRODUCT_PICK_LIST_LIMIT = 50 -PCHOME_MATCH_SCORE_FLOOR = 0.62 +PCHOME_MATCH_SCORE_FLOOR = 0.76 def _build_pchome_product_url(product_id): diff --git a/routes/export_routes.py b/routes/export_routes.py index f313f03..7dd7bde 100644 --- a/routes/export_routes.py +++ b/routes/export_routes.py @@ -137,7 +137,7 @@ def export_excel_ai_picks(): AND (cp.expires_at IS NULL OR cp.expires_at > CURRENT_TIMESTAMP) AND cp.price IS NOT NULL AND cp.price > 0 - AND COALESCE(cp.match_score, 0) >= 0.62 + AND COALESCE(cp.match_score, 0) >= 0.76 AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2' ORDER BY cp.sku, cp.crawled_at DESC NULLS LAST ) diff --git a/services/ai_product_pick_agent.py b/services/ai_product_pick_agent.py index 5385305..a60791d 100644 --- a/services/ai_product_pick_agent.py +++ b/services/ai_product_pick_agent.py @@ -154,7 +154,7 @@ def _fetch_candidates_without_sales(conn, limit: int) -> List[Dict[str, Any]]: ON cp.sku = lm.sku AND cp.source = 'pchome' AND (cp.expires_at IS NULL OR cp.expires_at > CURRENT_TIMESTAMP) - AND cp.match_score >= 0.62 + AND cp.match_score >= 0.76 {identity_condition} WHERE lm.rn = 1 ORDER BY cp.match_score DESC, cp.crawled_at DESC @@ -249,7 +249,7 @@ def _fetch_candidates(conn, limit: int) -> List[Dict[str, Any]]: FROM competitor_price_history WHERE source = 'pchome' AND crawled_at >= CURRENT_TIMESTAMP - INTERVAL '30 days' - AND COALESCE(match_score, 0) >= 0.62 + AND COALESCE(match_score, 0) >= 0.76 AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2' GROUP BY sku, source ) @@ -277,7 +277,7 @@ def _fetch_candidates(conn, limit: int) -> List[Dict[str, Any]]: ON cp.sku = lm.sku AND cp.source = 'pchome' AND (cp.expires_at IS NULL OR cp.expires_at > CURRENT_TIMESTAMP) - AND cp.match_score >= 0.62 + AND cp.match_score >= 0.76 {identity_condition} LEFT JOIN history_stats hs ON hs.sku = lm.sku @@ -515,7 +515,7 @@ def generate_product_pick_list(engine, limit: int = 50) -> ProductPickResult: scored = [_score_candidate(row) for row in rows if _to_float(row.get("pchome_price")) > 0] picks = [ pick for pick in scored - if pick["pick_score"] >= 45 and (_to_float(pick.get("match_score")) >= 0.62) + if pick["pick_score"] >= 45 and (_to_float(pick.get("match_score")) >= 0.76) ] picks.sort(key=lambda item: item["pick_score"], reverse=True) picks = picks[:limit] diff --git a/services/chart_generator_service.py b/services/chart_generator_service.py index 3a85a19..c07524a 100644 --- a/services/chart_generator_service.py +++ b/services/chart_generator_service.py @@ -107,7 +107,7 @@ def _fetch_price_history(sku: str, days: int = 30) -> Dict[str, Any]: WHERE sku = :sku AND source = 'pchome' AND crawled_at >= NOW() - INTERVAL ':days days' - AND COALESCE(match_score, 0) >= 0.62 + AND COALESCE(match_score, 0) >= 0.76 AND COALESCE(tags, '[]'::jsonb) ? 'identity_v2' GROUP BY dt ORDER BY dt """.replace(":days", str(days))), {"sku": sku}).fetchall() @@ -443,7 +443,7 @@ def price_history_heatmap(days: int = 30) -> Optional[bytes]: ) pr ON pr.product_id = p.id WHERE cp.crawled_at >= NOW() - INTERVAL '{days} days' AND p.category IS NOT NULL - AND COALESCE(cp.match_score, 0) >= 0.62 + AND COALESCE(cp.match_score, 0) >= 0.76 AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2' GROUP BY p.category, dt ORDER BY p.category, dt diff --git a/services/competitor_intel_repository.py b/services/competitor_intel_repository.py index eeeffb7..ae61767 100644 --- a/services/competitor_intel_repository.py +++ b/services/competitor_intel_repository.py @@ -16,7 +16,7 @@ from typing import Any, Optional, Union from sqlalchemy import inspect, text -PCHOME_MATCH_SCORE_FLOOR = 0.62 +PCHOME_MATCH_SCORE_FLOOR = 0.76 def _num(value: Any) -> float: diff --git a/services/competitor_price_feeder.py b/services/competitor_price_feeder.py index b48716b..a2cc327 100644 --- a/services/competitor_price_feeder.py +++ b/services/competitor_price_feeder.py @@ -34,7 +34,7 @@ from typing import Optional logger = logging.getLogger(__name__) # ── 比對參數 ───────────────────────────────────────── -MIN_MATCH_SCORE = 0.62 # 低於此分數不寫入;核心比價寧可待審也不能錯配 +MIN_MATCH_SCORE = 0.76 # 低於此分數不寫入;核心比價寧可待審也不能錯配 REPLACE_DIFFERENT_PRODUCT_SCORE = 0.84 # 已有不同 PChome 商品時,需超高信心才覆蓋 SEARCH_LIMIT = 12 # 每個搜尋詞取 PChome 前 N 筆 MAX_SEARCH_TERMS = 3 # 每個 MOMO 商品最多嘗試幾組搜尋詞 diff --git a/services/elephant_alpha_autonomous_engine.py b/services/elephant_alpha_autonomous_engine.py index dfedc45..c8edb25 100644 --- a/services/elephant_alpha_autonomous_engine.py +++ b/services/elephant_alpha_autonomous_engine.py @@ -369,7 +369,7 @@ class ElephantAlphaAutonomousEngine: ) pr ON pr.product_id = p.id JOIN competitor_prices cp ON cp.sku = p.i_code WHERE cp.expires_at > NOW() - AND COALESCE(cp.match_score, 0) >= 0.62 + AND COALESCE(cp.match_score, 0) >= 0.76 AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2' AND cp.price < pr.price * 0.85 AND cp.crawled_at >= NOW() - INTERVAL '2 hours' @@ -394,7 +394,7 @@ class ElephantAlphaAutonomousEngine: ) pr ON pr.product_id = p.id JOIN competitor_prices cp ON cp.sku = p.i_code WHERE cp.expires_at > NOW() - AND COALESCE(cp.match_score, 0) >= 0.62 + AND COALESCE(cp.match_score, 0) >= 0.76 AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2' AND cp.price > pr.price * 1.05 AND cp.crawled_at >= NOW() - INTERVAL '1 hour' diff --git a/services/hermes_analyst_service.py b/services/hermes_analyst_service.py index 7289526..ca5ff76 100644 --- a/services/hermes_analyst_service.py +++ b/services/hermes_analyst_service.py @@ -396,7 +396,7 @@ class HermesAnalystService: ON cp.sku = lmp.sku AND cp.source = 'pchome' AND cp.expires_at > NOW() - AND cp.match_score >= 0.62 + AND cp.match_score >= 0.76 AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2' WHERE lmp.rn = 1 AND rs.sales_7d_prev > 0 diff --git a/services/marketplace_product_matcher.py b/services/marketplace_product_matcher.py index 3dee19f..faecde4 100644 --- a/services/marketplace_product_matcher.py +++ b/services/marketplace_product_matcher.py @@ -390,6 +390,8 @@ def _spec_component(left_values: Iterable[float], right_values: Iterable[float]) def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, bool]: + left_counts = [count for count, _unit in left.counts] + right_counts = [count for count, _unit in right.counts] if left.total_piece_count and right.total_piece_count: if left.total_piece_count == right.total_piece_count: return 1.0, False @@ -398,7 +400,13 @@ def _count_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, if left.counts and right.counts: if set(left.counts) & set(right.counts): return 0.85, False + if left_counts and right_counts: + ratio = max(max(left_counts), max(right_counts)) / max(min(max(left_counts), max(right_counts)), 1) + if ratio >= 1.5: + return 0.0, True return 0.35, False + if (left_counts and max(left_counts) > 1) or (right_counts and max(right_counts) > 1): + return 0.0, True return 0.5, False @@ -428,6 +436,48 @@ def _spec_score(left: ProductIdentity, right: ProductIdentity) -> tuple[float, b return score, bool(conflicts), tuple(conflicts) +def _has_bundle_offer(identity: ProductIdentity) -> bool: + text = identity.normalized_name + return bool( + re.search(r"買\s*\d+\s*送\s*\d+", text) + or re.search(r"買\s*[一二兩雙三四五六七八九十]\s*送\s*[一二兩雙三四五六七八九十]", text) + or "買一送一" in text + or "囤貨組" in text + ) + + +def _has_multi_component(identity: ProductIdentity) -> bool: + text = identity.normalized_name + return bool( + "+" in text + or "+" in text + or re.search(r"\d+\s*(?:ml|g|毫升|公克)\s*x\s*\d+", text, re.I) + ) + + +def _chinese_bigram_score(left: ProductIdentity, right: ProductIdentity) -> float: + def signature(identity: ProductIdentity) -> set[str]: + text = identity.searchable_name + for token in sorted(identity.brand_tokens, key=len, reverse=True): + text = text.replace(token, " ") + text = re.sub(r"[a-z0-9]+", " ", text) + text = "".join(char for char in text if "\u4e00" <= char <= "\u9fff") + for phrase in ( + "官方", "直營", "公司貨", "專櫃", "正貨", "原廠", "限定", "獨家", + "期間", "超值", "特惠", "優惠", "新品", "經典", "人氣", "熱銷", + "必買", "推薦", "任選", "禮盒", "母親節", "超品日", "多款", + "組", "入", "瓶", "盒", "包", "片", "支", "條", + ): + text = text.replace(phrase, "") + return {text[i:i + 2] for i in range(max(0, len(text) - 1))} + + left_signature = signature(left) + right_signature = signature(right) + if not left_signature or not right_signature: + return 0.55 + return 2 * len(left_signature & right_signature) / (len(left_signature) + len(right_signature)) + + def score_marketplace_match( momo_name: str, competitor_name: str, @@ -441,6 +491,7 @@ def score_marketplace_match( token_score = _weighted_token_score(left, right) spec_score, spec_conflict, spec_reasons = _spec_score(left, right) sequence_score = SequenceMatcher(None, left.searchable_name, right.searchable_name).ratio() + chinese_name_score = _chinese_bigram_score(left, right) if left.product_type and right.product_type: type_score = 1.0 if left.product_type == right.product_type else 0.0 else: @@ -452,8 +503,20 @@ def score_marketplace_match( reasons.extend(spec_reasons) if left.product_type and right.product_type and left.product_type != right.product_type: reasons.append("type_conflict") + if _has_bundle_offer(left) != _has_bundle_offer(right): + reasons.append("bundle_offer_conflict") + if _has_multi_component(left) != _has_multi_component(right): + reasons.append("multi_component_conflict") + if chinese_name_score < 0.16: + reasons.append("product_line_conflict") hard_veto = brand_conflict or spec_conflict + if _has_bundle_offer(left) != _has_bundle_offer(right): + hard_veto = True + if _has_multi_component(left) != _has_multi_component(right): + hard_veto = True + if chinese_name_score < 0.16 and token_score < 0.72: + hard_veto = True if left.product_type and right.product_type and left.product_type != right.product_type and token_score < 0.55: hard_veto = True diff --git a/services/openclaw_strategist_service.py b/services/openclaw_strategist_service.py index 99181a9..2f34419 100644 --- a/services/openclaw_strategist_service.py +++ b/services/openclaw_strategist_service.py @@ -555,7 +555,7 @@ def _fetch_competitor_summary() -> Dict[str, Any]: FROM price_records ORDER BY product_id, timestamp DESC ) pr ON pr.product_id = p.id WHERE cp.expires_at > NOW() - AND COALESCE(cp.match_score, 0) >= 0.62 + AND COALESCE(cp.match_score, 0) >= 0.76 AND COALESCE(cp.tags, '[]'::jsonb) ? 'identity_v2' """)).fetchone() if row and row[0]: diff --git a/services/pchome_crawler.py b/services/pchome_crawler.py index 81af9b1..568340c 100644 --- a/services/pchome_crawler.py +++ b/services/pchome_crawler.py @@ -505,7 +505,7 @@ def find_best_match(keyword: str, momo_price: float) -> Optional[dict]: best = result best_score = diagnostics.score best_diagnostics = diagnostics - if not best or best_score < 0.62: + if not best or best_score < 0.76: return None best['match_score'] = best_score best['match_reasons'] = list(getattr(best_diagnostics, 'reasons', ()) or ()) diff --git a/templates/growth_analysis.html b/templates/growth_analysis.html index 7d1b2f5..fe2a5ee 100644 --- a/templates/growth_analysis.html +++ b/templates/growth_analysis.html @@ -140,7 +140,7 @@ {% set coverage = chart_data.competitor_coverage | default({}) %}