diff --git a/TODO_NEXT_STEPS.txt b/TODO_NEXT_STEPS.txt index ad020c0..d1e3bd4 100644 --- a/TODO_NEXT_STEPS.txt +++ b/TODO_NEXT_STEPS.txt @@ -4,6 +4,7 @@ ================================================================================ 【已完成】 + - V10.328 強化 MOMO/PChome 核心比價準確性第一波:補高頻品牌 alias、中文商品線 bigram 訊號、保健/包裝同義單位與買送件數解析,搜尋詞改為品牌/核心/主規格三層;PChome 比對嘗試與正式快照補存 URL、圖片、庫存與結構化 diagnostics,商品列表用 tone 分流顯示尚未搜尋、低信心、身份否決、單位價與過期狀態,不再把不同問題全部壓成灰色待比對;同步持久化首頁 / PChome coverage 熱路徑索引,避免重開機後慢查詢回歸。 - V10.327 補 OpenClaw fallback 可觀測性:週報、月報、Meta、日報洞察、每日報告的 Gemini/NIM 備援 caller 納入 caller registry、AI 觀測台 agent group 與 Telegram 狀態統計,並補 MCP collector Ollama-first regression test,避免 fallback 真實使用量在觀測層被歸類成未知或漏算。 - V10.326 補市場情報 candidate queue review AI summary Telegram dispatch report run readiness:新增 read-only report run readiness builder、POST endpoint、UI 按鈕與 deployment readiness smoke target,在 report run package 後整理 report generation readiness manifest、manual report command boundary、artifact path gate 與後續 report run receipt gate;API/UI 不讀 approval/Telegram token、不呼叫 LLM、不派送 Telegram、不開 DB、不寫檔、不產報表、不更新 review_state、不掛 scheduler。 - V10.325 收斂 Gemini 主路徑:OpenClaw 週/月/meta/日報洞察、Telegram PPT 分析與 MCP fallback 全部改成先走 OllamaService 的 GCP-A → GCP-B → 111 三主機級聯;Gemini 只在 Ollama/NIM 不可用後作備援。Elephant Alpha resource_optimization 告警補上待處理 action_plans 焦點列表,避免只報隊列數字卻沒有可執行對象。 diff --git a/config.py b/config.py index b83ef61..c6d73af 100644 --- a/config.py +++ b/config.py @@ -320,7 +320,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.327" +SYSTEM_VERSION = "V10.328" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/database/schema_repair.py b/database/schema_repair.py index db2cd4b..59f67fe 100644 --- a/database/schema_repair.py +++ b/database/schema_repair.py @@ -71,5 +71,40 @@ def repair_database_schema(): for col_name, ddl in promo_columns: _ensure_column(engine, text, 'promo_products', col_name, ddl) + # V10.328: PChome/MOMO 比價診斷欄位。正式價差與待審嘗試都需可回溯 + # URL、圖片、庫存與 matcher 結構化原因,避免「待比對」無法被人工處理。 + json_type = 'JSONB' if DATABASE_TYPE == 'postgresql' else 'TEXT' + competitor_price_columns = [ + ('competitor_product_url', "ALTER TABLE competitor_prices ADD COLUMN competitor_product_url TEXT"), + ('competitor_image_url', "ALTER TABLE competitor_prices ADD COLUMN competitor_image_url TEXT"), + ('competitor_stock', "ALTER TABLE competitor_prices ADD COLUMN competitor_stock INTEGER"), + ('match_diagnostic_json', f"ALTER TABLE competitor_prices ADD COLUMN match_diagnostic_json {json_type}"), + ('comparison_mode', "ALTER TABLE competitor_prices ADD COLUMN comparison_mode VARCHAR(40)"), + ('hard_veto', "ALTER TABLE competitor_prices ADD COLUMN hard_veto BOOLEAN"), + ('diagnostic_codes', f"ALTER TABLE competitor_prices ADD COLUMN diagnostic_codes {json_type}"), + ] + competitor_history_columns = [ + ('competitor_product_url', "ALTER TABLE competitor_price_history ADD COLUMN competitor_product_url TEXT"), + ('competitor_image_url', "ALTER TABLE competitor_price_history ADD COLUMN competitor_image_url TEXT"), + ('competitor_stock', "ALTER TABLE competitor_price_history ADD COLUMN competitor_stock INTEGER"), + ('match_diagnostic_json', f"ALTER TABLE competitor_price_history ADD COLUMN match_diagnostic_json {json_type}"), + ('comparison_mode', "ALTER TABLE competitor_price_history ADD COLUMN comparison_mode VARCHAR(40)"), + ('hard_veto', "ALTER TABLE competitor_price_history ADD COLUMN hard_veto BOOLEAN"), + ('diagnostic_codes', f"ALTER TABLE competitor_price_history ADD COLUMN diagnostic_codes {json_type}"), + ] + competitor_attempt_columns = [ + ('competitor_product_url', "ALTER TABLE competitor_match_attempts ADD COLUMN competitor_product_url TEXT"), + ('competitor_image_url', "ALTER TABLE competitor_match_attempts ADD COLUMN competitor_image_url TEXT"), + ('competitor_stock', "ALTER TABLE competitor_match_attempts ADD COLUMN competitor_stock INTEGER"), + ('match_diagnostic_json', f"ALTER TABLE competitor_match_attempts ADD COLUMN match_diagnostic_json {json_type}"), + ('comparison_mode', "ALTER TABLE competitor_match_attempts ADD COLUMN comparison_mode VARCHAR(40)"), + ('hard_veto', "ALTER TABLE competitor_match_attempts ADD COLUMN hard_veto BOOLEAN"), + ('diagnostic_codes', f"ALTER TABLE competitor_match_attempts ADD COLUMN diagnostic_codes {json_type}"), + ] + for table_columns in (competitor_price_columns, competitor_history_columns, competitor_attempt_columns): + for col_name, ddl in table_columns: + table_name = ddl.split()[2] + _ensure_column(engine, text, table_name, col_name, ddl) + except Exception as e: _log.error(f"[Database] [Schema] ❌ 資料庫修復失敗 | Error: {e}") diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md index 0a33a53..d0aaff5 100644 --- a/docs/memory/history_logs.md +++ b/docs/memory/history_logs.md @@ -12,6 +12,9 @@ ## 📅 詳細更新日誌 (考古存檔) +### 2026-05-20:重開機後首頁熱路徑索引持久化 +- **Dashboard / PChome 慢查詢修復**: 主機重開機後 `https://mo.wooo.work/` 首頁可用但多次逾時,實際瓶頸集中在首頁與 PChome coverage 查詢掃描 `products`、`price_records`、`competitor_match_attempts`。線上先補三個索引讓首頁恢復 200,並新增 `migrations/040_dashboard_hot_path_indexes.sql` 將修復持久化到 fresh restore / DB rebuild 流程。 + ### 2026-04-29:ADR-017 Phase 3f 模組化收尾啟動 - **DB metadata 救急**: `database/manager.py` 改為顯式載入 permission / AI / autoheal / import / vendor / realtime_sales ORM,PostgreSQL 初始化透過 process-local guard + advisory lock 執行 `Base.metadata.create_all()`,避免新環境漏表與一般流量重複碰 DDL。 - **realtime_sales_monthly 補 ORM**: 新增 `database/realtime_sales_models.py`,並同步 `docker/postgres/init/01-init.sql` 欄位,避免 fresh volume 先建出窄表後造成匯入欄位靜默遺失。 diff --git a/migrations/040_dashboard_hot_path_indexes.sql b/migrations/040_dashboard_hot_path_indexes.sql new file mode 100644 index 0000000..ab555da --- /dev/null +++ b/migrations/040_dashboard_hot_path_indexes.sql @@ -0,0 +1,37 @@ +-- ============================================================================= +-- Migration 040: Dashboard / PChome 熱路徑索引 +-- MOMO PRO — 2026-05-20 重開機後首頁慢查詢修復持久化 +-- ============================================================================= +-- 背景: +-- 2026-05-20 主機重開機後,https://mo.wooo.work/ 首頁雖可用但多次 +-- 8-45 秒逾時。實際瓶頸在首頁與 PChome coverage 查詢反覆掃描 +-- products、price_records、competitor_match_attempts。 +-- +-- 設計: +-- 1. 只新增索引,不改資料、不調整欄位、不重啟容器。 +-- 2. 使用 IF NOT EXISTS,讓 live 已手動套用索引、fresh restore、重跑 migration +-- 都維持冪等。 +-- 3. 不使用 CREATE INDEX CONCURRENTLY,避免被包在 transaction 的 migration +-- runner 執行時失敗;若未來超大型 live DB 需要零鎖定窗口,請在維護窗口 +-- 手動轉換為 CONCURRENTLY 流程。 +-- ============================================================================= + +CREATE INDEX IF NOT EXISTS idx_comp_match_attempts_source_sku_attempted_at + ON competitor_match_attempts (source, sku, attempted_at DESC) + INCLUDE (attempt_status); + +CREATE INDEX IF NOT EXISTS idx_price_records_product_timestamp_id_desc + ON price_records (product_id, timestamp DESC, id DESC) + INCLUDE (price); + +CREATE INDEX IF NOT EXISTS idx_products_status_id_icode + ON products (status, id, i_code); + +ANALYZE products; +ANALYZE price_records; +ANALYZE competitor_match_attempts; + +DO $$ +BEGIN + RAISE NOTICE 'Migration 040 完成 — Dashboard / PChome 熱路徑索引已確認'; +END $$; diff --git a/migrations/041_competitor_match_diagnostics.sql b/migrations/041_competitor_match_diagnostics.sql new file mode 100644 index 0000000..ef15324 --- /dev/null +++ b/migrations/041_competitor_match_diagnostics.sql @@ -0,0 +1,49 @@ +-- ============================================================================= +-- Migration 041: PChome / MOMO 比價診斷欄位 +-- MOMO PRO — Core price comparison evidence +-- 2026-05-20 台北 +-- ============================================================================= +-- 說明: +-- competitor_prices / competitor_price_history / competitor_match_attempts +-- 補存 PChome URL、圖片、庫存與 matcher 結構化診斷。 +-- 目的不是放寬配對門檻,而是讓「低信心 / 身份否決 / 單位價」可被 +-- 商品列表、圖表、簡報與人工覆核精準追蹤。 +-- ============================================================================= + +ALTER TABLE IF EXISTS competitor_prices + ADD COLUMN IF NOT EXISTS competitor_product_url TEXT, + ADD COLUMN IF NOT EXISTS competitor_image_url TEXT, + ADD COLUMN IF NOT EXISTS competitor_stock INTEGER, + ADD COLUMN IF NOT EXISTS match_diagnostic_json JSONB, + ADD COLUMN IF NOT EXISTS comparison_mode VARCHAR(40), + ADD COLUMN IF NOT EXISTS hard_veto BOOLEAN, + ADD COLUMN IF NOT EXISTS diagnostic_codes JSONB; + +ALTER TABLE IF EXISTS competitor_price_history + ADD COLUMN IF NOT EXISTS competitor_product_url TEXT, + ADD COLUMN IF NOT EXISTS competitor_image_url TEXT, + ADD COLUMN IF NOT EXISTS competitor_stock INTEGER, + ADD COLUMN IF NOT EXISTS match_diagnostic_json JSONB, + ADD COLUMN IF NOT EXISTS comparison_mode VARCHAR(40), + ADD COLUMN IF NOT EXISTS hard_veto BOOLEAN, + ADD COLUMN IF NOT EXISTS diagnostic_codes JSONB; + +ALTER TABLE IF EXISTS competitor_match_attempts + ADD COLUMN IF NOT EXISTS competitor_product_url TEXT, + ADD COLUMN IF NOT EXISTS competitor_image_url TEXT, + ADD COLUMN IF NOT EXISTS competitor_stock INTEGER, + ADD COLUMN IF NOT EXISTS match_diagnostic_json JSONB, + ADD COLUMN IF NOT EXISTS comparison_mode VARCHAR(40), + ADD COLUMN IF NOT EXISTS hard_veto BOOLEAN, + ADD COLUMN IF NOT EXISTS diagnostic_codes JSONB; + +CREATE INDEX IF NOT EXISTS idx_comp_match_attempts_mode_time + ON competitor_match_attempts (comparison_mode, attempted_at DESC); + +CREATE INDEX IF NOT EXISTS idx_comp_price_comparison_mode + ON competitor_prices (comparison_mode); + +DO $$ +BEGIN + RAISE NOTICE '✅ Migration 041 完成 — PChome/MOMO 比價診斷欄位已補齊'; +END $$; diff --git a/routes/dashboard_routes.py b/routes/dashboard_routes.py index 954b9eb..f41503f 100644 --- a/routes/dashboard_routes.py +++ b/routes/dashboard_routes.py @@ -279,9 +279,9 @@ def _build_pchome_match_status(attempt=None, ineligible=None): 'detail': attempt.get('error_message'), } return { - 'label': '待比對', + 'label': '狀態待釐清', 'tone': 'neutral', - 'summary': '尚無有效 PChome 對應商品或價格快取', + 'summary': '已有比對紀錄但尚未分類,需檢查 matcher diagnostics 或補抓紀錄', 'detail': score_text, } @@ -341,8 +341,14 @@ def _load_pchome_competitor_map(session, skus): discount_pct, competitor_product_id, competitor_product_name, + competitor_product_url, + competitor_image_url, + competitor_stock, match_score, tags, + comparison_mode, + hard_veto, + diagnostic_codes, crawled_at, expires_at FROM competitor_prices @@ -370,9 +376,14 @@ def _load_pchome_competitor_map(session, skus): 'discount_pct': row.get('discount_pct'), 'product_id': competitor_product_id, 'product_name': row.get('competitor_product_name'), - 'product_url': _build_pchome_product_url(competitor_product_id), + 'product_url': row.get('competitor_product_url') or _build_pchome_product_url(competitor_product_id), + 'image_url': row.get('competitor_image_url'), + 'stock': row.get('competitor_stock'), 'match_score': _to_float(row.get('match_score')), 'tags': row.get('tags'), + 'comparison_mode': row.get('comparison_mode'), + 'hard_veto': row.get('hard_veto'), + 'diagnostic_codes': row.get('diagnostic_codes'), 'crawled_at': row.get('crawled_at'), 'expires_at': row.get('expires_at'), } @@ -393,8 +404,14 @@ def _load_pchome_ineligible_competitor_map(session, skus): price, competitor_product_id, competitor_product_name, + competitor_product_url, + competitor_image_url, + competitor_stock, match_score, tags, + comparison_mode, + hard_veto, + diagnostic_codes, crawled_at, expires_at, CASE @@ -454,8 +471,14 @@ def _load_pchome_ineligible_competitor_map(session, skus): 'price': _to_float(row.get('price')), 'product_id': row.get('competitor_product_id'), 'product_name': row.get('competitor_product_name'), + 'product_url': row.get('competitor_product_url') or _build_pchome_product_url(row.get('competitor_product_id')), + 'image_url': row.get('competitor_image_url'), + 'stock': row.get('competitor_stock'), 'match_score': _to_float(row.get('match_score')), 'tags': row.get('tags'), + 'comparison_mode': row.get('comparison_mode'), + 'hard_veto': row.get('hard_veto'), + 'diagnostic_codes': row.get('diagnostic_codes'), 'crawled_at': row.get('crawled_at'), 'expires_at': row.get('expires_at'), } @@ -486,8 +509,15 @@ def _load_pchome_match_attempt_map(session, skus): cma.candidate_count, cma.best_competitor_product_id, cma.best_competitor_product_name, + cma.competitor_product_url, + cma.competitor_image_url, + cma.competitor_stock, cma.best_competitor_price, cma.best_match_score, + cma.match_diagnostic_json, + cma.comparison_mode, + cma.hard_veto, + cma.diagnostic_codes, cma.error_message, cma.attempted_at, lm.momo_product_name, @@ -512,6 +542,8 @@ def _load_pchome_match_attempt_map(session, skus): result = {} for row in rows: item = dict(row) + if item.get('best_competitor_product_id') and not item.get('competitor_product_url'): + item['competitor_product_url'] = _build_pchome_product_url(item.get('best_competitor_product_id')) if item.get('attempt_status') in {'unit_comparable', 'refresh_unit_comparable'}: try: from services.marketplace_product_matcher import build_unit_price_comparison diff --git a/services/competitor_intel_repository.py b/services/competitor_intel_repository.py index 7466763..434b628 100644 --- a/services/competitor_intel_repository.py +++ b/services/competitor_intel_repository.py @@ -115,7 +115,7 @@ def _month_label(value: Any) -> str: def _attempt_status_label(status: Any) -> str: - return ATTEMPT_STATUS_LABELS.get(str(status or ""), str(status or "待比對")) + return ATTEMPT_STATUS_LABELS.get(str(status or ""), str(status or "狀態待釐清")) def _attempt_action_label(status: Any) -> str: diff --git a/services/competitor_price_feeder.py b/services/competitor_price_feeder.py index 283da7b..1dce3ec 100644 --- a/services/competitor_price_feeder.py +++ b/services/competitor_price_feeder.py @@ -161,6 +161,40 @@ def _format_match_diagnostics(diagnostics) -> str: ) +def _match_diagnostics_payload(diagnostics) -> dict: + """Serialize matcher diagnostics for review/report consumers.""" + if not diagnostics: + return {} + return { + "score": getattr(diagnostics, "score", None), + "brand_score": getattr(diagnostics, "brand_score", None), + "token_score": getattr(diagnostics, "token_score", None), + "spec_score": getattr(diagnostics, "spec_score", None), + "sequence_score": getattr(diagnostics, "sequence_score", None), + "type_score": getattr(diagnostics, "type_score", None), + "price_penalty": getattr(diagnostics, "price_penalty", None), + "hard_veto": bool(getattr(diagnostics, "hard_veto", False)), + "comparison_mode": getattr(diagnostics, "comparison_mode", "exact_identity"), + "reasons": list(getattr(diagnostics, "reasons", ()) or ()), + } + + +def _product_snapshot_payload(product) -> dict: + payload = { + "competitor_product_url": None, + "competitor_image_url": None, + "competitor_stock": None, + } + if not product: + return payload + payload.update({ + "competitor_product_url": getattr(product, "product_url", None), + "competitor_image_url": getattr(product, "image_url", None), + "competitor_stock": getattr(product, "stock", None), + }) + return payload + + def _product_id_key(product_id: str) -> str: """Normalize PChome IDs for comparing cached IDs with API-returned IDs.""" return re.sub(r"[^A-Z0-9]", "", str(product_id or "").upper()) @@ -288,6 +322,35 @@ class CompetitorPriceFeeder: self.engine = engine self._history_table_ready = False self._attempt_table_ready = False + self._price_table_columns_ready = False + + def _ensure_table_columns(self, conn, table: str, column_specs: list[tuple[str, str]]) -> None: + """補齊既有表欄位;避免正式端舊表在新 INSERT 時炸掉。""" + from sqlalchemy import inspect, text + + inspector = inspect(conn) + if not inspector.has_table(table): + return + existing = {column["name"] for column in inspector.get_columns(table)} + for column_name, column_type in column_specs: + if column_name in existing: + continue + conn.execute(text(f"ALTER TABLE {table} ADD COLUMN {column_name} {column_type}")) + existing.add(column_name) + + def _ensure_competitor_prices_columns(self, conn) -> None: + if self._price_table_columns_ready: + return + self._ensure_table_columns(conn, "competitor_prices", [ + ("competitor_product_url", "TEXT"), + ("competitor_image_url", "TEXT"), + ("competitor_stock", "INTEGER"), + ("match_diagnostic_json", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"), + ("comparison_mode", "VARCHAR(40)"), + ("hard_veto", "BOOLEAN"), + ("diagnostic_codes", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"), + ]) + self._price_table_columns_ready = True def _ensure_competitor_price_history_table(self, conn): """確保競品價格歷史表存在;排程可自癒補表,不依賴手動 migration。""" @@ -308,8 +371,15 @@ class CompetitorPriceFeeder: discount_pct INTEGER, competitor_product_id VARCHAR(100), competitor_product_name TEXT, + competitor_product_url TEXT, + competitor_image_url TEXT, + competitor_stock INTEGER, match_score NUMERIC(4,3), tags JSONB DEFAULT '[]'::jsonb, + match_diagnostic_json JSONB, + comparison_mode VARCHAR(40), + hard_veto BOOLEAN, + diagnostic_codes JSONB, crawled_at TIMESTAMP NOT NULL DEFAULT NOW() ) """)) @@ -334,8 +404,15 @@ class CompetitorPriceFeeder: discount_pct INTEGER, competitor_product_id VARCHAR(100), competitor_product_name TEXT, + competitor_product_url TEXT, + competitor_image_url TEXT, + competitor_stock INTEGER, match_score NUMERIC(4,3), tags TEXT DEFAULT '[]', + match_diagnostic_json TEXT, + comparison_mode VARCHAR(40), + hard_veto BOOLEAN, + diagnostic_codes TEXT, crawled_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ) """)) @@ -348,6 +425,15 @@ class CompetitorPriceFeeder: ON competitor_price_history (competitor_product_id) """)) + self._ensure_table_columns(conn, "competitor_price_history", [ + ("competitor_product_url", "TEXT"), + ("competitor_image_url", "TEXT"), + ("competitor_stock", "INTEGER"), + ("match_diagnostic_json", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"), + ("comparison_mode", "VARCHAR(40)"), + ("hard_veto", "BOOLEAN"), + ("diagnostic_codes", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"), + ]) self._history_table_ready = True def _ensure_competitor_match_attempts_table(self, conn): @@ -370,8 +456,15 @@ class CompetitorPriceFeeder: attempt_status VARCHAR(30) NOT NULL, best_competitor_product_id VARCHAR(100), best_competitor_product_name TEXT, + competitor_product_url TEXT, + competitor_image_url TEXT, + competitor_stock INTEGER, best_competitor_price NUMERIC(10,2), best_match_score NUMERIC(4,3), + match_diagnostic_json JSONB, + comparison_mode VARCHAR(40), + hard_veto BOOLEAN, + diagnostic_codes JSONB, error_message TEXT, attempted_at TIMESTAMP NOT NULL DEFAULT NOW() ) @@ -398,8 +491,15 @@ class CompetitorPriceFeeder: attempt_status VARCHAR(30) NOT NULL, best_competitor_product_id VARCHAR(100), best_competitor_product_name TEXT, + competitor_product_url TEXT, + competitor_image_url TEXT, + competitor_stock INTEGER, best_competitor_price NUMERIC(10,2), best_match_score NUMERIC(4,3), + match_diagnostic_json TEXT, + comparison_mode VARCHAR(40), + hard_veto BOOLEAN, + diagnostic_codes TEXT, error_message TEXT, attempted_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ) @@ -413,6 +513,15 @@ class CompetitorPriceFeeder: ON competitor_match_attempts (attempt_status, attempted_at DESC) """)) + self._ensure_table_columns(conn, "competitor_match_attempts", [ + ("competitor_product_url", "TEXT"), + ("competitor_image_url", "TEXT"), + ("competitor_stock", "INTEGER"), + ("match_diagnostic_json", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"), + ("comparison_mode", "VARCHAR(40)"), + ("hard_veto", "BOOLEAN"), + ("diagnostic_codes", "JSONB" if conn.dialect.name == "postgresql" else "TEXT"), + ]) self._attempt_table_ready = True def _record_match_attempt( @@ -426,6 +535,7 @@ class CompetitorPriceFeeder: attempt_status: str = "unknown", best_product=None, best_score: float = None, + diagnostics=None, error_message: str = None, source: str = "pchome", ) -> None: @@ -435,18 +545,29 @@ class CompetitorPriceFeeder: with self.engine.begin() as conn: self._ensure_competitor_match_attempts_table(conn) search_terms_expr = "CAST(:search_terms AS jsonb)" if conn.dialect.name == "postgresql" else ":search_terms" + json_cast = "CAST(:match_diagnostic_json AS jsonb)" if conn.dialect.name == "postgresql" else ":match_diagnostic_json" + codes_cast = "CAST(:diagnostic_codes AS jsonb)" if conn.dialect.name == "postgresql" else ":diagnostic_codes" + diagnostic_payload = _match_diagnostics_payload(diagnostics) + diagnostic_codes = diagnostic_payload.get("reasons") or [] + product_payload = _product_snapshot_payload(best_product) conn.execute(text(f""" INSERT INTO competitor_match_attempts (sku, source, momo_product_id, momo_product_name, momo_price, search_terms, candidate_count, attempt_status, best_competitor_product_id, best_competitor_product_name, - best_competitor_price, best_match_score, error_message, + competitor_product_url, competitor_image_url, competitor_stock, + best_competitor_price, best_match_score, + match_diagnostic_json, comparison_mode, hard_veto, diagnostic_codes, + error_message, attempted_at) VALUES (:sku, :source, :momo_product_id, :momo_product_name, :momo_price, {search_terms_expr}, :candidate_count, :attempt_status, :best_id, :best_name, - :best_price, :best_score, :error_message, + :competitor_product_url, :competitor_image_url, :competitor_stock, + :best_price, :best_score, + {json_cast}, :comparison_mode, :hard_veto, {codes_cast}, + :error_message, CURRENT_TIMESTAMP) """), { "sku": sku, @@ -459,8 +580,13 @@ class CompetitorPriceFeeder: "attempt_status": attempt_status, "best_id": getattr(best_product, "product_id", None), "best_name": (getattr(best_product, "name", None) or "")[:300] or None, + **product_payload, "best_price": getattr(best_product, "price", None), "best_score": best_score, + "match_diagnostic_json": json.dumps(diagnostic_payload, ensure_ascii=False) if diagnostic_payload else None, + "comparison_mode": diagnostic_payload.get("comparison_mode"), + "hard_veto": diagnostic_payload.get("hard_veto"), + "diagnostic_codes": json.dumps(diagnostic_codes, ensure_ascii=False) if diagnostic_codes else None, "error_message": (error_message or "")[:1000] or None, }) @@ -608,6 +734,7 @@ class CompetitorPriceFeeder: tags: list, momo_product_id: int = None, momo_price: float = None, + diagnostics=None, source: str = "pchome", ): """單筆寫入/更新最新快取,並追加一筆歷史快照。""" @@ -615,28 +742,47 @@ class CompetitorPriceFeeder: _taipei = timezone(timedelta(hours=8)) expires_at = (datetime.now(_taipei) + timedelta(hours=TTL_HOURS)).strftime("%Y-%m-%d %H:%M:%S") tags_json = json.dumps(tags, ensure_ascii=False) + diagnostic_payload = _match_diagnostics_payload(diagnostics) + diagnostic_codes = diagnostic_payload.get("reasons") or [] + product_payload = _product_snapshot_payload(product) with self.engine.begin() as conn: self._ensure_competitor_price_history_table(conn) + self._ensure_competitor_prices_columns(conn) + json_expr = "CAST(:match_diagnostic_json AS jsonb)" if conn.dialect.name == "postgresql" else ":match_diagnostic_json" + codes_expr = "CAST(:diagnostic_codes AS jsonb)" if conn.dialect.name == "postgresql" else ":diagnostic_codes" conn.execute(text(""" INSERT INTO competitor_prices (sku, source, price, original_price, discount_pct, competitor_product_id, competitor_product_name, - match_score, tags, crawled_at, expires_at) + competitor_product_url, competitor_image_url, competitor_stock, + match_score, tags, match_diagnostic_json, + comparison_mode, hard_veto, diagnostic_codes, + crawled_at, expires_at) VALUES (:sku, :source, :price, :original_price, :discount_pct, :comp_id, :comp_name, - :match_score, :tags, CURRENT_TIMESTAMP, :expires_at) + :competitor_product_url, :competitor_image_url, :competitor_stock, + :match_score, :tags, {json_expr}, + :comparison_mode, :hard_veto, {codes_expr}, + CURRENT_TIMESTAMP, :expires_at) ON CONFLICT (sku, source) DO UPDATE SET price = EXCLUDED.price, original_price = EXCLUDED.original_price, discount_pct = EXCLUDED.discount_pct, competitor_product_id = EXCLUDED.competitor_product_id, competitor_product_name = EXCLUDED.competitor_product_name, + competitor_product_url = EXCLUDED.competitor_product_url, + competitor_image_url = EXCLUDED.competitor_image_url, + competitor_stock = EXCLUDED.competitor_stock, match_score = EXCLUDED.match_score, tags = EXCLUDED.tags, + match_diagnostic_json = EXCLUDED.match_diagnostic_json, + comparison_mode = EXCLUDED.comparison_mode, + hard_veto = EXCLUDED.hard_veto, + diagnostic_codes = EXCLUDED.diagnostic_codes, crawled_at = CURRENT_TIMESTAMP, expires_at = :expires_at - """), { + """.format(json_expr=json_expr, codes_expr=codes_expr)), { "sku": sku, "source": source, "price": product.price, @@ -644,8 +790,13 @@ class CompetitorPriceFeeder: "discount_pct": product.discount, "comp_id": product.product_id, "comp_name": product.name[:200], + **product_payload, "match_score": match_score, "tags": tags_json, + "match_diagnostic_json": json.dumps(diagnostic_payload, ensure_ascii=False) if diagnostic_payload else None, + "comparison_mode": diagnostic_payload.get("comparison_mode"), + "hard_veto": diagnostic_payload.get("hard_veto"), + "diagnostic_codes": json.dumps(diagnostic_codes, ensure_ascii=False) if diagnostic_codes else None, "expires_at": expires_at, }) conn.execute(text(""" @@ -653,13 +804,19 @@ class CompetitorPriceFeeder: (sku, source, momo_product_id, momo_price, price, original_price, discount_pct, competitor_product_id, competitor_product_name, - match_score, tags, crawled_at) + competitor_product_url, competitor_image_url, competitor_stock, + match_score, tags, match_diagnostic_json, + comparison_mode, hard_veto, diagnostic_codes, + crawled_at) VALUES (:sku, :source, :momo_product_id, :momo_price, :price, :original_price, :discount_pct, :comp_id, :comp_name, - :match_score, :tags, CURRENT_TIMESTAMP) - """), { + :competitor_product_url, :competitor_image_url, :competitor_stock, + :match_score, :tags, {json_expr}, + :comparison_mode, :hard_veto, {codes_expr}, + CURRENT_TIMESTAMP) + """.format(json_expr=json_expr, codes_expr=codes_expr)), { "sku": sku, "source": source, "momo_product_id": momo_product_id, @@ -669,8 +826,13 @@ class CompetitorPriceFeeder: "discount_pct": product.discount, "comp_id": product.product_id, "comp_name": product.name[:200], + **product_payload, "match_score": match_score, "tags": tags_json, + "match_diagnostic_json": json.dumps(diagnostic_payload, ensure_ascii=False) if diagnostic_payload else None, + "comparison_mode": diagnostic_payload.get("comparison_mode"), + "hard_veto": diagnostic_payload.get("hard_veto"), + "diagnostic_codes": json.dumps(diagnostic_codes, ensure_ascii=False) if diagnostic_codes else None, }) def _should_upsert_competitor_price( @@ -853,6 +1015,7 @@ class CompetitorPriceFeeder: attempt_status="manual_rejected", best_product=best_product, best_score=score, + diagnostics=diagnostics, error_message=( f"manual_review_rejected; rejected_candidates={rejected_note}; " f"{_format_match_diagnostics(diagnostics)}" @@ -880,6 +1043,7 @@ class CompetitorPriceFeeder: attempt_status="manual_unit_price_required", best_product=best_product, best_score=score, + diagnostics=diagnostics, error_message=f"manual_review_unit_price_required; {_format_match_diagnostics(diagnostics)}", source=source, ) @@ -903,6 +1067,7 @@ class CompetitorPriceFeeder: attempt_status="unit_comparable", best_product=best_product, best_score=score, + diagnostics=diagnostics, error_message=_format_match_diagnostics(diagnostics), source=source, ) @@ -925,6 +1090,7 @@ class CompetitorPriceFeeder: attempt_status="low_score", best_product=best_product, best_score=score, + diagnostics=diagnostics, error_message=_format_match_diagnostics(diagnostics), source=source, ) @@ -963,6 +1129,7 @@ class CompetitorPriceFeeder: attempt_status="needs_review", best_product=best_product, best_score=score, + diagnostics=diagnostics, error_message=f"{write_reason}; {_format_match_diagnostics(diagnostics)}", source=source, ) @@ -978,6 +1145,7 @@ class CompetitorPriceFeeder: tags, momo_product_id=momo_product_id, momo_price=momo_price, + diagnostics=diagnostics, source=source, ) self._record_match_attempt( @@ -990,6 +1158,7 @@ class CompetitorPriceFeeder: attempt_status="matched", best_product=best_product, best_score=score, + diagnostics=diagnostics, source=source, ) matched += 1 @@ -1126,6 +1295,7 @@ class CompetitorPriceFeeder: attempt_status="refresh_unit_comparable", best_product=best_product, best_score=score, + diagnostics=diagnostics, error_message=_format_match_diagnostics(diagnostics), source=source, ) @@ -1144,6 +1314,7 @@ class CompetitorPriceFeeder: attempt_status="refresh_low_score", best_product=best_product, best_score=score, + diagnostics=diagnostics, error_message=_format_match_diagnostics(diagnostics), source=source, ) @@ -1175,6 +1346,7 @@ class CompetitorPriceFeeder: attempt_status="refresh_needs_review", best_product=best_product, best_score=score, + diagnostics=diagnostics, error_message=f"{write_reason}; {_format_match_diagnostics(diagnostics)}", source=source, ) @@ -1190,6 +1362,7 @@ class CompetitorPriceFeeder: tags, momo_product_id=momo_product_id, momo_price=momo_price, + diagnostics=diagnostics, source=source, ) self._record_match_attempt( @@ -1202,6 +1375,7 @@ class CompetitorPriceFeeder: attempt_status="matched", best_product=best_product, best_score=score, + diagnostics=diagnostics, source=source, ) matched += 1 diff --git a/services/marketplace_product_matcher.py b/services/marketplace_product_matcher.py index 44a5afb..d43790f 100644 --- a/services/marketplace_product_matcher.py +++ b/services/marketplace_product_matcher.py @@ -93,6 +93,17 @@ GENERIC_TOKENS = { "paris", } +BRAND_ALIAS_OVERRIDES = { + "clarins": ("克蘭詩", "clarins"), + "nars": ("nars",), + "relove": ("relove",), + "stadler form": ("stadler form", "stadlerform"), + "cetaphil": ("舒特膚", "cetaphil"), + "sisley": ("希思黎", "sisley"), + "gennies": ("奇妮", "gennies"), + "uruhimemomoko": ("潤姬桃子", "uruhimemomoko", "uruhime momoko"), +} + PRODUCT_TYPES = { "精華": ("精華", "精華液", "essence", "serum", "安瓶"), "化妝水": ("化妝水", "機能水", "toner", "lotion"), @@ -109,9 +120,10 @@ PRODUCT_TYPES = { "保健": ("錠", "膠囊", "粉", "飲", "包", "健康食品"), } -COUNT_UNITS = {"入", "組", "瓶", "支", "條", "盒", "包", "片", "顆", "錠", "枚", "件"} -PIECE_UNITS = {"包", "片", "顆", "錠", "枚"} -CONTAINER_UNITS = {"入", "組", "盒"} +COUNT_UNITS = {"入", "組", "瓶", "支", "條", "盒", "包", "袋", "片", "顆", "粒", "錠", "枚", "件", "罐", "杯", "本"} +PIECE_UNITS = {"包", "袋", "片", "顆", "粒", "錠", "枚"} +CONTAINER_UNITS = {"入", "組", "盒", "罐", "杯", "本", "瓶", "支", "條", "件"} +ENGLISH_COUNT_UNIT_RE = r"(?:pcs?|pieces?|capsules?|caps?|tablets?|tabs?|packs?|sachets?|bottles?|boxes?)" BUNDLE_OFFER_PHRASES = ( "囤貨組", "超值組", @@ -283,7 +295,16 @@ def _known_brand_tokens(text: str) -> set[str]: BRAND_ALIASES = {} BRAND_NORMALIZE_MAP = {} - for alias, canonical in BRAND_NORMALIZE_MAP.items(): + alias_map = dict(BRAND_NORMALIZE_MAP) + alias_groups = {canonical: list(aliases) for canonical, aliases in BRAND_ALIASES.items()} + for canonical, aliases in BRAND_ALIAS_OVERRIDES.items(): + alias_groups.setdefault(canonical, []) + alias_groups[canonical].extend(aliases) + alias_map[canonical.lower()] = canonical + for alias in aliases: + alias_map[alias.lower()] = canonical + + for alias, canonical in alias_map.items(): alias_norm = normalize_product_text(alias) if alias_norm and alias_norm in text: tokens.add(canonical) @@ -291,7 +312,7 @@ def _known_brand_tokens(text: str) -> set[str]: token for token in _tokenize(alias_norm) if not re.fullmatch(r"[a-z]{1,2}", token) ) - for related in BRAND_ALIASES.get(canonical, []): + for related in alias_groups.get(canonical, []): tokens.update( token for token in _tokenize(normalize_product_text(related)) if not re.fullmatch(r"[a-z]{1,2}", token) @@ -364,16 +385,25 @@ def _extract_specs( dosages_mg.append(number) counts: list[tuple[int, str]] = [] - for match in re.finditer(r"(\d+)\s*([入組瓶支條盒包片顆錠枚件])", text): + for match in re.finditer(r"(\d+)\s*([入組瓶支條盒包袋片顆粒錠枚件罐杯本])", text): counts.append((int(match.group(1)), match.group(2))) - for match in re.finditer(r"([一二兩雙三四五六七八九十])\s*([入組瓶支條盒包片顆錠枚件])", text): + for match in re.finditer(r"([一二兩雙三四五六七八九十])\s*([入組瓶支條盒包袋片顆粒錠枚件罐杯本])", text): counts.append((CHINESE_COUNT[match.group(1)], match.group(2))) - for match in re.finditer(r"(?:x|乘)\s*(\d+)\s*([入組瓶支條盒包片顆錠枚件])?", text, re.I): + for match in re.finditer(r"(?:x|乘)\s*(\d+)\s*([入組瓶支條盒包袋片顆粒錠枚件罐杯本])?", text, re.I): unit = match.group(2) or "入" counts.append((int(match.group(1)), unit)) + for match in re.finditer(rf"(\d+)\s*{ENGLISH_COUNT_UNIT_RE}", text, re.I): + counts.append((int(match.group(1)), "入")) + buy_get = re.search(r"買\s*(\d+|[一二兩雙三四五六七八九十])\s*送\s*(\d+|[一二兩雙三四五六七八九十])", text) + if buy_get: + total_count = (_count_text_value(buy_get.group(1)) or 0) + (_count_text_value(buy_get.group(2)) or 0) + if total_count > 1: + counts.append((total_count, "入")) + if "買一送一" in text or "買1送1" in text: + counts.append((2, "入")) total_piece_count = None - explicit_total = re.search(r"共\s*(\d+)\s*([包片顆錠枚])", text) + explicit_total = re.search(r"共\s*(\d+)\s*([包袋片顆粒錠枚])", text) if explicit_total: total_piece_count = int(explicit_total.group(1)) else: @@ -410,7 +440,7 @@ def parse_product_identity(name: str) -> ProductIdentity: } core_tokens -= brand_tokens - volumes_ml, weights_g, dosages_mg, counts, total_piece_count = _extract_specs(searchable) + volumes_ml, weights_g, dosages_mg, counts, total_piece_count = _extract_specs(normalized) return ProductIdentity( original_name=name or "", normalized_name=normalized, @@ -428,14 +458,24 @@ def parse_product_identity(name: str) -> ProductIdentity: def _weighted_token_score(left: ProductIdentity, right: ProductIdentity) -> float: - left_tokens = left.brand_tokens | left.core_tokens - right_tokens = right.brand_tokens | right.core_tokens + def expand_tokens(identity: ProductIdentity) -> set[str]: + tokens = set(identity.brand_tokens | identity.core_tokens) + for token in identity.core_tokens: + chinese = "".join(char for char in token if "\u4e00" <= char <= "\u9fff") + if len(chinese) >= 3: + tokens.update(f"zh:{chinese[i:i + 2]}" for i in range(len(chinese) - 1)) + return tokens + + left_tokens = expand_tokens(left) + right_tokens = expand_tokens(right) if not left_tokens or not right_tokens: return SequenceMatcher(None, left.searchable_name, right.searchable_name).ratio() * 0.6 def weight(token: str) -> float: if token in left.brand_tokens or token in right.brand_tokens: return 1.4 + if token.startswith("zh:"): + return 0.55 if re.search(r"\d", token): return 1.2 if len(token) >= 4: @@ -943,7 +983,23 @@ def build_search_terms(name: str, max_terms: int = 3) -> list[str]: identity = parse_product_identity(name) terms: list[str] = [] - brand_part = " ".join(sorted(identity.brand_tokens))[:24] + def primary_brand_phrase() -> str: + chinese = sorted( + (token for token in identity.brand_tokens if re.search(r"[\u4e00-\u9fff]", token)), + key=lambda token: (-len(token), token), + ) + if chinese: + return chinese[0] + latin = sorted( + ( + token for token in identity.brand_tokens + if re.search(r"[a-z]", token) and len(token) >= 3 and token not in GENERIC_TOKENS + ), + key=lambda token: (" " not in token and "-" not in token, -len(token), token), + ) + return latin[0] if latin else "" + + brand_part = primary_brand_phrase() core = " ".join(sorted(identity.core_tokens, key=lambda token: (-len(token), token))[:4]) specs = [] if identity.volumes_ml: @@ -957,9 +1013,13 @@ def build_search_terms(name: str, max_terms: int = 3) -> list[str]: if identity.total_piece_count: specs.append(f"{identity.total_piece_count}包") + spec_part = " ".join(specs) + core_tokens = sorted(identity.core_tokens, key=lambda token: (-len(token), token)) + core_short = " ".join(core_tokens[:2]) for value in ( - " ".join(part for part in (brand_part, core, " ".join(specs)) if part), - " ".join(part for part in (brand_part, core) if part), + " ".join(part for part in (brand_part, core_short, spec_part) if part), + " ".join(part for part in (brand_part, core_short) if part), + " ".join(part for part in (core_short, spec_part) if part), identity.searchable_name, ): cleaned = re.sub(r"[^\w\u4e00-\u9fff]+", " ", value) diff --git a/templates/dashboard_v2.html b/templates/dashboard_v2.html index 8ff7095..c654c2a 100644 --- a/templates/dashboard_v2.html +++ b/templates/dashboard_v2.html @@ -421,13 +421,20 @@ {% elif competitor and competitor.product_id %} PChome {{ competitor.product_id }} {% else %} - PChome {{ match_status.label | default('待比對') }} + + PChome {{ match_status.label | default('尚未搜尋') }} + {% endif %} {% if competitor and competitor.product_name %}