From d5b41ec11beb80310fb655d35ce0578eea807be6 Mon Sep 17 00:00:00 2001 From: OoO Date: Thu, 21 May 2026 11:57:18 +0800 Subject: [PATCH] =?UTF-8?q?[V10.359]=20=E5=B0=8E=E5=85=A5=20browse.sh=20?= =?UTF-8?q?=E8=A8=BA=E6=96=B7=E8=88=87=E8=89=B2=E8=99=9F=E9=98=B2=E9=8C=AF?= =?UTF-8?q?=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AGENTS.md | 1 + TODO_NEXT_STEPS.txt | 2 + config.py | 2 +- docs/AI_INTELLIGENCE_MODULE_SOT.md | 1 + docs/guides/browse_sh_crawler_playbook.md | 53 ++++++ .../code_modularization_inventory_20260430.md | 11 +- docs/memory/history_logs.md | 10 + scripts/tools/browse_sh_probe.py | 42 +++++ services/browse_sh_tool.py | 172 ++++++++++++++++++ services/competitor_price_feeder.py | 34 +++- services/marketplace_product_matcher.py | 86 ++++++++- services/pchome_crawler.py | 6 +- tests/test_browse_sh_tool.py | 81 +++++++++ ...t_competitor_match_attempts_persistence.py | 48 +++++ tests/test_marketplace_product_matcher.py | 51 ++++++ 15 files changed, 587 insertions(+), 13 deletions(-) create mode 100644 docs/guides/browse_sh_crawler_playbook.md create mode 100755 scripts/tools/browse_sh_probe.py create mode 100644 services/browse_sh_tool.py create mode 100644 tests/test_browse_sh_tool.py diff --git a/AGENTS.md b/AGENTS.md index 5ae0a7f..e885f5f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -139,6 +139,7 @@ - 前端更版路線圖: `docs/guides/frontend_upgrade_roadmap.md` - AI 觀測台 UI 治理: `docs/guides/observability_ui_governance.md` - AI 自動化 Session SOP: `docs/guides/ai_automation_session_sop.md` +- Browse.sh 爬蟲診斷手冊: `docs/guides/browse_sh_crawler_playbook.md` - AI 競價情報 SOT: `docs/AI_INTELLIGENCE_MODULE_SOT.md` - Agent 角色矩陣: `docs/guides/codex_agent_roles.md` - ADR 索引: `docs/adr/README.md` diff --git a/TODO_NEXT_STEPS.txt b/TODO_NEXT_STEPS.txt index ee51a85..d62cec0 100644 --- a/TODO_NEXT_STEPS.txt +++ b/TODO_NEXT_STEPS.txt @@ -4,6 +4,7 @@ ================================================================================ 【已完成】 + - V10.359 導入 Browse.sh 可選爬蟲診斷與強化 MOMO/PChome 色號比對:新增 `BrowseShTool` wrapper、probe CLI 與操作手冊,讓 browse.sh 只作 selector/XHR/network trace 探勘、不進正式 scheduler;matcher 補護甲油/洗手慕斯/足膜精準搜尋詞,保留小數規格,並對唇釉、妝前乳、素顏霜等顯性色號/色系不一致候選做 hard veto,避免同系列不同色號污染正式價差。 - V10.355 補市場情報 candidate queue review AI summary Telegram dispatch report catalog record archive summary gate:新增 read-only report catalog record archive summary builder、POST endpoint、UI 按鈕與 deployment readiness smoke target,在 archive gate 後整理 catalog record identity、artifact traceability、DB commit/post-write smoke、archive manifest/retention policy 與後續 final closeout separate gate;API/UI 不讀 approval/Telegram token、不呼叫 LLM、不產報表、不派送 Telegram、不開 DB、不寫檔、不執行 CLI、不寫 catalog record、不 commit、不更新 review_state、不掛 scheduler。 - V10.354 補市場情報 candidate queue review AI summary Telegram dispatch report catalog record archive gate:新增 read-only report catalog record archive builder、POST endpoint、UI 按鈕與 deployment readiness smoke target,在 closeout gate 後審核 closeout/commit/run receipt/writer output/post-write smoke/backup 封存證據、archive manifest/retention policy 與後續 archive summary separate gate;API/UI 不讀 approval/Telegram token、不呼叫 LLM、不產報表、不派送 Telegram、不開 DB、不寫檔、不執行 CLI、不寫 catalog record、不 commit、不更新 review_state、不掛 scheduler。 - V10.353 補市場情報 candidate queue review AI summary Telegram dispatch report catalog record closeout gate:新增 read-only report catalog record closeout builder、POST endpoint、UI 按鈕與 deployment readiness smoke target,在 commit gate 後審核 catalog record identity、DB commit/post-write smoke 證據、操作員 closeout 確認與後續 archive separate gate;API/UI 不讀 approval/Telegram token、不呼叫 LLM、不補產報表、不派送 Telegram、不開 DB、不寫檔、不執行 CLI、不寫 catalog record、不 commit、不更新 review_state、不掛 scheduler。 @@ -16,6 +17,7 @@ - V10.339 補市場情報 candidate queue review AI summary Telegram dispatch report catalog index:新增 read-only report catalog index builder、POST endpoint、UI 按鈕與 deployment readiness smoke target,在 report catalog handoff 後整理 catalog index identity、handoff source trace、index manifest 與 runtime safety,只放行到後續 report catalog write preflight gate;API/UI 不讀 approval/Telegram token、不呼叫 LLM、不補產報表、不派送 Telegram、不開 DB、不寫 catalog index file、不寫 catalog record、不更新 review_state、不掛 scheduler。 - V10.338 補市場情報 candidate queue review AI summary Telegram dispatch report catalog handoff:新增 read-only report catalog handoff builder、POST endpoint、UI 按鈕與 deployment readiness smoke target,在 report archive summary 後整理 catalog identity、artifact manifest、section keys 與 hash traceability,只放行到後續 report catalog index gate;API/UI 不讀 approval/Telegram token、不呼叫 LLM、不補產報表、不派送 Telegram、不開 DB、不寫 catalog record、不更新 review_state、不掛 scheduler。 - V10.335 補市場情報 candidate queue review AI summary Telegram dispatch report archive summary:新增 read-only report archive summary builder、POST endpoint、UI 按鈕與 deployment readiness smoke target,在 report archive 後整理 report identity、archive traceability、integrity review 與 runtime safety sections,只放行到後續 report catalog handoff gate;API/UI 不讀 approval/Telegram token、不呼叫 LLM、不補產報表、不派送 Telegram、不開 DB、不寫檔、不更新 review_state、不掛 scheduler。 + - V10.334 強化 MOMO/PChome 核心比價第二波:matcher 補常見品牌 alias、任選/平輸/國別 noise 收斂、刀把/刀片/刀頭等件數解析與系列衝突硬否決,避免 Gillette/Schick 同品牌不同系列或刀片數被誤當同款;新增近門檻候選重新評分流程,會把舊 low_score 中 0.70 以上且非 hard veto、有 PChome product_id 的候選先批次重評,再補抓高價未配對商品;商品看板新增 PChome 補抓產線狀態卡,顯示 run id 階段、成功/低信心/無結果/挑品寫入與錯誤,不再讓核心比價補抓變成黑盒。 - V10.333 補市場情報 candidate queue review AI summary Telegram dispatch report archive:新增 read-only report archive builder、POST endpoint、UI 按鈕與 deployment readiness smoke target,在 report closeout 後審核 archive/closeout/receipt/report output artifact path、hash/章節、archive manifest、retention policy 與後續 archive summary separate gate;API/UI 不讀 approval/Telegram token、不呼叫 LLM、不補產報表、不派送 Telegram、不開 DB、不寫檔、不更新 review_state、不掛 scheduler。 - V10.332 補市場情報 candidate queue review AI summary Telegram dispatch report closeout:新增 read-only report closeout builder、POST endpoint、UI 按鈕與 deployment readiness smoke target,在 report run receipt 後審核 closeout artifact、receipt/report artifact path、hash/章節覆核、後續 report archive separate gate 與 runtime boundary;API/UI 不讀 approval/Telegram token、不呼叫 LLM、不補產報表、不派送 Telegram、不開 DB、不寫檔、不更新 review_state、不掛 scheduler。 - V10.331 強化 PChome 比價 matcher 邊界:正式端 pilot 先刷新 30 筆過期 identity_v2、補抓 15 筆高價未配對 SKU,確認 structured diagnostics 開始寫入;針對「同品牌、同核心多組件、無任何否決理由、價格正常」但因規格文字不完整卡在 0.74x 的候選,新增 strong_component_line_match 窄門補分,避免 Gennies 類完整套組被誤留在低信心,同時維持雙入組對單品、容量衝突、不同品牌與補充瓶硬否決。 diff --git a/config.py b/config.py index f4f9336..acc4a00 100644 --- a/config.py +++ b/config.py @@ -323,7 +323,7 @@ YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY', '') # ========================================== # 系統版本與路徑 # ========================================== -SYSTEM_VERSION = "V10.358" +SYSTEM_VERSION = "V10.359" LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = PUBLIC_URL # 用於模板顯示 diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index a534ab2..8924c61 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -312,6 +312,7 @@ LIMIT 300 | 最低比對門檻 | 0.76 | 核心比價寧可待審,不允許低信心錯配影響 AI 決策 | | 已有不同 PChome 商品覆蓋門檻 | 0.84 | 新候選與既有正式配對不同時,除非超高信心,否則寫入 `needs_review` attempt 不覆蓋 | | 單位價可比模式 | `unit_comparable` | 同核心商品但買送/套組/件數不同時,不寫正式總價差;只寫入 attempt,並以單位價證據供 Dashboard / PPT / AI 報表與人工覆核 | +| Browse.sh 診斷 | optional wrapper | 只用於 selector / XHR / network trace 探勘;不得取代正式 crawler,也不得直接把輸出寫成正式競品價格 | | 語意標籤 | JSONB 陣列 | 傳給 Hermes 提升情境感知品質 | ### 競品比對邏輯(`competitor_price_feeder.py`) diff --git a/docs/guides/browse_sh_crawler_playbook.md b/docs/guides/browse_sh_crawler_playbook.md new file mode 100644 index 0000000..4490592 --- /dev/null +++ b/docs/guides/browse_sh_crawler_playbook.md @@ -0,0 +1,53 @@ +# Browse.sh Crawler Playbook + +> Scope: MOMO / PChome 動態頁診斷、selector 探勘、XHR/network trace。正式價格資料仍以既有 Python crawler、PChome API 與資料庫閉環為準。 + +## 評估結論 + +- `browse.sh` 是 Browserbase 提供的 browser CLI,主打 open web catalog、browser primitives、debugging、cloud sessions 與 network/console tail。 +- 官方安裝入口為 `npm install -g browse`,CLI 名稱是 `browse`。 +- 對本專案最有價值的地方不是取代爬蟲,而是當 MOMO/PChome HTML 或前端 XHR 改版時,快速看 selector、console、network 與可重放 skill。 +- 本機目前 Node 16 因 `icu4c` 動態庫缺失無法啟動,不能在本機直接安裝或執行 `browse`。導入方式先採 optional wrapper,不影響 production scheduler。 + +## 使用邊界 + +- 只允許做 read-only 診斷,不登入、不下單、不加入購物車、不寫第三方狀態。 +- 不把 `browse` 放進排程主路徑;若未來要排程化,需另開 ADR 與 feature flag。 +- 不把 `browse` 輸出直接寫成正式價格;所有正式比價仍需經 `services/pchome_crawler.py`、`services/momo_crawler.py`、`services/marketplace_product_matcher.py` 與 `competitor_match_attempts` 診斷。 +- Cloud sessions 可能需要 Browserbase 帳號與憑證;憑證不可寫入 repo。 + +## 本地檢查 + +```bash +python scripts/tools/browse_sh_probe.py +``` + +可用時會輸出 `available=true` 與版本;不可用時會輸出原因。若 `browse` 不在 PATH,可用: + +```bash +BROWSE_SH_CLI=/path/to/browse python scripts/tools/browse_sh_probe.py +``` + +## 診斷流程 + +1. 先用既有 crawler/API 重現問題,保存失敗 SKU、搜尋詞、候選網址與 matcher diagnostics。 +2. 使用 `browse` 開同一頁,查看搜尋結果 DOM、network 與 console。 +3. 若找到穩定 XHR/API,優先回補到 Python crawler;若只能走 DOM,才更新 selector。 +4. 新增或更新單元測試,至少覆蓋搜尋詞、候選解析與 matcher hard veto。 +5. 只在測試通過後部署 app/scheduler/bot;不得重建或重啟 `momo-db`。 + +## 常用命令 + +```bash +python scripts/tools/browse_sh_probe.py -- --version +python scripts/tools/browse_sh_probe.py -- skills list +python scripts/tools/browse_sh_probe.py -- open "https://24h.pchome.com.tw/" +python scripts/tools/browse_sh_probe.py -- network --tail +python scripts/tools/browse_sh_probe.py -- screenshot +``` + +## MOMO/PChome 導入策略 + +- PChome:目前已有搜尋 API 與商品 API,`browse` 只用於確認 API 參數、分頁行為、前端是否切新 endpoint。 +- MOMO:若既有 BeautifulSoup selector 失效,先用 `browse` 找出前端實際 XHR;找到 API 時優先改成 structured API parser。 +- Matcher:`browse` 只提供候選證據;是否為同款仍由 `marketplace_product_matcher.score_marketplace_match()` 決定。 diff --git a/docs/memory/code_modularization_inventory_20260430.md b/docs/memory/code_modularization_inventory_20260430.md index 2a35d94..c9118b9 100644 --- a/docs/memory/code_modularization_inventory_20260430.md +++ b/docs/memory/code_modularization_inventory_20260430.md @@ -4,10 +4,10 @@ ## 盤點結論 -- Python 總量:約 90,293 行(排除 `venv/`、`backups/`、`__pycache__/`、`.claude/worktrees/`)。 -- 最大壓力區:`services/` 約 42,364 行、`routes/` 約 29,511 行。 +- Python 總量:約 139,476 行(排除 `venv/`、`backups/`、`__pycache__/`、`.claude/worktrees/`)。 +- 最大壓力區:`services/` 約 84,159 行、`routes/` 約 36,245 行。 - `app.py` 目前約 1,232 行,功能定位應固定為 bootstrap / Blueprint registration / startup guard,不再承接新 route。 -- 目前工作樹仍有 29 個 Python 檔案達到或超過 800 行;這些不是禁止修 bug,而是禁止繼續塞新功能。 +- 目前工作樹仍有 33 個 Python 檔案達到或超過 800 行;這些不是禁止修 bug,而是禁止繼續塞新功能。 - 2026-05-05 追記:Phase 38→56 觀測台戰役讓 `routes/admin_observability_routes.py` 與 `run_scheduler.py` 進入大檔治理清單;後續觀測台功能應先抽 query/action service,不再把新 SQL 與 L2 mutation 直接塞回 route。 - 2026-05-06 追記:跨平台市場情報模組啟動前,必須先把新增爬蟲、排程、DB schema、UI route 全部隔離在 `market_*` / `services/market_intel/` / `routes/market_intel_routes.py`,不可塞回既有大檔。 - 2026-05-18 追記:Phase 42 市場情報只在 `app.py` 的 `EXPECTED_METADATA_TABLES` 補上 `market_alert_review_queue` 名稱,未新增 route / bootstrap 邏輯;後續仍應把 metadata verification 抽到 app factory 或 startup guard module,避免 `app.py` 繼續承接功能。 @@ -32,6 +32,7 @@ - 2026-05-20 追記:同步 PChome contained identity anchor scorer 更新後的 `services/marketplace_product_matcher.py` 行數;此處只更新 inventory,不變更模組化決策。 - 2026-05-20 追記:同步 PChome spec/name alignment near-threshold scorer 更新後的 `services/marketplace_product_matcher.py` 行數;此處只更新 inventory,不變更模組化決策。 - 2026-05-20 追記:同步市場情報 review report route 與 review receipt 巨檔現況,並校正 PChome fresh-search recovery 更新後的 `services/competitor_price_feeder.py`、`services/marketplace_product_matcher.py` 行數;此處只更新 inventory,不變更模組化決策。 +- 2026-05-21 追記:同步 Browse.sh 診斷導入、PChome 變體搜尋與色號防錯配更新後的 `services/marketplace_product_matcher.py` 行數,並校正市場情報 review report route 目前行數;此處只更新 inventory,不變更模組化決策。 ## 達到或超過 800 行檔案清單 @@ -58,7 +59,7 @@ | 940 | `services/import_service.py` | P2 import service | validators / import writers / report builders | | 933 | `services/telegram_templates.py` | P2 Telegram templates | alert template groups / channel-specific formatting / reusable render helpers | | 867 | `services/token_report_service.py` | P2 token report service | query / aggregation / chart payload / notification formatting | -| 1559 | `services/marketplace_product_matcher.py` | P2 marketplace matcher | identity parsing / unit-comparable scoring / search term quality / persistence normalization | +| 1902 | `services/marketplace_product_matcher.py` | P2 marketplace matcher | identity parsing / unit-comparable scoring / search term quality / persistence normalization | | 865 | `routes/daily_sales_routes.py` | P2 Daily Sales Blueprint | route glue / export helpers / daily query and formatting service | | 844 | `services/ollama_service.py` | P2 Ollama client | host health / request client / fallback policy / response parsing | | 849 | `services/pchome_crawler.py` | P2 PChome crawler | search fetch / parsing / fallback source handling / rate limit policy | @@ -68,7 +69,7 @@ | 1733 | `services/competitor_price_feeder.py` | P2 competitor price feeder | crawler scheduling / price normalization / retryable candidate recovery / cache strategy | | 1120 | `services/competitor_intel_repository.py` | P2 competitor intel repository | review queue query / cache shaping / formatting helpers | | 805 | `routes/bot_api_routes.py` | P2 Bot API Blueprint | route glue / bot action service | -| 990 | `routes/market_intel_review_report_routes.py` | P2 market intel review report Blueprint | review report route glue / export payload / phase handoff orchestration | +| 1319 | `routes/market_intel_review_report_routes.py` | P2 market intel review report Blueprint | review report route glue / export payload / phase handoff orchestration | | 811 | `services/market_intel/deployment_readiness.py` | P2 market intel deployment readiness | preflight gates / readiness payload / route contract helpers | | 846 | `services/market_intel/candidate_queue_review_ai_summary_persistence_telegram_dispatch_report_catalog_record_run_receipt.py` | P2 market intel review receipt pipeline | AI summary / persistence / Telegram dispatch / report catalog run receipt orchestration | diff --git a/docs/memory/history_logs.md b/docs/memory/history_logs.md index fa9b3bf..8a3fe54 100644 --- a/docs/memory/history_logs.md +++ b/docs/memory/history_logs.md @@ -12,6 +12,11 @@ ## 📅 詳細更新日誌 (考古存檔) +### 2026-05-21:Browse.sh 爬蟲診斷與 PChome 色號比對強化 +- **V10.359 Browse.sh optional diagnostics**: 新增 `services/browse_sh_tool.py` 與 `scripts/tools/browse_sh_probe.py`,可檢查或執行 `browse` CLI;目前只定位為 MOMO/PChome selector、XHR 與 network trace 探勘,不進 scheduler 主路徑,也不直接寫正式競品價格。 +- **MOMO/PChome matcher 色號防錯配**: `marketplace_product_matcher.py` 補護甲油、洗手慕斯、足膜精準搜尋,搜尋詞保留 `4.2ml` 這類小數規格;唇釉、妝前乳、素顏霜等顯性色號/色系不一致時會 hard veto,避免同系列不同色號被推成正式價差。 +- **導入限制**: 本機 Node 16 目前因 `icu4c` 動態庫缺失無法啟動,browse.sh 需待 Node 修復或於乾淨主機安裝後才可實跑;repo 內先保留 optional wrapper、測試與 playbook。 + ### 2026-05-21:市場情報 Telegram dispatch report catalog record archive summary gate - **V10.355 report catalog record archive summary gate**: 新增 `candidate_queue_review_ai_summary_persistence_telegram_dispatch_report_catalog_record_archive_summary` service、POST endpoint、UI 按鈕與 deployment readiness smoke target,在 archive gate 後整理 catalog record identity、artifact traceability、DB commit/post-write smoke、archive manifest/retention policy 與後續 final closeout separate gate。 - **只讀安全邊界**: 本階段只放行到後續 report catalog record final closeout gate;API/UI 不讀 approval/Telegram token、不呼叫 LLM、不產報表、不派送 Telegram、不開 DB、不寫檔、不執行 CLI、不寫 catalog record、不 commit、不更新 `review_state`、不掛 scheduler。 @@ -60,6 +65,11 @@ - **V10.335 report archive summary gate**: 新增 `candidate_queue_review_ai_summary_persistence_telegram_dispatch_report_archive_summary` service、POST endpoint、UI 按鈕與 deployment readiness smoke target,在 report archive 後整理 report identity、archive traceability、integrity review 與 runtime safety sections。 - **只讀安全邊界**: 本階段只放行到後續 report catalog handoff gate;API/UI 不讀 approval/Telegram token、不呼叫 LLM、不補產報表、不派送 Telegram、不開 DB、不寫檔、不更新 `review_state`、不掛 scheduler。 +### 2026-05-20:PChome 核心比價準確率與補抓可觀測性 +- **V10.334 MOMO/PChome matcher 強化**: 補常見品牌 alias 與任選/平輸/國別 noise 收斂,並加入刀把/刀片/刀頭件數解析、同品牌不同系列硬否決,避免為了提高覆蓋率把 Gillette/Schick 等不同系列錯配成同款。 +- **近門檻候選重新評分**: `CompetitorPriceFeeder` 新增 `run_retryable_candidate_revalidation()`,優先重評舊 `low_score` 中 0.70 以上、非 hard veto 且有 PChome product_id 的候選,再補抓高價未配對商品;排程與手動 API 都會納入這段流程。 +- **PChome 補抓產線狀態**: 商品看板新增 PChome backfill status card,後端以 JSON 狀態檔記錄 queued/revalidating/matching/generating_picks/clearing_cache/completed/failed、結果統計與最近 run,避免手動補抓後沒有進度與錯誤上下文。 + ### 2026-05-20:重開機後首頁熱路徑索引持久化 - **Dashboard / PChome 慢查詢修復**: 主機重開機後 `https://mo.wooo.work/` 首頁可用但多次逾時,實際瓶頸集中在首頁與 PChome coverage 查詢掃描 `products`、`price_records`、`competitor_match_attempts`。線上先補三個索引讓首頁恢復 200,並新增 `migrations/040_dashboard_hot_path_indexes.sql` 將修復持久化到 fresh restore / DB rebuild 流程。 - **Growth Analysis 冷快取修復**: `/growth_analysis` 在 `monthly_summary_analysis` 落後時會改掃 `realtime_sales_monthly` 聚合,冷計算約 14 秒;修正為 source fingerprint 未變時延長共享快取有效期,匯入流程仍主動清除快取,避免資料未變卻反覆掃大表。 diff --git a/scripts/tools/browse_sh_probe.py b/scripts/tools/browse_sh_probe.py new file mode 100755 index 0000000..db496df --- /dev/null +++ b/scripts/tools/browse_sh_probe.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""檢查或執行可選的 browse.sh CLI。""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +from services.browse_sh_tool import BrowseShTool # noqa: E402 + + +def main() -> int: + parser = argparse.ArgumentParser(description="檢查 browse.sh CLI,或執行一次診斷命令。") + parser.add_argument("--cli", help="browse CLI 路徑;預設使用 BROWSE_SH_CLI 或 PATH。") + parser.add_argument("--timeout", type=int, default=90, help="執行逾時秒數。") + parser.add_argument( + "browse_args", + nargs=argparse.REMAINDER, + help="要傳給 browse 的參數;未提供時只輸出 availability。", + ) + args = parser.parse_args() + + tool = BrowseShTool(cli_path=args.cli, timeout_seconds=args.timeout) + if not args.browse_args: + print(json.dumps(tool.availability().as_dict(), ensure_ascii=False, indent=2)) + return 0 + + browse_args = tuple(arg for arg in args.browse_args if arg != "--") + result = tool.run(browse_args, timeout_seconds=args.timeout) + print(json.dumps(result.as_dict(), ensure_ascii=False, indent=2)) + return 0 if result.ok else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/services/browse_sh_tool.py b/services/browse_sh_tool.py new file mode 100644 index 0000000..9e1d77a --- /dev/null +++ b/services/browse_sh_tool.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""browse.sh CLI 的可選執行 wrapper。 + +正式爬蟲仍以既有 Python/API client 為準;browse.sh 只用於動態頁面診斷、 +selector 探勘與 network trace,避免把外部 CLI 變成 scheduler 的硬依賴。 +""" + +from __future__ import annotations + +import os +import shutil +import subprocess +from dataclasses import dataclass +from typing import Mapping, Sequence + + +BROWSE_SH_CLI_ENV = "BROWSE_SH_CLI" +DEFAULT_TIMEOUT_SECONDS = 90 + + +@dataclass(frozen=True) +class BrowseShAvailability: + available: bool + command: tuple[str, ...] + reason: str = "" + version: str = "" + + def as_dict(self) -> dict: + return { + "available": self.available, + "command": list(self.command), + "reason": self.reason, + "version": self.version, + } + + +@dataclass(frozen=True) +class BrowseShResult: + ok: bool + command: tuple[str, ...] + stdout: str = "" + stderr: str = "" + returncode: int | None = None + timed_out: bool = False + unavailable_reason: str = "" + + def as_dict(self) -> dict: + return { + "ok": self.ok, + "command": list(self.command), + "stdout": self.stdout, + "stderr": self.stderr, + "returncode": self.returncode, + "timed_out": self.timed_out, + "unavailable_reason": self.unavailable_reason, + } + + +class BrowseShTool: + """browse CLI 的最小安全包裝。""" + + def __init__( + self, + cli_path: str | None = None, + timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, + env: Mapping[str, str] | None = None, + ) -> None: + self.cli_path = cli_path + self.timeout_seconds = timeout_seconds + self.env = dict(env or {}) + + def resolve_cli_path(self) -> str | None: + override = self.cli_path or os.getenv(BROWSE_SH_CLI_ENV) + if override: + return override + return shutil.which("browse") + + def build_command(self, args: Sequence[str]) -> tuple[str, ...]: + cli_path = self.resolve_cli_path() + if not cli_path: + return tuple() + return (cli_path, *[str(arg) for arg in args]) + + def availability(self) -> BrowseShAvailability: + command = self.build_command(("--version",)) + if not command: + return BrowseShAvailability( + available=False, + command=tuple(), + reason="browse CLI 未安裝;請先安裝並確認 PATH 可找到 browse。", + ) + try: + completed = subprocess.run( + command, + capture_output=True, + check=False, + env={**os.environ, **self.env}, + text=True, + timeout=8, + ) + except FileNotFoundError: + return BrowseShAvailability(False, command, "browse CLI 路徑不存在。") + except subprocess.TimeoutExpired: + return BrowseShAvailability(False, command, "browse --version 執行逾時。") + except OSError as exc: + return BrowseShAvailability(False, command, f"browse CLI 無法啟動:{exc}") + + stdout = (completed.stdout or "").strip() + stderr = (completed.stderr or "").strip() + if completed.returncode != 0: + reason = stderr or stdout or f"browse --version 回傳 {completed.returncode}" + return BrowseShAvailability(False, command, reason) + return BrowseShAvailability(True, command, version=stdout or stderr) + + def run( + self, + args: Sequence[str], + timeout_seconds: int | None = None, + require_available: bool = True, + ) -> BrowseShResult: + command = self.build_command(args) + if not command: + return BrowseShResult( + ok=False, + command=tuple(), + unavailable_reason="browse CLI 未安裝;此工具只會略過,不影響正式爬蟲。", + ) + if require_available: + availability = self.availability() + if not availability.available: + return BrowseShResult( + ok=False, + command=command, + unavailable_reason=availability.reason, + ) + + try: + completed = subprocess.run( + command, + capture_output=True, + check=False, + env={**os.environ, **self.env}, + text=True, + timeout=timeout_seconds or self.timeout_seconds, + ) + except subprocess.TimeoutExpired as exc: + return BrowseShResult( + ok=False, + command=command, + stdout=exc.stdout or "", + stderr=exc.stderr or "", + timed_out=True, + ) + except OSError as exc: + return BrowseShResult( + ok=False, + command=command, + stderr=str(exc), + unavailable_reason=str(exc), + ) + + return BrowseShResult( + ok=completed.returncode == 0, + command=command, + stdout=completed.stdout or "", + stderr=completed.stderr or "", + returncode=completed.returncode, + ) + + def run_skill(self, skill_name: str, *skill_args: str, timeout_seconds: int | None = None) -> BrowseShResult: + return self.run((skill_name, *skill_args), timeout_seconds=timeout_seconds) diff --git a/services/competitor_price_feeder.py b/services/competitor_price_feeder.py index 87d3e5b..8ce66dd 100644 --- a/services/competitor_price_feeder.py +++ b/services/competitor_price_feeder.py @@ -45,6 +45,7 @@ BATCH_SIZE = 30 # 每批 DB 寫入筆數 RATE_DELAY = float(os.getenv("PCHOME_FEEDER_RATE_DELAY", "1.0")) # 每次 PChome 請求間隔(秒) TTL_HOURS = 6 # competitor_prices 快取有效期 REQUEST_TIMEOUT = float(os.getenv("PCHOME_FEEDER_TIMEOUT", "12")) # 避免外部搜尋 API 長時間卡住排程 +VARIANT_RECALL_SORTS = ("sale/dc", "new/dc") RECOVERABLE_LOW_SCORE_FLOOR = max(MIN_MATCH_SCORE - 0.03, 0.72) RECOVERABLE_DIAGNOSTIC_REASONS = { "strong_product_line_match", @@ -188,6 +189,25 @@ def _build_search_keywords(momo_name: str) -> list: return _dedupe_terms(primary_terms) +def _build_variant_recall_search_plan(momo_name: str, keywords: list[str]) -> list[tuple[str, str | None]]: + plan = [(keyword, None) for keyword in (keywords or [])] + try: + from services.marketplace_product_matcher import parse_product_identity + + identity = parse_product_identity(momo_name) + except Exception: + return plan + + brand_tokens = {token.lower() for token in getattr(identity, "brand_tokens", set())} + if not ({"dashing", "diva"} <= brand_tokens and "美甲片" in getattr(identity, "searchable_name", "")): + return plan + + broad_keyword = "dashing diva 時尚潮流美甲片" + for sort in VARIANT_RECALL_SORTS: + plan.append((broad_keyword, sort)) + return plan + + def _format_match_diagnostics(diagnostics) -> str: if not diagnostics: return "" @@ -294,8 +314,18 @@ def _search_pchome_candidates(crawler, momo_name: str, keywords: list = None, mo candidates = [] seen_ids = set() search_limit = SEARCH_LIMIT * max(1, SEARCH_MAX_PAGES) - for keyword in keywords or _build_search_keywords(momo_name): - ok, _, products = crawler.search_products(keyword, limit=search_limit, max_pages=SEARCH_MAX_PAGES) + active_keywords = keywords or _build_search_keywords(momo_name) + search_plan = _build_variant_recall_search_plan(momo_name, active_keywords) + for keyword, sort in search_plan: + if sort: + ok, _, products = crawler.search_products( + keyword, + limit=search_limit, + max_pages=SEARCH_MAX_PAGES, + sort=sort, + ) + else: + ok, _, products = crawler.search_products(keyword, limit=search_limit, max_pages=SEARCH_MAX_PAGES) if not ok or not products: continue for product in products: diff --git a/services/marketplace_product_matcher.py b/services/marketplace_product_matcher.py index 1fe7c24..c24c699 100644 --- a/services/marketplace_product_matcher.py +++ b/services/marketplace_product_matcher.py @@ -379,6 +379,10 @@ SEARCH_BROAD_ANCHORS = { } VARIANT_SENSITIVE_KEYWORDS = { + "妝前防護乳", + "妝前乳", + "素顏霜", + "粉底", "美甲片", "眼影盤", "唇釉", @@ -394,6 +398,37 @@ VARIANT_SENSITIVE_KEYWORDS = { "遮瑕棒", } +VARIANT_OPTION_COLOR_WORDS = { + "黑色", + "棕色", + "咖啡色", + "灰色", + "白色", + "紅色", + "粉色", + "粉紅", + "桃紅", + "玫瑰", + "玫瑰色", + "珊瑚", + "珊瑚色", + "橘色", + "橙色", + "裸色", + "奶茶色", + "豆沙色", + "紫色", + "薰衣草", + "藍色", + "綠色", + "膚色", + "自然色", + "明亮色", + "透明色", + "極光之藍", + "月光銀影", +} + SEARCH_AMBIGUOUS_PRODUCT_TERMS = { "保護膜", "保護貼", @@ -1314,6 +1349,11 @@ def score_marketplace_match( reasons.append("component_count_conflict") if chinese_name_score < 0.16: reasons.append("product_line_conflict") + shared_anchor = _shared_identity_anchor(left, right) + variant_descriptor_conflict = _has_variant_descriptor_conflict(left, right, shared_anchor) + variant_option_conflict = _has_explicit_variant_option_conflict(left, right, shared_anchor) + if variant_option_conflict: + reasons.append("variant_option_conflict") hard_veto = brand_conflict or spec_conflict if bundle_offer_conflict: @@ -1330,6 +1370,8 @@ def score_marketplace_match( hard_veto = True if left.product_type and right.product_type and left.product_type != right.product_type: hard_veto = True + if variant_option_conflict: + hard_veto = True comparison_mode = "exact_identity" if _is_unit_comparable_candidate( @@ -1370,8 +1412,6 @@ def score_marketplace_match( if token_score >= 0.72 and spec_score >= 0.82 and not brand_conflict: score += 0.08 - shared_anchor = _shared_identity_anchor(left, right) - variant_descriptor_conflict = _has_variant_descriptor_conflict(left, right, shared_anchor) if ( brand_score >= 0.95 @@ -1528,7 +1568,9 @@ def _clean_search_phrase(value: str) -> str: text = normalize_product_text(value) for phrase in sorted(SEARCH_NOISE_PHRASES, key=len, reverse=True): text = text.replace(phrase.lower(), " ") + text = re.sub(r"(?<=\d)\.(?=\d)", "DECIMALPOINT", text) text = re.sub(r"[^\w\u4e00-\u9fff]+", " ", text) + text = text.replace("DECIMALPOINT", ".").replace("decimalpoint", ".") text = " ".join( token for token in text.split() if token not in SEARCH_NOISE_TOKENS and token not in GENERIC_TOKENS @@ -1674,6 +1716,39 @@ def _has_variant_descriptor_conflict(left: ProductIdentity, right: ProductIdenti return True +def _explicit_variant_option_tokens(identity: ProductIdentity) -> set[str]: + text = identity.searchable_name + options: set[str] = set() + for match in re.finditer(r"(?:#|no\.?|色號|號色)\s*([a-z]?\d{1,3}[a-z]?)(?![a-z0-9])", text, re.I): + value = re.sub(r"[^a-z0-9]", "", match.group(1).lower()) + if value: + options.add(value) + for color_word in VARIANT_OPTION_COLOR_WORDS: + if color_word in text: + options.add(color_word) + return options + + +def _has_explicit_variant_option_conflict( + left: ProductIdentity, + right: ProductIdentity, + shared_anchor: str, +) -> bool: + if not _is_variant_sensitive_identity(left, right, shared_anchor): + return False + left_options = _explicit_variant_option_tokens(left) + right_options = _explicit_variant_option_tokens(right) + if not left_options or not right_options: + return False + if left_options & right_options: + return False + for left_option in left_options: + for right_option in right_options: + if left_option in right_option or right_option in left_option: + return False + return True + + def _search_core_score(token: str, all_tokens: set[str]) -> tuple[int, int, str]: cleaned = _clean_search_phrase(token) if not cleaned: @@ -1790,6 +1865,8 @@ def build_search_terms(name: str, max_terms: int = 3) -> list[str]: ) variant_descriptors = sorted(_variant_descriptors(identity), key=lambda token: (len(token), token)) variant_primary = variant_descriptors[0] if variant_descriptors else "" + variant_options = sorted(_explicit_variant_option_tokens(identity), key=lambda token: (len(token), token)) + variant_option_part = " ".join(variant_options[:2]) model_phrases = [ phrase for phrase in core_phrases[1:] @@ -1801,8 +1878,11 @@ def build_search_terms(name: str, max_terms: int = 3) -> list[str]: ) variant_sensitive = any(keyword in identity.searchable_name for keyword in VARIANT_SENSITIVE_KEYWORDS) for value in ( + " ".join(part for part in (brand_part, core_primary, variant_option_part, spec_part) if part) + if variant_sensitive and variant_option_part + else "", " ".join(part for part in (brand_part, core_primary, variant_primary, spec_part) if part) - if variant_sensitive and variant_primary + if variant_sensitive and variant_primary and variant_options else "", " ".join(part for part in (brand_part, modifier_with_primary, spec_part) if part) if modifier_with_primary and identity.product_type and identity.product_type in core_primary diff --git a/services/pchome_crawler.py b/services/pchome_crawler.py index 50fb88d..bb554e1 100644 --- a/services/pchome_crawler.py +++ b/services/pchome_crawler.py @@ -382,6 +382,7 @@ class PChomeCrawler: keyword: str, limit: int = 50, max_pages: Optional[int] = None, + sort: str = "rnk/dc", ) -> Tuple[bool, str, List[PChomeProduct]]: """ 搜尋商品 (使用搜尋 API) @@ -390,6 +391,7 @@ class PChomeCrawler: keyword: 搜尋關鍵字 limit: 最多回傳數量 max_pages: 搜尋結果最多掃描頁數;預設依 limit 最多掃到 3 頁 + sort: 搜尋排序;預設 relevance ranking (`rnk/dc`) Returns: (成功與否, 訊息, 商品資料列表) @@ -407,7 +409,7 @@ class PChomeCrawler: params = { 'q': keyword, 'page': page, - 'sort': 'rnk/dc', + 'sort': sort, 'cateid': '24h', } response = self._get_with_retry(search_url, params=params, timeout=self.timeout) @@ -435,7 +437,7 @@ class PChomeCrawler: # 取得詳細資料 success, message, products = self.fetch_product_details(product_ids[:limit]) if success: - message = f"{message};搜尋頁數 {pages_scanned}" + message = f"{message};搜尋頁數 {pages_scanned};排序 {sort}" return success, message, products except requests.RequestException as e: diff --git a/tests/test_browse_sh_tool.py b/tests/test_browse_sh_tool.py new file mode 100644 index 0000000..27cfb13 --- /dev/null +++ b/tests/test_browse_sh_tool.py @@ -0,0 +1,81 @@ +import subprocess +from types import SimpleNamespace + + +def test_browse_sh_availability_reports_missing_cli(monkeypatch): + from services import browse_sh_tool + from services.browse_sh_tool import BrowseShTool + + monkeypatch.delenv("BROWSE_SH_CLI", raising=False) + monkeypatch.setattr(browse_sh_tool.shutil, "which", lambda _name: None) + + availability = BrowseShTool().availability() + + assert availability.available is False + assert availability.command == tuple() + assert "未安裝" in availability.reason + + +def test_browse_sh_build_command_prefers_env_override(monkeypatch): + from services import browse_sh_tool + from services.browse_sh_tool import BrowseShTool + + monkeypatch.setenv("BROWSE_SH_CLI", "/opt/bin/browse") + monkeypatch.setattr(browse_sh_tool.shutil, "which", lambda _name: "/usr/local/bin/browse") + + assert BrowseShTool().build_command(("skills", "list")) == ( + "/opt/bin/browse", + "skills", + "list", + ) + + +def test_browse_sh_run_returns_success(monkeypatch): + from services.browse_sh_tool import BrowseShTool + + calls = [] + + def fake_run(command, **kwargs): + calls.append((command, kwargs)) + return SimpleNamespace(returncode=0, stdout="ok", stderr="") + + monkeypatch.setattr("services.browse_sh_tool.subprocess.run", fake_run) + + result = BrowseShTool(cli_path="/bin/browse").run(("skills", "list"), require_available=False) + + assert result.ok is True + assert result.command == ("/bin/browse", "skills", "list") + assert result.stdout == "ok" + assert calls[0][0] == ("/bin/browse", "skills", "list") + + +def test_browse_sh_run_converts_timeout(monkeypatch): + from services.browse_sh_tool import BrowseShTool + + def fake_run(command, **_kwargs): + raise subprocess.TimeoutExpired(command, timeout=1, output="partial", stderr="late") + + monkeypatch.setattr("services.browse_sh_tool.subprocess.run", fake_run) + + result = BrowseShTool(cli_path="/bin/browse").run(("screenshot",), require_available=False) + + assert result.ok is False + assert result.timed_out is True + assert result.stdout == "partial" + assert result.stderr == "late" + + +def test_browse_sh_run_surfaces_broken_cli(monkeypatch): + from services.browse_sh_tool import BrowseShTool + + def fake_run(command, **kwargs): + if command[-1] == "--version": + return SimpleNamespace(returncode=1, stdout="", stderr="node dyld missing icu4c") + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr("services.browse_sh_tool.subprocess.run", fake_run) + + result = BrowseShTool(cli_path="/bin/browse").run(("skills", "list")) + + assert result.ok is False + assert "icu4c" in result.unavailable_reason diff --git a/tests/test_competitor_match_attempts_persistence.py b/tests/test_competitor_match_attempts_persistence.py index 083df6a..0eb2135 100644 --- a/tests/test_competitor_match_attempts_persistence.py +++ b/tests/test_competitor_match_attempts_persistence.py @@ -724,6 +724,54 @@ def test_search_candidates_does_not_stop_on_merely_acceptable_match(monkeypatch) assert [candidate.product_id for candidate in candidates] == ["DDAB01-FIRST", "DDAB01-SECOND"] +def test_search_candidates_adds_variant_recall_sorts_for_dashing_diva(monkeypatch): + from services.competitor_price_feeder import _search_pchome_candidates + from services.pchome_crawler import PChomeProduct + + candidate = PChomeProduct( + product_id="DDBH8E-A900JMCJZ", + name="Dashing Diva/F 時尚潮流美甲片-月光銀影 MDF5F010AG", + price=420, + original_price=520, + discount=19, + image_url="", + product_url="https://24h.pchome.com.tw/prod/DDBH8E-A900JMCJZ", + stock=20, + store="24h", + rating=4.7, + review_count=8, + is_on_sale=True, + crawled_at=datetime.now(), + ) + + class FakeCrawler: + def __init__(self): + self.calls = [] + + def search_products(self, keyword, **kwargs): + self.calls.append((keyword, kwargs.get("sort"))) + return True, "ok", [candidate] + + monkeypatch.setattr( + "services.marketplace_product_matcher.score_marketplace_match", + lambda *_args, **_kwargs: SimpleNamespace(score=0.72), + ) + + crawler = FakeCrawler() + _search_pchome_candidates( + crawler, + "【DASHING DIVA】MAGICPRESS時尚潮流美甲片_極光之藍", + keywords=["dashing diva 時尚潮流美甲片 極光之藍"], + momo_price=331, + ) + + assert crawler.calls == [ + ("dashing diva 時尚潮流美甲片 極光之藍", None), + ("dashing diva 時尚潮流美甲片", "sale/dc"), + ("dashing diva 時尚潮流美甲片", "new/dc"), + ] + + def test_competitor_feeder_logs_keyword_parser_fallback(monkeypatch, caplog): from services import competitor_price_feeder from services import marketplace_product_matcher diff --git a/tests/test_marketplace_product_matcher.py b/tests/test_marketplace_product_matcher.py index e22043f..6640dbf 100644 --- a/tests/test_marketplace_product_matcher.py +++ b/tests/test_marketplace_product_matcher.py @@ -591,6 +591,45 @@ def test_marketplace_matcher_promotes_variant_safe_exact_option(): assert "shared_identity_anchor_variant_safe" in diagnostics.reasons +def test_marketplace_matcher_rejects_explicit_shade_option_mismatch(): + from services.marketplace_product_matcher import score_marketplace_match + + lipstick = score_marketplace_match( + "【Maybelline 媚比琳】超持久水光鎖吻唇釉#62 4.2ml", + "MAYBELLINE 媚比琳 超持久水光鎖吻唇釉 #120 4.2ml", + momo_price=399, + competitor_price=399, + ) + primer = score_marketplace_match( + "【植村秀】水凝光透妝前防護乳 紫色 30ml", + "植村秀 水凝光透妝前防護乳 粉色 30ml", + momo_price=1200, + competitor_price=1100, + ) + + for diagnostics in (lipstick, primer): + assert diagnostics.hard_veto is True + assert diagnostics.comparison_mode == "not_comparable" + assert diagnostics.score < 0.76 + assert "variant_option_conflict" in diagnostics.reasons + + +def test_marketplace_matcher_accepts_same_explicit_shade_option(): + from services.marketplace_product_matcher import score_marketplace_match + + diagnostics = score_marketplace_match( + "【Maybelline 媚比琳】超持久水光鎖吻唇釉#62 4.2ml", + "MAYBELLINE 媚比琳 超持久水光鎖吻唇釉 #62 4.2ml", + momo_price=399, + competitor_price=399, + ) + + assert diagnostics.hard_veto is False + assert diagnostics.comparison_mode == "exact_identity" + assert diagnostics.score >= 0.76 + assert "variant_option_conflict" not in diagnostics.reasons + + def test_marketplace_matcher_promotes_shared_identity_anchor_near_threshold(): from services.marketplace_product_matcher import score_marketplace_match @@ -881,6 +920,18 @@ def test_marketplace_search_terms_prefer_exact_identity_for_nail_foam_and_foot_m assert "枚入" not in " ".join(kameria_terms[:3]) +def test_marketplace_search_terms_preserve_decimal_spec_and_shade_option(): + from services.marketplace_product_matcher import build_search_terms + + terms = build_search_terms( + "【Maybelline 媚比琳】超持久水光鎖吻唇釉#62 4.2ml", + max_terms=5, + ) + + assert terms[0] == "媚比琳 超持久水光鎖吻唇釉 62 4.2ml" + assert "4 2ml" not in " ".join(terms) + + def test_marketplace_search_terms_prefer_specific_line_over_generic_usage_words(): from services.marketplace_product_matcher import build_search_terms