From c0219450476bd2cb2d7d061a6c8e564e988f2da5 Mon Sep 17 00:00:00 2001 From: OoO Date: Mon, 18 May 2026 14:07:49 +0800 Subject: [PATCH] fix: route telegram vision through ollama first --- .env.example | 6 + app.py | 12 +- docs/AI_INTELLIGENCE_MODULE_SOT.md | 1 + .../claude_inventory_validation_20260513.md | 1 + routes/openclaw_bot_routes.py | 175 ++++++++++++++---- routes/system_public_routes.py | 12 ++ services/llm_caller_registry.py | 2 + tests/test_caller_registry.py | 1 + tests/test_openclaw_bot_routes_webhook.py | 118 ++++++++++++ tests/test_phase3f_cleanup_contracts.py | 3 + 10 files changed, 283 insertions(+), 48 deletions(-) diff --git a/.env.example b/.env.example index 45dfcb9..ddd4cab 100644 --- a/.env.example +++ b/.env.example @@ -315,6 +315,9 @@ RAG_EMBED_NORMALIZE=true PPT_VISION_ENABLED=false PPT_VISION_MODEL=minicpm-v:latest PPT_VISION_TIMEOUT=60 +PPT_AUTO_GENERATION_ENABLED=true +PPT_AUTO_REPORT_TYPES=all +PPT_AUTO_DEFAULT_CATEGORY=美妝保養 DEEPSEEK_DIRECT_ENABLED=false DEEPSEEK_API_KEY= DEEPSEEK_BASE_URL=https://api.deepseek.com/v1 @@ -344,6 +347,9 @@ OLLAMA_EMBED_TIMEOUT=45 OPENCLAW_QA_OLLAMA_FIRST=true OPENCLAW_QA_OLLAMA_MODEL=qwen3:14b OPENCLAW_QA_OLLAMA_TIMEOUT=60 +OPENCLAW_IMAGE_VISION_MODEL=minicpm-v:latest +OPENCLAW_IMAGE_OLLAMA_TIMEOUT=45 +OPENCLAW_IMAGE_GEMINI_MODEL=gemini-1.5-flash NEMOTRON_OLLAMA_FIRST=true NEMOTRON_OLLAMA_MODEL=qwen3:14b NEMOTRON_OLLAMA_TIMEOUT=180 diff --git a/app.py b/app.py index 875a44f..48a9af3 100644 --- a/app.py +++ b/app.py @@ -29,7 +29,7 @@ except OSError as e: # ================= 🔧 2. 核心模組導入 ================= try: - from flask import Flask, render_template, jsonify, request, send_file, redirect, url_for, send_from_directory, flash, session + from flask import Flask, render_template, jsonify, request, send_file, redirect, url_for, flash, session from werkzeug.utils import secure_filename from pyngrok import ngrok, conf import schedule @@ -146,16 +146,6 @@ def add_static_asset_version(endpoint, values): values['v'] = SYSTEM_VERSION -@app.route('/favicon.ico') -def favicon(): - """使用既有品牌圖示回應瀏覽器預設 favicon 探測,避免全站 404 噪音。""" - return send_from_directory( - os.path.join(STATIC_DIR, 'images'), - 'logo_circle.svg', - mimetype='image/svg+xml', - max_age=604800, - ) - # ========================================== # 🔒 Flask 安全配置 # ========================================== diff --git a/docs/AI_INTELLIGENCE_MODULE_SOT.md b/docs/AI_INTELLIGENCE_MODULE_SOT.md index a8ddb67..f78abc3 100644 --- a/docs/AI_INTELLIGENCE_MODULE_SOT.md +++ b/docs/AI_INTELLIGENCE_MODULE_SOT.md @@ -18,6 +18,7 @@ - PPT vision、PPT 文案 final fallback、MCP 離線 final fallback 等特殊 Ollama 路徑也不得只打單一 host;如需 `/api/generate`,一律透過 `OllamaService.generate()`。 - Code Review pipeline 也必須 Ollama-first:Hermes scan 與 OpenClaw assessment 都走 `OllamaService` 三主機 retry;Gemini telemetry 只能以 `code_review_openclaw_gemini` 出現,表示 Ollama/可選 Claude 備援都失敗後才啟用。 - OpenClaw Telegram Q&A 主路徑也不得綁單一 host:`_call_qwen3_qa()` 必須透過 `OllamaService` 跑 GCP-A → GCP-B → 111,並把實際落點寫入 `ai_calls.provider`。 +- OpenClaw Telegram 圖片商品辨識也必須 Ollama-first:`_identify_product_name_with_ollama_vision()` 透過 `OllamaService` 嘗試 GCP-A → GCP-B → 111;Gemini 只允許以 `openclaw_bot_image_gemini` caller 作為失敗後備援。 ## 一、四 AI Agent 路由架構 diff --git a/docs/memory/claude_inventory_validation_20260513.md b/docs/memory/claude_inventory_validation_20260513.md index 3ccb267..d9a8075 100644 --- a/docs/memory/claude_inventory_validation_20260513.md +++ b/docs/memory/claude_inventory_validation_20260513.md @@ -56,6 +56,7 @@ - PPT vision、PPT 文案 final fallback 與 MCP 離線 final fallback 已改走 `OllamaService.generate()`;`OllamaService.generate()` 支援 `options`、`keep_alive` 與 vision `images`,特殊 `/api/generate` 路徑同樣取得三主機 retry。 - OpenClaw QA / daily Hermes template / NemoTron qwen3 的 flag 文件與測試已對齊 Ollama-first 預設 ON;顯式 `false` 才是 Gemini/NIM legacy 緊急退路。OpenClaw QA 已移除單一 `OPENCLAW_QA_OLLAMA_HOST` 主機覆寫,`_call_qwen3_qa()` 改走 `OllamaService` 的 GCP-A → GCP-B → 111 retry 並回寫實際 provider。 - Code Review pipeline 已對齊 Ollama-first:`_hermes_scan()` 與 `_openclaw_assess()` 都先走 `OllamaService` 的 GCP-A → GCP-B → 111 retry;Gemini 僅在 Ollama(與可選 Claude)失敗後以 `code_review_openclaw_gemini` caller 記錄備援,不再以 `code_review_openclaw` 直接 Gemini-first。 +- Telegram 圖片商品辨識已對齊 Ollama-first:`routes/openclaw_bot_routes.py` 會先用 `OPENCLAW_IMAGE_VISION_MODEL` 透過 `OllamaService` retry GCP-A → GCP-B → 111;Gemini 只以 `openclaw_bot_image_gemini` caller 作為圖片辨識備援。 - `.env.example` 已補齊 Python runtime 實際讀取的環境變數,`tests/test_phase3f_cleanup_contracts.py::test_env_example_documents_runtime_os_env_keys` 會掃 `app.py/config.py/scheduler.py/run_scheduler.py/routes/services/utils` 的 `os.getenv()` / `os.environ.get()`;只允許 `PYTEST_CURRENT_TEST` 與 `MOMO_ALLOW_INSECURE_CONFIG_FOR_TESTS` 兩個測試內部 key 不進範例。 - `docker-compose*.yml` 使用的 `${VAR}` 也已納入 `.env.example` 契約,包含 MCP compose 的 `TAVILY_API_KEY`、`EXA_API_KEY`、`MCP_POSTGRES_PASSWORD`、`FIRECRAWL_AUTH_KEY`,以及 image tag / Grafana / pgAdmin / Metabase / Grist 變數;`test_env_example_documents_docker_compose_variables` 會守住。 - Market Intel `seed_writer_cli_status` route 已補 API 層回歸:即使 `execute=true` 且環境有 `MARKET_INTEL_SEED_WRITE_APPROVAL`,API 仍不得回吐 token / `approval_token_hint` / 固定 token 文案,且不得 ready 或寫入;`tests/test_market_intel_skeleton.py::test_seed_writer_cli_status_route_never_leaks_approval_token` 會守住。 diff --git a/routes/openclaw_bot_routes.py b/routes/openclaw_bot_routes.py index 8898eff..c994ee8 100644 --- a/routes/openclaw_bot_routes.py +++ b/routes/openclaw_bot_routes.py @@ -4,7 +4,7 @@ OpenClaw Telegram 群組智能助理 v5 ───────────────────────────────────────── 核心功能: - • 群組自然對話,Gemini Flash 主引擎(2~5s) + • 群組自然對話,Ollama-first 三主機級聯,Gemini 僅備援 • Inline Keyboard 15 個功能入口 • 全商品查詢帶出商品ID • AI 分析強制比對內部DB + 外部MCP情報 @@ -91,7 +91,7 @@ except ImportError: # V-New: 引入 Ollama 探測機制 try: - from services.ollama_service import OllamaService + from services.ollama_service import OllamaService, get_host_label, get_provider_tag _OLLAMA_AVAILABLE = True except ImportError: _OLLAMA_AVAILABLE = False @@ -103,6 +103,11 @@ except ImportError: GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', '') GEMINI_BASE_URL = 'https://generativelanguage.googleapis.com/v1beta/models' GEMINI_MODEL = 'gemini-2.0-flash' +IMAGE_VISION_OLLAMA_MODEL = os.getenv( + 'OPENCLAW_IMAGE_VISION_MODEL', + os.getenv('PPT_VISION_MODEL', 'minicpm-v:latest'), +) +IMAGE_VISION_GEMINI_MODEL = os.getenv('OPENCLAW_IMAGE_GEMINI_MODEL', 'gemini-1.5-flash') PPT_CACHE_TTL_HOURS = max(1, int(os.getenv('OPENCLAW_PPT_CACHE_TTL_HOURS', '24'))) TAIPEI_TZ = timezone(timedelta(hours=8)) @@ -8678,6 +8683,123 @@ def _handle_event_ignore_callback(data: str, cq: dict, chat_id, message_id) -> N sys_log.info(f"[EA HITL] event_ignore event_id={event_id} by={user_label_raw}") +def _clean_vision_product_name(raw: str) -> str: + """把 vision 模型回應收斂成可直接丟給比價查詢的商品名稱。""" + text = (raw or '').strip() + if not text: + return '' + text = re.sub(r"^```(?:text)?\s*", "", text, flags=re.IGNORECASE).strip() + text = re.sub(r"\s*```$", "", text).strip() + first_line = next((line.strip() for line in text.splitlines() if line.strip()), '') + first_line = re.sub(r"^(商品名稱|品名|辨識結果|結果)\s*[::]\s*", "", first_line).strip() + first_line = first_line.strip("`*_ - ") + if not first_line: + return '' + refusal_patterns = ('無法辨識', '看不清', '無法確認', '不確定', 'unknown', 'not sure') + lowered = first_line.lower() + if any(pattern in lowered for pattern in refusal_patterns): + return '' + return first_line[:60] + + +def _identify_product_name_with_ollama_vision(img_b64: str, request_id: str) -> str: + """圖片比價的主辨識路徑:Ollama vision 三主機級聯。""" + if not _OLLAMA_AVAILABLE: + return '' + prompt = ( + "這是一張商品圖片。請辨識商品名稱,包含品牌、型號、規格。" + "只回商品名稱,不要解釋,不要 markdown,不超過 30 字;" + "如果是多個商品,只取最顯眼的一個。必須使用繁體中文。" + ) + timeout = int(os.getenv('OPENCLAW_IMAGE_OLLAMA_TIMEOUT', '45')) + with log_ai_call( + caller='openclaw_bot_image', + provider='gcp_ollama', + model=IMAGE_VISION_OLLAMA_MODEL, + request_id=request_id, + meta={'route': 'ollama_first', 'task': 'image_product_recognition'}, + ) as ctx: + try: + resp = OllamaService(model=IMAGE_VISION_OLLAMA_MODEL).generate( + prompt=prompt, + model=IMAGE_VISION_OLLAMA_MODEL, + temperature=0.1, + timeout=timeout, + options={'num_predict': 64}, + images=[img_b64], + ) + ctx.set_provider(get_provider_tag(resp.host or '')) + ctx.set_tokens(input=resp.input_tokens, output=resp.output_tokens) + ctx.add_meta('host', resp.host) + ctx.add_meta('host_label', get_host_label(resp.host or '')) + if not resp.success: + ctx.set_error(resp.error or 'ollama vision failed') + ctx.fallback_to_caller('openclaw_bot_image_gemini') + return '' + product_name = _clean_vision_product_name(resp.content) + if not product_name: + ctx.set_error('empty_or_unusable_vision_response') + ctx.fallback_to_caller('openclaw_bot_image_gemini') + return product_name + except Exception as exc: + ctx.set_error(f"{type(exc).__name__}: {exc}") + ctx.fallback_to_caller('openclaw_bot_image_gemini') + sys_log.warning(f"[VisionSearch] Ollama vision failed: {exc}") + return '' + + +def _identify_product_name_with_gemini_vision(img_b64: str, request_id: str) -> str: + """圖片比價的雲端備援:只有 Ollama vision 失敗後才呼叫。""" + if not GEMINI_API_KEY: + return '' + vision_payload = { + 'contents': [{ + 'parts': [ + {'text': ( + '這是一張商品圖片。請辨識商品名稱(品牌、型號、規格),' + '輸出格式:只回商品名稱,不超過 30 字,繁體中文。' + '如果是多個商品,只取最顯眼的一個。' + )}, + {'inline_data': {'mime_type': 'image/jpeg', 'data': img_b64}}, + ], + }], + } + with log_ai_call( + caller='openclaw_bot_image_gemini', + provider='gemini', + model=IMAGE_VISION_GEMINI_MODEL, + request_id=request_id, + meta={'fallback_from': 'openclaw_bot_image', 'task': 'image_product_recognition'}, + ) as ctx: + try: + vis_r = requests.post( + f"{GEMINI_BASE_URL}/{IMAGE_VISION_GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}", + json=vision_payload, timeout=20, + ) + vis_r.raise_for_status() + body = vis_r.json() + usage = body.get('usageMetadata', {}) or {} + ctx.set_tokens( + input=usage.get('promptTokenCount', 0), + output=usage.get('candidatesTokenCount', 0), + ) + raw = ( + body + .get('candidates', [{}])[0] + .get('content', {}) + .get('parts', [{}])[0] + .get('text', '') + ) + product_name = _clean_vision_product_name(raw) + if not product_name: + ctx.set_error('empty_or_unusable_vision_response') + return product_name + except Exception as exc: + ctx.set_error(f"{type(exc).__name__}: {exc}") + sys_log.warning(f"[VisionSearch] Gemini vision fallback failed: {exc}") + return '' + + # ── Webhook ─────────────────────────────────────────────────── @openclaw_bot_bp.route('/bot/telegram/webhook', methods=['POST']) def telegram_webhook(): @@ -8943,7 +9065,7 @@ def telegram_webhook(): # 已通過授權的 private chat question = text_raw - # ── 圖片訊息:Gemini Vision 商品辨識 ───────────────────── + # ── 圖片訊息:Ollama-first Vision 商品辨識 ───────────────── if not question and msg.get('photo'): send_typing(chat_id) try: @@ -8962,43 +9084,22 @@ def telegram_webhook(): img_data = requests.get(img_url, timeout=15).content import base64 as _b64 img_b64 = _b64.b64encode(img_data).decode() - # Gemini Vision 辨識商品名稱 - vision_payload = { - 'contents': [{ - 'parts': [ - {'text': ( - '這是一張商品圖片。請辨識商品名稱(品牌、型號、規格),' - '輸出格式:只回商品名稱,不超過 30 字,繁體中文。' - '如果是多個商品,只取最顯眼的一個。' - )}, - {'inline_data': {'mime_type': 'image/jpeg', 'data': img_b64}} - ] - }] - } - vis_r = requests.post( - f"{GEMINI_BASE_URL}/gemini-1.5-flash:generateContent?key={GEMINI_API_KEY}", - json=vision_payload, timeout=20 - ) - if vis_r.ok: - product_name = ( - vis_r.json() - .get('candidates', [{}])[0] - .get('content', {}) - .get('parts', [{}])[0] - .get('text', '').strip() - ) - if product_name: - send_message(chat_id, - f"🔍 辨識到商品:*{product_name}*\n正在搜尋 momo 比價...", - msg_id, parse_mode='Markdown') - # 直接執行比價 - handle_cmd('competitor', product_name, chat_id, msg_id) - else: - send_message(chat_id, "⚠️ 無法辨識圖片中的商品,請嘗試更清晰的圖片", msg_id) + + req_id = f"img-{chat_id or 0}-{msg_id or 0}" + product_name = _identify_product_name_with_ollama_vision(img_b64, req_id) + if not product_name: + product_name = _identify_product_name_with_gemini_vision(img_b64, req_id) + + if product_name: + send_message(chat_id, + f"🔍 辨識到商品:*{product_name}*\n正在搜尋 momo 比價...", + msg_id, parse_mode='Markdown') + # 直接執行比價 + handle_cmd('competitor', product_name, chat_id, msg_id) else: send_message( chat_id, - "⚠️ 圖片辨識失敗,請直接輸入商品名稱搜尋", + "⚠️ 無法辨識圖片中的商品,請直接輸入商品名稱搜尋", msg_id, [_row(('🔍 文字搜尋', 'await:search_compare'))], ) diff --git a/routes/system_public_routes.py b/routes/system_public_routes.py index 35240ef..7fbe9ef 100644 --- a/routes/system_public_routes.py +++ b/routes/system_public_routes.py @@ -27,6 +27,18 @@ sys_log = SystemLogger("SystemPublicRoutes").get_logger() TAIPEI_TZ = timezone(timedelta(hours=8)) LOG_FILE_PATH = os.path.join(BASE_DIR, 'logs/system.log') public_url = os.getenv('PUBLIC_URL', '服務啟動中...') +STATIC_DIR = os.path.join(BASE_DIR, 'web/static') + + +@system_public_bp.route('/favicon.ico') +def favicon(): + """使用既有品牌圖示回應瀏覽器預設 favicon 探測,避免全站 404 噪音。""" + return send_from_directory( + os.path.join(STATIC_DIR, 'images'), + 'logo_circle.svg', + mimetype='image/svg+xml', + max_age=604800, + ) @system_public_bp.route('/health') diff --git a/services/llm_caller_registry.py b/services/llm_caller_registry.py index 5ca5c8d..1759143 100644 --- a/services/llm_caller_registry.py +++ b/services/llm_caller_registry.py @@ -80,6 +80,8 @@ CALLER_REGISTRY: frozenset = frozenset({ 'tg_bot_copy_v2', # second copy entrance 'openclaw_bot_main', # OpenClaw Bot 主鏈 Ollama 'openclaw_bot_gemini', # Bot Gemini fallback + 'openclaw_bot_image', # Bot 圖片商品辨識 Ollama-first + 'openclaw_bot_image_gemini', # Bot 圖片商品辨識 Gemini fallback 'openclaw_bot_nim', # Bot NIM fallback # 其他 diff --git a/tests/test_caller_registry.py b/tests/test_caller_registry.py index 2541481..e61716f 100644 --- a/tests/test_caller_registry.py +++ b/tests/test_caller_registry.py @@ -41,6 +41,7 @@ def test_registry_contains_core_callers(): 'sales_copy', 'trend_match', 'trend_qa', 'product_insights', # Bot 'openclaw_bot_main', 'openclaw_bot_gemini', 'openclaw_bot_nim', + 'openclaw_bot_image', 'openclaw_bot_image_gemini', } missing = must_have - CALLER_REGISTRY diff --git a/tests/test_openclaw_bot_routes_webhook.py b/tests/test_openclaw_bot_routes_webhook.py index c51eb5d..16a5d8d 100644 --- a/tests/test_openclaw_bot_routes_webhook.py +++ b/tests/test_openclaw_bot_routes_webhook.py @@ -149,6 +149,124 @@ def test_is_authorized_private_mode_switch(monkeypatch): assert bot._is_authorized("private", 777, 42) is False +def test_photo_message_uses_ollama_vision_before_gemini(monkeypatch): + from routes import openclaw_bot_routes as bot + + sent = [] + handled = [] + + class FakeResponse: + def __init__(self, json_data=None, content=b"fake-image"): + self._json_data = json_data or {} + self.content = content + + def json(self): + return self._json_data + + def fake_get(url, **_kwargs): + if "getFile" in url: + return FakeResponse({"result": {"file_path": "photos/product.jpg"}}) + return FakeResponse(content=b"fake-image") + + monkeypatch.setattr(bot.requests, "get", fake_get) + monkeypatch.setattr(bot, "_is_authorized", lambda _chat_type, _chat_id, _uid: True) + monkeypatch.setattr(bot, "send_typing", lambda _chat_id: None) + monkeypatch.setattr( + bot, + "send_message", + lambda *args, **kwargs: sent.append((args, kwargs)), + ) + monkeypatch.setattr( + bot, + "handle_cmd", + lambda cmd, arg, chat_id, reply_to: handled.append((cmd, arg, chat_id, reply_to)), + ) + monkeypatch.setattr( + bot, + "_identify_product_name_with_ollama_vision", + lambda img_b64, request_id: "理膚寶水 B5 修復霜", + ) + monkeypatch.setattr( + bot, + "_identify_product_name_with_gemini_vision", + lambda img_b64, request_id: (_ for _ in ()).throw(AssertionError("Gemini should not run first")), + ) + + app = _build_request_app() + payload = { + "update_id": 10030, + "message": { + "message_id": 80, + "chat": {"id": 777, "type": "private"}, + "from": {"id": 777777}, + "photo": [{"file_id": "small"}, {"file_id": "large"}], + }, + } + + with app.test_request_context("/bot/telegram/webhook", method="POST", json=payload): + bot.telegram_webhook() + + assert handled == [("competitor", "理膚寶水 B5 修復霜", 777, 80)] + assert "理膚寶水 B5 修復霜" in sent[0][0][1] + + +def test_photo_message_falls_back_to_gemini_when_ollama_empty(monkeypatch): + from routes import openclaw_bot_routes as bot + + handled = [] + calls = [] + + class FakeResponse: + def __init__(self, json_data=None, content=b"fake-image"): + self._json_data = json_data or {} + self.content = content + + def json(self): + return self._json_data + + def fake_get(url, **_kwargs): + if "getFile" in url: + return FakeResponse({"result": {"file_path": "photos/product.jpg"}}) + return FakeResponse(content=b"fake-image") + + def fake_ollama(_img_b64, _request_id): + calls.append("ollama") + return "" + + def fake_gemini(_img_b64, _request_id): + calls.append("gemini") + return "飛利浦 Sonicare" + + monkeypatch.setattr(bot.requests, "get", fake_get) + monkeypatch.setattr(bot, "_is_authorized", lambda _chat_type, _chat_id, _uid: True) + monkeypatch.setattr(bot, "send_typing", lambda _chat_id: None) + monkeypatch.setattr(bot, "send_message", lambda *args, **kwargs: None) + monkeypatch.setattr( + bot, + "handle_cmd", + lambda cmd, arg, chat_id, reply_to: handled.append((cmd, arg, chat_id, reply_to)), + ) + monkeypatch.setattr(bot, "_identify_product_name_with_ollama_vision", fake_ollama) + monkeypatch.setattr(bot, "_identify_product_name_with_gemini_vision", fake_gemini) + + app = _build_request_app() + payload = { + "update_id": 10031, + "message": { + "message_id": 81, + "chat": {"id": 777, "type": "private"}, + "from": {"id": 777777}, + "photo": [{"file_id": "small"}, {"file_id": "large"}], + }, + } + + with app.test_request_context("/bot/telegram/webhook", method="POST", json=payload): + bot.telegram_webhook() + + assert calls == ["ollama", "gemini"] + assert handled == [("competitor", "飛利浦 Sonicare", 777, 81)] + + def test_obs_heal_audit_uses_current_callback_user(monkeypatch): from types import SimpleNamespace from routes import openclaw_bot_routes as bot diff --git a/tests/test_phase3f_cleanup_contracts.py b/tests/test_phase3f_cleanup_contracts.py index a1909ac..dda22ac 100644 --- a/tests/test_phase3f_cleanup_contracts.py +++ b/tests/test_phase3f_cleanup_contracts.py @@ -142,6 +142,9 @@ def test_env_example_documents_runtime_and_ai_automation_variables(): "OPENCLAW_DAILY_HERMES_TEMPLATE", "OPENCLAW_OLLAMA_MODEL", "OPENCLAW_PPT_CACHE_TTL_HOURS", + "OPENCLAW_IMAGE_GEMINI_MODEL", + "OPENCLAW_IMAGE_OLLAMA_TIMEOUT", + "OPENCLAW_IMAGE_VISION_MODEL", "OPENCLAW_QA_OLLAMA_FIRST", "OPENCLAW_QA_OLLAMA_MODEL", "OPENCLAW_QA_OLLAMA_TIMEOUT",