From 3953ef6d57c9fa9dd545697645fbfd1bb674d3d2 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 25 May 2026 23:19:14 +0800 Subject: [PATCH] fix(ollama): disable thinking for deepseek call sites --- apps/api/src/hermes/nl_gateway.py | 2 ++ apps/api/src/services/alert_rule_engine.py | 9 ++++++++- apps/api/src/services/chat_manager.py | 3 +++ apps/api/src/services/decision_manager.py | 8 +++++++- apps/api/src/services/log_summary_service.py | 3 +++ apps/api/tests/test_chat_manager_ollama_routing.py | 1 + apps/api/tests/test_decision_manager_ollama_routing.py | 4 ++++ 7 files changed, 28 insertions(+), 2 deletions(-) diff --git a/apps/api/src/hermes/nl_gateway.py b/apps/api/src/hermes/nl_gateway.py index 29f9f4e6..104b076d 100644 --- a/apps/api/src/hermes/nl_gateway.py +++ b/apps/api/src/hermes/nl_gateway.py @@ -276,6 +276,8 @@ async def process_nl_message( f"{endpoint.url}/api/chat", json={ "model": model, + # Keep Hermes responses in message.content across Ollama 0.24+. + "think": False, "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt_with_ctx}, diff --git a/apps/api/src/services/alert_rule_engine.py b/apps/api/src/services/alert_rule_engine.py index eef892ee..b96bb645 100644 --- a/apps/api/src/services/alert_rule_engine.py +++ b/apps/api/src/services/alert_rule_engine.py @@ -720,7 +720,14 @@ async def _call_ollama(prompt: str, ollama_url: str, model: str) -> str | None: try: resp = await client.post( f"{endpoint.url}/api/generate", - json={"model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.1}}, + json={ + "model": model, + "prompt": prompt, + "stream": False, + # Preserve response-body compatibility with Ollama 0.24 thinking models. + "think": False, + "options": {"temperature": 0.1}, + }, ) resp.raise_for_status() return resp.json().get("response", "") diff --git a/apps/api/src/services/chat_manager.py b/apps/api/src/services/chat_manager.py index 855bb0f8..3650e5aa 100644 --- a/apps/api/src/services/chat_manager.py +++ b/apps/api/src/services/chat_manager.py @@ -167,6 +167,9 @@ class ChatManager: json={ "model": MODEL, "stream": False, + # Ollama 0.24 separates deepseek-r1 thinking from final text. + # Chat callers expect message.content to contain the answer. + "think": False, "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}, diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 650cc771..920a53b1 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -678,7 +678,13 @@ async def _nemoclaw_second_opinion(incident: "Incident", primary_result: dict) - try: resp = await client.post( f"{endpoint.url.rstrip('/')}/api/generate", - json={"model": model, "prompt": prompt, "stream": False}, + json={ + "model": model, + "prompt": prompt, + "stream": False, + # Ollama 0.24 puts deepseek-r1 output in `thinking` unless disabled. + "think": False, + }, ) resp.raise_for_status() data = resp.json() diff --git a/apps/api/src/services/log_summary_service.py b/apps/api/src/services/log_summary_service.py index b2dc62ba..aa72bfae 100644 --- a/apps/api/src/services/log_summary_service.py +++ b/apps/api/src/services/log_summary_service.py @@ -220,6 +220,9 @@ class LogSummaryService: "model": SUMMARY_MODEL, "prompt": prompt, "stream": False, + # Ollama 0.24 returns deepseek-r1 text in `thinking` by default. + # Existing callers read `response`, so force final-answer mode. + "think": False, "options": {"temperature": 0.1, "num_predict": 200}, }, ) diff --git a/apps/api/tests/test_chat_manager_ollama_routing.py b/apps/api/tests/test_chat_manager_ollama_routing.py index a40648d7..86c0c50d 100644 --- a/apps/api/tests/test_chat_manager_ollama_routing.py +++ b/apps/api/tests/test_chat_manager_ollama_routing.py @@ -103,6 +103,7 @@ async def test_nemoclaw_chat_uses_resolved_interactive_lane( url, payload = _FakeAsyncClient.posted[0] assert url == "http://gcp-a:11435/api/chat" assert payload["model"] == "deepseek-r1:14b" + assert payload["think"] is False def test_chat_manager_has_no_direct_gemini_generation_path() -> None: diff --git a/apps/api/tests/test_decision_manager_ollama_routing.py b/apps/api/tests/test_decision_manager_ollama_routing.py index f67090c3..478acda8 100644 --- a/apps/api/tests/test_decision_manager_ollama_routing.py +++ b/apps/api/tests/test_decision_manager_ollama_routing.py @@ -22,6 +22,7 @@ class _FakeResponse: class _FakeAsyncClient: posted_urls: list[str] = [] + posted_payloads: list[dict[str, Any]] = [] fail_urls: set[str] = set() response: str = "" @@ -37,6 +38,7 @@ class _FakeAsyncClient: async def post(self, url: str, *, json: dict[str, Any]) -> _FakeResponse: self.posted_urls.append(url) + self.posted_payloads.append(json) if url in self.fail_urls: raise RuntimeError("endpoint unavailable") return _FakeResponse(self.response) @@ -45,6 +47,7 @@ class _FakeAsyncClient: @pytest.fixture(autouse=True) def _reset_fake_client() -> None: _FakeAsyncClient.posted_urls = [] + _FakeAsyncClient.posted_payloads = [] _FakeAsyncClient.fail_urls = set() _FakeAsyncClient.response = "" @@ -115,6 +118,7 @@ async def test_nemoclaw_second_opinion_tries_gcp_b_after_gcp_a_failure( "http://gcp-a:11435/api/generate", "http://gcp-b:11436/api/generate", ] + assert all(payload["think"] is False for payload in _FakeAsyncClient.posted_payloads) @pytest.mark.asyncio