Files
ewoooc/tests/test_openclaw_qa_routing.py
OoO 838267c293 feat(p1+p3): logger 接 13 caller + Q&A/Nemotron/日報 feature flag 灰度
Phase 1 A4 — 13 個呼叫點接 ai_call_logger(覆蓋率 11.8% → 預估 50%+)
- TOP-1 nemoton_dispatcher: nemotron_dispatch caller (NIM 配額追蹤)
- TOP-2 openclaw_strategist: 4 reports (daily/weekly/monthly/meta) + qa caller
- TOP-3 hermes_analyst: hermes_analyst + hermes_intent (順修 commit 00591c5 殘留 bug)
- TOP-4 code_review_pipeline: code_review_hermes/openclaw/elephant 三鏈 (request_id 串)
- TOP-5 openclaw_bot_routes: openclaw_bot_main/gemini/nim 三層 fallback

Phase 3 A7 — OpenClaw Q&A → qwen3:14b(feature flag OFF)
- OPENCLAW_QA_OLLAMA_FIRST 灰度開關
- 繁中強制 system prompt + Gemini fallback chain
- _is_low_quality_response 品質守門(簡體字檢測 + 拒答訊號 + 結構分數)
- 黃金集 A/B 對照測試框架(10 樣本去 PII)

Phase 3 A8 — OpenClaw 日報 → Hermes 模板(feature flag OFF)
- OPENCLAW_DAILY_HERMES_TEMPLATE 灰度開關
- _compute_daily_kpi 純 SQL + Hermes 規則引擎
- _compute_gemini_insight 精簡 200 字洞察 prompt
- templates/daily_report_v2.j2 + _SafeUndefined 缺欄位優雅降級
- scripts/compare_daily_report_versions.py 雙版本盲測

Phase 3 A9 — Nemotron NIM → qwen3:14b(feature flag OFF)
- NEMOTRON_OLLAMA_FIRST 灰度開關(A2 紅燈:deepseek-r1 假支援,改 qwen3)
- _call_qwen3_dispatch + 既有 NIM tool_calls 解析共用
- 保留 ADR-004「🟡 [降級模式]」Hermes 規則引擎兜底

H6 PII fix — chat_id 進 ai_calls.meta 改 SHA1[:8](4 處 Bot Q&A)

Code Review pipeline — N3 動態 provider tag(gcp/secondary/111)+ A4 logger 三鏈

37 unit tests 全綠(routing 15 + golden 5 + qwen3 8 + daily template 8 + nemotron 1)

Operation Ollama-First v5.0 / Phase 1 A4 + Phase 3 A7+A8+A9

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 23:05:38 +08:00

359 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
tests/test_openclaw_qa_routing.py
OpenClaw Q&A 路由 + 品質守門 unit tests
(Operation Ollama-First v5.0 — Phase 3, A7 fullstack-engineer)
涵蓋:
- feature flag OPENCLAW_QA_OLLAMA_FIRST=false → 走 Gemini-firstregression test
- flag=true + 高品質 Ollama 回應 → 直接回 Ollama 結果,不走 Gemini
- flag=true + 低品質 Ollama 回應 → 升級至 Gemini並標 fallback_to=openclaw_qa_gemini_fallback
- flag=true + Ollama 呼叫失敗 → 升級至 Gemini
- _is_low_quality_response 各規則:空字串 / 長度過短 / 簡體污染 / 拒答 / 流水帳
執行:
pytest tests/test_openclaw_qa_routing.py -v
"""
import os
import sys
import time
from typing import Any, Dict, Optional
import pytest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import services.openclaw_strategist_service as svc
import services.ai_call_logger as logger_mod
from services.ai_call_logger import _reset_kill_switch
# ─────────────────────────────────────────────────────────────────────────────
# Fixtures
# ─────────────────────────────────────────────────────────────────────────────
@pytest.fixture(autouse=True)
def reset_state(monkeypatch):
"""每個測試重置 logger kill-switch + stub DB 寫入收集 ai_calls 紀錄。"""
_reset_kill_switch()
captured = []
def fake_write(state):
captured.append({
'caller': state.caller,
'provider': state.provider,
'model': state.model,
'status': state.status,
'fallback_to': state.fallback_to,
'error': state.error,
'meta': dict(state.meta),
'request_id': state.request_id,
})
monkeypatch.setattr(logger_mod, '_write_to_db', fake_write)
monkeypatch.setenv('AI_CALL_LOGGING_ENABLED', 'true')
# 預設 flag=false戰前行為
monkeypatch.delenv('OPENCLAW_QA_OLLAMA_FIRST', raising=False)
yield captured
def _wait_async(captured, n=1, timeout=2.0):
deadline = time.time() + timeout
while time.time() < deadline:
if len(captured) >= n:
return True
time.sleep(0.01)
return False
# ─────────────────────────────────────────────────────────────────────────────
# 1. _is_low_quality_response 純函式規則
# ─────────────────────────────────────────────────────────────────────────────
class TestLowQualityRules:
def test_empty_string_is_low_quality(self):
assert svc._is_low_quality_response("") is True
assert svc._is_low_quality_response(None) is True
assert svc._is_low_quality_response(" \n ") is True
def test_too_short_is_low_quality(self):
# 長度 < 50 字元 → 低品質
assert svc._is_low_quality_response("你好,我是 OpenClaw") is True
def test_acceptable_response_passes(self):
good = (
"本週 momo 業績較上週成長 12%,主要受惠於家電與生活雜貨。\n"
"建議:持續關注 PChome 競價動態,必要時調整定價策略。\n"
"預估下週 momo 仍有 5-8% 成長空間。"
)
assert svc._is_low_quality_response(good) is False
def test_simplified_pollution_detected(self):
# 句中含 >= 3 個簡體字 hint → 低品質Qwen 繁中短板核心檢查)
polluted = (
"本周业绩比上周增长,您可以关注这个产品的价格变动趋势,"
"我们建议处理掉滞销库存以提高资产效率"
)
assert svc._is_low_quality_response(polluted) is True
def test_two_simplified_chars_still_acceptable(self):
# 只有 2 個簡體 hint 字(边界以下)+ 結構良好 → 仍可接受
# (避免過度敏感誤殺正常繁體回覆中混入零星簡體字的情境)
text = (
"本週 momo 业绩成長明顯,建議持續關注競品動向。\n"
"重點品類家電、3C、生活雜貨。\n"
"下週可加碼促銷檔期。"
)
assert svc._is_low_quality_response(text) is False
def test_refusal_pattern_detected(self):
for refusal in ['無法回答', '我不知道', '抱歉,我無法協助']:
text = f"關於這個問題,{refusal},請改問其他內容以便我協助您。"
assert svc._is_low_quality_response(text) is True, f"應被判定為拒答:{refusal}"
def test_flowing_text_no_breaks_is_low_quality(self):
# 200+ 字無換行 → 流水帳
text = "本週業績整體呈現上升趨勢。" * 20 # ~200+ 字
assert "\n" not in text
assert len(text) > 200
assert svc._is_low_quality_response(text) is True
def test_long_text_with_breaks_is_acceptable(self):
# 200+ 字但有適度斷行 → 結構良好
text = (
"本週業績整體呈現上升趨勢,主要驅動類別為家電與生活雜貨大類別。\n"
"競品動向PChome 在 3C 類發動大規模補貼戰,預估壓縮我方 3 至 5 個百分點毛利率。\n"
"蝦皮也在母嬰用品加碼免運券促銷,需密切觀察跟降節奏,避免市占下滑。\n"
"建議行動:(1) 加碼家電促銷檔期,重點操作大尺寸電視與廚房家電,"
"(2) 觀察 PChome 補貼是否延續至下週,準備二段反擊方案,"
"(3) 對價差大於 5% 的 SKU 主動啟動 EA 流程,避免毛利持續流失。"
)
assert len(text) > 200
assert svc._is_low_quality_response(text) is False
# ─────────────────────────────────────────────────────────────────────────────
# 2. Routingfeature flag = false 時維持 Gemini-first 路徑regression
# ─────────────────────────────────────────────────────────────────────────────
class TestFlagOff:
def test_flag_false_routes_to_legacy(self, monkeypatch, reset_state):
"""flag=false預設→ 不應該呼叫 _call_qwen3_qa直接走 _legacy_gemini_first_qa。"""
monkeypatch.setenv('OPENCLAW_QA_OLLAMA_FIRST', 'false')
legacy_called = {'count': 0}
ollama_called = {'count': 0}
def fake_legacy(q, ctx, request_id=None):
legacy_called['count'] += 1
return "[legacy gemini reply]"
def fake_ollama(q, ctx, rid):
ollama_called['count'] += 1
return "[should not be called]"
monkeypatch.setattr(svc, '_legacy_gemini_first_qa', fake_legacy)
monkeypatch.setattr(svc, '_call_qwen3_qa', fake_ollama)
result = svc.generate_strategy_response("本週業績如何?")
assert result == "[legacy gemini reply]"
assert legacy_called['count'] == 1
assert ollama_called['count'] == 0
def test_flag_unset_defaults_to_off(self, monkeypatch, reset_state):
"""環境變數完全未設 → 預設 false → 走 legacy。"""
monkeypatch.delenv('OPENCLAW_QA_OLLAMA_FIRST', raising=False)
legacy_called = {'count': 0}
def fake_legacy(q, ctx, request_id=None):
legacy_called['count'] += 1
return "[legacy reply]"
monkeypatch.setattr(svc, '_legacy_gemini_first_qa', fake_legacy)
# 不 stub _call_qwen3_qa如果意外被呼叫會打到真網路 → fail
result = svc.generate_strategy_response("競品分析")
assert legacy_called['count'] == 1
assert result == "[legacy reply]"
def test_empty_query_short_circuits(self, monkeypatch, reset_state):
"""空 query 不應觸發任何 LLM 呼叫。"""
monkeypatch.setenv('OPENCLAW_QA_OLLAMA_FIRST', 'true')
legacy_called = {'count': 0}
ollama_called = {'count': 0}
monkeypatch.setattr(svc, '_legacy_gemini_first_qa',
lambda q, ctx, request_id=None: legacy_called.update({'count': legacy_called['count']+1}) or "")
monkeypatch.setattr(svc, '_call_qwen3_qa',
lambda q, ctx, rid: ollama_called.update({'count': ollama_called['count']+1}) or "")
out = svc.generate_strategy_response("")
assert "請輸入您的問題" in out
assert legacy_called['count'] == 0
assert ollama_called['count'] == 0
# ─────────────────────────────────────────────────────────────────────────────
# 3. Routingfeature flag = true + Ollama 高/低品質
# ─────────────────────────────────────────────────────────────────────────────
class TestFlagOn:
def test_flag_true_high_quality_returns_ollama(self, monkeypatch, reset_state):
"""flag=true + Ollama 回高品質 → 直接回 Ollama不走 Gemini。"""
monkeypatch.setenv('OPENCLAW_QA_OLLAMA_FIRST', 'true')
legacy_called = {'count': 0}
good_reply = (
"本週 momo 業績成長 12%,主要驅動類別為家電。\n"
"建議:持續關注 PChome 競價並加碼家電促銷檔期。\n"
"下週預估仍有 5-8% 成長空間。"
)
monkeypatch.setattr(svc, '_call_qwen3_qa', lambda q, ctx, rid: good_reply)
def fake_legacy(q, ctx, request_id=None):
legacy_called['count'] += 1
return "[gemini fallback]"
monkeypatch.setattr(svc, '_legacy_gemini_first_qa', fake_legacy)
out = svc.generate_strategy_response("本週業績如何?")
assert out == good_reply
assert legacy_called['count'] == 0 # Gemini 沒被呼叫
def test_flag_true_low_quality_falls_back_to_gemini(self, monkeypatch, reset_state):
"""flag=true + Ollama 回低品質(簡體污染)→ fallback Gemini。"""
monkeypatch.setenv('OPENCLAW_QA_OLLAMA_FIRST', 'true')
bad_reply = "本周业绩增长,您可以关注这个产品的价格变动,我们建议处理库存"
legacy_called = {'count': 0}
monkeypatch.setattr(svc, '_call_qwen3_qa', lambda q, ctx, rid: bad_reply)
def fake_legacy(q, ctx, request_id=None):
legacy_called['count'] += 1
return "[gemini high quality reply]"
monkeypatch.setattr(svc, '_legacy_gemini_first_qa', fake_legacy)
out = svc.generate_strategy_response("本週業績如何?")
assert out == "[gemini high quality reply]"
assert legacy_called['count'] == 1
def test_flag_true_ollama_returns_none_falls_back(self, monkeypatch, reset_state):
"""flag=true + Ollama 呼叫失敗(回 None→ fallback Gemini。"""
monkeypatch.setenv('OPENCLAW_QA_OLLAMA_FIRST', 'true')
legacy_called = {'count': 0}
monkeypatch.setattr(svc, '_call_qwen3_qa', lambda q, ctx, rid: None)
def fake_legacy(q, ctx, request_id=None):
legacy_called['count'] += 1
return "[gemini reply after ollama down]"
monkeypatch.setattr(svc, '_legacy_gemini_first_qa', fake_legacy)
out = svc.generate_strategy_response("test")
assert out == "[gemini reply after ollama down]"
assert legacy_called['count'] == 1
# ─────────────────────────────────────────────────────────────────────────────
# 4. _call_qwen3_qa: ai_call_logger 整合 + fallback_to 標記
# ─────────────────────────────────────────────────────────────────────────────
class TestCallQwen3Telemetry:
def test_qwen3_logs_ok_status_on_success(self, monkeypatch, reset_state):
"""高品質回應 → ai_calls 應記 status=ok, caller=openclaw_qa, provider=gcp_ollama"""
captured = reset_state
class FakeResp:
status_code = 200
def raise_for_status(self): pass
def json(self):
return {
'response': '本週 momo 業績成長 12%,建議加碼家電促銷。',
'prompt_eval_count': 150,
'eval_count': 60,
}
monkeypatch.setattr(svc.requests, 'post', lambda *a, **kw: FakeResp())
result = svc._call_qwen3_qa("本週業績?", None, "qa-test123")
assert result is not None
assert "業績成長" in result
assert _wait_async(captured, 1)
assert len(captured) == 1
rec = captured[0]
assert rec['caller'] == 'openclaw_qa'
assert rec['provider'] == 'gcp_ollama'
assert rec['model'] == svc.OPENCLAW_QA_OLLAMA_MODEL
assert rec['status'] == 'ok'
assert rec['fallback_to'] is None
assert rec['meta'].get('flag') == 'OPENCLAW_QA_OLLAMA_FIRST'
assert rec['request_id'] == "qa-test123"
def test_qwen3_logs_fallback_on_exception(self, monkeypatch, reset_state):
"""Ollama 連線失敗 → ai_calls 應記 fallback_to=openclaw_qa_gemini_fallback + status=fallback"""
captured = reset_state
def boom(*a, **kw):
raise svc.requests.ConnectionError("connection refused")
monkeypatch.setattr(svc.requests, 'post', boom)
result = svc._call_qwen3_qa("test", None, "qa-fail123")
assert result is None
assert _wait_async(captured, 1)
rec = captured[0]
assert rec['status'] == 'fallback'
assert rec['fallback_to'] == 'openclaw_qa_gemini_fallback'
assert rec['error'] is not None
assert 'ConnectionError' in rec['error']
def test_qwen3_logs_fallback_on_empty_response(self, monkeypatch, reset_state):
"""Ollama 回空 response → 視為 empty_response標 fallback。"""
captured = reset_state
class FakeResp:
status_code = 200
def raise_for_status(self): pass
def json(self):
return {'response': '', 'prompt_eval_count': 100, 'eval_count': 0}
monkeypatch.setattr(svc.requests, 'post', lambda *a, **kw: FakeResp())
result = svc._call_qwen3_qa("test", None, "qa-empty")
assert result is None
assert _wait_async(captured, 1)
rec = captured[0]
assert rec['status'] == 'fallback'
assert rec['fallback_to'] == 'openclaw_qa_gemini_fallback'
assert rec['error'] == 'empty_response'
# ─────────────────────────────────────────────────────────────────────────────
# 5. 環境變數讀取即時性runtime toggle
# ─────────────────────────────────────────────────────────────────────────────
class TestRuntimeToggle:
def test_flag_helper_reads_env_each_call(self, monkeypatch):
"""_qa_ollama_first_enabled() 應每次重讀 env允許 runtime 灰度切換。"""
monkeypatch.setenv('OPENCLAW_QA_OLLAMA_FIRST', 'false')
assert svc._qa_ollama_first_enabled() is False
monkeypatch.setenv('OPENCLAW_QA_OLLAMA_FIRST', 'true')
assert svc._qa_ollama_first_enabled() is True
# 各種真值字串
for v in ('TRUE', 'True', '1', 'yes', 'on'):
monkeypatch.setenv('OPENCLAW_QA_OLLAMA_FIRST', v)
assert svc._qa_ollama_first_enabled() is True, f"應視為 true: {v!r}"
for v in ('false', '0', 'no', 'off', '', 'foo'):
monkeypatch.setenv('OPENCLAW_QA_OLLAMA_FIRST', v)
assert svc._qa_ollama_first_enabled() is False, f"應視為 false: {v!r}"