Files
ewoooc/tests/test_openclaw_qa_golden_set.py

292 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
tests/test_openclaw_qa_golden_set.py
OpenClaw Q&A 黃金集 A/B 對照框架
(Operation Ollama-First v5.0 — Phase 3, A7 fullstack-engineer)
目的:
在統帥盲測前,先建立 Ollama qwen3:14b vs Gemini 2.5 Flash 的「量化基線」。
10 題典型 momo 商業 Q&A雙模型各跑一次比對
- 簡體字污染數量A2 黃燈警訊核心)
- 回應長度
- 結構性指標(行數、列點數)
- 拒答訊號
- 黃金關鍵字命中率(題目自帶 expect_keywords
執行:
RUN_GOLDEN_SET=1 pytest tests/test_openclaw_qa_golden_set.py -v -s
# GCP 還沒拉 qwen3:14b 之前,預設 SKIP避免 CI 紅燈)
紀律:
- PII 紀律:題目/答案無真實 chat_id / username / 身份證 / 手機,全部去識別化
- 不對「正確性」做 hard assert本框架專做「品質量化基線」收集
- 報告印到 stdoutpytest -s 顯示),人工檢視,不卡 CI
"""
import json
import os
import sys
import time
from typing import Dict, List, Optional
import pytest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# ─────────────────────────────────────────────────────────────────────────────
# 啟用條件:須三條件齊備才實跑
# 1. RUN_GOLDEN_SET=1
# 2. OllamaService 三主機級聯可解析出可達主機
# 3. GEMINI_API_KEY 已設
# 否則 SKIP。
# ─────────────────────────────────────────────────────────────────────────────
def _ollama_reachable(host: str, timeout: float = 2.0) -> bool:
try:
import requests
r = requests.get(f"{host.rstrip('/')}/api/version", timeout=timeout)
return r.status_code == 200
except Exception:
return False
def _ollama_has_model(host: str, model: str, timeout: float = 3.0) -> bool:
"""檢查 Ollama 主機是否已 pull 指定模型。"""
try:
import requests
r = requests.get(f"{host.rstrip('/')}/api/tags", timeout=timeout)
if r.status_code != 200:
return False
tags = r.json().get('models', [])
return any(m.get('name', '').startswith(model.split(':')[0]) for m in tags)
except Exception:
return False
_RUN_GOLDEN = os.getenv('RUN_GOLDEN_SET', '0') == '1'
_MODEL = os.getenv('OPENCLAW_QA_OLLAMA_MODEL', 'qwen3:14b')
_HAS_GEMINI = bool(os.getenv('GEMINI_API_KEY'))
def _resolved_ollama_host() -> str:
from services.ollama_service import resolve_ollama_host
return resolve_ollama_host()
pytestmark = pytest.mark.skipif(
not _RUN_GOLDEN,
reason="黃金集需要 RUN_GOLDEN_SET=1 + GCP qwen3:14b ready + GEMINI_API_KEY統帥盲測前才跑",
)
# ─────────────────────────────────────────────────────────────────────────────
# 黃金集10 題;全部去 PII情境取自 momo-pro 真實 Telegram 互動模式)
# ─────────────────────────────────────────────────────────────────────────────
GOLDEN_SET: List[Dict] = [
{
"id": "g01_weekly_trend",
"question": "本週 momo 業績趨勢如何?跟上週比?",
"expect_keywords": ["業績", "", "成長"],
"category": "業績趨勢",
},
{
"id": "g02_competitor_threat",
"question": "PChome 最近在 3C 類有發動補貼戰嗎?對我們影響?",
"expect_keywords": ["PChome", "3C"],
"category": "競品威脅",
},
{
"id": "g03_pricing_strategy",
"question": "我有一支 SKU 比競品貴 8%,銷量持續下滑,該怎麼辦?",
"expect_keywords": ["定價", "競品"],
"category": "定價策略",
},
{
"id": "g04_seasonal",
"question": "母親節檔期快到了,建議哪些品類加碼?",
"expect_keywords": ["母親節", "品類"],
"category": "季節機會",
},
{
"id": "g05_command_routing",
"question": "我想看完整週報怎麼下指令?",
"expect_keywords": ["weekly", "週報"],
"category": "指令導引",
},
{
"id": "g06_top_threats",
"question": "目前 TOP 5 最緊急的競價威脅是哪些?",
"expect_keywords": ["威脅", "TOP"],
"category": "威脅清單",
},
{
"id": "g07_inventory_signal",
"question": "如何判斷某 SKU 該促銷出清?",
"expect_keywords": ["促銷", "出清"],
"category": "庫存決策",
},
{
"id": "g08_cross_category",
"question": "家電 vs 生活雜貨,哪個品類本月成長動能比較強?",
"expect_keywords": ["家電", "成長"],
"category": "品類比較",
},
{
"id": "g09_data_unavailable",
"question": "幫我看 2030 年的銷售預測。",
"expect_keywords": ["資料", "無法"], # 期待模型誠實回應「資料不足」而非編造
"category": "資料邊界",
},
{
"id": "g10_action_item",
"question": "綜合本週數據,給我 3 個 48 小時內必做行動。",
"expect_keywords": ["行動", "建議"],
"category": "行動清單",
},
]
# ─────────────────────────────────────────────────────────────────────────────
# Scoring helpers
# ─────────────────────────────────────────────────────────────────────────────
def _count_simplified(text: str) -> int:
"""重用 strategist service 的簡體字 hint 集合計數。"""
from services.openclaw_strategist_service import _SIMPLIFIED_HINT_CHARS
return sum(1 for c in (text or '') if c in _SIMPLIFIED_HINT_CHARS)
def _count_keyword_hits(text: str, keywords: List[str]) -> int:
if not text:
return 0
return sum(1 for kw in keywords if kw in text)
def _is_refusal(text: str) -> bool:
from services.openclaw_strategist_service import _REFUSAL_PATTERNS
return any(p in (text or '') for p in _REFUSAL_PATTERNS)
def _structure_score(text: str) -> Dict[str, int]:
"""結構性量化指標。"""
if not text:
return {"lines": 0, "bullets": 0, "tables": 0}
return {
"lines": text.count('\n') + 1,
# 條列符號粗略偵測(含中文「、」「,」開頭的列點)
"bullets": sum(text.count(s) for s in ('- ', '', '* ', '1.', '2.', '3.')),
"tables": text.count('|'),
}
def _score_response(qid: str, question: str, response: str, expect_kw: List[str]) -> Dict:
structure = _structure_score(response)
return {
"qid": qid,
"length": len(response or ''),
"simplified_count": _count_simplified(response),
"keyword_hits": _count_keyword_hits(response, expect_kw),
"is_refusal": _is_refusal(response),
"lines": structure["lines"],
"bullets": structure["bullets"],
"tables": structure["tables"],
"preview": (response or '')[:120].replace('\n', ' / '),
}
# ─────────────────────────────────────────────────────────────────────────────
# Caller wrappers (使用 service 的真實函式)
# ─────────────────────────────────────────────────────────────────────────────
def _call_ollama(question: str) -> Optional[str]:
from services.openclaw_strategist_service import _call_qwen3_qa
return _call_qwen3_qa(question, None, f"golden-{int(time.time())}")
def _call_gemini_baseline(question: str) -> Optional[str]:
from services.openclaw_strategist_service import _call_gemini
system_prompt = (
"你是 MOMO Pro 電商情報策略師「OpenClaw」。以繁體中文台灣用語回覆使用者。"
"嚴禁簡體字。回覆長度控制在 500 字內,可用 Markdown 條列。"
)
return _call_gemini(system_prompt, question, temperature=0.5, caller="openclaw_qa_golden")
# ─────────────────────────────────────────────────────────────────────────────
# Tests
# ─────────────────────────────────────────────────────────────────────────────
def test_environment_ready():
"""sanity check跑黃金集前確認 Ollama 級聯 host + model + Gemini key 都 ready。"""
host = _resolved_ollama_host()
assert _ollama_reachable(host), f"Ollama 主機不可達:{host}"
assert _ollama_has_model(host, _MODEL), (
f"Ollama 主機 {host} 尚未拉 {_MODEL}(請先完成 ollama pull"
)
assert _HAS_GEMINI, "GEMINI_API_KEY 未設"
def test_golden_set_ab_comparison(capsys):
"""跑 10 題雙模型 A/B 對照,量化指標印到 stdout。
本測試不對「正確性」做 hard assert目的是給統帥盲測前的「品質量化基線」。
僅 hard assert
- 雙模型至少都有回應(非全 None
- Gemini baseline 簡體字數量 == 0baseline 不該污染)
"""
# 啟用 flag 讓 _call_qwen3_qa 走真實邏輯
os.environ['OPENCLAW_QA_OLLAMA_FIRST'] = 'true'
rows = []
for item in GOLDEN_SET:
qid = item['id']
question = item['question']
kws = item['expect_keywords']
ollama_resp = _call_ollama(question)
gemini_resp = _call_gemini_baseline(question)
rows.append({
'qid': qid,
'category': item['category'],
'question': question,
'ollama': _score_response(qid, question, ollama_resp or '', kws),
'gemini': _score_response(qid, question, gemini_resp or '', kws),
})
# 列印量化基線pytest -s 才看得到)
print("\n" + "=" * 100)
print("OpenClaw QA 黃金集 A/B 量化基線Ollama qwen3:14b vs Gemini 2.5 Flash")
print("=" * 100)
for r in rows:
print(f"\n[{r['qid']}] ({r['category']}) {r['question']}")
for side in ('ollama', 'gemini'):
s = r[side]
print(
f" {side:>7}: len={s['length']:>4} simp={s['simplified_count']:>2} "
f"kw={s['keyword_hits']}/{len(GOLDEN_SET[0]['expect_keywords'])} "
f"lines={s['lines']:>2} refusal={s['is_refusal']}"
)
print(f" preview: {s['preview']}")
# 匯出 JSON 給後續分析
out_path = os.path.join(os.path.dirname(__file__), 'logs', 'qa_golden_baseline.json')
os.makedirs(os.path.dirname(out_path), exist_ok=True)
with open(out_path, 'w', encoding='utf-8') as f:
json.dump(rows, f, ensure_ascii=False, indent=2)
print(f"\n基線已存:{out_path}")
# Hard assertions最少安全網
ollama_responded = sum(1 for r in rows if r['ollama']['length'] > 0)
gemini_responded = sum(1 for r in rows if r['gemini']['length'] > 0)
assert ollama_responded >= 8, f"Ollama 回應率過低:{ollama_responded}/10"
assert gemini_responded >= 9, f"Gemini 回應率過低:{gemini_responded}/10"
# Gemini baseline 不該有簡體污染(用以驗證測量本身正確)
for r in rows:
assert r['gemini']['simplified_count'] == 0, (
f"Gemini baseline 簡體污染(指標可能誤判):{r['qid']} {r['gemini']['preview']}"
)