292 lines
12 KiB
Python
292 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
tests/test_openclaw_qa_golden_set.py
|
||
OpenClaw Q&A 黃金集 A/B 對照框架
|
||
(Operation Ollama-First v5.0 — Phase 3, A7 fullstack-engineer)
|
||
|
||
目的:
|
||
在統帥盲測前,先建立 Ollama qwen3:14b vs Gemini 2.5 Flash 的「量化基線」。
|
||
10 題典型 momo 商業 Q&A,雙模型各跑一次,比對:
|
||
- 簡體字污染數量(A2 黃燈警訊核心)
|
||
- 回應長度
|
||
- 結構性指標(行數、列點數)
|
||
- 拒答訊號
|
||
- 黃金關鍵字命中率(題目自帶 expect_keywords)
|
||
|
||
執行:
|
||
RUN_GOLDEN_SET=1 pytest tests/test_openclaw_qa_golden_set.py -v -s
|
||
# GCP 還沒拉 qwen3:14b 之前,預設 SKIP(避免 CI 紅燈)
|
||
|
||
紀律:
|
||
- PII 紀律:題目/答案無真實 chat_id / username / 身份證 / 手機,全部去識別化
|
||
- 不對「正確性」做 hard assert;本框架專做「品質量化基線」收集
|
||
- 報告印到 stdout(pytest -s 顯示),人工檢視,不卡 CI
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
from typing import Dict, List, Optional
|
||
|
||
import pytest
|
||
|
||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 啟用條件:須三條件齊備才實跑
|
||
# 1. RUN_GOLDEN_SET=1
|
||
# 2. OllamaService 三主機級聯可解析出可達主機
|
||
# 3. GEMINI_API_KEY 已設
|
||
# 否則 SKIP。
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _ollama_reachable(host: str, timeout: float = 2.0) -> bool:
|
||
try:
|
||
import requests
|
||
r = requests.get(f"{host.rstrip('/')}/api/version", timeout=timeout)
|
||
return r.status_code == 200
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
def _ollama_has_model(host: str, model: str, timeout: float = 3.0) -> bool:
|
||
"""檢查 Ollama 主機是否已 pull 指定模型。"""
|
||
try:
|
||
import requests
|
||
r = requests.get(f"{host.rstrip('/')}/api/tags", timeout=timeout)
|
||
if r.status_code != 200:
|
||
return False
|
||
tags = r.json().get('models', [])
|
||
return any(m.get('name', '').startswith(model.split(':')[0]) for m in tags)
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
_RUN_GOLDEN = os.getenv('RUN_GOLDEN_SET', '0') == '1'
|
||
_MODEL = os.getenv('OPENCLAW_QA_OLLAMA_MODEL', 'qwen3:14b')
|
||
_HAS_GEMINI = bool(os.getenv('GEMINI_API_KEY'))
|
||
|
||
|
||
def _resolved_ollama_host() -> str:
|
||
from services.ollama_service import resolve_ollama_host
|
||
return resolve_ollama_host()
|
||
|
||
pytestmark = pytest.mark.skipif(
|
||
not _RUN_GOLDEN,
|
||
reason="黃金集需要 RUN_GOLDEN_SET=1 + GCP qwen3:14b ready + GEMINI_API_KEY;統帥盲測前才跑",
|
||
)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# 黃金集(10 題;全部去 PII;情境取自 momo-pro 真實 Telegram 互動模式)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
GOLDEN_SET: List[Dict] = [
|
||
{
|
||
"id": "g01_weekly_trend",
|
||
"question": "本週 momo 業績趨勢如何?跟上週比?",
|
||
"expect_keywords": ["業績", "週", "成長"],
|
||
"category": "業績趨勢",
|
||
},
|
||
{
|
||
"id": "g02_competitor_threat",
|
||
"question": "PChome 最近在 3C 類有發動補貼戰嗎?對我們影響?",
|
||
"expect_keywords": ["PChome", "3C"],
|
||
"category": "競品威脅",
|
||
},
|
||
{
|
||
"id": "g03_pricing_strategy",
|
||
"question": "我有一支 SKU 比競品貴 8%,銷量持續下滑,該怎麼辦?",
|
||
"expect_keywords": ["定價", "競品"],
|
||
"category": "定價策略",
|
||
},
|
||
{
|
||
"id": "g04_seasonal",
|
||
"question": "母親節檔期快到了,建議哪些品類加碼?",
|
||
"expect_keywords": ["母親節", "品類"],
|
||
"category": "季節機會",
|
||
},
|
||
{
|
||
"id": "g05_command_routing",
|
||
"question": "我想看完整週報怎麼下指令?",
|
||
"expect_keywords": ["weekly", "週報"],
|
||
"category": "指令導引",
|
||
},
|
||
{
|
||
"id": "g06_top_threats",
|
||
"question": "目前 TOP 5 最緊急的競價威脅是哪些?",
|
||
"expect_keywords": ["威脅", "TOP"],
|
||
"category": "威脅清單",
|
||
},
|
||
{
|
||
"id": "g07_inventory_signal",
|
||
"question": "如何判斷某 SKU 該促銷出清?",
|
||
"expect_keywords": ["促銷", "出清"],
|
||
"category": "庫存決策",
|
||
},
|
||
{
|
||
"id": "g08_cross_category",
|
||
"question": "家電 vs 生活雜貨,哪個品類本月成長動能比較強?",
|
||
"expect_keywords": ["家電", "成長"],
|
||
"category": "品類比較",
|
||
},
|
||
{
|
||
"id": "g09_data_unavailable",
|
||
"question": "幫我看 2030 年的銷售預測。",
|
||
"expect_keywords": ["資料", "無法"], # 期待模型誠實回應「資料不足」而非編造
|
||
"category": "資料邊界",
|
||
},
|
||
{
|
||
"id": "g10_action_item",
|
||
"question": "綜合本週數據,給我 3 個 48 小時內必做行動。",
|
||
"expect_keywords": ["行動", "建議"],
|
||
"category": "行動清單",
|
||
},
|
||
]
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Scoring helpers
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _count_simplified(text: str) -> int:
|
||
"""重用 strategist service 的簡體字 hint 集合計數。"""
|
||
from services.openclaw_strategist_service import _SIMPLIFIED_HINT_CHARS
|
||
return sum(1 for c in (text or '') if c in _SIMPLIFIED_HINT_CHARS)
|
||
|
||
|
||
def _count_keyword_hits(text: str, keywords: List[str]) -> int:
|
||
if not text:
|
||
return 0
|
||
return sum(1 for kw in keywords if kw in text)
|
||
|
||
|
||
def _is_refusal(text: str) -> bool:
|
||
from services.openclaw_strategist_service import _REFUSAL_PATTERNS
|
||
return any(p in (text or '') for p in _REFUSAL_PATTERNS)
|
||
|
||
|
||
def _structure_score(text: str) -> Dict[str, int]:
|
||
"""結構性量化指標。"""
|
||
if not text:
|
||
return {"lines": 0, "bullets": 0, "tables": 0}
|
||
return {
|
||
"lines": text.count('\n') + 1,
|
||
# 條列符號粗略偵測(含中文「、」「,」開頭的列點)
|
||
"bullets": sum(text.count(s) for s in ('- ', '• ', '* ', '1.', '2.', '3.')),
|
||
"tables": text.count('|'),
|
||
}
|
||
|
||
|
||
def _score_response(qid: str, question: str, response: str, expect_kw: List[str]) -> Dict:
|
||
structure = _structure_score(response)
|
||
return {
|
||
"qid": qid,
|
||
"length": len(response or ''),
|
||
"simplified_count": _count_simplified(response),
|
||
"keyword_hits": _count_keyword_hits(response, expect_kw),
|
||
"is_refusal": _is_refusal(response),
|
||
"lines": structure["lines"],
|
||
"bullets": structure["bullets"],
|
||
"tables": structure["tables"],
|
||
"preview": (response or '')[:120].replace('\n', ' / '),
|
||
}
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Caller wrappers (使用 service 的真實函式)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _call_ollama(question: str) -> Optional[str]:
|
||
from services.openclaw_strategist_service import _call_qwen3_qa
|
||
return _call_qwen3_qa(question, None, f"golden-{int(time.time())}")
|
||
|
||
|
||
def _call_gemini_baseline(question: str) -> Optional[str]:
|
||
from services.openclaw_strategist_service import _call_gemini
|
||
system_prompt = (
|
||
"你是 MOMO Pro 電商情報策略師「OpenClaw」。以繁體中文(台灣用語)回覆使用者。"
|
||
"嚴禁簡體字。回覆長度控制在 500 字內,可用 Markdown 條列。"
|
||
)
|
||
return _call_gemini(system_prompt, question, temperature=0.5, caller="openclaw_qa_golden")
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Tests
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def test_environment_ready():
|
||
"""sanity check:跑黃金集前確認 Ollama 級聯 host + model + Gemini key 都 ready。"""
|
||
host = _resolved_ollama_host()
|
||
assert _ollama_reachable(host), f"Ollama 主機不可達:{host}"
|
||
assert _ollama_has_model(host, _MODEL), (
|
||
f"Ollama 主機 {host} 尚未拉 {_MODEL}(請先完成 ollama pull)"
|
||
)
|
||
assert _HAS_GEMINI, "GEMINI_API_KEY 未設"
|
||
|
||
|
||
def test_golden_set_ab_comparison(capsys):
|
||
"""跑 10 題雙模型 A/B 對照,量化指標印到 stdout。
|
||
|
||
本測試不對「正確性」做 hard assert;目的是給統帥盲測前的「品質量化基線」。
|
||
僅 hard assert:
|
||
- 雙模型至少都有回應(非全 None)
|
||
- Gemini baseline 簡體字數量 == 0(baseline 不該污染)
|
||
"""
|
||
# 啟用 flag 讓 _call_qwen3_qa 走真實邏輯
|
||
os.environ['OPENCLAW_QA_OLLAMA_FIRST'] = 'true'
|
||
|
||
rows = []
|
||
for item in GOLDEN_SET:
|
||
qid = item['id']
|
||
question = item['question']
|
||
kws = item['expect_keywords']
|
||
|
||
ollama_resp = _call_ollama(question)
|
||
gemini_resp = _call_gemini_baseline(question)
|
||
|
||
rows.append({
|
||
'qid': qid,
|
||
'category': item['category'],
|
||
'question': question,
|
||
'ollama': _score_response(qid, question, ollama_resp or '', kws),
|
||
'gemini': _score_response(qid, question, gemini_resp or '', kws),
|
||
})
|
||
|
||
# 列印量化基線(pytest -s 才看得到)
|
||
print("\n" + "=" * 100)
|
||
print("OpenClaw QA 黃金集 A/B 量化基線(Ollama qwen3:14b vs Gemini 2.5 Flash)")
|
||
print("=" * 100)
|
||
for r in rows:
|
||
print(f"\n[{r['qid']}] ({r['category']}) {r['question']}")
|
||
for side in ('ollama', 'gemini'):
|
||
s = r[side]
|
||
print(
|
||
f" {side:>7}: len={s['length']:>4} simp={s['simplified_count']:>2} "
|
||
f"kw={s['keyword_hits']}/{len(GOLDEN_SET[0]['expect_keywords'])} "
|
||
f"lines={s['lines']:>2} refusal={s['is_refusal']}"
|
||
)
|
||
print(f" preview: {s['preview']}")
|
||
|
||
# 匯出 JSON 給後續分析
|
||
out_path = os.path.join(os.path.dirname(__file__), 'logs', 'qa_golden_baseline.json')
|
||
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
||
with open(out_path, 'w', encoding='utf-8') as f:
|
||
json.dump(rows, f, ensure_ascii=False, indent=2)
|
||
print(f"\n基線已存:{out_path}")
|
||
|
||
# Hard assertions(最少安全網)
|
||
ollama_responded = sum(1 for r in rows if r['ollama']['length'] > 0)
|
||
gemini_responded = sum(1 for r in rows if r['gemini']['length'] > 0)
|
||
assert ollama_responded >= 8, f"Ollama 回應率過低:{ollama_responded}/10"
|
||
assert gemini_responded >= 9, f"Gemini 回應率過低:{gemini_responded}/10"
|
||
|
||
# Gemini baseline 不該有簡體污染(用以驗證測量本身正確)
|
||
for r in rows:
|
||
assert r['gemini']['simplified_count'] == 0, (
|
||
f"Gemini baseline 簡體污染(指標可能誤判):{r['qid']} {r['gemini']['preview']}"
|
||
)
|