206 lines
6.8 KiB
Python
206 lines
6.8 KiB
Python
"""
|
||
Phase 12.3: Model Regression Tests (#68)
|
||
=========================================
|
||
自動化模型回歸測試,確保 OpenClaw 提案品質
|
||
|
||
測試維度:
|
||
1. 回應語言 (繁體中文)
|
||
2. 命令格式 (kubectl)
|
||
3. 風險評估準確度
|
||
"""
|
||
|
||
import os
|
||
from typing import Any
|
||
|
||
import httpx
|
||
import pytest
|
||
|
||
# Ollama 伺服器配置
|
||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.111:11434")
|
||
DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "qwen2.5:7b-instruct")
|
||
TIMEOUT = 300 # 秒 (CPU 推理模式需 ~222-666 秒,見 2026-03-26 評估)
|
||
|
||
|
||
# =============================================================================
|
||
# 測試案例定義
|
||
# =============================================================================
|
||
|
||
REGRESSION_CASES = [
|
||
{
|
||
"name": "中文告警分析",
|
||
"prompt": "你是 AIOps 助手。分析告警:服務 awoooi-api CPU 95%。建議修復行動,用繁體中文,50字內。",
|
||
"validators": [
|
||
lambda r: any(c in r for c in ["建議", "執行", "擴", "重啟", "優化"]), # 包含行動建議
|
||
lambda r: len(r) < 200, # 回應簡潔
|
||
],
|
||
"description": "應包含中文行動建議",
|
||
},
|
||
{
|
||
"name": "kubectl 命令生成",
|
||
"prompt": "生成 kubectl 命令來重啟 deployment api-backend。只輸出命令,不要解釋。",
|
||
"validators": [
|
||
lambda r: "kubectl" in r.lower(), # 包含 kubectl
|
||
lambda r: "restart" in r.lower() or "rollout" in r.lower(), # 包含重啟動作
|
||
],
|
||
"description": "應生成有效 kubectl 命令",
|
||
},
|
||
{
|
||
"name": "風險評估",
|
||
"prompt": "評估風險等級:刪除 production namespace 中的 Pod。只回答 LOW/MEDIUM/HIGH/CRITICAL 其中一個。",
|
||
"validators": [
|
||
lambda r: any(level in r.upper() for level in ["HIGH", "CRITICAL"]), # 正確識別高風險
|
||
],
|
||
"description": "應識別為高風險操作",
|
||
},
|
||
{
|
||
"name": "數學推理",
|
||
"prompt": "計算:如果 CPU 使用率從 60% 上升到 95%,上升了多少百分點?只回答數字。",
|
||
"validators": [
|
||
lambda r: "35" in r, # 正確答案
|
||
],
|
||
"description": "應正確計算 35",
|
||
},
|
||
]
|
||
|
||
|
||
# =============================================================================
|
||
# 輔助函數
|
||
# =============================================================================
|
||
|
||
async def call_ollama(prompt: str, model: str = DEFAULT_MODEL) -> str | None:
|
||
"""呼叫 Ollama API (含確定性參數)
|
||
|
||
Phase 12.3 修正 (2026-03-26):
|
||
- 加入 temperature: 0.0 確保確定性輸出
|
||
- 加入 seed: 42 確保可重現性
|
||
"""
|
||
try:
|
||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||
response = await client.post(
|
||
f"{OLLAMA_URL}/api/generate",
|
||
json={
|
||
"model": model,
|
||
"prompt": prompt,
|
||
"stream": False,
|
||
"options": {
|
||
"temperature": 0.0, # 確定性輸出
|
||
"seed": 42, # 可重現性
|
||
},
|
||
},
|
||
)
|
||
response.raise_for_status()
|
||
return response.json().get("response", "")
|
||
except Exception as e:
|
||
print(f"Ollama 呼叫失敗: {e}")
|
||
return None
|
||
|
||
|
||
async def check_ollama_available() -> bool:
|
||
"""檢查 Ollama 是否可用"""
|
||
try:
|
||
async with httpx.AsyncClient(timeout=5) as client:
|
||
response = await client.get(f"{OLLAMA_URL}/api/tags")
|
||
return response.status_code == 200
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
# =============================================================================
|
||
# 測試類別
|
||
# =============================================================================
|
||
|
||
@pytest.mark.integration
|
||
class TestModelRegression:
|
||
"""模型回歸測試 — 需要 Ollama 服務(預設 111,可用 OLLAMA_URL 覆寫)"""
|
||
|
||
@pytest.fixture(autouse=True)
|
||
async def check_ollama(self):
|
||
"""檢查 Ollama 可用性"""
|
||
available = await check_ollama_available()
|
||
if not available:
|
||
pytest.skip(f"Ollama 無法連線: {OLLAMA_URL}")
|
||
|
||
@pytest.mark.asyncio
|
||
@pytest.mark.parametrize("case", REGRESSION_CASES, ids=[c["name"] for c in REGRESSION_CASES])
|
||
async def test_regression_case(self, case: dict[str, Any]):
|
||
"""執行回歸測試案例"""
|
||
response = await call_ollama(case["prompt"])
|
||
|
||
assert response is not None, f"模型無回應: {case['name']}"
|
||
assert len(response) > 0, f"回應為空: {case['name']}"
|
||
|
||
# 執行驗證器
|
||
for i, validator in enumerate(case["validators"]):
|
||
assert validator(response), (
|
||
f"驗證失敗 [{case['name']}] 驗證器 {i+1}: {case['description']}\n"
|
||
f"回應: {response[:200]}"
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# 準確度報告
|
||
# =============================================================================
|
||
|
||
@pytest.mark.integration
|
||
@pytest.mark.asyncio
|
||
async def test_regression_report():
|
||
"""生成回歸測試報告"""
|
||
available = await check_ollama_available()
|
||
if not available:
|
||
pytest.skip(f"Ollama 無法連線: {OLLAMA_URL}")
|
||
|
||
passed = 0
|
||
failed = 0
|
||
results = []
|
||
|
||
for case in REGRESSION_CASES:
|
||
response = await call_ollama(case["prompt"])
|
||
|
||
if response is None:
|
||
failed += 1
|
||
results.append({"name": case["name"], "status": "ERROR", "reason": "無回應"})
|
||
continue
|
||
|
||
all_passed = True
|
||
for validator in case["validators"]:
|
||
if not validator(response):
|
||
all_passed = False
|
||
break
|
||
|
||
if all_passed:
|
||
passed += 1
|
||
results.append({"name": case["name"], "status": "PASS"})
|
||
else:
|
||
failed += 1
|
||
results.append({
|
||
"name": case["name"],
|
||
"status": "FAIL",
|
||
"response": response[:100],
|
||
})
|
||
|
||
total = passed + failed
|
||
accuracy = (passed / total * 100) if total > 0 else 0
|
||
|
||
print("\n" + "=" * 60)
|
||
print("Phase 12.3: 模型回歸測試報告")
|
||
print("=" * 60)
|
||
print(f"模型: {DEFAULT_MODEL}")
|
||
print(f"總案例: {total}")
|
||
print(f"通過: {passed}")
|
||
print(f"失敗: {failed}")
|
||
print(f"準確率: {accuracy:.1f}%")
|
||
print("=" * 60)
|
||
|
||
if failed > 0:
|
||
print("\n失敗案例:")
|
||
for r in results:
|
||
if r["status"] != "PASS":
|
||
print(f" - {r['name']}: {r.get('reason', r.get('response', ''))}")
|
||
|
||
# 基線門檻 75%
|
||
assert accuracy >= 75, f"準確率 {accuracy}% 低於基線 75%"
|
||
|
||
|
||
if __name__ == "__main__":
|
||
pytest.main([__file__, "-v", "--tb=short"])
|