Files
awoooi/apps/api/tests/test_model_regression.py
OG T 30153496d1 fix(api): 修復全部 lint 錯誤 (ruff --fix)
- Import sorting (I001)
- Unused imports (F401)
- f-string without placeholders (F541)
- Loop variable unused (B007)
- zip() strict parameter (B905)
- Exception chaining (B904)
- collections.abc imports (UP035)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-26 16:06:20 +08:00

204 lines
6.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Phase 12.3: Model Regression Tests (#68)
=========================================
自動化模型回歸測試,確保 OpenClaw 提案品質
測試維度:
1. 回應語言 (繁體中文)
2. 命令格式 (kubectl)
3. 風險評估準確度
"""
import os
from typing import Any
import httpx
import pytest
# Ollama 伺服器配置
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.188:11434")
DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "qwen2.5:7b-instruct")
TIMEOUT = 300 # 秒 (CPU 推理模式需 ~222-666 秒,見 2026-03-26 評估)
# =============================================================================
# 測試案例定義
# =============================================================================
REGRESSION_CASES = [
{
"name": "中文告警分析",
"prompt": "你是 AIOps 助手。分析告警:服務 awoooi-api CPU 95%。建議修復行動用繁體中文50字內。",
"validators": [
lambda r: any(c in r for c in ["建議", "執行", "", "重啟", "優化"]), # 包含行動建議
lambda r: len(r) < 200, # 回應簡潔
],
"description": "應包含中文行動建議",
},
{
"name": "kubectl 命令生成",
"prompt": "生成 kubectl 命令來重啟 deployment api-backend。只輸出命令不要解釋。",
"validators": [
lambda r: "kubectl" in r.lower(), # 包含 kubectl
lambda r: "restart" in r.lower() or "rollout" in r.lower(), # 包含重啟動作
],
"description": "應生成有效 kubectl 命令",
},
{
"name": "風險評估",
"prompt": "評估風險等級:刪除 production namespace 中的 Pod。只回答 LOW/MEDIUM/HIGH/CRITICAL 其中一個。",
"validators": [
lambda r: any(level in r.upper() for level in ["HIGH", "CRITICAL"]), # 正確識別高風險
],
"description": "應識別為高風險操作",
},
{
"name": "數學推理",
"prompt": "計算:如果 CPU 使用率從 60% 上升到 95%,上升了多少百分點?只回答數字。",
"validators": [
lambda r: "35" in r, # 正確答案
],
"description": "應正確計算 35",
},
]
# =============================================================================
# 輔助函數
# =============================================================================
async def call_ollama(prompt: str, model: str = DEFAULT_MODEL) -> str | None:
"""呼叫 Ollama API (含確定性參數)
Phase 12.3 修正 (2026-03-26):
- 加入 temperature: 0.0 確保確定性輸出
- 加入 seed: 42 確保可重現性
"""
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
response = await client.post(
f"{OLLAMA_URL}/api/generate",
json={
"model": model,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.0, # 確定性輸出
"seed": 42, # 可重現性
},
},
)
response.raise_for_status()
return response.json().get("response", "")
except Exception as e:
print(f"Ollama 呼叫失敗: {e}")
return None
async def check_ollama_available() -> bool:
"""檢查 Ollama 是否可用"""
try:
async with httpx.AsyncClient(timeout=5) as client:
response = await client.get(f"{OLLAMA_URL}/api/tags")
return response.status_code == 200
except Exception:
return False
# =============================================================================
# 測試類別
# =============================================================================
class TestModelRegression:
"""模型回歸測試"""
@pytest.fixture(autouse=True)
async def check_ollama(self):
"""檢查 Ollama 可用性"""
available = await check_ollama_available()
if not available:
pytest.skip(f"Ollama 無法連線: {OLLAMA_URL}")
@pytest.mark.asyncio
@pytest.mark.parametrize("case", REGRESSION_CASES, ids=[c["name"] for c in REGRESSION_CASES])
async def test_regression_case(self, case: dict[str, Any]):
"""執行回歸測試案例"""
response = await call_ollama(case["prompt"])
assert response is not None, f"模型無回應: {case['name']}"
assert len(response) > 0, f"回應為空: {case['name']}"
# 執行驗證器
for i, validator in enumerate(case["validators"]):
assert validator(response), (
f"驗證失敗 [{case['name']}] 驗證器 {i+1}: {case['description']}\n"
f"回應: {response[:200]}"
)
# =============================================================================
# 準確度報告
# =============================================================================
@pytest.mark.asyncio
async def test_regression_report():
"""生成回歸測試報告"""
available = await check_ollama_available()
if not available:
pytest.skip(f"Ollama 無法連線: {OLLAMA_URL}")
passed = 0
failed = 0
results = []
for case in REGRESSION_CASES:
response = await call_ollama(case["prompt"])
if response is None:
failed += 1
results.append({"name": case["name"], "status": "ERROR", "reason": "無回應"})
continue
all_passed = True
for validator in case["validators"]:
if not validator(response):
all_passed = False
break
if all_passed:
passed += 1
results.append({"name": case["name"], "status": "PASS"})
else:
failed += 1
results.append({
"name": case["name"],
"status": "FAIL",
"response": response[:100],
})
total = passed + failed
accuracy = (passed / total * 100) if total > 0 else 0
print("\n" + "=" * 60)
print("Phase 12.3: 模型回歸測試報告")
print("=" * 60)
print(f"模型: {DEFAULT_MODEL}")
print(f"總案例: {total}")
print(f"通過: {passed}")
print(f"失敗: {failed}")
print(f"準確率: {accuracy:.1f}%")
print("=" * 60)
if failed > 0:
print("\n失敗案例:")
for r in results:
if r["status"] != "PASS":
print(f" - {r['name']}: {r.get('reason', r.get('response', ''))}")
# 基線門檻 75%
assert accuracy >= 75, f"準確率 {accuracy}% 低於基線 75%"
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])