""" Phase 12.3: Model Regression Tests (#68) ========================================= 自動化模型回歸測試,確保 OpenClaw 提案品質 測試維度: 1. 回應語言 (繁體中文) 2. 命令格式 (kubectl) 3. 風險評估準確度 """ import os from typing import Any import httpx import pytest # Ollama 伺服器配置 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.188:11434") DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "qwen2.5:7b-instruct") TIMEOUT = 300 # 秒 (CPU 推理模式需 ~222-666 秒,見 2026-03-26 評估) # ============================================================================= # 測試案例定義 # ============================================================================= REGRESSION_CASES = [ { "name": "中文告警分析", "prompt": "你是 AIOps 助手。分析告警:服務 awoooi-api CPU 95%。建議修復行動,用繁體中文,50字內。", "validators": [ lambda r: any(c in r for c in ["建議", "執行", "擴", "重啟", "優化"]), # 包含行動建議 lambda r: len(r) < 200, # 回應簡潔 ], "description": "應包含中文行動建議", }, { "name": "kubectl 命令生成", "prompt": "生成 kubectl 命令來重啟 deployment api-backend。只輸出命令,不要解釋。", "validators": [ lambda r: "kubectl" in r.lower(), # 包含 kubectl lambda r: "restart" in r.lower() or "rollout" in r.lower(), # 包含重啟動作 ], "description": "應生成有效 kubectl 命令", }, { "name": "風險評估", "prompt": "評估風險等級:刪除 production namespace 中的 Pod。只回答 LOW/MEDIUM/HIGH/CRITICAL 其中一個。", "validators": [ lambda r: any(level in r.upper() for level in ["HIGH", "CRITICAL"]), # 正確識別高風險 ], "description": "應識別為高風險操作", }, { "name": "數學推理", "prompt": "計算:如果 CPU 使用率從 60% 上升到 95%,上升了多少百分點?只回答數字。", "validators": [ lambda r: "35" in r, # 正確答案 ], "description": "應正確計算 35", }, ] # ============================================================================= # 輔助函數 # ============================================================================= async def call_ollama(prompt: str, model: str = DEFAULT_MODEL) -> str | None: """呼叫 Ollama API (含確定性參數) Phase 12.3 修正 (2026-03-26): - 加入 temperature: 0.0 確保確定性輸出 - 加入 seed: 42 確保可重現性 """ try: async with httpx.AsyncClient(timeout=TIMEOUT) as client: response = await client.post( f"{OLLAMA_URL}/api/generate", json={ "model": model, "prompt": prompt, "stream": False, "options": { "temperature": 0.0, # 確定性輸出 "seed": 42, # 可重現性 }, }, ) response.raise_for_status() return response.json().get("response", "") except Exception as e: print(f"Ollama 呼叫失敗: {e}") return None async def check_ollama_available() -> bool: """檢查 Ollama 是否可用""" try: async with httpx.AsyncClient(timeout=5) as client: response = await client.get(f"{OLLAMA_URL}/api/tags") return response.status_code == 200 except Exception: return False # ============================================================================= # 測試類別 # ============================================================================= @pytest.mark.integration class TestModelRegression: """模型回歸測試 — 需要 Ollama 服務 (192.168.0.188:11434)""" @pytest.fixture(autouse=True) async def check_ollama(self): """檢查 Ollama 可用性""" available = await check_ollama_available() if not available: pytest.skip(f"Ollama 無法連線: {OLLAMA_URL}") @pytest.mark.asyncio @pytest.mark.parametrize("case", REGRESSION_CASES, ids=[c["name"] for c in REGRESSION_CASES]) async def test_regression_case(self, case: dict[str, Any]): """執行回歸測試案例""" response = await call_ollama(case["prompt"]) assert response is not None, f"模型無回應: {case['name']}" assert len(response) > 0, f"回應為空: {case['name']}" # 執行驗證器 for i, validator in enumerate(case["validators"]): assert validator(response), ( f"驗證失敗 [{case['name']}] 驗證器 {i+1}: {case['description']}\n" f"回應: {response[:200]}" ) # ============================================================================= # 準確度報告 # ============================================================================= @pytest.mark.integration @pytest.mark.asyncio async def test_regression_report(): """生成回歸測試報告""" available = await check_ollama_available() if not available: pytest.skip(f"Ollama 無法連線: {OLLAMA_URL}") passed = 0 failed = 0 results = [] for case in REGRESSION_CASES: response = await call_ollama(case["prompt"]) if response is None: failed += 1 results.append({"name": case["name"], "status": "ERROR", "reason": "無回應"}) continue all_passed = True for validator in case["validators"]: if not validator(response): all_passed = False break if all_passed: passed += 1 results.append({"name": case["name"], "status": "PASS"}) else: failed += 1 results.append({ "name": case["name"], "status": "FAIL", "response": response[:100], }) total = passed + failed accuracy = (passed / total * 100) if total > 0 else 0 print("\n" + "=" * 60) print("Phase 12.3: 模型回歸測試報告") print("=" * 60) print(f"模型: {DEFAULT_MODEL}") print(f"總案例: {total}") print(f"通過: {passed}") print(f"失敗: {failed}") print(f"準確率: {accuracy:.1f}%") print("=" * 60) if failed > 0: print("\n失敗案例:") for r in results: if r["status"] != "PASS": print(f" - {r['name']}: {r.get('reason', r.get('response', ''))}") # 基線門檻 75% assert accuracy >= 75, f"準確率 {accuracy}% 低於基線 75%" if __name__ == "__main__": pytest.main([__file__, "-v", "--tb=short"])