awoooi/apps/api/tests/test_model_regression.py

"""
Phase 12.3: Model Regression Tests (#68)
=========================================
自動化模型回歸測試，確保 OpenClaw 提案品質

測試維度:
1. 回應語言 (繁體中文)
2. 命令格式 (kubectl)
3. 風險評估準確度
"""

import os
from typing import Any

import httpx
import pytest

# Ollama 伺服器配置
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.0.188:11434")
DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "qwen2.5:7b-instruct")
TIMEOUT = 300  # 秒 (CPU 推理模式需 ~222-666 秒，見 2026-03-26 評估)


# =============================================================================
# 測試案例定義
# =============================================================================

REGRESSION_CASES = [
    {
        "name": "中文告警分析",
        "prompt": "你是 AIOps 助手。分析告警：服務 awoooi-api CPU 95%。建議修復行動，用繁體中文，50字內。",
        "validators": [
            lambda r: any(c in r for c in ["建議", "執行", "擴", "重啟", "優化"]),  # 包含行動建議
            lambda r: len(r) < 200,  # 回應簡潔
        ],
        "description": "應包含中文行動建議",
    },
    {
        "name": "kubectl 命令生成",
        "prompt": "生成 kubectl 命令來重啟 deployment api-backend。只輸出命令，不要解釋。",
        "validators": [
            lambda r: "kubectl" in r.lower(),  # 包含 kubectl
            lambda r: "restart" in r.lower() or "rollout" in r.lower(),  # 包含重啟動作
        ],
        "description": "應生成有效 kubectl 命令",
    },
    {
        "name": "風險評估",
        "prompt": "評估風險等級：刪除 production namespace 中的 Pod。只回答 LOW/MEDIUM/HIGH/CRITICAL 其中一個。",
        "validators": [
            lambda r: any(level in r.upper() for level in ["HIGH", "CRITICAL"]),  # 正確識別高風險
        ],
        "description": "應識別為高風險操作",
    },
    {
        "name": "數學推理",
        "prompt": "計算：如果 CPU 使用率從 60% 上升到 95%，上升了多少百分點？只回答數字。",
        "validators": [
            lambda r: "35" in r,  # 正確答案
        ],
        "description": "應正確計算 35",
    },
]


# =============================================================================
# 輔助函數
# =============================================================================

async def call_ollama(prompt: str, model: str = DEFAULT_MODEL) -> str | None:
    """呼叫 Ollama API (含確定性參數)

    Phase 12.3 修正 (2026-03-26):
    - 加入 temperature: 0.0 確保確定性輸出
    - 加入 seed: 42 確保可重現性
    """
    try:
        async with httpx.AsyncClient(timeout=TIMEOUT) as client:
            response = await client.post(
                f"{OLLAMA_URL}/api/generate",
                json={
                    "model": model,
                    "prompt": prompt,
                    "stream": False,
                    "options": {
                        "temperature": 0.0,  # 確定性輸出
                        "seed": 42,          # 可重現性
                    },
                },
            )
            response.raise_for_status()
            return response.json().get("response", "")
    except Exception as e:
        print(f"Ollama 呼叫失敗: {e}")
        return None


async def check_ollama_available() -> bool:
    """檢查 Ollama 是否可用"""
    try:
        async with httpx.AsyncClient(timeout=5) as client:
            response = await client.get(f"{OLLAMA_URL}/api/tags")
            return response.status_code == 200
    except Exception:
        return False


# =============================================================================
# 測試類別
# =============================================================================

class TestModelRegression:
    """模型回歸測試"""

    @pytest.fixture(autouse=True)
    async def check_ollama(self):
        """檢查 Ollama 可用性"""
        available = await check_ollama_available()
        if not available:
            pytest.skip(f"Ollama 無法連線: {OLLAMA_URL}")

    @pytest.mark.asyncio
    @pytest.mark.parametrize("case", REGRESSION_CASES, ids=[c["name"] for c in REGRESSION_CASES])
    async def test_regression_case(self, case: dict[str, Any]):
        """執行回歸測試案例"""
        response = await call_ollama(case["prompt"])

        assert response is not None, f"模型無回應: {case['name']}"
        assert len(response) > 0, f"回應為空: {case['name']}"

        # 執行驗證器
        for i, validator in enumerate(case["validators"]):
            assert validator(response), (
                f"驗證失敗 [{case['name']}] 驗證器 {i+1}: {case['description']}\n"
                f"回應: {response[:200]}"
            )


# =============================================================================
# 準確度報告
# =============================================================================

@pytest.mark.asyncio
async def test_regression_report():
    """生成回歸測試報告"""
    available = await check_ollama_available()
    if not available:
        pytest.skip(f"Ollama 無法連線: {OLLAMA_URL}")

    passed = 0
    failed = 0
    results = []

    for case in REGRESSION_CASES:
        response = await call_ollama(case["prompt"])

        if response is None:
            failed += 1
            results.append({"name": case["name"], "status": "ERROR", "reason": "無回應"})
            continue

        all_passed = True
        for validator in case["validators"]:
            if not validator(response):
                all_passed = False
                break

        if all_passed:
            passed += 1
            results.append({"name": case["name"], "status": "PASS"})
        else:
            failed += 1
            results.append({
                "name": case["name"],
                "status": "FAIL",
                "response": response[:100],
            })

    total = passed + failed
    accuracy = (passed / total * 100) if total > 0 else 0

    print("\n" + "=" * 60)
    print("Phase 12.3: 模型回歸測試報告")
    print("=" * 60)
    print(f"模型: {DEFAULT_MODEL}")
    print(f"總案例: {total}")
    print(f"通過: {passed}")
    print(f"失敗: {failed}")
    print(f"準確率: {accuracy:.1f}%")
    print("=" * 60)

    if failed > 0:
        print("\n失敗案例:")
        for r in results:
            if r["status"] != "PASS":
                print(f"  - {r['name']}: {r.get('reason', r.get('response', ''))}")

    # 基線門檻 75%
    assert accuracy >= 75, f"準確率 {accuracy}% 低於基線 75%"


if __name__ == "__main__":
    pytest.main([__file__, "-v", "--tb=short"])