awoooi/apps/api/scripts/test_nemotron_tool_calling.py

#!/usr/bin/env python3
"""
Nemotron Tool Calling 精準度測試
比較 Nemotron vs Ollama (Qwen) 的 Tool Calling 能力

使用方式:
    export NVIDIA_API_KEY=nvapi-xxxx
    python test_nemotron_tool_calling.py

建立者: Claude Code
日期: 2026-03-28 (台北時間)
"""

import asyncio
import json
import os
import time
from dataclasses import dataclass

try:
    import httpx
except ImportError:
    print("請安裝 httpx: pip install httpx")
    exit(1)

# ============================================================================
# 配置
# ============================================================================

NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.110:11435")

if not NVIDIA_API_KEY:
    print("❌ 請設定 NVIDIA_API_KEY 環境變數")
    print("   export NVIDIA_API_KEY=nvapi-xxxx")
    exit(1)

# ============================================================================
# Tool 定義 (K8s SRE 場景)
# ============================================================================

TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "kubectl_get",
            "description": "Get Kubernetes resources (pods, deployments, services, etc.)",
            "parameters": {
                "type": "object",
                "properties": {
                    "resource": {
                        "type": "string",
                        "enum": ["pods", "deployments", "services", "nodes", "events"],
                        "description": "Resource type to query"
                    },
                    "namespace": {
                        "type": "string",
                        "description": "Kubernetes namespace (default: awoooi-prod)"
                    },
                    "name": {
                        "type": "string",
                        "description": "Specific resource name (optional)"
                    }
                },
                "required": ["resource"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "restart_deployment",
            "description": "Restart a Kubernetes deployment by rolling restart",
            "parameters": {
                "type": "object",
                "properties": {
                    "deployment": {
                        "type": "string",
                        "description": "Deployment name"
                    },
                    "namespace": {
                        "type": "string",
                        "description": "Kubernetes namespace"
                    }
                },
                "required": ["deployment", "namespace"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "scale_deployment",
            "description": "Scale a Kubernetes deployment to specified replicas",
            "parameters": {
                "type": "object",
                "properties": {
                    "deployment": {"type": "string"},
                    "namespace": {"type": "string"},
                    "replicas": {"type": "integer", "minimum": 0, "maximum": 10}
                },
                "required": ["deployment", "namespace", "replicas"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "get_logs",
            "description": "Get logs from a Kubernetes pod",
            "parameters": {
                "type": "object",
                "properties": {
                    "pod": {"type": "string"},
                    "namespace": {"type": "string"},
                    "tail": {"type": "integer", "description": "Number of lines (default: 100)"},
                    "container": {"type": "string", "description": "Container name (optional)"}
                },
                "required": ["pod", "namespace"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "send_alert",
            "description": "Send alert notification via Telegram",
            "parameters": {
                "type": "object",
                "properties": {
                    "severity": {"type": "string", "enum": ["info", "warning", "critical"]},
                    "message": {"type": "string"},
                    "incident_id": {"type": "string"}
                },
                "required": ["severity", "message"]
            }
        }
    }
]

# ============================================================================
# 測試案例
# ============================================================================

TEST_CASES = [
    {
        "id": "TC001",
        "description": "簡單查詢 - 列出所有 pods",
        "prompt": "Show me all pods in awoooi-prod namespace",
        "expected_tool": "kubectl_get",
    },
    {
        "id": "TC002",
        "description": "重啟服務",
        "prompt": "The API is not responding, please restart the awoooi-api deployment in awoooi-prod",
        "expected_tool": "restart_deployment",
    },
    {
        "id": "TC003",
        "description": "擴展副本",
        "prompt": "We're getting high traffic, scale awoooi-web deployment to 3 replicas in awoooi-prod namespace",
        "expected_tool": "scale_deployment",
    },
    {
        "id": "TC004",
        "description": "查看日誌",
        "prompt": "Get the last 50 lines of logs from awoooi-api-abc123 pod in awoooi-prod",
        "expected_tool": "get_logs",
    },
    {
        "id": "TC005",
        "description": "發送告警",
        "prompt": "Send a critical alert with message 'Database connection failed' for incident INC-2026-001",
        "expected_tool": "send_alert",
    },
    {
        "id": "TC006",
        "description": "繁體中文指令",
        "prompt": "請幫我重啟 awoooi-worker 這個 deployment，namespace 是 awoooi-prod",
        "expected_tool": "restart_deployment",
    },
    {
        "id": "TC007",
        "description": "複合理解",
        "prompt": "The web frontend is showing 502 errors. First, check if the API pods are running in awoooi-prod.",
        "expected_tool": "kubectl_get",
    },
]

# ============================================================================
# API 客戶端
# ============================================================================

@dataclass
class TestResult:
    model: str
    test_id: str
    description: str
    success: bool
    tool_called: str | None
    params: dict | None
    latency_ms: float
    error: str | None = None
    raw_response: str | None = None


async def call_nemotron(prompt: str, model: str = "nvidia/nemotron-mini-4b-instruct") -> dict:
    """呼叫 NVIDIA NIM API"""
    async with httpx.AsyncClient(timeout=60) as client:
        start = time.time()
        try:
            response = await client.post(
                "https://integrate.api.nvidia.com/v1/chat/completions",
                headers={
                    "Content-Type": "application/json",
                    "Authorization": f"Bearer {NVIDIA_API_KEY}"
                },
                json={
                    "model": model,
                    "messages": [
                        {
                            "role": "system",
                            "content": "You are an SRE assistant for AWOOOI AIOps platform. Use the provided tools to help with Kubernetes operations. Always use tools when the user requests an action."
                        },
                        {"role": "user", "content": prompt}
                    ],
                    "tools": TOOLS,
                    "tool_choice": "auto",
                    "temperature": 0.1,
                    "max_tokens": 512
                }
            )
            latency = (time.time() - start) * 1000
            response.raise_for_status()
            return {"data": response.json(), "latency_ms": latency, "error": None}
        except Exception as e:
            latency = (time.time() - start) * 1000
            return {"data": None, "latency_ms": latency, "error": str(e)}


async def call_ollama(prompt: str, model: str = "qwen2.5:7b-instruct") -> dict:
    """呼叫本地 Ollama (JSON 模式模擬 Tool Calling)"""
    async with httpx.AsyncClient(timeout=120) as client:
        start = time.time()
        try:
            # Ollama 不原生支援 Tool Calling，用 JSON 模式模擬
            tool_prompt = f"""Based on this user request, determine which tool to call and with what parameters.

User Request: {prompt}

Available tools:
1. kubectl_get - Get K8s resources (params: resource, namespace, name)
2. restart_deployment - Restart a deployment (params: deployment, namespace)
3. scale_deployment - Scale replicas (params: deployment, namespace, replicas)
4. get_logs - Get pod logs (params: pod, namespace, tail, container)
5. send_alert - Send Telegram alert (params: severity, message, incident_id)

Respond ONLY with a JSON object in this exact format:
{{"tool": "tool_name", "params": {{"key": "value"}}}}
"""
            response = await client.post(
                f"{OLLAMA_BASE_URL}/api/chat",
                json={
                    "model": model,
                    "messages": [
                        {"role": "user", "content": tool_prompt}
                    ],
                    "stream": False,
                    "format": "json",
                    "options": {
                        "temperature": 0.1
                    }
                }
            )
            latency = (time.time() - start) * 1000
            response.raise_for_status()
            return {"data": response.json(), "latency_ms": latency, "error": None}
        except Exception as e:
            latency = (time.time() - start) * 1000
            return {"data": None, "latency_ms": latency, "error": str(e)}


def parse_nemotron_response(response: dict) -> tuple:
    """解析 Nemotron 回應"""
    try:
        choices = response.get("choices", [])
        if not choices:
            return (None, {}, "No choices in response")

        message = choices[0].get("message", {})

        # 檢查 tool_calls
        if message.get("tool_calls"):
            tool_call = message["tool_calls"][0]
            tool_name = tool_call["function"]["name"]
            try:
                params = json.loads(tool_call["function"]["arguments"])
            except Exception:
                params = {}
            return (tool_name, params, None)

        # 如果沒有 tool_calls，回傳 content
        content = message.get("content", "")
        return (None, {}, f"No tool call, content: {content[:100]}")

    except Exception as e:
        return (None, {}, str(e))


def parse_ollama_response(response: dict) -> tuple:
    """解析 Ollama 回應"""
    try:
        content = response.get("message", {}).get("content", "{}")
        parsed = json.loads(content)
        return (parsed.get("tool"), parsed.get("params", {}), None)
    except Exception as e:
        return (None, {}, str(e))


# ============================================================================
# 測試執行
# ============================================================================

async def run_single_test(test_case: dict) -> list:
    """執行單一測試案例"""
    results = []
    prompt = test_case["prompt"]

    # 測試 Nemotron
    print("    Testing Nemotron...", end=" ", flush=True)
    resp = await call_nemotron(prompt)
    if resp["error"]:
        results.append(TestResult(
            model="Nemotron-mini-4B",
            test_id=test_case["id"],
            description=test_case["description"],
            success=False,
            tool_called=None,
            params=None,
            latency_ms=resp["latency_ms"],
            error=resp["error"]
        ))
        print("❌ Error")
    else:
        tool, params, error = parse_nemotron_response(resp["data"])
        success = tool == test_case["expected_tool"]
        raw = None
        try:
            raw = json.dumps(resp["data"].get("choices", [{}])[0].get("message", {}), indent=2)[:200]
        except Exception:
            pass
        results.append(TestResult(
            model="Nemotron-mini-4B",
            test_id=test_case["id"],
            description=test_case["description"],
            success=success,
            tool_called=tool,
            params=params,
            latency_ms=resp["latency_ms"],
            error=error,
            raw_response=raw
        ))
        status = "✅" if success else "❌"
        print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)")

    # 測試 Ollama
    print("    Testing Ollama...", end=" ", flush=True)
    resp = await call_ollama(prompt)
    if resp["error"]:
        results.append(TestResult(
            model="Ollama-Qwen2.5-7B",
            test_id=test_case["id"],
            description=test_case["description"],
            success=False,
            tool_called=None,
            params=None,
            latency_ms=resp["latency_ms"],
            error=resp["error"]
        ))
        print(f"❌ Error: {resp['error'][:50]}")
    else:
        tool, params, error = parse_ollama_response(resp["data"])
        success = tool == test_case["expected_tool"]
        results.append(TestResult(
            model="Ollama-Qwen2.5-7B",
            test_id=test_case["id"],
            description=test_case["description"],
            success=success,
            tool_called=tool,
            params=params,
            latency_ms=resp["latency_ms"],
            error=error
        ))
        status = "✅" if success else "❌"
        print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)")

    return results


async def main():
    """主測試流程"""
    print("=" * 70)
    print("🧪 Nemotron vs Ollama Tool Calling 精準度測試")
    print("=" * 70)
    print()
    print("Nemotron API: integrate.api.nvidia.com")
    print(f"Ollama URL: {OLLAMA_BASE_URL}")
    print()

    all_results = []

    for i, tc in enumerate(TEST_CASES, 1):
        print(f"[{i}/{len(TEST_CASES)}] {tc['id']}: {tc['description']}")
        print(f"    Prompt: {tc['prompt'][:60]}...")
        print(f"    Expected: {tc['expected_tool']}")

        results = await run_single_test(tc)
        all_results.extend(results)
        print()

    # ========================================================================
    # 統計結果
    # ========================================================================
    print("=" * 70)
    print("📊 統計結果")
    print("=" * 70)
    print()

    models = {}
    for r in all_results:
        if r.model not in models:
            models[r.model] = {"success": 0, "total": 0, "latency": [], "errors": 0}
        models[r.model]["total"] += 1
        if r.success:
            models[r.model]["success"] += 1
        if r.error:
            models[r.model]["errors"] += 1
        if r.latency_ms > 0:
            models[r.model]["latency"].append(r.latency_ms)

    print(f"{'Model':<25} {'Accuracy':<12} {'Avg Latency':<15} {'Errors':<10}")
    print("-" * 62)
    for model, stats in models.items():
        acc = stats["success"] / stats["total"] * 100 if stats["total"] > 0 else 0
        avg_lat = sum(stats["latency"]) / len(stats["latency"]) if stats["latency"] else 0
        print(f"{model:<25} {acc:>6.1f}%      {avg_lat:>8.0f}ms       {stats['errors']}")

    print()

    # 詳細錯誤報告
    errors = [r for r in all_results if r.error]
    if errors:
        print("=" * 70)
        print("⚠️ 錯誤詳情")
        print("=" * 70)
        for r in errors:
            print(f"  [{r.test_id}] {r.model}: {r.error[:80]}")

    # 推薦
    print()
    print("=" * 70)
    print("💡 建議")
    print("=" * 70)

    nemotron_stats = models.get("Nemotron-mini-4B", {})
    ollama_stats = models.get("Ollama-Qwen2.5-7B", {})

    nem_acc = nemotron_stats.get("success", 0) / nemotron_stats.get("total", 1) * 100
    oll_acc = ollama_stats.get("success", 0) / ollama_stats.get("total", 1) * 100

    if nem_acc > oll_acc:
        print(f"✅ Nemotron Tool Calling 精準度較高 ({nem_acc:.0f}% vs {oll_acc:.0f}%)")
        print("   建議: 將 Nemotron 作為 Tool Calling 任務的首選模型")
    elif oll_acc > nem_acc:
        print(f"⚠️ Ollama 精準度較高 ({oll_acc:.0f}% vs {nem_acc:.0f}%)")
        print("   建議: 繼續使用 Ollama，Nemotron 可作為備援")
    else:
        print(f"📊 兩者精準度相近 ({nem_acc:.0f}% vs {oll_acc:.0f}%)")
        print("   建議: 考慮延遲和成本選擇")

    nem_lat = sum(nemotron_stats.get("latency", [0])) / len(nemotron_stats.get("latency", [1]))
    oll_lat = sum(ollama_stats.get("latency", [0])) / len(ollama_stats.get("latency", [1]))

    print()
    if nem_lat < oll_lat:
        print(f"⚡ Nemotron 延遲較低 ({nem_lat:.0f}ms vs {oll_lat:.0f}ms)")
    else:
        print(f"🏠 Ollama 延遲較低 ({oll_lat:.0f}ms vs {nem_lat:.0f}ms) - 本地優勢")

    print()
    print("測試完成！")


if __name__ == "__main__":
    asyncio.run(main())