#!/usr/bin/env python3 """ Nemotron Tool Calling 精準度測試 比較 Nemotron vs Ollama (Qwen) 的 Tool Calling 能力 使用方式: export NVIDIA_API_KEY=nvapi-xxxx python test_nemotron_tool_calling.py 建立者: Claude Code 日期: 2026-03-28 (台北時間) """ import asyncio import json import os import time from dataclasses import dataclass try: import httpx except ImportError: print("請安裝 httpx: pip install httpx") exit(1) # ============================================================================ # 配置 # ============================================================================ NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY") OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.188:11434") if not NVIDIA_API_KEY: print("❌ 請設定 NVIDIA_API_KEY 環境變數") print(" export NVIDIA_API_KEY=nvapi-xxxx") exit(1) # ============================================================================ # Tool 定義 (K8s SRE 場景) # ============================================================================ TOOLS = [ { "type": "function", "function": { "name": "kubectl_get", "description": "Get Kubernetes resources (pods, deployments, services, etc.)", "parameters": { "type": "object", "properties": { "resource": { "type": "string", "enum": ["pods", "deployments", "services", "nodes", "events"], "description": "Resource type to query" }, "namespace": { "type": "string", "description": "Kubernetes namespace (default: awoooi-prod)" }, "name": { "type": "string", "description": "Specific resource name (optional)" } }, "required": ["resource"] } } }, { "type": "function", "function": { "name": "restart_deployment", "description": "Restart a Kubernetes deployment by rolling restart", "parameters": { "type": "object", "properties": { "deployment": { "type": "string", "description": "Deployment name" }, "namespace": { "type": "string", "description": "Kubernetes namespace" } }, "required": ["deployment", "namespace"] } } }, { "type": "function", "function": { "name": "scale_deployment", "description": "Scale a Kubernetes deployment to specified replicas", "parameters": { "type": "object", "properties": { "deployment": {"type": "string"}, "namespace": {"type": "string"}, "replicas": {"type": "integer", "minimum": 0, "maximum": 10} }, "required": ["deployment", "namespace", "replicas"] } } }, { "type": "function", "function": { "name": "get_logs", "description": "Get logs from a Kubernetes pod", "parameters": { "type": "object", "properties": { "pod": {"type": "string"}, "namespace": {"type": "string"}, "tail": {"type": "integer", "description": "Number of lines (default: 100)"}, "container": {"type": "string", "description": "Container name (optional)"} }, "required": ["pod", "namespace"] } } }, { "type": "function", "function": { "name": "send_alert", "description": "Send alert notification via Telegram", "parameters": { "type": "object", "properties": { "severity": {"type": "string", "enum": ["info", "warning", "critical"]}, "message": {"type": "string"}, "incident_id": {"type": "string"} }, "required": ["severity", "message"] } } } ] # ============================================================================ # 測試案例 # ============================================================================ TEST_CASES = [ { "id": "TC001", "description": "簡單查詢 - 列出所有 pods", "prompt": "Show me all pods in awoooi-prod namespace", "expected_tool": "kubectl_get", }, { "id": "TC002", "description": "重啟服務", "prompt": "The API is not responding, please restart the awoooi-api deployment in awoooi-prod", "expected_tool": "restart_deployment", }, { "id": "TC003", "description": "擴展副本", "prompt": "We're getting high traffic, scale awoooi-web deployment to 3 replicas in awoooi-prod namespace", "expected_tool": "scale_deployment", }, { "id": "TC004", "description": "查看日誌", "prompt": "Get the last 50 lines of logs from awoooi-api-abc123 pod in awoooi-prod", "expected_tool": "get_logs", }, { "id": "TC005", "description": "發送告警", "prompt": "Send a critical alert with message 'Database connection failed' for incident INC-2026-001", "expected_tool": "send_alert", }, { "id": "TC006", "description": "繁體中文指令", "prompt": "請幫我重啟 awoooi-worker 這個 deployment,namespace 是 awoooi-prod", "expected_tool": "restart_deployment", }, { "id": "TC007", "description": "複合理解", "prompt": "The web frontend is showing 502 errors. First, check if the API pods are running in awoooi-prod.", "expected_tool": "kubectl_get", }, ] # ============================================================================ # API 客戶端 # ============================================================================ @dataclass class TestResult: model: str test_id: str description: str success: bool tool_called: str | None params: dict | None latency_ms: float error: str | None = None raw_response: str | None = None async def call_nemotron(prompt: str, model: str = "nvidia/nemotron-mini-4b-instruct") -> dict: """呼叫 NVIDIA NIM API""" async with httpx.AsyncClient(timeout=60) as client: start = time.time() try: response = await client.post( "https://integrate.api.nvidia.com/v1/chat/completions", headers={ "Content-Type": "application/json", "Authorization": f"Bearer {NVIDIA_API_KEY}" }, json={ "model": model, "messages": [ { "role": "system", "content": "You are an SRE assistant for AWOOOI AIOps platform. Use the provided tools to help with Kubernetes operations. Always use tools when the user requests an action." }, {"role": "user", "content": prompt} ], "tools": TOOLS, "tool_choice": "auto", "temperature": 0.1, "max_tokens": 512 } ) latency = (time.time() - start) * 1000 response.raise_for_status() return {"data": response.json(), "latency_ms": latency, "error": None} except Exception as e: latency = (time.time() - start) * 1000 return {"data": None, "latency_ms": latency, "error": str(e)} async def call_ollama(prompt: str, model: str = "qwen2.5:7b-instruct") -> dict: """呼叫本地 Ollama (JSON 模式模擬 Tool Calling)""" async with httpx.AsyncClient(timeout=120) as client: start = time.time() try: # Ollama 不原生支援 Tool Calling,用 JSON 模式模擬 tool_prompt = f"""Based on this user request, determine which tool to call and with what parameters. User Request: {prompt} Available tools: 1. kubectl_get - Get K8s resources (params: resource, namespace, name) 2. restart_deployment - Restart a deployment (params: deployment, namespace) 3. scale_deployment - Scale replicas (params: deployment, namespace, replicas) 4. get_logs - Get pod logs (params: pod, namespace, tail, container) 5. send_alert - Send Telegram alert (params: severity, message, incident_id) Respond ONLY with a JSON object in this exact format: {{"tool": "tool_name", "params": {{"key": "value"}}}} """ response = await client.post( f"{OLLAMA_BASE_URL}/api/chat", json={ "model": model, "messages": [ {"role": "user", "content": tool_prompt} ], "stream": False, "format": "json", "options": { "temperature": 0.1 } } ) latency = (time.time() - start) * 1000 response.raise_for_status() return {"data": response.json(), "latency_ms": latency, "error": None} except Exception as e: latency = (time.time() - start) * 1000 return {"data": None, "latency_ms": latency, "error": str(e)} def parse_nemotron_response(response: dict) -> tuple: """解析 Nemotron 回應""" try: choices = response.get("choices", []) if not choices: return (None, {}, "No choices in response") message = choices[0].get("message", {}) # 檢查 tool_calls if message.get("tool_calls"): tool_call = message["tool_calls"][0] tool_name = tool_call["function"]["name"] try: params = json.loads(tool_call["function"]["arguments"]) except Exception: params = {} return (tool_name, params, None) # 如果沒有 tool_calls,回傳 content content = message.get("content", "") return (None, {}, f"No tool call, content: {content[:100]}") except Exception as e: return (None, {}, str(e)) def parse_ollama_response(response: dict) -> tuple: """解析 Ollama 回應""" try: content = response.get("message", {}).get("content", "{}") parsed = json.loads(content) return (parsed.get("tool"), parsed.get("params", {}), None) except Exception as e: return (None, {}, str(e)) # ============================================================================ # 測試執行 # ============================================================================ async def run_single_test(test_case: dict) -> list: """執行單一測試案例""" results = [] prompt = test_case["prompt"] # 測試 Nemotron print(" Testing Nemotron...", end=" ", flush=True) resp = await call_nemotron(prompt) if resp["error"]: results.append(TestResult( model="Nemotron-mini-4B", test_id=test_case["id"], description=test_case["description"], success=False, tool_called=None, params=None, latency_ms=resp["latency_ms"], error=resp["error"] )) print("❌ Error") else: tool, params, error = parse_nemotron_response(resp["data"]) success = tool == test_case["expected_tool"] raw = None try: raw = json.dumps(resp["data"].get("choices", [{}])[0].get("message", {}), indent=2)[:200] except Exception: pass results.append(TestResult( model="Nemotron-mini-4B", test_id=test_case["id"], description=test_case["description"], success=success, tool_called=tool, params=params, latency_ms=resp["latency_ms"], error=error, raw_response=raw )) status = "✅" if success else "❌" print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)") # 測試 Ollama print(" Testing Ollama...", end=" ", flush=True) resp = await call_ollama(prompt) if resp["error"]: results.append(TestResult( model="Ollama-Qwen2.5-7B", test_id=test_case["id"], description=test_case["description"], success=False, tool_called=None, params=None, latency_ms=resp["latency_ms"], error=resp["error"] )) print(f"❌ Error: {resp['error'][:50]}") else: tool, params, error = parse_ollama_response(resp["data"]) success = tool == test_case["expected_tool"] results.append(TestResult( model="Ollama-Qwen2.5-7B", test_id=test_case["id"], description=test_case["description"], success=success, tool_called=tool, params=params, latency_ms=resp["latency_ms"], error=error )) status = "✅" if success else "❌" print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)") return results async def main(): """主測試流程""" print("=" * 70) print("🧪 Nemotron vs Ollama Tool Calling 精準度測試") print("=" * 70) print() print("Nemotron API: integrate.api.nvidia.com") print(f"Ollama URL: {OLLAMA_BASE_URL}") print() all_results = [] for i, tc in enumerate(TEST_CASES, 1): print(f"[{i}/{len(TEST_CASES)}] {tc['id']}: {tc['description']}") print(f" Prompt: {tc['prompt'][:60]}...") print(f" Expected: {tc['expected_tool']}") results = await run_single_test(tc) all_results.extend(results) print() # ======================================================================== # 統計結果 # ======================================================================== print("=" * 70) print("📊 統計結果") print("=" * 70) print() models = {} for r in all_results: if r.model not in models: models[r.model] = {"success": 0, "total": 0, "latency": [], "errors": 0} models[r.model]["total"] += 1 if r.success: models[r.model]["success"] += 1 if r.error: models[r.model]["errors"] += 1 if r.latency_ms > 0: models[r.model]["latency"].append(r.latency_ms) print(f"{'Model':<25} {'Accuracy':<12} {'Avg Latency':<15} {'Errors':<10}") print("-" * 62) for model, stats in models.items(): acc = stats["success"] / stats["total"] * 100 if stats["total"] > 0 else 0 avg_lat = sum(stats["latency"]) / len(stats["latency"]) if stats["latency"] else 0 print(f"{model:<25} {acc:>6.1f}% {avg_lat:>8.0f}ms {stats['errors']}") print() # 詳細錯誤報告 errors = [r for r in all_results if r.error] if errors: print("=" * 70) print("⚠️ 錯誤詳情") print("=" * 70) for r in errors: print(f" [{r.test_id}] {r.model}: {r.error[:80]}") # 推薦 print() print("=" * 70) print("💡 建議") print("=" * 70) nemotron_stats = models.get("Nemotron-mini-4B", {}) ollama_stats = models.get("Ollama-Qwen2.5-7B", {}) nem_acc = nemotron_stats.get("success", 0) / nemotron_stats.get("total", 1) * 100 oll_acc = ollama_stats.get("success", 0) / ollama_stats.get("total", 1) * 100 if nem_acc > oll_acc: print(f"✅ Nemotron Tool Calling 精準度較高 ({nem_acc:.0f}% vs {oll_acc:.0f}%)") print(" 建議: 將 Nemotron 作為 Tool Calling 任務的首選模型") elif oll_acc > nem_acc: print(f"⚠️ Ollama 精準度較高 ({oll_acc:.0f}% vs {nem_acc:.0f}%)") print(" 建議: 繼續使用 Ollama,Nemotron 可作為備援") else: print(f"📊 兩者精準度相近 ({nem_acc:.0f}% vs {oll_acc:.0f}%)") print(" 建議: 考慮延遲和成本選擇") nem_lat = sum(nemotron_stats.get("latency", [0])) / len(nemotron_stats.get("latency", [1])) oll_lat = sum(ollama_stats.get("latency", [0])) / len(ollama_stats.get("latency", [1])) print() if nem_lat < oll_lat: print(f"⚡ Nemotron 延遲較低 ({nem_lat:.0f}ms vs {oll_lat:.0f}ms)") else: print(f"🏠 Ollama 延遲較低 ({oll_lat:.0f}ms vs {nem_lat:.0f}ms) - 本地優勢") print() print("測試完成!") if __name__ == "__main__": asyncio.run(main())