Files
awoooi/apps/api/scripts/test_nemotron_tool_calling.py
Your Name d441f70693
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m2s
CD Pipeline / build-and-deploy (push) Successful in 9m2s
CD Pipeline / post-deploy-checks (push) Successful in 1m15s
fix(ai): add 188 ollama retirement gate
2026-05-06 14:55:21 +08:00

496 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Nemotron Tool Calling 精準度測試
比較 Nemotron vs Ollama (Qwen) 的 Tool Calling 能力
使用方式:
export NVIDIA_API_KEY=nvapi-xxxx
python test_nemotron_tool_calling.py
建立者: Claude Code
日期: 2026-03-28 (台北時間)
"""
import asyncio
import json
import os
import time
from dataclasses import dataclass
try:
import httpx
except ImportError:
print("請安裝 httpx: pip install httpx")
exit(1)
# ============================================================================
# 配置
# ============================================================================
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.110:11435")
if not NVIDIA_API_KEY:
print("❌ 請設定 NVIDIA_API_KEY 環境變數")
print(" export NVIDIA_API_KEY=nvapi-xxxx")
exit(1)
# ============================================================================
# Tool 定義 (K8s SRE 場景)
# ============================================================================
TOOLS = [
{
"type": "function",
"function": {
"name": "kubectl_get",
"description": "Get Kubernetes resources (pods, deployments, services, etc.)",
"parameters": {
"type": "object",
"properties": {
"resource": {
"type": "string",
"enum": ["pods", "deployments", "services", "nodes", "events"],
"description": "Resource type to query"
},
"namespace": {
"type": "string",
"description": "Kubernetes namespace (default: awoooi-prod)"
},
"name": {
"type": "string",
"description": "Specific resource name (optional)"
}
},
"required": ["resource"]
}
}
},
{
"type": "function",
"function": {
"name": "restart_deployment",
"description": "Restart a Kubernetes deployment by rolling restart",
"parameters": {
"type": "object",
"properties": {
"deployment": {
"type": "string",
"description": "Deployment name"
},
"namespace": {
"type": "string",
"description": "Kubernetes namespace"
}
},
"required": ["deployment", "namespace"]
}
}
},
{
"type": "function",
"function": {
"name": "scale_deployment",
"description": "Scale a Kubernetes deployment to specified replicas",
"parameters": {
"type": "object",
"properties": {
"deployment": {"type": "string"},
"namespace": {"type": "string"},
"replicas": {"type": "integer", "minimum": 0, "maximum": 10}
},
"required": ["deployment", "namespace", "replicas"]
}
}
},
{
"type": "function",
"function": {
"name": "get_logs",
"description": "Get logs from a Kubernetes pod",
"parameters": {
"type": "object",
"properties": {
"pod": {"type": "string"},
"namespace": {"type": "string"},
"tail": {"type": "integer", "description": "Number of lines (default: 100)"},
"container": {"type": "string", "description": "Container name (optional)"}
},
"required": ["pod", "namespace"]
}
}
},
{
"type": "function",
"function": {
"name": "send_alert",
"description": "Send alert notification via Telegram",
"parameters": {
"type": "object",
"properties": {
"severity": {"type": "string", "enum": ["info", "warning", "critical"]},
"message": {"type": "string"},
"incident_id": {"type": "string"}
},
"required": ["severity", "message"]
}
}
}
]
# ============================================================================
# 測試案例
# ============================================================================
TEST_CASES = [
{
"id": "TC001",
"description": "簡單查詢 - 列出所有 pods",
"prompt": "Show me all pods in awoooi-prod namespace",
"expected_tool": "kubectl_get",
},
{
"id": "TC002",
"description": "重啟服務",
"prompt": "The API is not responding, please restart the awoooi-api deployment in awoooi-prod",
"expected_tool": "restart_deployment",
},
{
"id": "TC003",
"description": "擴展副本",
"prompt": "We're getting high traffic, scale awoooi-web deployment to 3 replicas in awoooi-prod namespace",
"expected_tool": "scale_deployment",
},
{
"id": "TC004",
"description": "查看日誌",
"prompt": "Get the last 50 lines of logs from awoooi-api-abc123 pod in awoooi-prod",
"expected_tool": "get_logs",
},
{
"id": "TC005",
"description": "發送告警",
"prompt": "Send a critical alert with message 'Database connection failed' for incident INC-2026-001",
"expected_tool": "send_alert",
},
{
"id": "TC006",
"description": "繁體中文指令",
"prompt": "請幫我重啟 awoooi-worker 這個 deploymentnamespace 是 awoooi-prod",
"expected_tool": "restart_deployment",
},
{
"id": "TC007",
"description": "複合理解",
"prompt": "The web frontend is showing 502 errors. First, check if the API pods are running in awoooi-prod.",
"expected_tool": "kubectl_get",
},
]
# ============================================================================
# API 客戶端
# ============================================================================
@dataclass
class TestResult:
model: str
test_id: str
description: str
success: bool
tool_called: str | None
params: dict | None
latency_ms: float
error: str | None = None
raw_response: str | None = None
async def call_nemotron(prompt: str, model: str = "nvidia/nemotron-mini-4b-instruct") -> dict:
"""呼叫 NVIDIA NIM API"""
async with httpx.AsyncClient(timeout=60) as client:
start = time.time()
try:
response = await client.post(
"https://integrate.api.nvidia.com/v1/chat/completions",
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {NVIDIA_API_KEY}"
},
json={
"model": model,
"messages": [
{
"role": "system",
"content": "You are an SRE assistant for AWOOOI AIOps platform. Use the provided tools to help with Kubernetes operations. Always use tools when the user requests an action."
},
{"role": "user", "content": prompt}
],
"tools": TOOLS,
"tool_choice": "auto",
"temperature": 0.1,
"max_tokens": 512
}
)
latency = (time.time() - start) * 1000
response.raise_for_status()
return {"data": response.json(), "latency_ms": latency, "error": None}
except Exception as e:
latency = (time.time() - start) * 1000
return {"data": None, "latency_ms": latency, "error": str(e)}
async def call_ollama(prompt: str, model: str = "qwen2.5:7b-instruct") -> dict:
"""呼叫本地 Ollama (JSON 模式模擬 Tool Calling)"""
async with httpx.AsyncClient(timeout=120) as client:
start = time.time()
try:
# Ollama 不原生支援 Tool Calling用 JSON 模式模擬
tool_prompt = f"""Based on this user request, determine which tool to call and with what parameters.
User Request: {prompt}
Available tools:
1. kubectl_get - Get K8s resources (params: resource, namespace, name)
2. restart_deployment - Restart a deployment (params: deployment, namespace)
3. scale_deployment - Scale replicas (params: deployment, namespace, replicas)
4. get_logs - Get pod logs (params: pod, namespace, tail, container)
5. send_alert - Send Telegram alert (params: severity, message, incident_id)
Respond ONLY with a JSON object in this exact format:
{{"tool": "tool_name", "params": {{"key": "value"}}}}
"""
response = await client.post(
f"{OLLAMA_BASE_URL}/api/chat",
json={
"model": model,
"messages": [
{"role": "user", "content": tool_prompt}
],
"stream": False,
"format": "json",
"options": {
"temperature": 0.1
}
}
)
latency = (time.time() - start) * 1000
response.raise_for_status()
return {"data": response.json(), "latency_ms": latency, "error": None}
except Exception as e:
latency = (time.time() - start) * 1000
return {"data": None, "latency_ms": latency, "error": str(e)}
def parse_nemotron_response(response: dict) -> tuple:
"""解析 Nemotron 回應"""
try:
choices = response.get("choices", [])
if not choices:
return (None, {}, "No choices in response")
message = choices[0].get("message", {})
# 檢查 tool_calls
if message.get("tool_calls"):
tool_call = message["tool_calls"][0]
tool_name = tool_call["function"]["name"]
try:
params = json.loads(tool_call["function"]["arguments"])
except Exception:
params = {}
return (tool_name, params, None)
# 如果沒有 tool_calls回傳 content
content = message.get("content", "")
return (None, {}, f"No tool call, content: {content[:100]}")
except Exception as e:
return (None, {}, str(e))
def parse_ollama_response(response: dict) -> tuple:
"""解析 Ollama 回應"""
try:
content = response.get("message", {}).get("content", "{}")
parsed = json.loads(content)
return (parsed.get("tool"), parsed.get("params", {}), None)
except Exception as e:
return (None, {}, str(e))
# ============================================================================
# 測試執行
# ============================================================================
async def run_single_test(test_case: dict) -> list:
"""執行單一測試案例"""
results = []
prompt = test_case["prompt"]
# 測試 Nemotron
print(" Testing Nemotron...", end=" ", flush=True)
resp = await call_nemotron(prompt)
if resp["error"]:
results.append(TestResult(
model="Nemotron-mini-4B",
test_id=test_case["id"],
description=test_case["description"],
success=False,
tool_called=None,
params=None,
latency_ms=resp["latency_ms"],
error=resp["error"]
))
print("❌ Error")
else:
tool, params, error = parse_nemotron_response(resp["data"])
success = tool == test_case["expected_tool"]
raw = None
try:
raw = json.dumps(resp["data"].get("choices", [{}])[0].get("message", {}), indent=2)[:200]
except Exception:
pass
results.append(TestResult(
model="Nemotron-mini-4B",
test_id=test_case["id"],
description=test_case["description"],
success=success,
tool_called=tool,
params=params,
latency_ms=resp["latency_ms"],
error=error,
raw_response=raw
))
status = "" if success else ""
print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)")
# 測試 Ollama
print(" Testing Ollama...", end=" ", flush=True)
resp = await call_ollama(prompt)
if resp["error"]:
results.append(TestResult(
model="Ollama-Qwen2.5-7B",
test_id=test_case["id"],
description=test_case["description"],
success=False,
tool_called=None,
params=None,
latency_ms=resp["latency_ms"],
error=resp["error"]
))
print(f"❌ Error: {resp['error'][:50]}")
else:
tool, params, error = parse_ollama_response(resp["data"])
success = tool == test_case["expected_tool"]
results.append(TestResult(
model="Ollama-Qwen2.5-7B",
test_id=test_case["id"],
description=test_case["description"],
success=success,
tool_called=tool,
params=params,
latency_ms=resp["latency_ms"],
error=error
))
status = "" if success else ""
print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)")
return results
async def main():
"""主測試流程"""
print("=" * 70)
print("🧪 Nemotron vs Ollama Tool Calling 精準度測試")
print("=" * 70)
print()
print("Nemotron API: integrate.api.nvidia.com")
print(f"Ollama URL: {OLLAMA_BASE_URL}")
print()
all_results = []
for i, tc in enumerate(TEST_CASES, 1):
print(f"[{i}/{len(TEST_CASES)}] {tc['id']}: {tc['description']}")
print(f" Prompt: {tc['prompt'][:60]}...")
print(f" Expected: {tc['expected_tool']}")
results = await run_single_test(tc)
all_results.extend(results)
print()
# ========================================================================
# 統計結果
# ========================================================================
print("=" * 70)
print("📊 統計結果")
print("=" * 70)
print()
models = {}
for r in all_results:
if r.model not in models:
models[r.model] = {"success": 0, "total": 0, "latency": [], "errors": 0}
models[r.model]["total"] += 1
if r.success:
models[r.model]["success"] += 1
if r.error:
models[r.model]["errors"] += 1
if r.latency_ms > 0:
models[r.model]["latency"].append(r.latency_ms)
print(f"{'Model':<25} {'Accuracy':<12} {'Avg Latency':<15} {'Errors':<10}")
print("-" * 62)
for model, stats in models.items():
acc = stats["success"] / stats["total"] * 100 if stats["total"] > 0 else 0
avg_lat = sum(stats["latency"]) / len(stats["latency"]) if stats["latency"] else 0
print(f"{model:<25} {acc:>6.1f}% {avg_lat:>8.0f}ms {stats['errors']}")
print()
# 詳細錯誤報告
errors = [r for r in all_results if r.error]
if errors:
print("=" * 70)
print("⚠️ 錯誤詳情")
print("=" * 70)
for r in errors:
print(f" [{r.test_id}] {r.model}: {r.error[:80]}")
# 推薦
print()
print("=" * 70)
print("💡 建議")
print("=" * 70)
nemotron_stats = models.get("Nemotron-mini-4B", {})
ollama_stats = models.get("Ollama-Qwen2.5-7B", {})
nem_acc = nemotron_stats.get("success", 0) / nemotron_stats.get("total", 1) * 100
oll_acc = ollama_stats.get("success", 0) / ollama_stats.get("total", 1) * 100
if nem_acc > oll_acc:
print(f"✅ Nemotron Tool Calling 精準度較高 ({nem_acc:.0f}% vs {oll_acc:.0f}%)")
print(" 建議: 將 Nemotron 作為 Tool Calling 任務的首選模型")
elif oll_acc > nem_acc:
print(f"⚠️ Ollama 精準度較高 ({oll_acc:.0f}% vs {nem_acc:.0f}%)")
print(" 建議: 繼續使用 OllamaNemotron 可作為備援")
else:
print(f"📊 兩者精準度相近 ({nem_acc:.0f}% vs {oll_acc:.0f}%)")
print(" 建議: 考慮延遲和成本選擇")
nem_lat = sum(nemotron_stats.get("latency", [0])) / len(nemotron_stats.get("latency", [1]))
oll_lat = sum(ollama_stats.get("latency", [0])) / len(ollama_stats.get("latency", [1]))
print()
if nem_lat < oll_lat:
print(f"⚡ Nemotron 延遲較低 ({nem_lat:.0f}ms vs {oll_lat:.0f}ms)")
else:
print(f"🏠 Ollama 延遲較低 ({oll_lat:.0f}ms vs {nem_lat:.0f}ms) - 本地優勢")
print()
print("測試完成!")
if __name__ == "__main__":
asyncio.run(main())