清理工作: - .gitignore 新增 playwright-report/ 和 test-results/ 排除 - 保留 phase19/ 參考截圖目錄 kube-state-metrics 告警擴充 (P3): - CronJobLastRunFailed: Job 執行失敗 - DaemonSetMissingPods: DaemonSet 缺少 Pod - StatefulSetReplicasMismatch: StatefulSet 副本不足 - ContainerWaiting: ImagePullBackOff/CrashLoopBackOff 偵測 - PDBViolation: PDB 健康 Pod 數不足 - NodeUnschedulable: 節點標記為不可排程 新增: - apps/api/scripts/test_nemotron_tool_calling.py (E2E 比較測試) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
497 lines
17 KiB
Python
497 lines
17 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Nemotron Tool Calling 精準度測試
|
||
比較 Nemotron vs Ollama (Qwen) 的 Tool Calling 能力
|
||
|
||
使用方式:
|
||
export NVIDIA_API_KEY=nvapi-xxxx
|
||
python test_nemotron_tool_calling.py
|
||
|
||
建立者: Claude Code
|
||
日期: 2026-03-28 (台北時間)
|
||
"""
|
||
|
||
import os
|
||
import json
|
||
import asyncio
|
||
import time
|
||
from dataclasses import dataclass
|
||
from typing import Optional
|
||
|
||
try:
|
||
import httpx
|
||
except ImportError:
|
||
print("請安裝 httpx: pip install httpx")
|
||
exit(1)
|
||
|
||
# ============================================================================
|
||
# 配置
|
||
# ============================================================================
|
||
|
||
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
|
||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.188:11434")
|
||
|
||
if not NVIDIA_API_KEY:
|
||
print("❌ 請設定 NVIDIA_API_KEY 環境變數")
|
||
print(" export NVIDIA_API_KEY=nvapi-xxxx")
|
||
exit(1)
|
||
|
||
# ============================================================================
|
||
# Tool 定義 (K8s SRE 場景)
|
||
# ============================================================================
|
||
|
||
TOOLS = [
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "kubectl_get",
|
||
"description": "Get Kubernetes resources (pods, deployments, services, etc.)",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"resource": {
|
||
"type": "string",
|
||
"enum": ["pods", "deployments", "services", "nodes", "events"],
|
||
"description": "Resource type to query"
|
||
},
|
||
"namespace": {
|
||
"type": "string",
|
||
"description": "Kubernetes namespace (default: awoooi-prod)"
|
||
},
|
||
"name": {
|
||
"type": "string",
|
||
"description": "Specific resource name (optional)"
|
||
}
|
||
},
|
||
"required": ["resource"]
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "restart_deployment",
|
||
"description": "Restart a Kubernetes deployment by rolling restart",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"deployment": {
|
||
"type": "string",
|
||
"description": "Deployment name"
|
||
},
|
||
"namespace": {
|
||
"type": "string",
|
||
"description": "Kubernetes namespace"
|
||
}
|
||
},
|
||
"required": ["deployment", "namespace"]
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "scale_deployment",
|
||
"description": "Scale a Kubernetes deployment to specified replicas",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"deployment": {"type": "string"},
|
||
"namespace": {"type": "string"},
|
||
"replicas": {"type": "integer", "minimum": 0, "maximum": 10}
|
||
},
|
||
"required": ["deployment", "namespace", "replicas"]
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "get_logs",
|
||
"description": "Get logs from a Kubernetes pod",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"pod": {"type": "string"},
|
||
"namespace": {"type": "string"},
|
||
"tail": {"type": "integer", "description": "Number of lines (default: 100)"},
|
||
"container": {"type": "string", "description": "Container name (optional)"}
|
||
},
|
||
"required": ["pod", "namespace"]
|
||
}
|
||
}
|
||
},
|
||
{
|
||
"type": "function",
|
||
"function": {
|
||
"name": "send_alert",
|
||
"description": "Send alert notification via Telegram",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"severity": {"type": "string", "enum": ["info", "warning", "critical"]},
|
||
"message": {"type": "string"},
|
||
"incident_id": {"type": "string"}
|
||
},
|
||
"required": ["severity", "message"]
|
||
}
|
||
}
|
||
}
|
||
]
|
||
|
||
# ============================================================================
|
||
# 測試案例
|
||
# ============================================================================
|
||
|
||
TEST_CASES = [
|
||
{
|
||
"id": "TC001",
|
||
"description": "簡單查詢 - 列出所有 pods",
|
||
"prompt": "Show me all pods in awoooi-prod namespace",
|
||
"expected_tool": "kubectl_get",
|
||
},
|
||
{
|
||
"id": "TC002",
|
||
"description": "重啟服務",
|
||
"prompt": "The API is not responding, please restart the awoooi-api deployment in awoooi-prod",
|
||
"expected_tool": "restart_deployment",
|
||
},
|
||
{
|
||
"id": "TC003",
|
||
"description": "擴展副本",
|
||
"prompt": "We're getting high traffic, scale awoooi-web deployment to 3 replicas in awoooi-prod namespace",
|
||
"expected_tool": "scale_deployment",
|
||
},
|
||
{
|
||
"id": "TC004",
|
||
"description": "查看日誌",
|
||
"prompt": "Get the last 50 lines of logs from awoooi-api-abc123 pod in awoooi-prod",
|
||
"expected_tool": "get_logs",
|
||
},
|
||
{
|
||
"id": "TC005",
|
||
"description": "發送告警",
|
||
"prompt": "Send a critical alert with message 'Database connection failed' for incident INC-2026-001",
|
||
"expected_tool": "send_alert",
|
||
},
|
||
{
|
||
"id": "TC006",
|
||
"description": "繁體中文指令",
|
||
"prompt": "請幫我重啟 awoooi-worker 這個 deployment,namespace 是 awoooi-prod",
|
||
"expected_tool": "restart_deployment",
|
||
},
|
||
{
|
||
"id": "TC007",
|
||
"description": "複合理解",
|
||
"prompt": "The web frontend is showing 502 errors. First, check if the API pods are running in awoooi-prod.",
|
||
"expected_tool": "kubectl_get",
|
||
},
|
||
]
|
||
|
||
# ============================================================================
|
||
# API 客戶端
|
||
# ============================================================================
|
||
|
||
@dataclass
|
||
class TestResult:
|
||
model: str
|
||
test_id: str
|
||
description: str
|
||
success: bool
|
||
tool_called: Optional[str]
|
||
params: Optional[dict]
|
||
latency_ms: float
|
||
error: Optional[str] = None
|
||
raw_response: Optional[str] = None
|
||
|
||
|
||
async def call_nemotron(prompt: str, model: str = "nvidia/nemotron-mini-4b-instruct") -> dict:
|
||
"""呼叫 NVIDIA NIM API"""
|
||
async with httpx.AsyncClient(timeout=60) as client:
|
||
start = time.time()
|
||
try:
|
||
response = await client.post(
|
||
"https://integrate.api.nvidia.com/v1/chat/completions",
|
||
headers={
|
||
"Content-Type": "application/json",
|
||
"Authorization": f"Bearer {NVIDIA_API_KEY}"
|
||
},
|
||
json={
|
||
"model": model,
|
||
"messages": [
|
||
{
|
||
"role": "system",
|
||
"content": "You are an SRE assistant for AWOOOI AIOps platform. Use the provided tools to help with Kubernetes operations. Always use tools when the user requests an action."
|
||
},
|
||
{"role": "user", "content": prompt}
|
||
],
|
||
"tools": TOOLS,
|
||
"tool_choice": "auto",
|
||
"temperature": 0.1,
|
||
"max_tokens": 512
|
||
}
|
||
)
|
||
latency = (time.time() - start) * 1000
|
||
response.raise_for_status()
|
||
return {"data": response.json(), "latency_ms": latency, "error": None}
|
||
except Exception as e:
|
||
latency = (time.time() - start) * 1000
|
||
return {"data": None, "latency_ms": latency, "error": str(e)}
|
||
|
||
|
||
async def call_ollama(prompt: str, model: str = "qwen2.5:7b-instruct") -> dict:
|
||
"""呼叫本地 Ollama (JSON 模式模擬 Tool Calling)"""
|
||
async with httpx.AsyncClient(timeout=120) as client:
|
||
start = time.time()
|
||
try:
|
||
# Ollama 不原生支援 Tool Calling,用 JSON 模式模擬
|
||
tool_prompt = f"""Based on this user request, determine which tool to call and with what parameters.
|
||
|
||
User Request: {prompt}
|
||
|
||
Available tools:
|
||
1. kubectl_get - Get K8s resources (params: resource, namespace, name)
|
||
2. restart_deployment - Restart a deployment (params: deployment, namespace)
|
||
3. scale_deployment - Scale replicas (params: deployment, namespace, replicas)
|
||
4. get_logs - Get pod logs (params: pod, namespace, tail, container)
|
||
5. send_alert - Send Telegram alert (params: severity, message, incident_id)
|
||
|
||
Respond ONLY with a JSON object in this exact format:
|
||
{{"tool": "tool_name", "params": {{"key": "value"}}}}
|
||
"""
|
||
response = await client.post(
|
||
f"{OLLAMA_BASE_URL}/api/chat",
|
||
json={
|
||
"model": model,
|
||
"messages": [
|
||
{"role": "user", "content": tool_prompt}
|
||
],
|
||
"stream": False,
|
||
"format": "json",
|
||
"options": {
|
||
"temperature": 0.1
|
||
}
|
||
}
|
||
)
|
||
latency = (time.time() - start) * 1000
|
||
response.raise_for_status()
|
||
return {"data": response.json(), "latency_ms": latency, "error": None}
|
||
except Exception as e:
|
||
latency = (time.time() - start) * 1000
|
||
return {"data": None, "latency_ms": latency, "error": str(e)}
|
||
|
||
|
||
def parse_nemotron_response(response: dict) -> tuple:
|
||
"""解析 Nemotron 回應"""
|
||
try:
|
||
choices = response.get("choices", [])
|
||
if not choices:
|
||
return (None, {}, "No choices in response")
|
||
|
||
message = choices[0].get("message", {})
|
||
|
||
# 檢查 tool_calls
|
||
if message.get("tool_calls"):
|
||
tool_call = message["tool_calls"][0]
|
||
tool_name = tool_call["function"]["name"]
|
||
try:
|
||
params = json.loads(tool_call["function"]["arguments"])
|
||
except:
|
||
params = {}
|
||
return (tool_name, params, None)
|
||
|
||
# 如果沒有 tool_calls,回傳 content
|
||
content = message.get("content", "")
|
||
return (None, {}, f"No tool call, content: {content[:100]}")
|
||
|
||
except Exception as e:
|
||
return (None, {}, str(e))
|
||
|
||
|
||
def parse_ollama_response(response: dict) -> tuple:
|
||
"""解析 Ollama 回應"""
|
||
try:
|
||
content = response.get("message", {}).get("content", "{}")
|
||
parsed = json.loads(content)
|
||
return (parsed.get("tool"), parsed.get("params", {}), None)
|
||
except Exception as e:
|
||
return (None, {}, str(e))
|
||
|
||
|
||
# ============================================================================
|
||
# 測試執行
|
||
# ============================================================================
|
||
|
||
async def run_single_test(test_case: dict) -> list:
|
||
"""執行單一測試案例"""
|
||
results = []
|
||
prompt = test_case["prompt"]
|
||
|
||
# 測試 Nemotron
|
||
print(f" Testing Nemotron...", end=" ", flush=True)
|
||
resp = await call_nemotron(prompt)
|
||
if resp["error"]:
|
||
results.append(TestResult(
|
||
model="Nemotron-mini-4B",
|
||
test_id=test_case["id"],
|
||
description=test_case["description"],
|
||
success=False,
|
||
tool_called=None,
|
||
params=None,
|
||
latency_ms=resp["latency_ms"],
|
||
error=resp["error"]
|
||
))
|
||
print(f"❌ Error")
|
||
else:
|
||
tool, params, error = parse_nemotron_response(resp["data"])
|
||
success = tool == test_case["expected_tool"]
|
||
raw = None
|
||
try:
|
||
raw = json.dumps(resp["data"].get("choices", [{}])[0].get("message", {}), indent=2)[:200]
|
||
except:
|
||
pass
|
||
results.append(TestResult(
|
||
model="Nemotron-mini-4B",
|
||
test_id=test_case["id"],
|
||
description=test_case["description"],
|
||
success=success,
|
||
tool_called=tool,
|
||
params=params,
|
||
latency_ms=resp["latency_ms"],
|
||
error=error,
|
||
raw_response=raw
|
||
))
|
||
status = "✅" if success else "❌"
|
||
print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)")
|
||
|
||
# 測試 Ollama
|
||
print(f" Testing Ollama...", end=" ", flush=True)
|
||
resp = await call_ollama(prompt)
|
||
if resp["error"]:
|
||
results.append(TestResult(
|
||
model="Ollama-Qwen2.5-7B",
|
||
test_id=test_case["id"],
|
||
description=test_case["description"],
|
||
success=False,
|
||
tool_called=None,
|
||
params=None,
|
||
latency_ms=resp["latency_ms"],
|
||
error=resp["error"]
|
||
))
|
||
print(f"❌ Error: {resp['error'][:50]}")
|
||
else:
|
||
tool, params, error = parse_ollama_response(resp["data"])
|
||
success = tool == test_case["expected_tool"]
|
||
results.append(TestResult(
|
||
model="Ollama-Qwen2.5-7B",
|
||
test_id=test_case["id"],
|
||
description=test_case["description"],
|
||
success=success,
|
||
tool_called=tool,
|
||
params=params,
|
||
latency_ms=resp["latency_ms"],
|
||
error=error
|
||
))
|
||
status = "✅" if success else "❌"
|
||
print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)")
|
||
|
||
return results
|
||
|
||
|
||
async def main():
|
||
"""主測試流程"""
|
||
print("=" * 70)
|
||
print("🧪 Nemotron vs Ollama Tool Calling 精準度測試")
|
||
print("=" * 70)
|
||
print()
|
||
print(f"Nemotron API: integrate.api.nvidia.com")
|
||
print(f"Ollama URL: {OLLAMA_BASE_URL}")
|
||
print()
|
||
|
||
all_results = []
|
||
|
||
for i, tc in enumerate(TEST_CASES, 1):
|
||
print(f"[{i}/{len(TEST_CASES)}] {tc['id']}: {tc['description']}")
|
||
print(f" Prompt: {tc['prompt'][:60]}...")
|
||
print(f" Expected: {tc['expected_tool']}")
|
||
|
||
results = await run_single_test(tc)
|
||
all_results.extend(results)
|
||
print()
|
||
|
||
# ========================================================================
|
||
# 統計結果
|
||
# ========================================================================
|
||
print("=" * 70)
|
||
print("📊 統計結果")
|
||
print("=" * 70)
|
||
print()
|
||
|
||
models = {}
|
||
for r in all_results:
|
||
if r.model not in models:
|
||
models[r.model] = {"success": 0, "total": 0, "latency": [], "errors": 0}
|
||
models[r.model]["total"] += 1
|
||
if r.success:
|
||
models[r.model]["success"] += 1
|
||
if r.error:
|
||
models[r.model]["errors"] += 1
|
||
if r.latency_ms > 0:
|
||
models[r.model]["latency"].append(r.latency_ms)
|
||
|
||
print(f"{'Model':<25} {'Accuracy':<12} {'Avg Latency':<15} {'Errors':<10}")
|
||
print("-" * 62)
|
||
for model, stats in models.items():
|
||
acc = stats["success"] / stats["total"] * 100 if stats["total"] > 0 else 0
|
||
avg_lat = sum(stats["latency"]) / len(stats["latency"]) if stats["latency"] else 0
|
||
print(f"{model:<25} {acc:>6.1f}% {avg_lat:>8.0f}ms {stats['errors']}")
|
||
|
||
print()
|
||
|
||
# 詳細錯誤報告
|
||
errors = [r for r in all_results if r.error]
|
||
if errors:
|
||
print("=" * 70)
|
||
print("⚠️ 錯誤詳情")
|
||
print("=" * 70)
|
||
for r in errors:
|
||
print(f" [{r.test_id}] {r.model}: {r.error[:80]}")
|
||
|
||
# 推薦
|
||
print()
|
||
print("=" * 70)
|
||
print("💡 建議")
|
||
print("=" * 70)
|
||
|
||
nemotron_stats = models.get("Nemotron-mini-4B", {})
|
||
ollama_stats = models.get("Ollama-Qwen2.5-7B", {})
|
||
|
||
nem_acc = nemotron_stats.get("success", 0) / nemotron_stats.get("total", 1) * 100
|
||
oll_acc = ollama_stats.get("success", 0) / ollama_stats.get("total", 1) * 100
|
||
|
||
if nem_acc > oll_acc:
|
||
print(f"✅ Nemotron Tool Calling 精準度較高 ({nem_acc:.0f}% vs {oll_acc:.0f}%)")
|
||
print(" 建議: 將 Nemotron 作為 Tool Calling 任務的首選模型")
|
||
elif oll_acc > nem_acc:
|
||
print(f"⚠️ Ollama 精準度較高 ({oll_acc:.0f}% vs {nem_acc:.0f}%)")
|
||
print(" 建議: 繼續使用 Ollama,Nemotron 可作為備援")
|
||
else:
|
||
print(f"📊 兩者精準度相近 ({nem_acc:.0f}% vs {oll_acc:.0f}%)")
|
||
print(" 建議: 考慮延遲和成本選擇")
|
||
|
||
nem_lat = sum(nemotron_stats.get("latency", [0])) / len(nemotron_stats.get("latency", [1]))
|
||
oll_lat = sum(ollama_stats.get("latency", [0])) / len(ollama_stats.get("latency", [1]))
|
||
|
||
print()
|
||
if nem_lat < oll_lat:
|
||
print(f"⚡ Nemotron 延遲較低 ({nem_lat:.0f}ms vs {oll_lat:.0f}ms)")
|
||
else:
|
||
print(f"🏠 Ollama 延遲較低 ({oll_lat:.0f}ms vs {nem_lat:.0f}ms) - 本地優勢")
|
||
|
||
print()
|
||
print("測試完成!")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|