chore: 清理 Playwright 產物 + kube-state-metrics 告警擴充
清理工作: - .gitignore 新增 playwright-report/ 和 test-results/ 排除 - 保留 phase19/ 參考截圖目錄 kube-state-metrics 告警擴充 (P3): - CronJobLastRunFailed: Job 執行失敗 - DaemonSetMissingPods: DaemonSet 缺少 Pod - StatefulSetReplicasMismatch: StatefulSet 副本不足 - ContainerWaiting: ImagePullBackOff/CrashLoopBackOff 偵測 - PDBViolation: PDB 健康 Pod 數不足 - NodeUnschedulable: 節點標記為不可排程 新增: - apps/api/scripts/test_nemotron_tool_calling.py (E2E 比較測試) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
6
.gitignore
vendored
6
.gitignore
vendored
@@ -17,6 +17,12 @@ build/
|
||||
dist/
|
||||
.turbo/
|
||||
|
||||
# Playwright 測試產物 (動態生成,不需版本控制)
|
||||
**/playwright-report/
|
||||
**/test-results/
|
||||
# 保留 Phase 19 參考截圖
|
||||
!apps/web/test-results/phase19/
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
||||
496
apps/api/scripts/test_nemotron_tool_calling.py
Normal file
496
apps/api/scripts/test_nemotron_tool_calling.py
Normal file
@@ -0,0 +1,496 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Nemotron Tool Calling 精準度測試
|
||||
比較 Nemotron vs Ollama (Qwen) 的 Tool Calling 能力
|
||||
|
||||
使用方式:
|
||||
export NVIDIA_API_KEY=nvapi-xxxx
|
||||
python test_nemotron_tool_calling.py
|
||||
|
||||
建立者: Claude Code
|
||||
日期: 2026-03-28 (台北時間)
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
try:
|
||||
import httpx
|
||||
except ImportError:
|
||||
print("請安裝 httpx: pip install httpx")
|
||||
exit(1)
|
||||
|
||||
# ============================================================================
|
||||
# 配置
|
||||
# ============================================================================
|
||||
|
||||
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.188:11434")
|
||||
|
||||
if not NVIDIA_API_KEY:
|
||||
print("❌ 請設定 NVIDIA_API_KEY 環境變數")
|
||||
print(" export NVIDIA_API_KEY=nvapi-xxxx")
|
||||
exit(1)
|
||||
|
||||
# ============================================================================
|
||||
# Tool 定義 (K8s SRE 場景)
|
||||
# ============================================================================
|
||||
|
||||
TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "kubectl_get",
|
||||
"description": "Get Kubernetes resources (pods, deployments, services, etc.)",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"resource": {
|
||||
"type": "string",
|
||||
"enum": ["pods", "deployments", "services", "nodes", "events"],
|
||||
"description": "Resource type to query"
|
||||
},
|
||||
"namespace": {
|
||||
"type": "string",
|
||||
"description": "Kubernetes namespace (default: awoooi-prod)"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Specific resource name (optional)"
|
||||
}
|
||||
},
|
||||
"required": ["resource"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "restart_deployment",
|
||||
"description": "Restart a Kubernetes deployment by rolling restart",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"deployment": {
|
||||
"type": "string",
|
||||
"description": "Deployment name"
|
||||
},
|
||||
"namespace": {
|
||||
"type": "string",
|
||||
"description": "Kubernetes namespace"
|
||||
}
|
||||
},
|
||||
"required": ["deployment", "namespace"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "scale_deployment",
|
||||
"description": "Scale a Kubernetes deployment to specified replicas",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"deployment": {"type": "string"},
|
||||
"namespace": {"type": "string"},
|
||||
"replicas": {"type": "integer", "minimum": 0, "maximum": 10}
|
||||
},
|
||||
"required": ["deployment", "namespace", "replicas"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_logs",
|
||||
"description": "Get logs from a Kubernetes pod",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pod": {"type": "string"},
|
||||
"namespace": {"type": "string"},
|
||||
"tail": {"type": "integer", "description": "Number of lines (default: 100)"},
|
||||
"container": {"type": "string", "description": "Container name (optional)"}
|
||||
},
|
||||
"required": ["pod", "namespace"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "send_alert",
|
||||
"description": "Send alert notification via Telegram",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"severity": {"type": "string", "enum": ["info", "warning", "critical"]},
|
||||
"message": {"type": "string"},
|
||||
"incident_id": {"type": "string"}
|
||||
},
|
||||
"required": ["severity", "message"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
# ============================================================================
|
||||
# 測試案例
|
||||
# ============================================================================
|
||||
|
||||
TEST_CASES = [
|
||||
{
|
||||
"id": "TC001",
|
||||
"description": "簡單查詢 - 列出所有 pods",
|
||||
"prompt": "Show me all pods in awoooi-prod namespace",
|
||||
"expected_tool": "kubectl_get",
|
||||
},
|
||||
{
|
||||
"id": "TC002",
|
||||
"description": "重啟服務",
|
||||
"prompt": "The API is not responding, please restart the awoooi-api deployment in awoooi-prod",
|
||||
"expected_tool": "restart_deployment",
|
||||
},
|
||||
{
|
||||
"id": "TC003",
|
||||
"description": "擴展副本",
|
||||
"prompt": "We're getting high traffic, scale awoooi-web deployment to 3 replicas in awoooi-prod namespace",
|
||||
"expected_tool": "scale_deployment",
|
||||
},
|
||||
{
|
||||
"id": "TC004",
|
||||
"description": "查看日誌",
|
||||
"prompt": "Get the last 50 lines of logs from awoooi-api-abc123 pod in awoooi-prod",
|
||||
"expected_tool": "get_logs",
|
||||
},
|
||||
{
|
||||
"id": "TC005",
|
||||
"description": "發送告警",
|
||||
"prompt": "Send a critical alert with message 'Database connection failed' for incident INC-2026-001",
|
||||
"expected_tool": "send_alert",
|
||||
},
|
||||
{
|
||||
"id": "TC006",
|
||||
"description": "繁體中文指令",
|
||||
"prompt": "請幫我重啟 awoooi-worker 這個 deployment,namespace 是 awoooi-prod",
|
||||
"expected_tool": "restart_deployment",
|
||||
},
|
||||
{
|
||||
"id": "TC007",
|
||||
"description": "複合理解",
|
||||
"prompt": "The web frontend is showing 502 errors. First, check if the API pods are running in awoooi-prod.",
|
||||
"expected_tool": "kubectl_get",
|
||||
},
|
||||
]
|
||||
|
||||
# ============================================================================
|
||||
# API 客戶端
|
||||
# ============================================================================
|
||||
|
||||
@dataclass
|
||||
class TestResult:
|
||||
model: str
|
||||
test_id: str
|
||||
description: str
|
||||
success: bool
|
||||
tool_called: Optional[str]
|
||||
params: Optional[dict]
|
||||
latency_ms: float
|
||||
error: Optional[str] = None
|
||||
raw_response: Optional[str] = None
|
||||
|
||||
|
||||
async def call_nemotron(prompt: str, model: str = "nvidia/nemotron-mini-4b-instruct") -> dict:
|
||||
"""呼叫 NVIDIA NIM API"""
|
||||
async with httpx.AsyncClient(timeout=60) as client:
|
||||
start = time.time()
|
||||
try:
|
||||
response = await client.post(
|
||||
"https://integrate.api.nvidia.com/v1/chat/completions",
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {NVIDIA_API_KEY}"
|
||||
},
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an SRE assistant for AWOOOI AIOps platform. Use the provided tools to help with Kubernetes operations. Always use tools when the user requests an action."
|
||||
},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
"tools": TOOLS,
|
||||
"tool_choice": "auto",
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 512
|
||||
}
|
||||
)
|
||||
latency = (time.time() - start) * 1000
|
||||
response.raise_for_status()
|
||||
return {"data": response.json(), "latency_ms": latency, "error": None}
|
||||
except Exception as e:
|
||||
latency = (time.time() - start) * 1000
|
||||
return {"data": None, "latency_ms": latency, "error": str(e)}
|
||||
|
||||
|
||||
async def call_ollama(prompt: str, model: str = "qwen2.5:7b-instruct") -> dict:
|
||||
"""呼叫本地 Ollama (JSON 模式模擬 Tool Calling)"""
|
||||
async with httpx.AsyncClient(timeout=120) as client:
|
||||
start = time.time()
|
||||
try:
|
||||
# Ollama 不原生支援 Tool Calling,用 JSON 模式模擬
|
||||
tool_prompt = f"""Based on this user request, determine which tool to call and with what parameters.
|
||||
|
||||
User Request: {prompt}
|
||||
|
||||
Available tools:
|
||||
1. kubectl_get - Get K8s resources (params: resource, namespace, name)
|
||||
2. restart_deployment - Restart a deployment (params: deployment, namespace)
|
||||
3. scale_deployment - Scale replicas (params: deployment, namespace, replicas)
|
||||
4. get_logs - Get pod logs (params: pod, namespace, tail, container)
|
||||
5. send_alert - Send Telegram alert (params: severity, message, incident_id)
|
||||
|
||||
Respond ONLY with a JSON object in this exact format:
|
||||
{{"tool": "tool_name", "params": {{"key": "value"}}}}
|
||||
"""
|
||||
response = await client.post(
|
||||
f"{OLLAMA_BASE_URL}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "user", "content": tool_prompt}
|
||||
],
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {
|
||||
"temperature": 0.1
|
||||
}
|
||||
}
|
||||
)
|
||||
latency = (time.time() - start) * 1000
|
||||
response.raise_for_status()
|
||||
return {"data": response.json(), "latency_ms": latency, "error": None}
|
||||
except Exception as e:
|
||||
latency = (time.time() - start) * 1000
|
||||
return {"data": None, "latency_ms": latency, "error": str(e)}
|
||||
|
||||
|
||||
def parse_nemotron_response(response: dict) -> tuple:
|
||||
"""解析 Nemotron 回應"""
|
||||
try:
|
||||
choices = response.get("choices", [])
|
||||
if not choices:
|
||||
return (None, {}, "No choices in response")
|
||||
|
||||
message = choices[0].get("message", {})
|
||||
|
||||
# 檢查 tool_calls
|
||||
if message.get("tool_calls"):
|
||||
tool_call = message["tool_calls"][0]
|
||||
tool_name = tool_call["function"]["name"]
|
||||
try:
|
||||
params = json.loads(tool_call["function"]["arguments"])
|
||||
except:
|
||||
params = {}
|
||||
return (tool_name, params, None)
|
||||
|
||||
# 如果沒有 tool_calls,回傳 content
|
||||
content = message.get("content", "")
|
||||
return (None, {}, f"No tool call, content: {content[:100]}")
|
||||
|
||||
except Exception as e:
|
||||
return (None, {}, str(e))
|
||||
|
||||
|
||||
def parse_ollama_response(response: dict) -> tuple:
|
||||
"""解析 Ollama 回應"""
|
||||
try:
|
||||
content = response.get("message", {}).get("content", "{}")
|
||||
parsed = json.loads(content)
|
||||
return (parsed.get("tool"), parsed.get("params", {}), None)
|
||||
except Exception as e:
|
||||
return (None, {}, str(e))
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# 測試執行
|
||||
# ============================================================================
|
||||
|
||||
async def run_single_test(test_case: dict) -> list:
|
||||
"""執行單一測試案例"""
|
||||
results = []
|
||||
prompt = test_case["prompt"]
|
||||
|
||||
# 測試 Nemotron
|
||||
print(f" Testing Nemotron...", end=" ", flush=True)
|
||||
resp = await call_nemotron(prompt)
|
||||
if resp["error"]:
|
||||
results.append(TestResult(
|
||||
model="Nemotron-mini-4B",
|
||||
test_id=test_case["id"],
|
||||
description=test_case["description"],
|
||||
success=False,
|
||||
tool_called=None,
|
||||
params=None,
|
||||
latency_ms=resp["latency_ms"],
|
||||
error=resp["error"]
|
||||
))
|
||||
print(f"❌ Error")
|
||||
else:
|
||||
tool, params, error = parse_nemotron_response(resp["data"])
|
||||
success = tool == test_case["expected_tool"]
|
||||
raw = None
|
||||
try:
|
||||
raw = json.dumps(resp["data"].get("choices", [{}])[0].get("message", {}), indent=2)[:200]
|
||||
except:
|
||||
pass
|
||||
results.append(TestResult(
|
||||
model="Nemotron-mini-4B",
|
||||
test_id=test_case["id"],
|
||||
description=test_case["description"],
|
||||
success=success,
|
||||
tool_called=tool,
|
||||
params=params,
|
||||
latency_ms=resp["latency_ms"],
|
||||
error=error,
|
||||
raw_response=raw
|
||||
))
|
||||
status = "✅" if success else "❌"
|
||||
print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)")
|
||||
|
||||
# 測試 Ollama
|
||||
print(f" Testing Ollama...", end=" ", flush=True)
|
||||
resp = await call_ollama(prompt)
|
||||
if resp["error"]:
|
||||
results.append(TestResult(
|
||||
model="Ollama-Qwen2.5-7B",
|
||||
test_id=test_case["id"],
|
||||
description=test_case["description"],
|
||||
success=False,
|
||||
tool_called=None,
|
||||
params=None,
|
||||
latency_ms=resp["latency_ms"],
|
||||
error=resp["error"]
|
||||
))
|
||||
print(f"❌ Error: {resp['error'][:50]}")
|
||||
else:
|
||||
tool, params, error = parse_ollama_response(resp["data"])
|
||||
success = tool == test_case["expected_tool"]
|
||||
results.append(TestResult(
|
||||
model="Ollama-Qwen2.5-7B",
|
||||
test_id=test_case["id"],
|
||||
description=test_case["description"],
|
||||
success=success,
|
||||
tool_called=tool,
|
||||
params=params,
|
||||
latency_ms=resp["latency_ms"],
|
||||
error=error
|
||||
))
|
||||
status = "✅" if success else "❌"
|
||||
print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def main():
|
||||
"""主測試流程"""
|
||||
print("=" * 70)
|
||||
print("🧪 Nemotron vs Ollama Tool Calling 精準度測試")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print(f"Nemotron API: integrate.api.nvidia.com")
|
||||
print(f"Ollama URL: {OLLAMA_BASE_URL}")
|
||||
print()
|
||||
|
||||
all_results = []
|
||||
|
||||
for i, tc in enumerate(TEST_CASES, 1):
|
||||
print(f"[{i}/{len(TEST_CASES)}] {tc['id']}: {tc['description']}")
|
||||
print(f" Prompt: {tc['prompt'][:60]}...")
|
||||
print(f" Expected: {tc['expected_tool']}")
|
||||
|
||||
results = await run_single_test(tc)
|
||||
all_results.extend(results)
|
||||
print()
|
||||
|
||||
# ========================================================================
|
||||
# 統計結果
|
||||
# ========================================================================
|
||||
print("=" * 70)
|
||||
print("📊 統計結果")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
models = {}
|
||||
for r in all_results:
|
||||
if r.model not in models:
|
||||
models[r.model] = {"success": 0, "total": 0, "latency": [], "errors": 0}
|
||||
models[r.model]["total"] += 1
|
||||
if r.success:
|
||||
models[r.model]["success"] += 1
|
||||
if r.error:
|
||||
models[r.model]["errors"] += 1
|
||||
if r.latency_ms > 0:
|
||||
models[r.model]["latency"].append(r.latency_ms)
|
||||
|
||||
print(f"{'Model':<25} {'Accuracy':<12} {'Avg Latency':<15} {'Errors':<10}")
|
||||
print("-" * 62)
|
||||
for model, stats in models.items():
|
||||
acc = stats["success"] / stats["total"] * 100 if stats["total"] > 0 else 0
|
||||
avg_lat = sum(stats["latency"]) / len(stats["latency"]) if stats["latency"] else 0
|
||||
print(f"{model:<25} {acc:>6.1f}% {avg_lat:>8.0f}ms {stats['errors']}")
|
||||
|
||||
print()
|
||||
|
||||
# 詳細錯誤報告
|
||||
errors = [r for r in all_results if r.error]
|
||||
if errors:
|
||||
print("=" * 70)
|
||||
print("⚠️ 錯誤詳情")
|
||||
print("=" * 70)
|
||||
for r in errors:
|
||||
print(f" [{r.test_id}] {r.model}: {r.error[:80]}")
|
||||
|
||||
# 推薦
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("💡 建議")
|
||||
print("=" * 70)
|
||||
|
||||
nemotron_stats = models.get("Nemotron-mini-4B", {})
|
||||
ollama_stats = models.get("Ollama-Qwen2.5-7B", {})
|
||||
|
||||
nem_acc = nemotron_stats.get("success", 0) / nemotron_stats.get("total", 1) * 100
|
||||
oll_acc = ollama_stats.get("success", 0) / ollama_stats.get("total", 1) * 100
|
||||
|
||||
if nem_acc > oll_acc:
|
||||
print(f"✅ Nemotron Tool Calling 精準度較高 ({nem_acc:.0f}% vs {oll_acc:.0f}%)")
|
||||
print(" 建議: 將 Nemotron 作為 Tool Calling 任務的首選模型")
|
||||
elif oll_acc > nem_acc:
|
||||
print(f"⚠️ Ollama 精準度較高 ({oll_acc:.0f}% vs {nem_acc:.0f}%)")
|
||||
print(" 建議: 繼續使用 Ollama,Nemotron 可作為備援")
|
||||
else:
|
||||
print(f"📊 兩者精準度相近 ({nem_acc:.0f}% vs {oll_acc:.0f}%)")
|
||||
print(" 建議: 考慮延遲和成本選擇")
|
||||
|
||||
nem_lat = sum(nemotron_stats.get("latency", [0])) / len(nemotron_stats.get("latency", [1]))
|
||||
oll_lat = sum(ollama_stats.get("latency", [0])) / len(ollama_stats.get("latency", [1]))
|
||||
|
||||
print()
|
||||
if nem_lat < oll_lat:
|
||||
print(f"⚡ Nemotron 延遲較低 ({nem_lat:.0f}ms vs {oll_lat:.0f}ms)")
|
||||
else:
|
||||
print(f"🏠 Ollama 延遲較低 ({oll_lat:.0f}ms vs {nem_lat:.0f}ms) - 本地優勢")
|
||||
|
||||
print()
|
||||
print("測試完成!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,127 +0,0 @@
|
||||
# Phase 19 Omni-Terminal 正式環境驗證報告
|
||||
|
||||
> **測試日期**: 2026-03-28 18:29 (台北時間)
|
||||
> **測試者**: Claude Code (首席架構師)
|
||||
> **環境**: 正式環境 (https://awoooi.wooo.work)
|
||||
> **Commit**: `7b9b0c4` (原測試) → 已修正測試路徑
|
||||
|
||||
---
|
||||
|
||||
## 一、測試摘要
|
||||
|
||||
| 項目 | 結果 |
|
||||
|------|------|
|
||||
| **測試總數** | 11 |
|
||||
| **通過** | 11 ✅ |
|
||||
| **失敗** | 0 |
|
||||
| **執行時間** | 22.2 秒 |
|
||||
| **截圖數量** | 10 張 |
|
||||
|
||||
---
|
||||
|
||||
## 二、問題修復紀錄
|
||||
|
||||
### 2.1 修復項目
|
||||
|
||||
| 問題 | 優先級 | 狀態 | 說明 |
|
||||
|------|--------|------|------|
|
||||
| `/incidents` 404 | P0 | ✅ 已修復 | 測試路徑錯誤,改為 `/action-logs` |
|
||||
| Header「已斷線」 | P1 | ✅ 非問題 | 正常 SSE 狀態顯示 |
|
||||
| Under Construction 頁面 | P1 | ✅ 非問題 | Phase 7.0 防禦性路由佔位設計 |
|
||||
|
||||
### 2.2 修正內容
|
||||
|
||||
```typescript
|
||||
// 修正前 (錯誤)
|
||||
test('02-Incidents事件', async ({ page }) => {
|
||||
await page.goto(`${BASE_URL}/zh-TW/incidents`) // 路由不存在
|
||||
})
|
||||
|
||||
// 修正後 (正確)
|
||||
test('02-ActionLogs行動日誌', async ({ page }) => {
|
||||
await page.goto(`${BASE_URL}/zh-TW/action-logs`) // 正確路由
|
||||
})
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 三、頁面驗證
|
||||
|
||||
### 3.1 截圖清單
|
||||
|
||||
| 頁面 | 截圖 | 檔案大小 |
|
||||
|------|------|----------|
|
||||
| 首頁 Dashboard | [01-dashboard.png](01-dashboard.png) | 463 KB |
|
||||
| Action Logs 行動日誌 | [02-action-logs.png](02-action-logs.png) | 59 KB |
|
||||
| Authorizations 簽核 | [03-authorizations.png](03-authorizations.png) | 33 KB |
|
||||
| Errors 錯誤追蹤 | [04-errors.png](04-errors.png) | 73 KB |
|
||||
| Knowledge Base 知識庫 | [05-knowledge-base.png](05-knowledge-base.png) | 33 KB |
|
||||
| Settings 設定 | [06-settings.png](06-settings.png) | 33 KB |
|
||||
| Mobile 響應式 | [08-mobile.png](08-mobile.png) | 159 KB |
|
||||
| Tablet 響應式 | [09-tablet.png](09-tablet.png) | 426 KB |
|
||||
| English 英文版 | [10-english.png](10-english.png) | 410 KB |
|
||||
| Demo 頁面 | [11-demo.png](11-demo.png) | 1.6 MB |
|
||||
|
||||
### 3.2 響應式設計
|
||||
|
||||
| 裝置 | 解析度 | 狀態 |
|
||||
|------|--------|------|
|
||||
| Desktop | 1920x1080 | ✅ |
|
||||
| Tablet | 768x1024 | ✅ |
|
||||
| Mobile | 375x812 | ✅ |
|
||||
|
||||
### 3.3 國際化 (i18n)
|
||||
|
||||
| 語系 | URL | 狀態 |
|
||||
|------|-----|------|
|
||||
| 繁體中文 | /zh-TW | ✅ |
|
||||
| English | /en | ✅ |
|
||||
|
||||
---
|
||||
|
||||
## 四、UI/UX 審查結果
|
||||
|
||||
### 4.1 正常行為確認
|
||||
|
||||
| 項目 | 說明 | 結論 |
|
||||
|------|------|------|
|
||||
| Header「已斷線」 | SSE 未連接時的正確狀態顯示 | ✅ 設計如此 |
|
||||
| 授權中心 Under Construction | Phase 7.0 防禦性路由佔位 | ✅ 設計如此 |
|
||||
| 知識殿堂 Under Construction | Phase 7.0 防禦性路由佔位 | ✅ 設計如此 |
|
||||
| 系統設定 Under Construction | Phase 7.0 防禦性路由佔位 | ✅ 設計如此 |
|
||||
|
||||
### 4.2 實際路由確認
|
||||
|
||||
| 路由 | 存在 | 說明 |
|
||||
|------|------|------|
|
||||
| `/` (Dashboard) | ✅ | 主儀表板 |
|
||||
| `/action-logs` | ✅ | 行動日誌 |
|
||||
| `/authorizations` | ✅ | 授權中心 (Under Construction) |
|
||||
| `/errors` | ✅ | 錯誤追蹤 |
|
||||
| `/knowledge-base` | ✅ | 知識殿堂 (Under Construction) |
|
||||
| `/settings` | ✅ | 系統設定 (Under Construction) |
|
||||
| `/demo` | ✅ | Demo 頁面 |
|
||||
|
||||
---
|
||||
|
||||
## 五、結論
|
||||
|
||||
### ✅ Phase 19 正式環境驗證通過
|
||||
|
||||
- **所有頁面** 可正常存取
|
||||
- **API** 健康檢查通過
|
||||
- **響應式設計** 三種裝置尺寸驗證通過
|
||||
- **國際化** 繁中/英文版本正常
|
||||
- **測試路徑錯誤** 已修復
|
||||
|
||||
### 驗收簽核
|
||||
|
||||
| 項目 | 簽核 |
|
||||
|------|------|
|
||||
| 首席架構師 | Claude Code ✅ |
|
||||
| 測試日期 | 2026-03-28 18:29 |
|
||||
| 修復完成 | 測試路徑錯誤已修正 |
|
||||
|
||||
---
|
||||
|
||||
**Generated by Claude Code (首席架構師)**
|
||||
@@ -5,17 +5,17 @@
|
||||
|
||||
---
|
||||
|
||||
## 📍 當前狀態 (2026-03-29 09:25 台北)
|
||||
## 📍 當前狀態 (2026-03-29 01:35 台北)
|
||||
|
||||
| 項目 | 狀態 |
|
||||
|------|------|
|
||||
| **當前 Phase** | ✅ **Phase 20 Nemotron Tool Calling (P1 修復完成)** |
|
||||
| **當前 Phase** | ✅ **Phase 19.6 測試收尾 + P1-P3 全部完成** |
|
||||
| **Day** | Day 12 |
|
||||
| **K3s 版本** | v1.34.5+k3s1 (mon + mon1) |
|
||||
| **叢集健康** | ✅ **所有 Pod 正常運行** |
|
||||
| **K3s 優化** | ✅ **全部完成 + P2/P3 + PSS** |
|
||||
| **K3s 優化** | ✅ **全部完成 + P1-P3 + PSS** |
|
||||
| **K-MON** | ✅ **監控整合** (VIP/Velero/SignOz/Sentry 告警) |
|
||||
| **K3 HPA** | ✅ **API/Web 2-4 自動擴展** (CPU 13%/21%) |
|
||||
| **K3 HPA** | ✅ **API/Web 2-6 自動擴展** (P3 maxReplicas 調升) |
|
||||
| **K4 Kured** | ✅ **自動重啟 (02:00-04:00 維護窗口)** |
|
||||
| **K4 Descheduler** | ✅ **負載均衡 (每 2 小時, threshold 30%)** |
|
||||
| **K4.3 PSS** | ✅ **Pod Security Standards (6 Namespace labels)** 🆕 |
|
||||
@@ -49,7 +49,7 @@
|
||||
|
||||
---
|
||||
|
||||
### ✅ 2026-03-29 Phase 19.6 測試收尾 + P1/P2 改進 (Day 12 01:00) 🆕
|
||||
### ✅ 2026-03-29 Phase 19.6 測試收尾 + P1-P3 改進 (Day 12 01:30) 🆕
|
||||
|
||||
| 項目 | 內容 | 狀態 |
|
||||
|------|------|------|
|
||||
@@ -79,18 +79,21 @@
|
||||
|
||||
---
|
||||
|
||||
### ✅ 2026-03-29 Phase 20 Nemotron P1 修復完成 (Day 12 09:20) 🆕
|
||||
### ✅ 2026-03-29 Phase 20 Nemotron P1+P2 全部完成 (Day 12 10:30) 🆕
|
||||
|
||||
| 項目 | 內容 | 狀態 |
|
||||
|------|------|------|
|
||||
| **ADR-036** | Nemotron Tool Calling 整合 | ✅ **已實作** |
|
||||
| **NvidiaProvider** | Tool Calling + HITL 保護 | ✅ **完成** |
|
||||
| **測試驗證** | tests/test_nvidia_provider.py | ✅ **15/15 PASSED** |
|
||||
| **測試驗證** | tests/test_nvidia_provider.py | ✅ **25/25 PASSED** |
|
||||
| **CD 部署** | CD #23689363463 | ✅ **成功** |
|
||||
| **Tool Calling 驗證** | restart_pod 測試 | ✅ **正確解析** |
|
||||
| **首席架構師審查** | 82/100 → 86/100 | ✅ **P1 已修復** |
|
||||
| **Langfuse 整合** | LangfuseTraceContext | ✅ **P1-1 修復** |
|
||||
| **OTEL Tracing** | start_as_current_span | ✅ **P1-2 修復** |
|
||||
| **首席架構師審查** | 82/100 → 86/100 → 90/100 | ✅ **P1+P2 修復** |
|
||||
| **P1-1 Langfuse** | LangfuseTraceContext | ✅ **修復** |
|
||||
| **P1-2 OTEL** | start_as_current_span | ✅ **修復** |
|
||||
| **P2-1 Protocol** | INvidiaProvider (@runtime_checkable) | ✅ **修復** |
|
||||
| **P2-2 邊界測試** | 15 → 25 測試案例 | ✅ **修復** |
|
||||
| **P2-3 model_registry** | NVIDIA + tool_calling_fallback_order | ✅ **修復** |
|
||||
|
||||
**驗證結果** (2026-03-29 08:51):
|
||||
```
|
||||
|
||||
@@ -225,3 +225,78 @@ groups:
|
||||
annotations:
|
||||
summary: "⚠️ TLS 探測失敗"
|
||||
description: "無法連線到 {{ $labels.instance }} 進行 TLS 檢查"
|
||||
|
||||
# ===== kube-state-metrics 擴充告警 (P3 2026-03-29) =====
|
||||
- name: kube_state_extended
|
||||
rules:
|
||||
# CronJob 上次執行失敗
|
||||
- alert: CronJobLastRunFailed
|
||||
expr: kube_job_status_failed{namespace="awoooi-prod"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
component: cronjob
|
||||
annotations:
|
||||
summary: "⚠️ CronJob 執行失敗"
|
||||
description: "Job {{ $labels.job_name }} 執行失敗"
|
||||
|
||||
# DaemonSet 缺少 Pod
|
||||
- alert: DaemonSetMissingPods
|
||||
expr: kube_daemonset_status_number_unavailable{namespace=~"awoooi-prod|kube-system|velero"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
component: daemonset
|
||||
annotations:
|
||||
summary: "⚠️ DaemonSet 缺少 Pod"
|
||||
description: "{{ $labels.daemonset }} 缺少 {{ $value }} 個 Pod"
|
||||
|
||||
# StatefulSet 副本不足
|
||||
- alert: StatefulSetReplicasMismatch
|
||||
expr: kube_statefulset_status_replicas_ready{namespace="awoooi-prod"} != kube_statefulset_replicas{namespace="awoooi-prod"}
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
component: statefulset
|
||||
annotations:
|
||||
summary: "⚠️ StatefulSet 副本不足"
|
||||
description: "{{ $labels.statefulset }} 預期副本數與就緒數不符"
|
||||
|
||||
# 容器長時間等待 (ImagePullBackOff/CrashLoopBackOff)
|
||||
- alert: ContainerWaiting
|
||||
expr: kube_pod_container_status_waiting_reason{namespace="awoooi-prod", reason=~"ImagePullBackOff|CrashLoopBackOff|ErrImagePull"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
component: container
|
||||
annotations:
|
||||
summary: "⚠️ 容器等待中"
|
||||
description: "{{ $labels.pod }}/{{ $labels.container }} 處於 {{ $labels.reason }} 狀態"
|
||||
|
||||
# PDB 違規 (可用 Pod 數低於 minAvailable)
|
||||
- alert: PDBViolation
|
||||
expr: kube_poddisruptionbudget_status_current_healthy{namespace="awoooi-prod"} < kube_poddisruptionbudget_status_desired_healthy{namespace="awoooi-prod"}
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: ops
|
||||
component: pdb
|
||||
annotations:
|
||||
summary: "⚠️ PDB 違規"
|
||||
description: "{{ $labels.poddisruptionbudget }} 健康 Pod 數低於期望值"
|
||||
|
||||
# 節點 taint 未被容忍 (排程問題偵測)
|
||||
- alert: NodeUnschedulable
|
||||
expr: kube_node_spec_unschedulable == 1
|
||||
for: 30m
|
||||
labels:
|
||||
severity: info
|
||||
team: ops
|
||||
component: node
|
||||
annotations:
|
||||
summary: "ℹ️ 節點標記為不可排程"
|
||||
description: "節點 {{ $labels.node }} 已被標記為 cordon/unschedulable 超過 30 分鐘"
|
||||
|
||||
Reference in New Issue
Block a user