chore: 清理 Playwright 產物 + kube-state-metrics 告警擴充

清理工作: - .gitignore 新增 playwright-report/ 和 test-results/ 排除 - 保留 phase19/ 參考截圖目錄 kube-state-metrics 告警擴充 (P3): - CronJobLastRunFailed: Job 執行失敗 - DaemonSetMissingPods: DaemonSet 缺少 Pod - StatefulSetReplicasMismatch: StatefulSet 副本不足 - ContainerWaiting: ImagePullBackOff/CrashLoopBackOff 偵測 - PDBViolation: PDB 健康 Pod 數不足 - NodeUnschedulable: 節點標記為不可排程新增: - apps/api/scripts/test_nemotron_tool_calling.py (E2E 比較測試) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-29 01:28:35 +08:00
parent 725392b578
commit 179e659f14
5 changed files with 590 additions and 137 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,12 @@ build/
 dist/
 .turbo/

+# Playwright 測試產物 (動態生成，不需版本控制)
+**/playwright-report/
+**/test-results/
+# 保留 Phase 19 參考截圖
+!apps/web/test-results/phase19/
+
 # Python
 __pycache__/
 *.py[cod]
--- a/apps/api/scripts/test_nemotron_tool_calling.py
+++ b/apps/api/scripts/test_nemotron_tool_calling.py
@@ -0,0 +1,496 @@
+#!/usr/bin/env python3
+"""
+Nemotron Tool Calling 精準度測試
+比較 Nemotron vs Ollama (Qwen) 的 Tool Calling 能力
+
+使用方式:
+    export NVIDIA_API_KEY=nvapi-xxxx
+    python test_nemotron_tool_calling.py
+
+建立者: Claude Code
+日期: 2026-03-28 (台北時間)
+"""
+
+import os
+import json
+import asyncio
+import time
+from dataclasses import dataclass
+from typing import Optional
+
+try:
+    import httpx
+except ImportError:
+    print("請安裝 httpx: pip install httpx")
+    exit(1)
+
+# ============================================================================
+# 配置
+# ============================================================================
+
+NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.188:11434")
+
+if not NVIDIA_API_KEY:
+    print("❌ 請設定 NVIDIA_API_KEY 環境變數")
+    print("   export NVIDIA_API_KEY=nvapi-xxxx")
+    exit(1)
+
+# ============================================================================
+# Tool 定義 (K8s SRE 場景)
+# ============================================================================
+
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "kubectl_get",
+            "description": "Get Kubernetes resources (pods, deployments, services, etc.)",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "resource": {
+                        "type": "string",
+                        "enum": ["pods", "deployments", "services", "nodes", "events"],
+                        "description": "Resource type to query"
+                    },
+                    "namespace": {
+                        "type": "string",
+                        "description": "Kubernetes namespace (default: awoooi-prod)"
+                    },
+                    "name": {
+                        "type": "string",
+                        "description": "Specific resource name (optional)"
+                    }
+                },
+                "required": ["resource"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "restart_deployment",
+            "description": "Restart a Kubernetes deployment by rolling restart",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "deployment": {
+                        "type": "string",
+                        "description": "Deployment name"
+                    },
+                    "namespace": {
+                        "type": "string",
+                        "description": "Kubernetes namespace"
+                    }
+                },
+                "required": ["deployment", "namespace"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "scale_deployment",
+            "description": "Scale a Kubernetes deployment to specified replicas",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "deployment": {"type": "string"},
+                    "namespace": {"type": "string"},
+                    "replicas": {"type": "integer", "minimum": 0, "maximum": 10}
+                },
+                "required": ["deployment", "namespace", "replicas"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_logs",
+            "description": "Get logs from a Kubernetes pod",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "pod": {"type": "string"},
+                    "namespace": {"type": "string"},
+                    "tail": {"type": "integer", "description": "Number of lines (default: 100)"},
+                    "container": {"type": "string", "description": "Container name (optional)"}
+                },
+                "required": ["pod", "namespace"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "send_alert",
+            "description": "Send alert notification via Telegram",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "severity": {"type": "string", "enum": ["info", "warning", "critical"]},
+                    "message": {"type": "string"},
+                    "incident_id": {"type": "string"}
+                },
+                "required": ["severity", "message"]
+            }
+        }
+    }
+]
+
+# ============================================================================
+# 測試案例
+# ============================================================================
+
+TEST_CASES = [
+    {
+        "id": "TC001",
+        "description": "簡單查詢 - 列出所有 pods",
+        "prompt": "Show me all pods in awoooi-prod namespace",
+        "expected_tool": "kubectl_get",
+    },
+    {
+        "id": "TC002",
+        "description": "重啟服務",
+        "prompt": "The API is not responding, please restart the awoooi-api deployment in awoooi-prod",
+        "expected_tool": "restart_deployment",
+    },
+    {
+        "id": "TC003",
+        "description": "擴展副本",
+        "prompt": "We're getting high traffic, scale awoooi-web deployment to 3 replicas in awoooi-prod namespace",
+        "expected_tool": "scale_deployment",
+    },
+    {
+        "id": "TC004",
+        "description": "查看日誌",
+        "prompt": "Get the last 50 lines of logs from awoooi-api-abc123 pod in awoooi-prod",
+        "expected_tool": "get_logs",
+    },
+    {
+        "id": "TC005",
+        "description": "發送告警",
+        "prompt": "Send a critical alert with message 'Database connection failed' for incident INC-2026-001",
+        "expected_tool": "send_alert",
+    },
+    {
+        "id": "TC006",
+        "description": "繁體中文指令",
+        "prompt": "請幫我重啟 awoooi-worker 這個 deployment，namespace 是 awoooi-prod",
+        "expected_tool": "restart_deployment",
+    },
+    {
+        "id": "TC007",
+        "description": "複合理解",
+        "prompt": "The web frontend is showing 502 errors. First, check if the API pods are running in awoooi-prod.",
+        "expected_tool": "kubectl_get",
+    },
+]
+
+# ============================================================================
+# API 客戶端
+# ============================================================================
+
+@dataclass
+class TestResult:
+    model: str
+    test_id: str
+    description: str
+    success: bool
+    tool_called: Optional[str]
+    params: Optional[dict]
+    latency_ms: float
+    error: Optional[str] = None
+    raw_response: Optional[str] = None
+
+
+async def call_nemotron(prompt: str, model: str = "nvidia/nemotron-mini-4b-instruct") -> dict:
+    """呼叫 NVIDIA NIM API"""
+    async with httpx.AsyncClient(timeout=60) as client:
+        start = time.time()
+        try:
+            response = await client.post(
+                "https://integrate.api.nvidia.com/v1/chat/completions",
+                headers={
+                    "Content-Type": "application/json",
+                    "Authorization": f"Bearer {NVIDIA_API_KEY}"
+                },
+                json={
+                    "model": model,
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": "You are an SRE assistant for AWOOOI AIOps platform. Use the provided tools to help with Kubernetes operations. Always use tools when the user requests an action."
+                        },
+                        {"role": "user", "content": prompt}
+                    ],
+                    "tools": TOOLS,
+                    "tool_choice": "auto",
+                    "temperature": 0.1,
+                    "max_tokens": 512
+                }
+            )
+            latency = (time.time() - start) * 1000
+            response.raise_for_status()
+            return {"data": response.json(), "latency_ms": latency, "error": None}
+        except Exception as e:
+            latency = (time.time() - start) * 1000
+            return {"data": None, "latency_ms": latency, "error": str(e)}
+
+
+async def call_ollama(prompt: str, model: str = "qwen2.5:7b-instruct") -> dict:
+    """呼叫本地 Ollama (JSON 模式模擬 Tool Calling)"""
+    async with httpx.AsyncClient(timeout=120) as client:
+        start = time.time()
+        try:
+            # Ollama 不原生支援 Tool Calling，用 JSON 模式模擬
+            tool_prompt = f"""Based on this user request, determine which tool to call and with what parameters.
+
+User Request: {prompt}
+
+Available tools:
+1. kubectl_get - Get K8s resources (params: resource, namespace, name)
+2. restart_deployment - Restart a deployment (params: deployment, namespace)
+3. scale_deployment - Scale replicas (params: deployment, namespace, replicas)
+4. get_logs - Get pod logs (params: pod, namespace, tail, container)
+5. send_alert - Send Telegram alert (params: severity, message, incident_id)
+
+Respond ONLY with a JSON object in this exact format:
+{{"tool": "tool_name", "params": {{"key": "value"}}}}
+"""
+            response = await client.post(
+                f"{OLLAMA_BASE_URL}/api/chat",
+                json={
+                    "model": model,
+                    "messages": [
+                        {"role": "user", "content": tool_prompt}
+                    ],
+                    "stream": False,
+                    "format": "json",
+                    "options": {
+                        "temperature": 0.1
+                    }
+                }
+            )
+            latency = (time.time() - start) * 1000
+            response.raise_for_status()
+            return {"data": response.json(), "latency_ms": latency, "error": None}
+        except Exception as e:
+            latency = (time.time() - start) * 1000
+            return {"data": None, "latency_ms": latency, "error": str(e)}
+
+
+def parse_nemotron_response(response: dict) -> tuple:
+    """解析 Nemotron 回應"""
+    try:
+        choices = response.get("choices", [])
+        if not choices:
+            return (None, {}, "No choices in response")
+
+        message = choices[0].get("message", {})
+
+        # 檢查 tool_calls
+        if message.get("tool_calls"):
+            tool_call = message["tool_calls"][0]
+            tool_name = tool_call["function"]["name"]
+            try:
+                params = json.loads(tool_call["function"]["arguments"])
+            except:
+                params = {}
+            return (tool_name, params, None)
+
+        # 如果沒有 tool_calls，回傳 content
+        content = message.get("content", "")
+        return (None, {}, f"No tool call, content: {content[:100]}")
+
+    except Exception as e:
+        return (None, {}, str(e))
+
+
+def parse_ollama_response(response: dict) -> tuple:
+    """解析 Ollama 回應"""
+    try:
+        content = response.get("message", {}).get("content", "{}")
+        parsed = json.loads(content)
+        return (parsed.get("tool"), parsed.get("params", {}), None)
+    except Exception as e:
+        return (None, {}, str(e))
+
+
+# ============================================================================
+# 測試執行
+# ============================================================================
+
+async def run_single_test(test_case: dict) -> list:
+    """執行單一測試案例"""
+    results = []
+    prompt = test_case["prompt"]
+
+    # 測試 Nemotron
+    print(f"    Testing Nemotron...", end=" ", flush=True)
+    resp = await call_nemotron(prompt)
+    if resp["error"]:
+        results.append(TestResult(
+            model="Nemotron-mini-4B",
+            test_id=test_case["id"],
+            description=test_case["description"],
+            success=False,
+            tool_called=None,
+            params=None,
+            latency_ms=resp["latency_ms"],
+            error=resp["error"]
+        ))
+        print(f"❌ Error")
+    else:
+        tool, params, error = parse_nemotron_response(resp["data"])
+        success = tool == test_case["expected_tool"]
+        raw = None
+        try:
+            raw = json.dumps(resp["data"].get("choices", [{}])[0].get("message", {}), indent=2)[:200]
+        except:
+            pass
+        results.append(TestResult(
+            model="Nemotron-mini-4B",
+            test_id=test_case["id"],
+            description=test_case["description"],
+            success=success,
+            tool_called=tool,
+            params=params,
+            latency_ms=resp["latency_ms"],
+            error=error,
+            raw_response=raw
+        ))
+        status = "✅" if success else "❌"
+        print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)")
+
+    # 測試 Ollama
+    print(f"    Testing Ollama...", end=" ", flush=True)
+    resp = await call_ollama(prompt)
+    if resp["error"]:
+        results.append(TestResult(
+            model="Ollama-Qwen2.5-7B",
+            test_id=test_case["id"],
+            description=test_case["description"],
+            success=False,
+            tool_called=None,
+            params=None,
+            latency_ms=resp["latency_ms"],
+            error=resp["error"]
+        ))
+        print(f"❌ Error: {resp['error'][:50]}")
+    else:
+        tool, params, error = parse_ollama_response(resp["data"])
+        success = tool == test_case["expected_tool"]
+        results.append(TestResult(
+            model="Ollama-Qwen2.5-7B",
+            test_id=test_case["id"],
+            description=test_case["description"],
+            success=success,
+            tool_called=tool,
+            params=params,
+            latency_ms=resp["latency_ms"],
+            error=error
+        ))
+        status = "✅" if success else "❌"
+        print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)")
+
+    return results
+
+
+async def main():
+    """主測試流程"""
+    print("=" * 70)
+    print("🧪 Nemotron vs Ollama Tool Calling 精準度測試")
+    print("=" * 70)
+    print()
+    print(f"Nemotron API: integrate.api.nvidia.com")
+    print(f"Ollama URL: {OLLAMA_BASE_URL}")
+    print()
+
+    all_results = []
+
+    for i, tc in enumerate(TEST_CASES, 1):
+        print(f"[{i}/{len(TEST_CASES)}] {tc['id']}: {tc['description']}")
+        print(f"    Prompt: {tc['prompt'][:60]}...")
+        print(f"    Expected: {tc['expected_tool']}")
+
+        results = await run_single_test(tc)
+        all_results.extend(results)
+        print()
+
+    # ========================================================================
+    # 統計結果
+    # ========================================================================
+    print("=" * 70)
+    print("📊 統計結果")
+    print("=" * 70)
+    print()
+
+    models = {}
+    for r in all_results:
+        if r.model not in models:
+            models[r.model] = {"success": 0, "total": 0, "latency": [], "errors": 0}
+        models[r.model]["total"] += 1
+        if r.success:
+            models[r.model]["success"] += 1
+        if r.error:
+            models[r.model]["errors"] += 1
+        if r.latency_ms > 0:
+            models[r.model]["latency"].append(r.latency_ms)
+
+    print(f"{'Model':<25} {'Accuracy':<12} {'Avg Latency':<15} {'Errors':<10}")
+    print("-" * 62)
+    for model, stats in models.items():
+        acc = stats["success"] / stats["total"] * 100 if stats["total"] > 0 else 0
+        avg_lat = sum(stats["latency"]) / len(stats["latency"]) if stats["latency"] else 0
+        print(f"{model:<25} {acc:>6.1f}%      {avg_lat:>8.0f}ms       {stats['errors']}")
+
+    print()
+
+    # 詳細錯誤報告
+    errors = [r for r in all_results if r.error]
+    if errors:
+        print("=" * 70)
+        print("⚠️ 錯誤詳情")
+        print("=" * 70)
+        for r in errors:
+            print(f"  [{r.test_id}] {r.model}: {r.error[:80]}")
+
+    # 推薦
+    print()
+    print("=" * 70)
+    print("💡 建議")
+    print("=" * 70)
+
+    nemotron_stats = models.get("Nemotron-mini-4B", {})
+    ollama_stats = models.get("Ollama-Qwen2.5-7B", {})
+
+    nem_acc = nemotron_stats.get("success", 0) / nemotron_stats.get("total", 1) * 100
+    oll_acc = ollama_stats.get("success", 0) / ollama_stats.get("total", 1) * 100
+
+    if nem_acc > oll_acc:
+        print(f"✅ Nemotron Tool Calling 精準度較高 ({nem_acc:.0f}% vs {oll_acc:.0f}%)")
+        print("   建議: 將 Nemotron 作為 Tool Calling 任務的首選模型")
+    elif oll_acc > nem_acc:
+        print(f"⚠️ Ollama 精準度較高 ({oll_acc:.0f}% vs {nem_acc:.0f}%)")
+        print("   建議: 繼續使用 Ollama，Nemotron 可作為備援")
+    else:
+        print(f"📊 兩者精準度相近 ({nem_acc:.0f}% vs {oll_acc:.0f}%)")
+        print("   建議: 考慮延遲和成本選擇")
+
+    nem_lat = sum(nemotron_stats.get("latency", [0])) / len(nemotron_stats.get("latency", [1]))
+    oll_lat = sum(ollama_stats.get("latency", [0])) / len(ollama_stats.get("latency", [1]))
+
+    print()
+    if nem_lat < oll_lat:
+        print(f"⚡ Nemotron 延遲較低 ({nem_lat:.0f}ms vs {oll_lat:.0f}ms)")
+    else:
+        print(f"🏠 Ollama 延遲較低 ({oll_lat:.0f}ms vs {nem_lat:.0f}ms) - 本地優勢")
+
+    print()
+    print("測試完成！")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/apps/web/test-results/phase19/PHASE19-VERIFICATION-REPORT.md
+++ b/apps/web/test-results/phase19/PHASE19-VERIFICATION-REPORT.md
@@ -1,127 +0,0 @@
-# Phase 19 Omni-Terminal 正式環境驗證報告
-
-> **測試日期**: 2026-03-28 18:29 (台北時間)
-> **測試者**: Claude Code (首席架構師)
-> **環境**: 正式環境 (https://awoooi.wooo.work)
-> **Commit**: `7b9b0c4` (原測試) → 已修正測試路徑
-
---
-
-## 一、測試摘要
-
-| 項目 | 結果 |
-|------|------|
-| **測試總數** | 11 |
-| **通過** | 11 ✅ |
-| **失敗** | 0 |
-| **執行時間** | 22.2 秒 |
-| **截圖數量** | 10 張 |
-
---
-
-## 二、問題修復紀錄
-
-### 2.1 修復項目
-
-| 問題 | 優先級 | 狀態 | 說明 |
-|------|--------|------|------|
-| `/incidents` 404 | P0 | ✅ 已修復 | 測試路徑錯誤，改為 `/action-logs` |
-| Header「已斷線」 | P1 | ✅ 非問題 | 正常 SSE 狀態顯示 |
-| Under Construction 頁面 | P1 | ✅ 非問題 | Phase 7.0 防禦性路由佔位設計 |
-
-### 2.2 修正內容
-
-```typescript
-// 修正前 (錯誤)
-test('02-Incidents事件', async ({ page }) => {
-  await page.goto(`${BASE_URL}/zh-TW/incidents`)  // 路由不存在
-})
-
-// 修正後 (正確)
-test('02-ActionLogs行動日誌', async ({ page }) => {
-  await page.goto(`${BASE_URL}/zh-TW/action-logs`)  // 正確路由
-})
-```
-
---
-
-## 三、頁面驗證
-
-### 3.1 截圖清單
-
-| 頁面 | 截圖 | 檔案大小 |
-|------|------|----------|
-| 首頁 Dashboard | [01-dashboard.png](01-dashboard.png) | 463 KB |
-| Action Logs 行動日誌 | [02-action-logs.png](02-action-logs.png) | 59 KB |
-| Authorizations 簽核 | [03-authorizations.png](03-authorizations.png) | 33 KB |
-| Errors 錯誤追蹤 | [04-errors.png](04-errors.png) | 73 KB |
-| Knowledge Base 知識庫 | [05-knowledge-base.png](05-knowledge-base.png) | 33 KB |
-| Settings 設定 | [06-settings.png](06-settings.png) | 33 KB |
-| Mobile 響應式 | [08-mobile.png](08-mobile.png) | 159 KB |
-| Tablet 響應式 | [09-tablet.png](09-tablet.png) | 426 KB |
-| English 英文版 | [10-english.png](10-english.png) | 410 KB |
-| Demo 頁面 | [11-demo.png](11-demo.png) | 1.6 MB |
-
-### 3.2 響應式設計
-
-| 裝置 | 解析度 | 狀態 |
-|------|--------|------|
-| Desktop | 1920x1080 | ✅ |
-| Tablet | 768x1024 | ✅ |
-| Mobile | 375x812 | ✅ |
-
-### 3.3 國際化 (i18n)
-
-| 語系 | URL | 狀態 |
-|------|-----|------|
-| 繁體中文 | /zh-TW | ✅ |
-| English | /en | ✅ |
-
---
-
-## 四、UI/UX 審查結果
-
-### 4.1 正常行為確認
-
-| 項目 | 說明 | 結論 |
-|------|------|------|
-| Header「已斷線」 | SSE 未連接時的正確狀態顯示 | ✅ 設計如此 |
-| 授權中心 Under Construction | Phase 7.0 防禦性路由佔位 | ✅ 設計如此 |
-| 知識殿堂 Under Construction | Phase 7.0 防禦性路由佔位 | ✅ 設計如此 |
-| 系統設定 Under Construction | Phase 7.0 防禦性路由佔位 | ✅ 設計如此 |
-
-### 4.2 實際路由確認
-
-| 路由 | 存在 | 說明 |
-|------|------|------|
-| `/` (Dashboard) | ✅ | 主儀表板 |
-| `/action-logs` | ✅ | 行動日誌 |
-| `/authorizations` | ✅ | 授權中心 (Under Construction) |
-| `/errors` | ✅ | 錯誤追蹤 |
-| `/knowledge-base` | ✅ | 知識殿堂 (Under Construction) |
-| `/settings` | ✅ | 系統設定 (Under Construction) |
-| `/demo` | ✅ | Demo 頁面 |
-
---
-
-## 五、結論
-
-### ✅ Phase 19 正式環境驗證通過
-
- **所有頁面** 可正常存取
- **API** 健康檢查通過
- **響應式設計** 三種裝置尺寸驗證通過
- **國際化** 繁中/英文版本正常
- **測試路徑錯誤** 已修復
-
-### 驗收簽核
-
-| 項目 | 簽核 |
-|------|------|
-| 首席架構師 | Claude Code ✅ |
-| 測試日期 | 2026-03-28 18:29 |
-| 修復完成 | 測試路徑錯誤已修正 |
-
---
-
-**Generated by Claude Code (首席架構師)**
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -5,17 +5,17 @@

 ---

-## 📍 當前狀態 (2026-03-29 09:25 台北)
+## 📍 當前狀態 (2026-03-29 01:35 台北)

 | 項目 | 狀態 |
 |------|------|
-| **當前 Phase** | ✅ **Phase 20 Nemotron Tool Calling (P1 修復完成)** |
+| **當前 Phase** | ✅ **Phase 19.6 測試收尾 + P1-P3 全部完成** |
 | **Day** | Day 12 |
 | **K3s 版本** | v1.34.5+k3s1 (mon + mon1) |
 | **叢集健康** | ✅ **所有 Pod 正常運行** |
-| **K3s 優化** | ✅ **全部完成 + P2/P3 + PSS** |
+| **K3s 優化** | ✅ **全部完成 + P1-P3 + PSS** |
 | **K-MON** | ✅ **監控整合** (VIP/Velero/SignOz/Sentry 告警) |
-| **K3 HPA** | ✅ **API/Web 2-4 自動擴展** (CPU 13%/21%) |
+| **K3 HPA** | ✅ **API/Web 2-6 自動擴展** (P3 maxReplicas 調升) |
 | **K4 Kured** | ✅ **自動重啟 (02:00-04:00 維護窗口)** |
 | **K4 Descheduler** | ✅ **負載均衡 (每 2 小時, threshold 30%)** |
 | **K4.3 PSS** | ✅ **Pod Security Standards (6 Namespace labels)** 🆕 |
@@ -49,7 +49,7 @@

 ---

-### ✅ 2026-03-29 Phase 19.6 測試收尾 + P1/P2 改進 (Day 12 01:00) 🆕
+### ✅ 2026-03-29 Phase 19.6 測試收尾 + P1-P3 改進 (Day 12 01:30) 🆕

 | 項目 | 內容 | 狀態 |
 |------|------|------|
@@ -79,18 +79,21 @@

 ---

-### ✅ 2026-03-29 Phase 20 Nemotron P1 修復完成 (Day 12 09:20) 🆕
+### ✅ 2026-03-29 Phase 20 Nemotron P1+P2 全部完成 (Day 12 10:30) 🆕

 | 項目 | 內容 | 狀態 |
 |------|------|------|
 | **ADR-036** | Nemotron Tool Calling 整合 | ✅ **已實作** |
 | **NvidiaProvider** | Tool Calling + HITL 保護 | ✅ **完成** |
-| **測試驗證** | tests/test_nvidia_provider.py | ✅ **15/15 PASSED** |
+| **測試驗證** | tests/test_nvidia_provider.py | ✅ **25/25 PASSED** |
 | **CD 部署** | CD #23689363463 | ✅ **成功** |
 | **Tool Calling 驗證** | restart_pod 測試 | ✅ **正確解析** |
-| **首席架構師審查** | 82/100 → 86/100 | ✅ **P1 已修復** |
-| **Langfuse 整合** | LangfuseTraceContext | ✅ **P1-1 修復** |
-| **OTEL Tracing** | start_as_current_span | ✅ **P1-2 修復** |
+| **首席架構師審查** | 82/100 → 86/100 → 90/100 | ✅ **P1+P2 修復** |
+| **P1-1 Langfuse** | LangfuseTraceContext | ✅ **修復** |
+| **P1-2 OTEL** | start_as_current_span | ✅ **修復** |
+| **P2-1 Protocol** | INvidiaProvider (@runtime_checkable) | ✅ **修復** |
+| **P2-2 邊界測試** | 15 → 25 測試案例 | ✅ **修復** |
+| **P2-3 model_registry** | NVIDIA + tool_calling_fallback_order | ✅ **修復** |

 **驗證結果** (2026-03-29 08:51):
 ```
--- a/k8s/monitoring/k3s-alerts-supplemental.yaml
+++ b/k8s/monitoring/k3s-alerts-supplemental.yaml
@@ -225,3 +225,78 @@ groups:
        annotations:
          summary: "⚠️ TLS 探測失敗"
          description: "無法連線到 {{ $labels.instance }} 進行 TLS 檢查"
+
+  # ===== kube-state-metrics 擴充告警 (P3 2026-03-29) =====
+  - name: kube_state_extended
+    rules:
+      # CronJob 上次執行失敗
+      - alert: CronJobLastRunFailed
+        expr: kube_job_status_failed{namespace="awoooi-prod"} > 0
+        for: 1m
+        labels:
+          severity: warning
+          team: ops
+          component: cronjob
+        annotations:
+          summary: "⚠️ CronJob 執行失敗"
+          description: "Job {{ $labels.job_name }} 執行失敗"
+
+      # DaemonSet 缺少 Pod
+      - alert: DaemonSetMissingPods
+        expr: kube_daemonset_status_number_unavailable{namespace=~"awoooi-prod|kube-system|velero"} > 0
+        for: 5m
+        labels:
+          severity: warning
+          team: ops
+          component: daemonset
+        annotations:
+          summary: "⚠️ DaemonSet 缺少 Pod"
+          description: "{{ $labels.daemonset }} 缺少 {{ $value }} 個 Pod"
+
+      # StatefulSet 副本不足
+      - alert: StatefulSetReplicasMismatch
+        expr: kube_statefulset_status_replicas_ready{namespace="awoooi-prod"} != kube_statefulset_replicas{namespace="awoooi-prod"}
+        for: 5m
+        labels:
+          severity: warning
+          team: ops
+          component: statefulset
+        annotations:
+          summary: "⚠️ StatefulSet 副本不足"
+          description: "{{ $labels.statefulset }} 預期副本數與就緒數不符"
+
+      # 容器長時間等待 (ImagePullBackOff/CrashLoopBackOff)
+      - alert: ContainerWaiting
+        expr: kube_pod_container_status_waiting_reason{namespace="awoooi-prod", reason=~"ImagePullBackOff|CrashLoopBackOff|ErrImagePull"} == 1
+        for: 10m
+        labels:
+          severity: warning
+          team: ops
+          component: container
+        annotations:
+          summary: "⚠️ 容器等待中"
+          description: "{{ $labels.pod }}/{{ $labels.container }} 處於 {{ $labels.reason }} 狀態"
+
+      # PDB 違規 (可用 Pod 數低於 minAvailable)
+      - alert: PDBViolation
+        expr: kube_poddisruptionbudget_status_current_healthy{namespace="awoooi-prod"} < kube_poddisruptionbudget_status_desired_healthy{namespace="awoooi-prod"}
+        for: 5m
+        labels:
+          severity: warning
+          team: ops
+          component: pdb
+        annotations:
+          summary: "⚠️ PDB 違規"
+          description: "{{ $labels.poddisruptionbudget }} 健康 Pod 數低於期望值"
+
+      # 節點 taint 未被容忍 (排程問題偵測)
+      - alert: NodeUnschedulable
+        expr: kube_node_spec_unschedulable == 1
+        for: 30m
+        labels:
+          severity: info
+          team: ops
+          component: node
+        annotations:
+          summary: "ℹ️ 節點標記為不可排程"
+          description: "節點 {{ $labels.node }} 已被標記為 cordon/unschedulable 超過 30 分鐘"