diff --git a/.gitignore b/.gitignore index 334d25d4..bb8490ce 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,12 @@ build/ dist/ .turbo/ +# Playwright 測試產物 (動態生成,不需版本控制) +**/playwright-report/ +**/test-results/ +# 保留 Phase 19 參考截圖 +!apps/web/test-results/phase19/ + # Python __pycache__/ *.py[cod] diff --git a/apps/api/scripts/test_nemotron_tool_calling.py b/apps/api/scripts/test_nemotron_tool_calling.py new file mode 100644 index 00000000..fffb7b51 --- /dev/null +++ b/apps/api/scripts/test_nemotron_tool_calling.py @@ -0,0 +1,496 @@ +#!/usr/bin/env python3 +""" +Nemotron Tool Calling 精準度測試 +比較 Nemotron vs Ollama (Qwen) 的 Tool Calling 能力 + +使用方式: + export NVIDIA_API_KEY=nvapi-xxxx + python test_nemotron_tool_calling.py + +建立者: Claude Code +日期: 2026-03-28 (台北時間) +""" + +import os +import json +import asyncio +import time +from dataclasses import dataclass +from typing import Optional + +try: + import httpx +except ImportError: + print("請安裝 httpx: pip install httpx") + exit(1) + +# ============================================================================ +# 配置 +# ============================================================================ + +NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY") +OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.188:11434") + +if not NVIDIA_API_KEY: + print("❌ 請設定 NVIDIA_API_KEY 環境變數") + print(" export NVIDIA_API_KEY=nvapi-xxxx") + exit(1) + +# ============================================================================ +# Tool 定義 (K8s SRE 場景) +# ============================================================================ + +TOOLS = [ + { + "type": "function", + "function": { + "name": "kubectl_get", + "description": "Get Kubernetes resources (pods, deployments, services, etc.)", + "parameters": { + "type": "object", + "properties": { + "resource": { + "type": "string", + "enum": ["pods", "deployments", "services", "nodes", "events"], + "description": "Resource type to query" + }, + "namespace": { + "type": "string", + "description": "Kubernetes namespace (default: awoooi-prod)" + }, + "name": { + "type": "string", + "description": "Specific resource name (optional)" + } + }, + "required": ["resource"] + } + } + }, + { + "type": "function", + "function": { + "name": "restart_deployment", + "description": "Restart a Kubernetes deployment by rolling restart", + "parameters": { + "type": "object", + "properties": { + "deployment": { + "type": "string", + "description": "Deployment name" + }, + "namespace": { + "type": "string", + "description": "Kubernetes namespace" + } + }, + "required": ["deployment", "namespace"] + } + } + }, + { + "type": "function", + "function": { + "name": "scale_deployment", + "description": "Scale a Kubernetes deployment to specified replicas", + "parameters": { + "type": "object", + "properties": { + "deployment": {"type": "string"}, + "namespace": {"type": "string"}, + "replicas": {"type": "integer", "minimum": 0, "maximum": 10} + }, + "required": ["deployment", "namespace", "replicas"] + } + } + }, + { + "type": "function", + "function": { + "name": "get_logs", + "description": "Get logs from a Kubernetes pod", + "parameters": { + "type": "object", + "properties": { + "pod": {"type": "string"}, + "namespace": {"type": "string"}, + "tail": {"type": "integer", "description": "Number of lines (default: 100)"}, + "container": {"type": "string", "description": "Container name (optional)"} + }, + "required": ["pod", "namespace"] + } + } + }, + { + "type": "function", + "function": { + "name": "send_alert", + "description": "Send alert notification via Telegram", + "parameters": { + "type": "object", + "properties": { + "severity": {"type": "string", "enum": ["info", "warning", "critical"]}, + "message": {"type": "string"}, + "incident_id": {"type": "string"} + }, + "required": ["severity", "message"] + } + } + } +] + +# ============================================================================ +# 測試案例 +# ============================================================================ + +TEST_CASES = [ + { + "id": "TC001", + "description": "簡單查詢 - 列出所有 pods", + "prompt": "Show me all pods in awoooi-prod namespace", + "expected_tool": "kubectl_get", + }, + { + "id": "TC002", + "description": "重啟服務", + "prompt": "The API is not responding, please restart the awoooi-api deployment in awoooi-prod", + "expected_tool": "restart_deployment", + }, + { + "id": "TC003", + "description": "擴展副本", + "prompt": "We're getting high traffic, scale awoooi-web deployment to 3 replicas in awoooi-prod namespace", + "expected_tool": "scale_deployment", + }, + { + "id": "TC004", + "description": "查看日誌", + "prompt": "Get the last 50 lines of logs from awoooi-api-abc123 pod in awoooi-prod", + "expected_tool": "get_logs", + }, + { + "id": "TC005", + "description": "發送告警", + "prompt": "Send a critical alert with message 'Database connection failed' for incident INC-2026-001", + "expected_tool": "send_alert", + }, + { + "id": "TC006", + "description": "繁體中文指令", + "prompt": "請幫我重啟 awoooi-worker 這個 deployment,namespace 是 awoooi-prod", + "expected_tool": "restart_deployment", + }, + { + "id": "TC007", + "description": "複合理解", + "prompt": "The web frontend is showing 502 errors. First, check if the API pods are running in awoooi-prod.", + "expected_tool": "kubectl_get", + }, +] + +# ============================================================================ +# API 客戶端 +# ============================================================================ + +@dataclass +class TestResult: + model: str + test_id: str + description: str + success: bool + tool_called: Optional[str] + params: Optional[dict] + latency_ms: float + error: Optional[str] = None + raw_response: Optional[str] = None + + +async def call_nemotron(prompt: str, model: str = "nvidia/nemotron-mini-4b-instruct") -> dict: + """呼叫 NVIDIA NIM API""" + async with httpx.AsyncClient(timeout=60) as client: + start = time.time() + try: + response = await client.post( + "https://integrate.api.nvidia.com/v1/chat/completions", + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {NVIDIA_API_KEY}" + }, + json={ + "model": model, + "messages": [ + { + "role": "system", + "content": "You are an SRE assistant for AWOOOI AIOps platform. Use the provided tools to help with Kubernetes operations. Always use tools when the user requests an action." + }, + {"role": "user", "content": prompt} + ], + "tools": TOOLS, + "tool_choice": "auto", + "temperature": 0.1, + "max_tokens": 512 + } + ) + latency = (time.time() - start) * 1000 + response.raise_for_status() + return {"data": response.json(), "latency_ms": latency, "error": None} + except Exception as e: + latency = (time.time() - start) * 1000 + return {"data": None, "latency_ms": latency, "error": str(e)} + + +async def call_ollama(prompt: str, model: str = "qwen2.5:7b-instruct") -> dict: + """呼叫本地 Ollama (JSON 模式模擬 Tool Calling)""" + async with httpx.AsyncClient(timeout=120) as client: + start = time.time() + try: + # Ollama 不原生支援 Tool Calling,用 JSON 模式模擬 + tool_prompt = f"""Based on this user request, determine which tool to call and with what parameters. + +User Request: {prompt} + +Available tools: +1. kubectl_get - Get K8s resources (params: resource, namespace, name) +2. restart_deployment - Restart a deployment (params: deployment, namespace) +3. scale_deployment - Scale replicas (params: deployment, namespace, replicas) +4. get_logs - Get pod logs (params: pod, namespace, tail, container) +5. send_alert - Send Telegram alert (params: severity, message, incident_id) + +Respond ONLY with a JSON object in this exact format: +{{"tool": "tool_name", "params": {{"key": "value"}}}} +""" + response = await client.post( + f"{OLLAMA_BASE_URL}/api/chat", + json={ + "model": model, + "messages": [ + {"role": "user", "content": tool_prompt} + ], + "stream": False, + "format": "json", + "options": { + "temperature": 0.1 + } + } + ) + latency = (time.time() - start) * 1000 + response.raise_for_status() + return {"data": response.json(), "latency_ms": latency, "error": None} + except Exception as e: + latency = (time.time() - start) * 1000 + return {"data": None, "latency_ms": latency, "error": str(e)} + + +def parse_nemotron_response(response: dict) -> tuple: + """解析 Nemotron 回應""" + try: + choices = response.get("choices", []) + if not choices: + return (None, {}, "No choices in response") + + message = choices[0].get("message", {}) + + # 檢查 tool_calls + if message.get("tool_calls"): + tool_call = message["tool_calls"][0] + tool_name = tool_call["function"]["name"] + try: + params = json.loads(tool_call["function"]["arguments"]) + except: + params = {} + return (tool_name, params, None) + + # 如果沒有 tool_calls,回傳 content + content = message.get("content", "") + return (None, {}, f"No tool call, content: {content[:100]}") + + except Exception as e: + return (None, {}, str(e)) + + +def parse_ollama_response(response: dict) -> tuple: + """解析 Ollama 回應""" + try: + content = response.get("message", {}).get("content", "{}") + parsed = json.loads(content) + return (parsed.get("tool"), parsed.get("params", {}), None) + except Exception as e: + return (None, {}, str(e)) + + +# ============================================================================ +# 測試執行 +# ============================================================================ + +async def run_single_test(test_case: dict) -> list: + """執行單一測試案例""" + results = [] + prompt = test_case["prompt"] + + # 測試 Nemotron + print(f" Testing Nemotron...", end=" ", flush=True) + resp = await call_nemotron(prompt) + if resp["error"]: + results.append(TestResult( + model="Nemotron-mini-4B", + test_id=test_case["id"], + description=test_case["description"], + success=False, + tool_called=None, + params=None, + latency_ms=resp["latency_ms"], + error=resp["error"] + )) + print(f"❌ Error") + else: + tool, params, error = parse_nemotron_response(resp["data"]) + success = tool == test_case["expected_tool"] + raw = None + try: + raw = json.dumps(resp["data"].get("choices", [{}])[0].get("message", {}), indent=2)[:200] + except: + pass + results.append(TestResult( + model="Nemotron-mini-4B", + test_id=test_case["id"], + description=test_case["description"], + success=success, + tool_called=tool, + params=params, + latency_ms=resp["latency_ms"], + error=error, + raw_response=raw + )) + status = "✅" if success else "❌" + print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)") + + # 測試 Ollama + print(f" Testing Ollama...", end=" ", flush=True) + resp = await call_ollama(prompt) + if resp["error"]: + results.append(TestResult( + model="Ollama-Qwen2.5-7B", + test_id=test_case["id"], + description=test_case["description"], + success=False, + tool_called=None, + params=None, + latency_ms=resp["latency_ms"], + error=resp["error"] + )) + print(f"❌ Error: {resp['error'][:50]}") + else: + tool, params, error = parse_ollama_response(resp["data"]) + success = tool == test_case["expected_tool"] + results.append(TestResult( + model="Ollama-Qwen2.5-7B", + test_id=test_case["id"], + description=test_case["description"], + success=success, + tool_called=tool, + params=params, + latency_ms=resp["latency_ms"], + error=error + )) + status = "✅" if success else "❌" + print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)") + + return results + + +async def main(): + """主測試流程""" + print("=" * 70) + print("🧪 Nemotron vs Ollama Tool Calling 精準度測試") + print("=" * 70) + print() + print(f"Nemotron API: integrate.api.nvidia.com") + print(f"Ollama URL: {OLLAMA_BASE_URL}") + print() + + all_results = [] + + for i, tc in enumerate(TEST_CASES, 1): + print(f"[{i}/{len(TEST_CASES)}] {tc['id']}: {tc['description']}") + print(f" Prompt: {tc['prompt'][:60]}...") + print(f" Expected: {tc['expected_tool']}") + + results = await run_single_test(tc) + all_results.extend(results) + print() + + # ======================================================================== + # 統計結果 + # ======================================================================== + print("=" * 70) + print("📊 統計結果") + print("=" * 70) + print() + + models = {} + for r in all_results: + if r.model not in models: + models[r.model] = {"success": 0, "total": 0, "latency": [], "errors": 0} + models[r.model]["total"] += 1 + if r.success: + models[r.model]["success"] += 1 + if r.error: + models[r.model]["errors"] += 1 + if r.latency_ms > 0: + models[r.model]["latency"].append(r.latency_ms) + + print(f"{'Model':<25} {'Accuracy':<12} {'Avg Latency':<15} {'Errors':<10}") + print("-" * 62) + for model, stats in models.items(): + acc = stats["success"] / stats["total"] * 100 if stats["total"] > 0 else 0 + avg_lat = sum(stats["latency"]) / len(stats["latency"]) if stats["latency"] else 0 + print(f"{model:<25} {acc:>6.1f}% {avg_lat:>8.0f}ms {stats['errors']}") + + print() + + # 詳細錯誤報告 + errors = [r for r in all_results if r.error] + if errors: + print("=" * 70) + print("⚠️ 錯誤詳情") + print("=" * 70) + for r in errors: + print(f" [{r.test_id}] {r.model}: {r.error[:80]}") + + # 推薦 + print() + print("=" * 70) + print("💡 建議") + print("=" * 70) + + nemotron_stats = models.get("Nemotron-mini-4B", {}) + ollama_stats = models.get("Ollama-Qwen2.5-7B", {}) + + nem_acc = nemotron_stats.get("success", 0) / nemotron_stats.get("total", 1) * 100 + oll_acc = ollama_stats.get("success", 0) / ollama_stats.get("total", 1) * 100 + + if nem_acc > oll_acc: + print(f"✅ Nemotron Tool Calling 精準度較高 ({nem_acc:.0f}% vs {oll_acc:.0f}%)") + print(" 建議: 將 Nemotron 作為 Tool Calling 任務的首選模型") + elif oll_acc > nem_acc: + print(f"⚠️ Ollama 精準度較高 ({oll_acc:.0f}% vs {nem_acc:.0f}%)") + print(" 建議: 繼續使用 Ollama,Nemotron 可作為備援") + else: + print(f"📊 兩者精準度相近 ({nem_acc:.0f}% vs {oll_acc:.0f}%)") + print(" 建議: 考慮延遲和成本選擇") + + nem_lat = sum(nemotron_stats.get("latency", [0])) / len(nemotron_stats.get("latency", [1])) + oll_lat = sum(ollama_stats.get("latency", [0])) / len(ollama_stats.get("latency", [1])) + + print() + if nem_lat < oll_lat: + print(f"⚡ Nemotron 延遲較低 ({nem_lat:.0f}ms vs {oll_lat:.0f}ms)") + else: + print(f"🏠 Ollama 延遲較低 ({oll_lat:.0f}ms vs {nem_lat:.0f}ms) - 本地優勢") + + print() + print("測試完成!") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/apps/web/test-results/phase19/PHASE19-VERIFICATION-REPORT.md b/apps/web/test-results/phase19/PHASE19-VERIFICATION-REPORT.md deleted file mode 100644 index 8e485f87..00000000 --- a/apps/web/test-results/phase19/PHASE19-VERIFICATION-REPORT.md +++ /dev/null @@ -1,127 +0,0 @@ -# Phase 19 Omni-Terminal 正式環境驗證報告 - -> **測試日期**: 2026-03-28 18:29 (台北時間) -> **測試者**: Claude Code (首席架構師) -> **環境**: 正式環境 (https://awoooi.wooo.work) -> **Commit**: `7b9b0c4` (原測試) → 已修正測試路徑 - ---- - -## 一、測試摘要 - -| 項目 | 結果 | -|------|------| -| **測試總數** | 11 | -| **通過** | 11 ✅ | -| **失敗** | 0 | -| **執行時間** | 22.2 秒 | -| **截圖數量** | 10 張 | - ---- - -## 二、問題修復紀錄 - -### 2.1 修復項目 - -| 問題 | 優先級 | 狀態 | 說明 | -|------|--------|------|------| -| `/incidents` 404 | P0 | ✅ 已修復 | 測試路徑錯誤,改為 `/action-logs` | -| Header「已斷線」 | P1 | ✅ 非問題 | 正常 SSE 狀態顯示 | -| Under Construction 頁面 | P1 | ✅ 非問題 | Phase 7.0 防禦性路由佔位設計 | - -### 2.2 修正內容 - -```typescript -// 修正前 (錯誤) -test('02-Incidents事件', async ({ page }) => { - await page.goto(`${BASE_URL}/zh-TW/incidents`) // 路由不存在 -}) - -// 修正後 (正確) -test('02-ActionLogs行動日誌', async ({ page }) => { - await page.goto(`${BASE_URL}/zh-TW/action-logs`) // 正確路由 -}) -``` - ---- - -## 三、頁面驗證 - -### 3.1 截圖清單 - -| 頁面 | 截圖 | 檔案大小 | -|------|------|----------| -| 首頁 Dashboard | [01-dashboard.png](01-dashboard.png) | 463 KB | -| Action Logs 行動日誌 | [02-action-logs.png](02-action-logs.png) | 59 KB | -| Authorizations 簽核 | [03-authorizations.png](03-authorizations.png) | 33 KB | -| Errors 錯誤追蹤 | [04-errors.png](04-errors.png) | 73 KB | -| Knowledge Base 知識庫 | [05-knowledge-base.png](05-knowledge-base.png) | 33 KB | -| Settings 設定 | [06-settings.png](06-settings.png) | 33 KB | -| Mobile 響應式 | [08-mobile.png](08-mobile.png) | 159 KB | -| Tablet 響應式 | [09-tablet.png](09-tablet.png) | 426 KB | -| English 英文版 | [10-english.png](10-english.png) | 410 KB | -| Demo 頁面 | [11-demo.png](11-demo.png) | 1.6 MB | - -### 3.2 響應式設計 - -| 裝置 | 解析度 | 狀態 | -|------|--------|------| -| Desktop | 1920x1080 | ✅ | -| Tablet | 768x1024 | ✅ | -| Mobile | 375x812 | ✅ | - -### 3.3 國際化 (i18n) - -| 語系 | URL | 狀態 | -|------|-----|------| -| 繁體中文 | /zh-TW | ✅ | -| English | /en | ✅ | - ---- - -## 四、UI/UX 審查結果 - -### 4.1 正常行為確認 - -| 項目 | 說明 | 結論 | -|------|------|------| -| Header「已斷線」 | SSE 未連接時的正確狀態顯示 | ✅ 設計如此 | -| 授權中心 Under Construction | Phase 7.0 防禦性路由佔位 | ✅ 設計如此 | -| 知識殿堂 Under Construction | Phase 7.0 防禦性路由佔位 | ✅ 設計如此 | -| 系統設定 Under Construction | Phase 7.0 防禦性路由佔位 | ✅ 設計如此 | - -### 4.2 實際路由確認 - -| 路由 | 存在 | 說明 | -|------|------|------| -| `/` (Dashboard) | ✅ | 主儀表板 | -| `/action-logs` | ✅ | 行動日誌 | -| `/authorizations` | ✅ | 授權中心 (Under Construction) | -| `/errors` | ✅ | 錯誤追蹤 | -| `/knowledge-base` | ✅ | 知識殿堂 (Under Construction) | -| `/settings` | ✅ | 系統設定 (Under Construction) | -| `/demo` | ✅ | Demo 頁面 | - ---- - -## 五、結論 - -### ✅ Phase 19 正式環境驗證通過 - -- **所有頁面** 可正常存取 -- **API** 健康檢查通過 -- **響應式設計** 三種裝置尺寸驗證通過 -- **國際化** 繁中/英文版本正常 -- **測試路徑錯誤** 已修復 - -### 驗收簽核 - -| 項目 | 簽核 | -|------|------| -| 首席架構師 | Claude Code ✅ | -| 測試日期 | 2026-03-28 18:29 | -| 修復完成 | 測試路徑錯誤已修正 | - ---- - -**Generated by Claude Code (首席架構師)** diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 7b0202ef..097b7d9c 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -5,17 +5,17 @@ --- -## 📍 當前狀態 (2026-03-29 09:25 台北) +## 📍 當前狀態 (2026-03-29 01:35 台北) | 項目 | 狀態 | |------|------| -| **當前 Phase** | ✅ **Phase 20 Nemotron Tool Calling (P1 修復完成)** | +| **當前 Phase** | ✅ **Phase 19.6 測試收尾 + P1-P3 全部完成** | | **Day** | Day 12 | | **K3s 版本** | v1.34.5+k3s1 (mon + mon1) | | **叢集健康** | ✅ **所有 Pod 正常運行** | -| **K3s 優化** | ✅ **全部完成 + P2/P3 + PSS** | +| **K3s 優化** | ✅ **全部完成 + P1-P3 + PSS** | | **K-MON** | ✅ **監控整合** (VIP/Velero/SignOz/Sentry 告警) | -| **K3 HPA** | ✅ **API/Web 2-4 自動擴展** (CPU 13%/21%) | +| **K3 HPA** | ✅ **API/Web 2-6 自動擴展** (P3 maxReplicas 調升) | | **K4 Kured** | ✅ **自動重啟 (02:00-04:00 維護窗口)** | | **K4 Descheduler** | ✅ **負載均衡 (每 2 小時, threshold 30%)** | | **K4.3 PSS** | ✅ **Pod Security Standards (6 Namespace labels)** 🆕 | @@ -49,7 +49,7 @@ --- -### ✅ 2026-03-29 Phase 19.6 測試收尾 + P1/P2 改進 (Day 12 01:00) 🆕 +### ✅ 2026-03-29 Phase 19.6 測試收尾 + P1-P3 改進 (Day 12 01:30) 🆕 | 項目 | 內容 | 狀態 | |------|------|------| @@ -79,18 +79,21 @@ --- -### ✅ 2026-03-29 Phase 20 Nemotron P1 修復完成 (Day 12 09:20) 🆕 +### ✅ 2026-03-29 Phase 20 Nemotron P1+P2 全部完成 (Day 12 10:30) 🆕 | 項目 | 內容 | 狀態 | |------|------|------| | **ADR-036** | Nemotron Tool Calling 整合 | ✅ **已實作** | | **NvidiaProvider** | Tool Calling + HITL 保護 | ✅ **完成** | -| **測試驗證** | tests/test_nvidia_provider.py | ✅ **15/15 PASSED** | +| **測試驗證** | tests/test_nvidia_provider.py | ✅ **25/25 PASSED** | | **CD 部署** | CD #23689363463 | ✅ **成功** | | **Tool Calling 驗證** | restart_pod 測試 | ✅ **正確解析** | -| **首席架構師審查** | 82/100 → 86/100 | ✅ **P1 已修復** | -| **Langfuse 整合** | LangfuseTraceContext | ✅ **P1-1 修復** | -| **OTEL Tracing** | start_as_current_span | ✅ **P1-2 修復** | +| **首席架構師審查** | 82/100 → 86/100 → 90/100 | ✅ **P1+P2 修復** | +| **P1-1 Langfuse** | LangfuseTraceContext | ✅ **修復** | +| **P1-2 OTEL** | start_as_current_span | ✅ **修復** | +| **P2-1 Protocol** | INvidiaProvider (@runtime_checkable) | ✅ **修復** | +| **P2-2 邊界測試** | 15 → 25 測試案例 | ✅ **修復** | +| **P2-3 model_registry** | NVIDIA + tool_calling_fallback_order | ✅ **修復** | **驗證結果** (2026-03-29 08:51): ``` diff --git a/k8s/monitoring/k3s-alerts-supplemental.yaml b/k8s/monitoring/k3s-alerts-supplemental.yaml index 0a45bb95..6bf6a91f 100644 --- a/k8s/monitoring/k3s-alerts-supplemental.yaml +++ b/k8s/monitoring/k3s-alerts-supplemental.yaml @@ -225,3 +225,78 @@ groups: annotations: summary: "⚠️ TLS 探測失敗" description: "無法連線到 {{ $labels.instance }} 進行 TLS 檢查" + + # ===== kube-state-metrics 擴充告警 (P3 2026-03-29) ===== + - name: kube_state_extended + rules: + # CronJob 上次執行失敗 + - alert: CronJobLastRunFailed + expr: kube_job_status_failed{namespace="awoooi-prod"} > 0 + for: 1m + labels: + severity: warning + team: ops + component: cronjob + annotations: + summary: "⚠️ CronJob 執行失敗" + description: "Job {{ $labels.job_name }} 執行失敗" + + # DaemonSet 缺少 Pod + - alert: DaemonSetMissingPods + expr: kube_daemonset_status_number_unavailable{namespace=~"awoooi-prod|kube-system|velero"} > 0 + for: 5m + labels: + severity: warning + team: ops + component: daemonset + annotations: + summary: "⚠️ DaemonSet 缺少 Pod" + description: "{{ $labels.daemonset }} 缺少 {{ $value }} 個 Pod" + + # StatefulSet 副本不足 + - alert: StatefulSetReplicasMismatch + expr: kube_statefulset_status_replicas_ready{namespace="awoooi-prod"} != kube_statefulset_replicas{namespace="awoooi-prod"} + for: 5m + labels: + severity: warning + team: ops + component: statefulset + annotations: + summary: "⚠️ StatefulSet 副本不足" + description: "{{ $labels.statefulset }} 預期副本數與就緒數不符" + + # 容器長時間等待 (ImagePullBackOff/CrashLoopBackOff) + - alert: ContainerWaiting + expr: kube_pod_container_status_waiting_reason{namespace="awoooi-prod", reason=~"ImagePullBackOff|CrashLoopBackOff|ErrImagePull"} == 1 + for: 10m + labels: + severity: warning + team: ops + component: container + annotations: + summary: "⚠️ 容器等待中" + description: "{{ $labels.pod }}/{{ $labels.container }} 處於 {{ $labels.reason }} 狀態" + + # PDB 違規 (可用 Pod 數低於 minAvailable) + - alert: PDBViolation + expr: kube_poddisruptionbudget_status_current_healthy{namespace="awoooi-prod"} < kube_poddisruptionbudget_status_desired_healthy{namespace="awoooi-prod"} + for: 5m + labels: + severity: warning + team: ops + component: pdb + annotations: + summary: "⚠️ PDB 違規" + description: "{{ $labels.poddisruptionbudget }} 健康 Pod 數低於期望值" + + # 節點 taint 未被容忍 (排程問題偵測) + - alert: NodeUnschedulable + expr: kube_node_spec_unschedulable == 1 + for: 30m + labels: + severity: info + team: ops + component: node + annotations: + summary: "ℹ️ 節點標記為不可排程" + description: "節點 {{ $labels.node }} 已被標記為 cordon/unschedulable 超過 30 分鐘"