chore: 清理 Playwright 產物 + kube-state-metrics 告警擴充

清理工作:
- .gitignore 新增 playwright-report/ 和 test-results/ 排除
- 保留 phase19/ 參考截圖目錄

kube-state-metrics 告警擴充 (P3):
- CronJobLastRunFailed: Job 執行失敗
- DaemonSetMissingPods: DaemonSet 缺少 Pod
- StatefulSetReplicasMismatch: StatefulSet 副本不足
- ContainerWaiting: ImagePullBackOff/CrashLoopBackOff 偵測
- PDBViolation: PDB 健康 Pod 數不足
- NodeUnschedulable: 節點標記為不可排程

新增:
- apps/api/scripts/test_nemotron_tool_calling.py (E2E 比較測試)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-03-29 01:28:35 +08:00
parent 725392b578
commit 179e659f14
5 changed files with 590 additions and 137 deletions

6
.gitignore vendored
View File

@@ -17,6 +17,12 @@ build/
dist/
.turbo/
# Playwright 測試產物 (動態生成,不需版本控制)
**/playwright-report/
**/test-results/
# 保留 Phase 19 參考截圖
!apps/web/test-results/phase19/
# Python
__pycache__/
*.py[cod]

View File

@@ -0,0 +1,496 @@
#!/usr/bin/env python3
"""
Nemotron Tool Calling 精準度測試
比較 Nemotron vs Ollama (Qwen) 的 Tool Calling 能力
使用方式:
export NVIDIA_API_KEY=nvapi-xxxx
python test_nemotron_tool_calling.py
建立者: Claude Code
日期: 2026-03-28 (台北時間)
"""
import os
import json
import asyncio
import time
from dataclasses import dataclass
from typing import Optional
try:
import httpx
except ImportError:
print("請安裝 httpx: pip install httpx")
exit(1)
# ============================================================================
# 配置
# ============================================================================
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.188:11434")
if not NVIDIA_API_KEY:
print("❌ 請設定 NVIDIA_API_KEY 環境變數")
print(" export NVIDIA_API_KEY=nvapi-xxxx")
exit(1)
# ============================================================================
# Tool 定義 (K8s SRE 場景)
# ============================================================================
TOOLS = [
{
"type": "function",
"function": {
"name": "kubectl_get",
"description": "Get Kubernetes resources (pods, deployments, services, etc.)",
"parameters": {
"type": "object",
"properties": {
"resource": {
"type": "string",
"enum": ["pods", "deployments", "services", "nodes", "events"],
"description": "Resource type to query"
},
"namespace": {
"type": "string",
"description": "Kubernetes namespace (default: awoooi-prod)"
},
"name": {
"type": "string",
"description": "Specific resource name (optional)"
}
},
"required": ["resource"]
}
}
},
{
"type": "function",
"function": {
"name": "restart_deployment",
"description": "Restart a Kubernetes deployment by rolling restart",
"parameters": {
"type": "object",
"properties": {
"deployment": {
"type": "string",
"description": "Deployment name"
},
"namespace": {
"type": "string",
"description": "Kubernetes namespace"
}
},
"required": ["deployment", "namespace"]
}
}
},
{
"type": "function",
"function": {
"name": "scale_deployment",
"description": "Scale a Kubernetes deployment to specified replicas",
"parameters": {
"type": "object",
"properties": {
"deployment": {"type": "string"},
"namespace": {"type": "string"},
"replicas": {"type": "integer", "minimum": 0, "maximum": 10}
},
"required": ["deployment", "namespace", "replicas"]
}
}
},
{
"type": "function",
"function": {
"name": "get_logs",
"description": "Get logs from a Kubernetes pod",
"parameters": {
"type": "object",
"properties": {
"pod": {"type": "string"},
"namespace": {"type": "string"},
"tail": {"type": "integer", "description": "Number of lines (default: 100)"},
"container": {"type": "string", "description": "Container name (optional)"}
},
"required": ["pod", "namespace"]
}
}
},
{
"type": "function",
"function": {
"name": "send_alert",
"description": "Send alert notification via Telegram",
"parameters": {
"type": "object",
"properties": {
"severity": {"type": "string", "enum": ["info", "warning", "critical"]},
"message": {"type": "string"},
"incident_id": {"type": "string"}
},
"required": ["severity", "message"]
}
}
}
]
# ============================================================================
# 測試案例
# ============================================================================
TEST_CASES = [
{
"id": "TC001",
"description": "簡單查詢 - 列出所有 pods",
"prompt": "Show me all pods in awoooi-prod namespace",
"expected_tool": "kubectl_get",
},
{
"id": "TC002",
"description": "重啟服務",
"prompt": "The API is not responding, please restart the awoooi-api deployment in awoooi-prod",
"expected_tool": "restart_deployment",
},
{
"id": "TC003",
"description": "擴展副本",
"prompt": "We're getting high traffic, scale awoooi-web deployment to 3 replicas in awoooi-prod namespace",
"expected_tool": "scale_deployment",
},
{
"id": "TC004",
"description": "查看日誌",
"prompt": "Get the last 50 lines of logs from awoooi-api-abc123 pod in awoooi-prod",
"expected_tool": "get_logs",
},
{
"id": "TC005",
"description": "發送告警",
"prompt": "Send a critical alert with message 'Database connection failed' for incident INC-2026-001",
"expected_tool": "send_alert",
},
{
"id": "TC006",
"description": "繁體中文指令",
"prompt": "請幫我重啟 awoooi-worker 這個 deploymentnamespace 是 awoooi-prod",
"expected_tool": "restart_deployment",
},
{
"id": "TC007",
"description": "複合理解",
"prompt": "The web frontend is showing 502 errors. First, check if the API pods are running in awoooi-prod.",
"expected_tool": "kubectl_get",
},
]
# ============================================================================
# API 客戶端
# ============================================================================
@dataclass
class TestResult:
model: str
test_id: str
description: str
success: bool
tool_called: Optional[str]
params: Optional[dict]
latency_ms: float
error: Optional[str] = None
raw_response: Optional[str] = None
async def call_nemotron(prompt: str, model: str = "nvidia/nemotron-mini-4b-instruct") -> dict:
"""呼叫 NVIDIA NIM API"""
async with httpx.AsyncClient(timeout=60) as client:
start = time.time()
try:
response = await client.post(
"https://integrate.api.nvidia.com/v1/chat/completions",
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {NVIDIA_API_KEY}"
},
json={
"model": model,
"messages": [
{
"role": "system",
"content": "You are an SRE assistant for AWOOOI AIOps platform. Use the provided tools to help with Kubernetes operations. Always use tools when the user requests an action."
},
{"role": "user", "content": prompt}
],
"tools": TOOLS,
"tool_choice": "auto",
"temperature": 0.1,
"max_tokens": 512
}
)
latency = (time.time() - start) * 1000
response.raise_for_status()
return {"data": response.json(), "latency_ms": latency, "error": None}
except Exception as e:
latency = (time.time() - start) * 1000
return {"data": None, "latency_ms": latency, "error": str(e)}
async def call_ollama(prompt: str, model: str = "qwen2.5:7b-instruct") -> dict:
"""呼叫本地 Ollama (JSON 模式模擬 Tool Calling)"""
async with httpx.AsyncClient(timeout=120) as client:
start = time.time()
try:
# Ollama 不原生支援 Tool Calling用 JSON 模式模擬
tool_prompt = f"""Based on this user request, determine which tool to call and with what parameters.
User Request: {prompt}
Available tools:
1. kubectl_get - Get K8s resources (params: resource, namespace, name)
2. restart_deployment - Restart a deployment (params: deployment, namespace)
3. scale_deployment - Scale replicas (params: deployment, namespace, replicas)
4. get_logs - Get pod logs (params: pod, namespace, tail, container)
5. send_alert - Send Telegram alert (params: severity, message, incident_id)
Respond ONLY with a JSON object in this exact format:
{{"tool": "tool_name", "params": {{"key": "value"}}}}
"""
response = await client.post(
f"{OLLAMA_BASE_URL}/api/chat",
json={
"model": model,
"messages": [
{"role": "user", "content": tool_prompt}
],
"stream": False,
"format": "json",
"options": {
"temperature": 0.1
}
}
)
latency = (time.time() - start) * 1000
response.raise_for_status()
return {"data": response.json(), "latency_ms": latency, "error": None}
except Exception as e:
latency = (time.time() - start) * 1000
return {"data": None, "latency_ms": latency, "error": str(e)}
def parse_nemotron_response(response: dict) -> tuple:
"""解析 Nemotron 回應"""
try:
choices = response.get("choices", [])
if not choices:
return (None, {}, "No choices in response")
message = choices[0].get("message", {})
# 檢查 tool_calls
if message.get("tool_calls"):
tool_call = message["tool_calls"][0]
tool_name = tool_call["function"]["name"]
try:
params = json.loads(tool_call["function"]["arguments"])
except:
params = {}
return (tool_name, params, None)
# 如果沒有 tool_calls回傳 content
content = message.get("content", "")
return (None, {}, f"No tool call, content: {content[:100]}")
except Exception as e:
return (None, {}, str(e))
def parse_ollama_response(response: dict) -> tuple:
"""解析 Ollama 回應"""
try:
content = response.get("message", {}).get("content", "{}")
parsed = json.loads(content)
return (parsed.get("tool"), parsed.get("params", {}), None)
except Exception as e:
return (None, {}, str(e))
# ============================================================================
# 測試執行
# ============================================================================
async def run_single_test(test_case: dict) -> list:
"""執行單一測試案例"""
results = []
prompt = test_case["prompt"]
# 測試 Nemotron
print(f" Testing Nemotron...", end=" ", flush=True)
resp = await call_nemotron(prompt)
if resp["error"]:
results.append(TestResult(
model="Nemotron-mini-4B",
test_id=test_case["id"],
description=test_case["description"],
success=False,
tool_called=None,
params=None,
latency_ms=resp["latency_ms"],
error=resp["error"]
))
print(f"❌ Error")
else:
tool, params, error = parse_nemotron_response(resp["data"])
success = tool == test_case["expected_tool"]
raw = None
try:
raw = json.dumps(resp["data"].get("choices", [{}])[0].get("message", {}), indent=2)[:200]
except:
pass
results.append(TestResult(
model="Nemotron-mini-4B",
test_id=test_case["id"],
description=test_case["description"],
success=success,
tool_called=tool,
params=params,
latency_ms=resp["latency_ms"],
error=error,
raw_response=raw
))
status = "" if success else ""
print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)")
# 測試 Ollama
print(f" Testing Ollama...", end=" ", flush=True)
resp = await call_ollama(prompt)
if resp["error"]:
results.append(TestResult(
model="Ollama-Qwen2.5-7B",
test_id=test_case["id"],
description=test_case["description"],
success=False,
tool_called=None,
params=None,
latency_ms=resp["latency_ms"],
error=resp["error"]
))
print(f"❌ Error: {resp['error'][:50]}")
else:
tool, params, error = parse_ollama_response(resp["data"])
success = tool == test_case["expected_tool"]
results.append(TestResult(
model="Ollama-Qwen2.5-7B",
test_id=test_case["id"],
description=test_case["description"],
success=success,
tool_called=tool,
params=params,
latency_ms=resp["latency_ms"],
error=error
))
status = "" if success else ""
print(f"{status} {tool} ({resp['latency_ms']:.0f}ms)")
return results
async def main():
"""主測試流程"""
print("=" * 70)
print("🧪 Nemotron vs Ollama Tool Calling 精準度測試")
print("=" * 70)
print()
print(f"Nemotron API: integrate.api.nvidia.com")
print(f"Ollama URL: {OLLAMA_BASE_URL}")
print()
all_results = []
for i, tc in enumerate(TEST_CASES, 1):
print(f"[{i}/{len(TEST_CASES)}] {tc['id']}: {tc['description']}")
print(f" Prompt: {tc['prompt'][:60]}...")
print(f" Expected: {tc['expected_tool']}")
results = await run_single_test(tc)
all_results.extend(results)
print()
# ========================================================================
# 統計結果
# ========================================================================
print("=" * 70)
print("📊 統計結果")
print("=" * 70)
print()
models = {}
for r in all_results:
if r.model not in models:
models[r.model] = {"success": 0, "total": 0, "latency": [], "errors": 0}
models[r.model]["total"] += 1
if r.success:
models[r.model]["success"] += 1
if r.error:
models[r.model]["errors"] += 1
if r.latency_ms > 0:
models[r.model]["latency"].append(r.latency_ms)
print(f"{'Model':<25} {'Accuracy':<12} {'Avg Latency':<15} {'Errors':<10}")
print("-" * 62)
for model, stats in models.items():
acc = stats["success"] / stats["total"] * 100 if stats["total"] > 0 else 0
avg_lat = sum(stats["latency"]) / len(stats["latency"]) if stats["latency"] else 0
print(f"{model:<25} {acc:>6.1f}% {avg_lat:>8.0f}ms {stats['errors']}")
print()
# 詳細錯誤報告
errors = [r for r in all_results if r.error]
if errors:
print("=" * 70)
print("⚠️ 錯誤詳情")
print("=" * 70)
for r in errors:
print(f" [{r.test_id}] {r.model}: {r.error[:80]}")
# 推薦
print()
print("=" * 70)
print("💡 建議")
print("=" * 70)
nemotron_stats = models.get("Nemotron-mini-4B", {})
ollama_stats = models.get("Ollama-Qwen2.5-7B", {})
nem_acc = nemotron_stats.get("success", 0) / nemotron_stats.get("total", 1) * 100
oll_acc = ollama_stats.get("success", 0) / ollama_stats.get("total", 1) * 100
if nem_acc > oll_acc:
print(f"✅ Nemotron Tool Calling 精準度較高 ({nem_acc:.0f}% vs {oll_acc:.0f}%)")
print(" 建議: 將 Nemotron 作為 Tool Calling 任務的首選模型")
elif oll_acc > nem_acc:
print(f"⚠️ Ollama 精準度較高 ({oll_acc:.0f}% vs {nem_acc:.0f}%)")
print(" 建議: 繼續使用 OllamaNemotron 可作為備援")
else:
print(f"📊 兩者精準度相近 ({nem_acc:.0f}% vs {oll_acc:.0f}%)")
print(" 建議: 考慮延遲和成本選擇")
nem_lat = sum(nemotron_stats.get("latency", [0])) / len(nemotron_stats.get("latency", [1]))
oll_lat = sum(ollama_stats.get("latency", [0])) / len(ollama_stats.get("latency", [1]))
print()
if nem_lat < oll_lat:
print(f"⚡ Nemotron 延遲較低 ({nem_lat:.0f}ms vs {oll_lat:.0f}ms)")
else:
print(f"🏠 Ollama 延遲較低 ({oll_lat:.0f}ms vs {nem_lat:.0f}ms) - 本地優勢")
print()
print("測試完成!")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,127 +0,0 @@
# Phase 19 Omni-Terminal 正式環境驗證報告
> **測試日期**: 2026-03-28 18:29 (台北時間)
> **測試者**: Claude Code (首席架構師)
> **環境**: 正式環境 (https://awoooi.wooo.work)
> **Commit**: `7b9b0c4` (原測試) → 已修正測試路徑
---
## 一、測試摘要
| 項目 | 結果 |
|------|------|
| **測試總數** | 11 |
| **通過** | 11 ✅ |
| **失敗** | 0 |
| **執行時間** | 22.2 秒 |
| **截圖數量** | 10 張 |
---
## 二、問題修復紀錄
### 2.1 修復項目
| 問題 | 優先級 | 狀態 | 說明 |
|------|--------|------|------|
| `/incidents` 404 | P0 | ✅ 已修復 | 測試路徑錯誤,改為 `/action-logs` |
| Header「已斷線」 | P1 | ✅ 非問題 | 正常 SSE 狀態顯示 |
| Under Construction 頁面 | P1 | ✅ 非問題 | Phase 7.0 防禦性路由佔位設計 |
### 2.2 修正內容
```typescript
// 修正前 (錯誤)
test('02-Incidents事件', async ({ page }) => {
await page.goto(`${BASE_URL}/zh-TW/incidents`) // 路由不存在
})
// 修正後 (正確)
test('02-ActionLogs行動日誌', async ({ page }) => {
await page.goto(`${BASE_URL}/zh-TW/action-logs`) // 正確路由
})
```
---
## 三、頁面驗證
### 3.1 截圖清單
| 頁面 | 截圖 | 檔案大小 |
|------|------|----------|
| 首頁 Dashboard | [01-dashboard.png](01-dashboard.png) | 463 KB |
| Action Logs 行動日誌 | [02-action-logs.png](02-action-logs.png) | 59 KB |
| Authorizations 簽核 | [03-authorizations.png](03-authorizations.png) | 33 KB |
| Errors 錯誤追蹤 | [04-errors.png](04-errors.png) | 73 KB |
| Knowledge Base 知識庫 | [05-knowledge-base.png](05-knowledge-base.png) | 33 KB |
| Settings 設定 | [06-settings.png](06-settings.png) | 33 KB |
| Mobile 響應式 | [08-mobile.png](08-mobile.png) | 159 KB |
| Tablet 響應式 | [09-tablet.png](09-tablet.png) | 426 KB |
| English 英文版 | [10-english.png](10-english.png) | 410 KB |
| Demo 頁面 | [11-demo.png](11-demo.png) | 1.6 MB |
### 3.2 響應式設計
| 裝置 | 解析度 | 狀態 |
|------|--------|------|
| Desktop | 1920x1080 | ✅ |
| Tablet | 768x1024 | ✅ |
| Mobile | 375x812 | ✅ |
### 3.3 國際化 (i18n)
| 語系 | URL | 狀態 |
|------|-----|------|
| 繁體中文 | /zh-TW | ✅ |
| English | /en | ✅ |
---
## 四、UI/UX 審查結果
### 4.1 正常行為確認
| 項目 | 說明 | 結論 |
|------|------|------|
| Header「已斷線」 | SSE 未連接時的正確狀態顯示 | ✅ 設計如此 |
| 授權中心 Under Construction | Phase 7.0 防禦性路由佔位 | ✅ 設計如此 |
| 知識殿堂 Under Construction | Phase 7.0 防禦性路由佔位 | ✅ 設計如此 |
| 系統設定 Under Construction | Phase 7.0 防禦性路由佔位 | ✅ 設計如此 |
### 4.2 實際路由確認
| 路由 | 存在 | 說明 |
|------|------|------|
| `/` (Dashboard) | ✅ | 主儀表板 |
| `/action-logs` | ✅ | 行動日誌 |
| `/authorizations` | ✅ | 授權中心 (Under Construction) |
| `/errors` | ✅ | 錯誤追蹤 |
| `/knowledge-base` | ✅ | 知識殿堂 (Under Construction) |
| `/settings` | ✅ | 系統設定 (Under Construction) |
| `/demo` | ✅ | Demo 頁面 |
---
## 五、結論
### ✅ Phase 19 正式環境驗證通過
- **所有頁面** 可正常存取
- **API** 健康檢查通過
- **響應式設計** 三種裝置尺寸驗證通過
- **國際化** 繁中/英文版本正常
- **測試路徑錯誤** 已修復
### 驗收簽核
| 項目 | 簽核 |
|------|------|
| 首席架構師 | Claude Code ✅ |
| 測試日期 | 2026-03-28 18:29 |
| 修復完成 | 測試路徑錯誤已修正 |
---
**Generated by Claude Code (首席架構師)**

View File

@@ -5,17 +5,17 @@
---
## 📍 當前狀態 (2026-03-29 09:25 台北)
## 📍 當前狀態 (2026-03-29 01:35 台北)
| 項目 | 狀態 |
|------|------|
| **當前 Phase** | ✅ **Phase 20 Nemotron Tool Calling (P1 修復完成)** |
| **當前 Phase** | ✅ **Phase 19.6 測試收尾 + P1-P3 全部完成** |
| **Day** | Day 12 |
| **K3s 版本** | v1.34.5+k3s1 (mon + mon1) |
| **叢集健康** | ✅ **所有 Pod 正常運行** |
| **K3s 優化** | ✅ **全部完成 + P2/P3 + PSS** |
| **K3s 優化** | ✅ **全部完成 + P1-P3 + PSS** |
| **K-MON** | ✅ **監控整合** (VIP/Velero/SignOz/Sentry 告警) |
| **K3 HPA** | ✅ **API/Web 2-4 自動擴展** (CPU 13%/21%) |
| **K3 HPA** | ✅ **API/Web 2-6 自動擴展** (P3 maxReplicas 調升) |
| **K4 Kured** | ✅ **自動重啟 (02:00-04:00 維護窗口)** |
| **K4 Descheduler** | ✅ **負載均衡 (每 2 小時, threshold 30%)** |
| **K4.3 PSS** | ✅ **Pod Security Standards (6 Namespace labels)** 🆕 |
@@ -49,7 +49,7 @@
---
### ✅ 2026-03-29 Phase 19.6 測試收尾 + P1/P2 改進 (Day 12 01:00) 🆕
### ✅ 2026-03-29 Phase 19.6 測試收尾 + P1-P3 改進 (Day 12 01:30) 🆕
| 項目 | 內容 | 狀態 |
|------|------|------|
@@ -79,18 +79,21 @@
---
### ✅ 2026-03-29 Phase 20 Nemotron P1 修復完成 (Day 12 09:20) 🆕
### ✅ 2026-03-29 Phase 20 Nemotron P1+P2 全部完成 (Day 12 10:30) 🆕
| 項目 | 內容 | 狀態 |
|------|------|------|
| **ADR-036** | Nemotron Tool Calling 整合 | ✅ **已實作** |
| **NvidiaProvider** | Tool Calling + HITL 保護 | ✅ **完成** |
| **測試驗證** | tests/test_nvidia_provider.py | ✅ **15/15 PASSED** |
| **測試驗證** | tests/test_nvidia_provider.py | ✅ **25/25 PASSED** |
| **CD 部署** | CD #23689363463 | ✅ **成功** |
| **Tool Calling 驗證** | restart_pod 測試 | ✅ **正確解析** |
| **首席架構師審查** | 82/100 → 86/100 | ✅ **P1 修復** |
| **Langfuse 整合** | LangfuseTraceContext | ✅ **P1-1 修復** |
| **OTEL Tracing** | start_as_current_span | ✅ **P1-2 修復** |
| **首席架構師審查** | 82/100 → 86/100 → 90/100 | ✅ **P1+P2 修復** |
| **P1-1 Langfuse** | LangfuseTraceContext | ✅ **修復** |
| **P1-2 OTEL** | start_as_current_span | ✅ **修復** |
| **P2-1 Protocol** | INvidiaProvider (@runtime_checkable) | ✅ **修復** |
| **P2-2 邊界測試** | 15 → 25 測試案例 | ✅ **修復** |
| **P2-3 model_registry** | NVIDIA + tool_calling_fallback_order | ✅ **修復** |
**驗證結果** (2026-03-29 08:51):
```

View File

@@ -225,3 +225,78 @@ groups:
annotations:
summary: "⚠️ TLS 探測失敗"
description: "無法連線到 {{ $labels.instance }} 進行 TLS 檢查"
# ===== kube-state-metrics 擴充告警 (P3 2026-03-29) =====
- name: kube_state_extended
rules:
# CronJob 上次執行失敗
- alert: CronJobLastRunFailed
expr: kube_job_status_failed{namespace="awoooi-prod"} > 0
for: 1m
labels:
severity: warning
team: ops
component: cronjob
annotations:
summary: "⚠️ CronJob 執行失敗"
description: "Job {{ $labels.job_name }} 執行失敗"
# DaemonSet 缺少 Pod
- alert: DaemonSetMissingPods
expr: kube_daemonset_status_number_unavailable{namespace=~"awoooi-prod|kube-system|velero"} > 0
for: 5m
labels:
severity: warning
team: ops
component: daemonset
annotations:
summary: "⚠️ DaemonSet 缺少 Pod"
description: "{{ $labels.daemonset }} 缺少 {{ $value }} 個 Pod"
# StatefulSet 副本不足
- alert: StatefulSetReplicasMismatch
expr: kube_statefulset_status_replicas_ready{namespace="awoooi-prod"} != kube_statefulset_replicas{namespace="awoooi-prod"}
for: 5m
labels:
severity: warning
team: ops
component: statefulset
annotations:
summary: "⚠️ StatefulSet 副本不足"
description: "{{ $labels.statefulset }} 預期副本數與就緒數不符"
# 容器長時間等待 (ImagePullBackOff/CrashLoopBackOff)
- alert: ContainerWaiting
expr: kube_pod_container_status_waiting_reason{namespace="awoooi-prod", reason=~"ImagePullBackOff|CrashLoopBackOff|ErrImagePull"} == 1
for: 10m
labels:
severity: warning
team: ops
component: container
annotations:
summary: "⚠️ 容器等待中"
description: "{{ $labels.pod }}/{{ $labels.container }} 處於 {{ $labels.reason }} 狀態"
# PDB 違規 (可用 Pod 數低於 minAvailable)
- alert: PDBViolation
expr: kube_poddisruptionbudget_status_current_healthy{namespace="awoooi-prod"} < kube_poddisruptionbudget_status_desired_healthy{namespace="awoooi-prod"}
for: 5m
labels:
severity: warning
team: ops
component: pdb
annotations:
summary: "⚠️ PDB 違規"
description: "{{ $labels.poddisruptionbudget }} 健康 Pod 數低於期望值"
# 節點 taint 未被容忍 (排程問題偵測)
- alert: NodeUnschedulable
expr: kube_node_spec_unschedulable == 1
for: 30m
labels:
severity: info
team: ops
component: node
annotations:
summary: " 節點標記為不可排程"
description: "節點 {{ $labels.node }} 已被標記為 cordon/unschedulable 超過 30 分鐘"