diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index fad40c41..c56b8f2f 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -1377,13 +1377,19 @@ async def alertmanager_webhook( # ========================================================================== # 新告警 - LLM 分析 # ========================================================================== + # 2026-04-16 ogt + Claude Sonnet 4.6: 修復 — alertname 置頂,LLM 才能知道是什麼告警 + # 舊版 alertname 埋在 labels 中,alert_type 永遠是 "custom" + # → LLM 全部輸出「重啟 AWOOOI 服務」(見 INC-20260416-C365D0 postgres 磁碟告警事故) alert_context = { + "alertname": alertname, # 主要識別符 — LLM 必讀 + "alert_category": alert_category, # kubernetes/database/storage/host_resource/ssl_cert "alert_type": alert_type, "severity": severity, "source": "alertmanager", "target_resource": target_resource, "namespace": namespace, "message": message, + "annotations": dict(alert.annotations) if alert.annotations else {}, "metrics": {}, "labels": alert.labels, } diff --git a/apps/api/src/core/prompts.py b/apps/api/src/core/prompts.py index ea9d7a82..aaf8f637 100644 --- a/apps/api/src/core/prompts.py +++ b/apps/api/src/core/prompts.py @@ -103,6 +103,23 @@ For each optimization suggestion, provide EXECUTABLE kubectl commands: } ``` +## 🔑 Alert-Specific Analysis Rules (CRITICAL — read alertname first) +The `alertname` field is your PRIMARY signal. Use it to determine the problem type and appropriate action: + +| Alert category / alertname pattern | suggested_action | kubectl_command guidance | +|-------------------------------------|-----------------|--------------------------| +| contains "Disk", "Storage", "PVC", "Volume" | NO_ACTION | `kubectl exec -- df -h` or `kubectl get pvc -n ` | +| contains "Postgres", "MySQL", "Redis", "DB", "Database" | NO_ACTION | `kubectl exec -- psql` or `kubectl logs ` | +| contains "CrashLoop", "OOMKilled", "Pod" | DELETE_POD or RESTART_DEPLOYMENT | `kubectl delete pod -n ` | +| contains "CPU", "Memory", "Resource" | TUNE_RESOURCES or SCALE_DEPLOYMENT | `kubectl top pod -n ` or HPA command | +| contains "Node", "NodeNotReady" | NO_ACTION | `kubectl describe node ` | +| contains "SSL", "Certificate", "Cert" | NO_ACTION | `kubectl get certificate -n ` | +| alert_category = "database" | NO_ACTION | DB investigation commands only | +| alert_category = "storage" | NO_ACTION | `kubectl get pvc`, `kubectl exec -- df -h` | + +**NEVER** use `kubectl rollout restart deployment/awoooi-prod` for database, storage, or network alerts. +Make `action_title` describe the ACTUAL problem from alertname (not generic "自動修復 AWOOOI 服務"). + ## 🔥 Short Example: High CPU -> SCALE_DEPLOYMENT, HPA, risk_level=medium Please carefully justify your confidence between 0.0 and 1.0 (e.g. 0.82) based on symptoms and metrics. @@ -138,16 +155,26 @@ OPENCLAW_TEST_PROMPT = """你是 AWOOOI AIOps 平台的智慧助手 OpenClaw。 NEMOTRON_SYSTEM_PROMPT = """# OpenClaw Lightweight (Nemo-4B Optimized) You are an SRE AI. Analyze the alert and respond with ONLY valid JSON. +## CRITICAL: Read alertname first +The `alertname` field tells you what kind of problem this is. Use it: +- "Disk/Storage/PVC/Volume" → suggested_action=NO_ACTION, kubectl_command="kubectl get pvc" or "kubectl exec -- df -h" +- "Postgres/MySQL/Redis/DB/Database" → suggested_action=NO_ACTION, DB investigation commands +- "CrashLoop/OOM/Pod" → suggested_action=DELETE_POD or RESTART_DEPLOYMENT +- "CPU/Memory/Resource" → suggested_action=TUNE_RESOURCES or SCALE_DEPLOYMENT +- "SSL/Cert" → suggested_action=NO_ACTION +NEVER use "kubectl rollout restart deployment/awoooi-prod" for database or storage alerts. +Make action_title describe the ACTUAL problem (not generic "自動修復 AWOOOI 服務"). + ## Required JSON Schema: { "confidence": , "reasoning": "簡短理由 (繁體中文)", "primary_responsibility": "FE|BE|INFRA|DB|COLLAB", "risk_level": "low|medium|critical", - "action_title": "操作標題 (繁體中文)", - "description": "根因分析 (繁體中文)", - "suggested_action": "RESTART_DEPLOYMENT|DELETE_POD|SCALE_DEPLOYMENT|NO_ACTION", - "kubectl_command": "kubectl 指令", + "action_title": "操作標題,必須反映 alertname 的實際問題 (繁體中文)", + "description": "根因分析,說明 alertname 代表的問題及建議調查步驟 (繁體中文)", + "suggested_action": "RESTART_DEPLOYMENT|DELETE_POD|SCALE_DEPLOYMENT|TUNE_RESOURCES|NO_ACTION", + "kubectl_command": "針對此告警類型的 kubectl 指令", "target_resource": "目標資源", "namespace": "K8s namespace", "blast_radius": {"affected_pods": 1, "estimated_downtime": "~30s"} diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index 6a07282c..0b01441a 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -741,10 +741,15 @@ class OpenClawService: 2026-03-29 ogt: 加入 Token/Cost 追蹤 """ # 生成快取鍵 (基於 prompt + alert_context hash) + # 2026-04-16 ogt + Claude Sonnet 4.6: 修復 — alertname 才是主要識別符 + # 舊版用 alert_type:target_resource → 不同告警 (e.g. PostgreSQLDiskGrowth vs PodCrashLoop) + # 在 alert_type="custom" 時共用同一快取鍵 → 全部回傳相同 LLM 結果 context_hash = "" if alert_context: - # 使用告警類型 + 目標資源作為上下文 hash - context_hash = f"{alert_context.get('alert_type', '')}:{alert_context.get('target_resource', '')}" + # alertname 優先;無 alertname 時 fallback 到 alert_type + _alertname = alert_context.get("alertname") or alert_context.get("alert_type", "") + _target = alert_context.get("target_resource", "") + context_hash = f"{_alertname}:{_target}" cache_key = self._generate_cache_key(prompt, context_hash)