From a258d877676722b7c6cedf1399404c065b329f06 Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 16 Apr 2026 19:56:07 +0800 Subject: [PATCH] =?UTF-8?q?fix(webhooks+prompts):=20=E4=BF=AE=E5=BE=A9=20L?= =?UTF-8?q?LM=20=E5=B0=8D=E6=89=80=E6=9C=89=E5=91=8A=E8=AD=A6=E4=B8=80?= =?UTF-8?q?=E5=BE=8B=E8=BC=B8=E5=87=BA=E3=80=8C=E9=87=8D=E5=95=9F=20AWOOOI?= =?UTF-8?q?=20=E6=9C=8D=E5=8B=99=E3=80=8D=E7=9A=84=E6=A0=B9=E6=9C=AC?= =?UTF-8?q?=E5=95=8F=E9=A1=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根因 (INC-20260416-C365D0 postgres 磁碟告警事故): 1. alert_context 中 alertname 埋在 labels 深處,LLM 看到 alert_type="custom" → 不知道是什麼告警 2. 快取鍵用 alert_type:target_resource → 不同 alertname 共用同一快取 → 全部回傳第一個 LLM 結果 3. 系統 Prompt 無 alert-category 指導 → LLM 永遠輸出 kubectl rollout restart 修復: - webhooks.py: alert_context 置頂加入 alertname + alert_category + annotations - openclaw.py: 快取鍵改用 alertname:target_resource(告警名稱才是主要識別符) - prompts.py: OPENCLAW_SYSTEM_PROMPT + NEMOTRON_SYSTEM_PROMPT 加入 Alert-Specific Analysis Rules database/storage 告警 → NO_ACTION + 調查指令;K8s 告警 → 對應重啟指令 禁止對非 K8s 告警輸出 kubectl rollout restart deployment/awoooi-prod 2026-04-16 ogt + Claude Sonnet 4.6(亞太) Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/api/v1/webhooks.py | 6 ++++++ apps/api/src/core/prompts.py | 35 +++++++++++++++++++++++++++---- apps/api/src/services/openclaw.py | 9 ++++++-- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index fad40c41..c56b8f2f 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -1377,13 +1377,19 @@ async def alertmanager_webhook( # ========================================================================== # 新告警 - LLM 分析 # ========================================================================== + # 2026-04-16 ogt + Claude Sonnet 4.6: 修復 — alertname 置頂,LLM 才能知道是什麼告警 + # 舊版 alertname 埋在 labels 中,alert_type 永遠是 "custom" + # → LLM 全部輸出「重啟 AWOOOI 服務」(見 INC-20260416-C365D0 postgres 磁碟告警事故) alert_context = { + "alertname": alertname, # 主要識別符 — LLM 必讀 + "alert_category": alert_category, # kubernetes/database/storage/host_resource/ssl_cert "alert_type": alert_type, "severity": severity, "source": "alertmanager", "target_resource": target_resource, "namespace": namespace, "message": message, + "annotations": dict(alert.annotations) if alert.annotations else {}, "metrics": {}, "labels": alert.labels, } diff --git a/apps/api/src/core/prompts.py b/apps/api/src/core/prompts.py index ea9d7a82..aaf8f637 100644 --- a/apps/api/src/core/prompts.py +++ b/apps/api/src/core/prompts.py @@ -103,6 +103,23 @@ For each optimization suggestion, provide EXECUTABLE kubectl commands: } ``` +## 🔑 Alert-Specific Analysis Rules (CRITICAL — read alertname first) +The `alertname` field is your PRIMARY signal. Use it to determine the problem type and appropriate action: + +| Alert category / alertname pattern | suggested_action | kubectl_command guidance | +|-------------------------------------|-----------------|--------------------------| +| contains "Disk", "Storage", "PVC", "Volume" | NO_ACTION | `kubectl exec -- df -h` or `kubectl get pvc -n ` | +| contains "Postgres", "MySQL", "Redis", "DB", "Database" | NO_ACTION | `kubectl exec -- psql` or `kubectl logs ` | +| contains "CrashLoop", "OOMKilled", "Pod" | DELETE_POD or RESTART_DEPLOYMENT | `kubectl delete pod -n ` | +| contains "CPU", "Memory", "Resource" | TUNE_RESOURCES or SCALE_DEPLOYMENT | `kubectl top pod -n ` or HPA command | +| contains "Node", "NodeNotReady" | NO_ACTION | `kubectl describe node ` | +| contains "SSL", "Certificate", "Cert" | NO_ACTION | `kubectl get certificate -n ` | +| alert_category = "database" | NO_ACTION | DB investigation commands only | +| alert_category = "storage" | NO_ACTION | `kubectl get pvc`, `kubectl exec -- df -h` | + +**NEVER** use `kubectl rollout restart deployment/awoooi-prod` for database, storage, or network alerts. +Make `action_title` describe the ACTUAL problem from alertname (not generic "自動修復 AWOOOI 服務"). + ## 🔥 Short Example: High CPU -> SCALE_DEPLOYMENT, HPA, risk_level=medium Please carefully justify your confidence between 0.0 and 1.0 (e.g. 0.82) based on symptoms and metrics. @@ -138,16 +155,26 @@ OPENCLAW_TEST_PROMPT = """你是 AWOOOI AIOps 平台的智慧助手 OpenClaw。 NEMOTRON_SYSTEM_PROMPT = """# OpenClaw Lightweight (Nemo-4B Optimized) You are an SRE AI. Analyze the alert and respond with ONLY valid JSON. +## CRITICAL: Read alertname first +The `alertname` field tells you what kind of problem this is. Use it: +- "Disk/Storage/PVC/Volume" → suggested_action=NO_ACTION, kubectl_command="kubectl get pvc" or "kubectl exec -- df -h" +- "Postgres/MySQL/Redis/DB/Database" → suggested_action=NO_ACTION, DB investigation commands +- "CrashLoop/OOM/Pod" → suggested_action=DELETE_POD or RESTART_DEPLOYMENT +- "CPU/Memory/Resource" → suggested_action=TUNE_RESOURCES or SCALE_DEPLOYMENT +- "SSL/Cert" → suggested_action=NO_ACTION +NEVER use "kubectl rollout restart deployment/awoooi-prod" for database or storage alerts. +Make action_title describe the ACTUAL problem (not generic "自動修復 AWOOOI 服務"). + ## Required JSON Schema: { "confidence": , "reasoning": "簡短理由 (繁體中文)", "primary_responsibility": "FE|BE|INFRA|DB|COLLAB", "risk_level": "low|medium|critical", - "action_title": "操作標題 (繁體中文)", - "description": "根因分析 (繁體中文)", - "suggested_action": "RESTART_DEPLOYMENT|DELETE_POD|SCALE_DEPLOYMENT|NO_ACTION", - "kubectl_command": "kubectl 指令", + "action_title": "操作標題,必須反映 alertname 的實際問題 (繁體中文)", + "description": "根因分析,說明 alertname 代表的問題及建議調查步驟 (繁體中文)", + "suggested_action": "RESTART_DEPLOYMENT|DELETE_POD|SCALE_DEPLOYMENT|TUNE_RESOURCES|NO_ACTION", + "kubectl_command": "針對此告警類型的 kubectl 指令", "target_resource": "目標資源", "namespace": "K8s namespace", "blast_radius": {"affected_pods": 1, "estimated_downtime": "~30s"} diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index 6a07282c..0b01441a 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -741,10 +741,15 @@ class OpenClawService: 2026-03-29 ogt: 加入 Token/Cost 追蹤 """ # 生成快取鍵 (基於 prompt + alert_context hash) + # 2026-04-16 ogt + Claude Sonnet 4.6: 修復 — alertname 才是主要識別符 + # 舊版用 alert_type:target_resource → 不同告警 (e.g. PostgreSQLDiskGrowth vs PodCrashLoop) + # 在 alert_type="custom" 時共用同一快取鍵 → 全部回傳相同 LLM 結果 context_hash = "" if alert_context: - # 使用告警類型 + 目標資源作為上下文 hash - context_hash = f"{alert_context.get('alert_type', '')}:{alert_context.get('target_resource', '')}" + # alertname 優先;無 alertname 時 fallback 到 alert_type + _alertname = alert_context.get("alertname") or alert_context.get("alert_type", "") + _target = alert_context.get("target_resource", "") + context_hash = f"{_alertname}:{_target}" cache_key = self._generate_cache_key(prompt, context_hash)