From 7e9448f6d024a501c9ebf6f770c6abaa139ba77a Mon Sep 17 00:00:00 2001 From: OG T Date: Sat, 18 Apr 2026 23:26:08 +0800 Subject: [PATCH] =?UTF-8?q?fix(openclaw):=20=E5=B9=BB=E8=A6=BA=20deploymen?= =?UTF-8?q?t=20=E5=90=8D=E9=9B=99=E5=B1=A4=E9=98=B2=E7=A6=A6=20=E2=80=94?= =?UTF-8?q?=20Prompt=20+=20Python=20validator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2026-04-18 晚(台北時區)— ogt + Claude Opus 4.7 (1M) 生產事件 (approval f763bedf, 22:58): - Alert: KubePodCrashLooping, labels.deployment="awoooi-api" - NEMOTRON 雖收 inventory "awoooi-api, awoooi-web, awoooi-worker" 仍輸出 kubectl_command="kubectl rollout restart deployment/awoooi-prod" (把 namespace 誤當 deployment 名) - 執行結果: "Deployment 'awoooi-prod' not found in namespace 'awoooi-prod'" ## Layer 1: NEMOTRON_SYSTEM_PROMPT 強化 (prompts.py) 新增「🔒 DEPLOYMENT NAME RULE (STRICTLY ENFORCED)」區塊: - namespace NEVER is a deployment name - "awoooi-prod" 是 NAMESPACE,不可寫 deployment/awoooi-prod - 若有 inventory,deployment 必須 exact match - 優先用 labels.deployment,unknown → NO_ACTION ## Layer 2: Python 後驗證 (openclaw.py:1322+) LLM 回應解析後 regex 抽出 deployment 名,對照 _k8s_inventory: - 在清單內 → 通過 - 不在清單內 → 降級: * kubectl_command → "kubectl get deploy -n {ns}"(純調查) * suggested_action → NO_ACTION * target_resource → "unknown(hallucinated)" * confidence → 0.0 * description 加註 [安全降級] 並列出合法 inventory - log 'openclaw_deployment_hallucination_detected' 記錄 效果: 就算 LLM 無視 prompt,Python 層也會擋下。 破壞性 kubectl 絕不執行於不存在的 deployment。 Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/api/src/core/prompts.py | 11 ++++++++- apps/api/src/services/openclaw.py | 40 +++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/apps/api/src/core/prompts.py b/apps/api/src/core/prompts.py index 49720213..66f94c3b 100644 --- a/apps/api/src/core/prompts.py +++ b/apps/api/src/core/prompts.py @@ -155,6 +155,15 @@ OPENCLAW_TEST_PROMPT = """你是 AWOOOI AIOps 平台的智慧助手 OpenClaw。 NEMOTRON_SYSTEM_PROMPT = """# OpenClaw Lightweight (Nemo-4B Optimized) You are an SRE AI. Analyze the alert and respond with ONLY valid JSON. +## 🔒 DEPLOYMENT NAME RULE (STRICTLY ENFORCED) +- `namespace` is NEVER a deployment name. +- "awoooi-prod" is a NAMESPACE, NOT a deployment. NEVER write `deployment/awoooi-prod`. +- When "叢集實際資源清單" is provided, `target_resource` and deployment in + `kubectl_command` MUST match one of those names exactly. +- If alert has `labels.deployment`, prefer it over guessing. +- Unknown target → suggested_action=NO_ACTION, kubectl_command= + "kubectl get deploy -n " (investigation only). + ## CRITICAL: Read alertname first The `alertname` field tells you what kind of problem this is. Use it: - "Disk/Storage/PVC/Volume" → suggested_action=NO_ACTION, kubectl_command="kubectl get pvc" or "kubectl exec -- df -h" @@ -162,7 +171,7 @@ The `alertname` field tells you what kind of problem this is. Use it: - "CrashLoop/OOM/Pod" → suggested_action=DELETE_POD or RESTART_DEPLOYMENT - "CPU/Memory/Resource" → suggested_action=TUNE_RESOURCES or SCALE_DEPLOYMENT - "SSL/Cert" → suggested_action=NO_ACTION -NEVER use "kubectl rollout restart deployment/awoooi-prod" for database or storage alerts. +NEVER use "kubectl rollout restart deployment/awoooi-prod" (that is the NAMESPACE, not a deployment). Make action_title describe the ACTUAL problem (not generic "自動修復 AWOOOI 服務"). ## Required JSON Schema: diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index a4ae9fc0..9966f743 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -1321,6 +1321,46 @@ Trace URL: {signoz_trace_url} # 解析結果 result = self._parse_analysis_result(raw_response) + # 2026-04-18 ogt + Claude Opus 4.7: 幻覺 deployment 名偵測與降級 (Checkpoint-3) + # 根因: NEMOTRON 即使 prompt 有 inventory 仍會拿 namespace "awoooi-prod" 當 deployment 名 + # → 執行時 kubectl rollout restart deployment/awoooi-prod → "not found" + # 修復: LLM 回應後 Python 驗證 kubectl_command 中的 deployment 名是否在 inventory + # 不在 → 降級為 NO_ACTION + 改成投查 kubectl get deploy(無破壞,只排查) + if result and _k8s_inventory: + _inventory_names = {n.strip() for n in _k8s_inventory.split(",") if n.strip()} + _kcmd = (result.kubectl_command or "").lower() + import re as _re + _m = _re.search(r"deployment[/\s]+([a-z0-9][a-z0-9-]*)", _kcmd) + if _m: + _deploy_guess = _m.group(1) + if _deploy_guess not in _inventory_names: + logger.warning( + "openclaw_deployment_hallucination_detected", + hallucinated=_deploy_guess, + inventory=sorted(_inventory_names), + original_kubectl_cmd=result.kubectl_command, + original_action=result.suggested_action.value if hasattr(result.suggested_action, 'value') else str(result.suggested_action), + ) + # 降級為安全調查動作,不執行破壞性操作 + result.kubectl_command = f"kubectl get deploy -n {_k8s_ns}" + result.target_resource = "unknown(hallucinated)" + # Pydantic enum 處理: 若不能直接賦 str 就跳過 + try: + from src.models.openclaw_schema import SuggestedAction as _SA + result.suggested_action = _SA.NO_ACTION + except Exception: + pass + result.description = ( + f"[安全降級] 原 LLM 建議的 deployment '{_deploy_guess}' 不在叢集 inventory " + f"({', '.join(sorted(_inventory_names))})。" + f"已降級為純調查動作,請手動確認實際問題資源。" + ) + # 信心度歸零 + try: + result.confidence = 0.0 + except Exception: + pass + if result: logger.info( "openclaw_analysis_complete",