diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index e8f00cd1..a4ae9fc0 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -1262,19 +1262,33 @@ class OpenClawService: Trace URL: {signoz_trace_url} """ + # Step 0.5: 擷取 K8s 叢集真實資源清單(Checkpoint-2 webhook path) + # 2026-04-17 ogt + Claude Sonnet 4.6: 防止 NemoTron 幻覺 deployment/awoooi-service + # 根因:webhook path 沒有叢集上下文 → LLM 盲猜資源名稱 → kubectl not found → trust 0 永遠 + # 修復:每次分析前先拉真實 Deployment 清單,注入 prompt 強制 LLM 對齊 + _k8s_ns = alert_context.get("namespace", "awoooi-prod") + _k8s_inventory = await _fetch_k8s_inventory_for_openclaw(namespace=_k8s_ns) + k8s_section = ( + f"\n\n## 🔒 叢集實際資源清單({_k8s_ns})\n" + f"kubectl_command 與 target_resource **必須**從以下名稱選擇,不可自行編造:\n" + f"{_k8s_inventory}\n" + if _k8s_inventory + else "\n\n## ⚠️ 無法取得叢集清單,target_resource 請依 alertname 推斷,勿編造。\n" + ) + # 格式化告警為 Prompt (2026-03-31 ogt: 強力截斷以符合 NVIDIA 4K 限制) # 優先保留 System Prompt,截斷 Alert Data - available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) + available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) if available_len < 500: # 如果 SignOz 太長,也截斷它 signoz_context = signoz_context[:500] + "... (truncated)" - available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) + available_len = 3500 - len(OPENCLAW_SYSTEM_PROMPT) - len(signoz_context) - len(k8s_section) alert_json = json.dumps(alert_context, ensure_ascii=False, indent=2) if len(alert_json) > available_len: alert_json = alert_json[:available_len] + "... (truncated)" - full_prompt = OPENCLAW_SYSTEM_PROMPT + signoz_context + "\n\n## Alert Data:\n" + alert_json + full_prompt = OPENCLAW_SYSTEM_PROMPT + signoz_context + k8s_section + "\n\n## Alert Data:\n" + alert_json logger.info( "openclaw_alert_analysis_start", @@ -2033,6 +2047,51 @@ async def close_openclaw() -> None: _openclaw = None +async def _fetch_k8s_inventory_for_openclaw( + namespace: str = "awoooi-prod", + timeout_sec: float = 3.0, +) -> str: + """ + 取得 K8s 叢集實際 Deployment/StatefulSet 清單,注入 analyze_alert prompt。 + + 2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-2 webhook path): + - 根因:NemoTron 在 webhook path 收不到叢集清單 → 幻覺 deployment/awoooi-service + - 修復:analyze_alert 前拉取真實資源名,注入 prompt,強制 LLM 從清單選擇 + - 超時/失敗 → 返回 ""(prompt 仍正常但無鎖定效果,不中斷主流程) + - 只執行唯讀 get 指令,不修改叢集 + + Returns: + "awoooi-api, awoooi-web, ..." 格式字串,失敗時返回 "" + """ + import asyncio as _asyncio + import structlog as _structlog + _logger = _structlog.get_logger(__name__) + try: + cmd = ( + f"kubectl get deployments,statefulsets -n {namespace} " + "-o jsonpath='{.items[*].metadata.name}' 2>/dev/null" + ) + proc = await _asyncio.create_subprocess_shell( + cmd, + stdout=_asyncio.subprocess.PIPE, + stderr=_asyncio.subprocess.PIPE, + ) + try: + stdout, _ = await _asyncio.wait_for(proc.communicate(), timeout=timeout_sec) + except _asyncio.TimeoutError: + proc.kill() + _logger.warning("k8s_inventory_timeout_openclaw", namespace=namespace) + return "" + raw = (stdout or b"").decode("utf-8", errors="replace").strip() + if not raw: + return "" + names = [n.strip() for n in raw.split() if n.strip()] + return ", ".join(names) + except Exception as _e: + _logger.warning("k8s_inventory_failed_openclaw", namespace=namespace, error=str(_e)) + return "" + + # ============================================================================= # Phase 5 + SignOz Integration Complete # =============================================================================