From 7f200aff5f7d42a2d4339f9b46a158e85cd3e781 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 28 Apr 2026 15:05:02 +0800 Subject: [PATCH] =?UTF-8?q?fix(solver):=20=E6=B3=A8=E5=85=A5=E5=91=8A?= =?UTF-8?q?=E8=AD=A6=20labels=20=E8=AE=93=20params=20=E6=A8=A1=E6=9D=BF?= =?UTF-8?q?=E5=A1=AB=E5=85=85=E7=9C=9F=E5=AF=A6=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根因:Solver LLM 不知道 namespace/pod/deployment/instance 真實值, recommended_actions.params 模板({labels.namespace} 等)填不出來 → Telegram 顯示 kubectl scale deployment --replicas=(空白) 修復: - solver.run() 加 incident_labels 參數 - _build_prompt() 把 labels 顯式列出給 LLM 參考 - orchestrator 從 snapshot.alert_info.labels 取出後傳入 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/agents/solver_agent.py | 18 +++++++++++++++++- apps/api/src/services/agent_orchestrator.py | 4 +++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/apps/api/src/agents/solver_agent.py b/apps/api/src/agents/solver_agent.py index 2a800cbc..384822fc 100644 --- a/apps/api/src/agents/solver_agent.py +++ b/apps/api/src/agents/solver_agent.py @@ -452,6 +452,7 @@ class SolverAgent(BaseAgent): self, diagnosis: DiagnosisReport, timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性 + incident_labels: dict | None = None, # 2026-04-28: 告警 labels 注入 prompt ) -> ActionPlan: """ 根據診斷報告產出修復計畫。 @@ -459,10 +460,12 @@ class SolverAgent(BaseAgent): Args: diagnosis: Diagnostician 輸出 timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級 + incident_labels: 原始告警 labels(用於 params 模板填充,如 namespace/pod/instance) Returns: ActionPlan(真實異常時 degraded=True,recommended_actions=[],不假造) """ + self._incident_labels = incident_labels or {} start_ms = int(time.monotonic() * 1000) # 若 Diagnostician 已棄權,Solver 也應棄權(無論降級假設是否存在) @@ -521,6 +524,7 @@ class SolverAgent(BaseAgent): "confidence": top.confidence, "k8s_inventory": _k8s_inventory, "mcp_registry": mcp_registry, + "incident_labels": getattr(self, "_incident_labels", {}), }) # 2026-04-16 ogt + Claude Sonnet 4.6: 傳遞 hypothesis 結構化資料給 OPENCLAW_NEMO @@ -661,12 +665,24 @@ class SolverAgent(BaseAgent): except (TypeError, ValueError): _confidence_pct = "0%" + # 2026-04-28 Claude Sonnet 4.6: 把告警 labels 注入 prompt + # 根因:LLM 不知道真實 namespace/pod/deployment/instance,params 模板填不出來 + # 修復:把 incident_labels 顯式列出,LLM 直接用真實值填 params + _incident_labels: dict = context.get("incident_labels") or {} + if _incident_labels: + _labels_lines = "\n".join( + f" {k}: {v}" for k, v in sorted(_incident_labels.items()) if v + ) + _labels_section = f"\n告警 Labels(params 模板可直接引用):\n{_labels_lines}\n" + else: + _labels_section = "" + return f"""你是 AWOOOI SRE 系統的軍師 Agent,專職修復方案設計。 根因假設:{_safe_hypothesis} 告警類別:{_safe_category} 診斷信心:{_confidence_pct} -{_inventory_section}{_non_k8s_warning}{_mcp_section} +{_labels_section}{_inventory_section}{_non_k8s_warning}{_mcp_section} 你的工作:依照根因假設,提出 1-3 個針對性修復方案,同時輸出 0-3 個結構化 recommended_actions。 ⚠️ 核心規則:修復方案必須對應根因,禁止無腦重啟 diff --git a/apps/api/src/services/agent_orchestrator.py b/apps/api/src/services/agent_orchestrator.py index 780a4d29..35d3b5d8 100644 --- a/apps/api/src/services/agent_orchestrator.py +++ b/apps/api/src/services/agent_orchestrator.py @@ -194,7 +194,9 @@ async def _debate( # ── Step 2: Solver ───────────────────────────────────────────────────── solver = get_solver_agent() - plan = await solver.run(diagnosis) + # 2026-04-28: 把告警 labels 傳入,讓 Solver params 模板能填真實值 + _alert_labels = (snapshot.alert_info or {}).get("labels", {}) if snapshot.alert_info else {} + plan = await solver.run(diagnosis, incident_labels=_alert_labels) await _record_turn( session_id=session_id, incident_id=incident_id,