diff --git a/apps/api/src/agents/solver_agent.py b/apps/api/src/agents/solver_agent.py index 2702527d..ab733551 100644 --- a/apps/api/src/agents/solver_agent.py +++ b/apps/api/src/agents/solver_agent.py @@ -105,10 +105,17 @@ class SolverAgent(BaseAgent): vote=AgentVote.ABSTAIN, ) + # 2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-2 環境感知): + # 根因:LLM 在無叢集上下文時「盲猜」資源名稱 → awooiii-api(三個 i)→ K8s not found + # 修復:生成指令前先拉取實際 Deployment 清單,注入 prompt 讓 LLM 對齊真實名稱 + # 失敗無害:kubectl 超時或拒絕 → _k8s_inventory 為空 → prompt 仍正常但無鎖定效果 + _k8s_inventory = await _fetch_k8s_inventory(namespace="awoooi-prod") + prompt = self._build_prompt({ "hypothesis": top.description, "category": top.category, "confidence": top.confidence, + "k8s_inventory": _k8s_inventory, }) # 2026-04-16 ogt + Claude Sonnet 4.6: 傳遞 hypothesis 結構化資料給 OPENCLAW_NEMO @@ -150,12 +157,20 @@ class SolverAgent(BaseAgent): # → auto_approve Condition 1c 拒絕(無 kubectl 關鍵字) # → blast_radius_calculator 永遠不被調用(fill rate = 0%) # 修復:要求 action 必須是真實 kubectl 命令,並提供正確範例 + # 2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-2): 注入 K8s 實際 Deployment 清單 + # LLM 必須從此清單選擇資源名稱,不可自行編造 + _inventory = context.get("k8s_inventory", "") + _inventory_section = ( + f"\n🔒 叢集實際 Deployment 清單(awoooi-prod)— 必須從此清單選擇資源名稱:\n{_inventory}\n" + if _inventory + else "\n⚠️ 無法取得叢集清單,請謹慎填寫資源名稱。\n" + ) return f"""你是 AWOOOI SRE 系統的軍師 Agent,專職修復方案設計。 根因假設:{context.get("hypothesis", "")} 告警類別:{context.get("category", "")} 診斷信心:{context.get("confidence", 0.0):.0%} - +{_inventory_section} 你的工作:為此根因提出 1-3 個修復候選方案。 每個方案必須評估: - blast_radius(0-100):影響範圍(越高 = 風險越大) @@ -221,6 +236,48 @@ blast_radius 參考: # Helpers # ───────────────────────────────────────────────────────────────────────────── +async def _fetch_k8s_inventory(namespace: str = "awoooi-prod", timeout_sec: float = 5.0) -> str: + """ + 取得 K8s 叢集實際 Deployment/StatefulSet 清單,供 Solver prompt 注入。 + + 2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-2 環境感知): + - 在生成 kubectl 指令前查詢叢集真實資源,防止 LLM 幻覺資源名(如 awooiii-api) + - 超時或失敗 → 返回 ""(呼叫端降級為警示模式,不中斷 Solver 主流程) + - 只執行唯讀 get 指令,不修改叢集 + + Returns: + "awoooi-api, awoooi-web, postgres, ..." 格式字串,失敗時返回 "" + """ + import asyncio as _asyncio + try: + cmd = f"kubectl get deployments,statefulsets -n {namespace} -o jsonpath='{{.items[*].metadata.name}}' 2>/dev/null" + proc = await _asyncio.create_subprocess_shell( + cmd, + stdout=_asyncio.subprocess.PIPE, + stderr=_asyncio.subprocess.PIPE, + ) + try: + stdout, _ = await _asyncio.wait_for(proc.communicate(), timeout=timeout_sec) + except _asyncio.TimeoutError: + proc.kill() + logger.warning("k8s_inventory_timeout", namespace=namespace, timeout_sec=timeout_sec) + return "" + + raw = (stdout or b"").decode("utf-8", errors="replace").strip() + if not raw: + return "" + + # jsonpath 輸出以空格分隔,轉成可讀逗號格式 + names = [n.strip() for n in raw.split() if n.strip()] + inventory = ", ".join(names) + logger.debug("k8s_inventory_fetched", namespace=namespace, count=len(names)) + return inventory + + except Exception as _e: + logger.warning("k8s_inventory_failed", namespace=namespace, error=str(_e)) + return "" + + def _extract_candidates(parsed: dict[str, Any]) -> list[CandidateAction]: """從 LLM 解析結果提取候選方案(按信心降序)。 diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index e0ddb8ac..b34d490b 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -107,7 +107,7 @@ class ApprovalExecutionService: # 瞬態錯誤 → 可重試 return any(kw in lower for kw in cls._TRANSIENT_ERROR_KEYWORDS) - async def execute_approved_action(self, approval: ApprovalRequest) -> None: + async def execute_approved_action(self, approval: ApprovalRequest) -> bool: """ 背景執行已批准的操作 @@ -115,8 +115,16 @@ class ApprovalExecutionService: Phase 5: 執行後更新資料庫狀態 Phase 6: 執行後發送通知 (Post-Execution Hook) + 2026-04-17 ogt + Claude Sonnet 4.6: 返回 bool 表示 K8s 執行成功與否 + 根本原因: 原本返回 None → decision_manager.py auto-execute 路徑無法得知結果 + → 永遠傳 success=True 給 _push_auto_repair_result → 假成功廣播 + 修復: 返回 result.success,讓呼叫端自行決定 Telegram 訊息 + Args: approval: 已批准的授權請求 + + Returns: + bool: True = K8s 執行成功,False = 執行失敗(含解析失敗) """ from src.services.notifications import ExecutionStatus @@ -164,7 +172,7 @@ class ApprovalExecutionService: error_message="Could not parse operation type", ) ) - return + return False # 解析失敗 → 執行未發生 # ADR-076 Task 3: 執行失敗重試機制 # 瞬態錯誤 (connection refused, timeout 等) 自動重試,最多 MAX_RETRY 次 @@ -317,6 +325,8 @@ class ApprovalExecutionService: except Exception as _resolve_e: logger.warning("incident_resolve_after_execution_failed", error=str(_resolve_e)) + return True # K8s 執行成功 + else: logger.error( "background_execution_failed", @@ -379,6 +389,7 @@ class ApprovalExecutionService: approval_id=str(approval.id), timeout_sec=30.0, ) + return False # K8s 執行失敗 async def _push_execution_result_to_alert( self, diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index d022f708..aa3bb4f2 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -2040,23 +2040,29 @@ class DecisionManager: incident_id=incident.incident_id, error=str(_mb_err)) # 執行 + # 2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-1 假成功修復): + # 舊 bug: execute_approved_action 返回 None → 此處永遠傳 success=True 給 + # _push_auto_repair_result → Telegram 顯示 ✅ 自動修復完成,即使 K8s 拒絕了指令 + # 修復: execute_approved_action 現在返回 bool,正確透傳給通知函數 executor = ApprovalExecutionService() - await executor.execute_approved_action(approval) + _exec_success = await executor.execute_approved_action(approval) # 更新狀態 token.state = DecisionState.COMPLETED token.proposal_data["auto_executed"] = True + token.proposal_data["exec_success"] = _exec_success await self._save_token(token) logger.info( "auto_execute_completed", incident_id=incident.incident_id, action=approval.action, + exec_success=_exec_success, ) - # 2026-04-09 Claude Sonnet 4.6: 執行成功 → 發 Telegram 結果通知 + # 2026-04-09 Claude Sonnet 4.6: 執行完成 → 發 Telegram 結果通知(成功或失敗皆發) _fire_and_forget( - _push_auto_repair_result(incident, action, success=True) + _push_auto_repair_result(incident, action, success=_exec_success) ) except Exception as e: