fix(solver+execution): Checkpoint-1 假成功修復 + Checkpoint-2 K8s 環境感知
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 10m55s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 10m55s
## Checkpoint-1: 假成功根治 - approval_execution.py: execute_approved_action 改返回 bool (原返回 None,呼叫端無法判斷 K8s 是否接受指令) - decision_manager.py auto-execute 路徑: 用 _exec_success 取代硬編 success=True 修復: K8s 拒絕指令時正確發 ❌ 而非 ✅ 自動修復完成 ## Checkpoint-2: K8s 環境感知 (Inventory Pre-flight) - solver_agent.py: 新增 _fetch_k8s_inventory() — 生成 kubectl 指令前先拉 kubectl get deployments,statefulsets -n awoooi-prod,將真實名稱清單 注入 Solver prompt,LLM 必須從清單選擇,防止幻覺(awooiii-api 三個 i) - 超時 5s 或失敗 → 返回 "",prompt 顯示警示但不中斷主流程 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -105,10 +105,17 @@ class SolverAgent(BaseAgent):
|
||||
vote=AgentVote.ABSTAIN,
|
||||
)
|
||||
|
||||
# 2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-2 環境感知):
|
||||
# 根因:LLM 在無叢集上下文時「盲猜」資源名稱 → awooiii-api(三個 i)→ K8s not found
|
||||
# 修復:生成指令前先拉取實際 Deployment 清單,注入 prompt 讓 LLM 對齊真實名稱
|
||||
# 失敗無害:kubectl 超時或拒絕 → _k8s_inventory 為空 → prompt 仍正常但無鎖定效果
|
||||
_k8s_inventory = await _fetch_k8s_inventory(namespace="awoooi-prod")
|
||||
|
||||
prompt = self._build_prompt({
|
||||
"hypothesis": top.description,
|
||||
"category": top.category,
|
||||
"confidence": top.confidence,
|
||||
"k8s_inventory": _k8s_inventory,
|
||||
})
|
||||
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: 傳遞 hypothesis 結構化資料給 OPENCLAW_NEMO
|
||||
@@ -150,12 +157,20 @@ class SolverAgent(BaseAgent):
|
||||
# → auto_approve Condition 1c 拒絕(無 kubectl 關鍵字)
|
||||
# → blast_radius_calculator 永遠不被調用(fill rate = 0%)
|
||||
# 修復:要求 action 必須是真實 kubectl 命令,並提供正確範例
|
||||
# 2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-2): 注入 K8s 實際 Deployment 清單
|
||||
# LLM 必須從此清單選擇資源名稱,不可自行編造
|
||||
_inventory = context.get("k8s_inventory", "")
|
||||
_inventory_section = (
|
||||
f"\n🔒 叢集實際 Deployment 清單(awoooi-prod)— 必須從此清單選擇資源名稱:\n{_inventory}\n"
|
||||
if _inventory
|
||||
else "\n⚠️ 無法取得叢集清單,請謹慎填寫資源名稱。\n"
|
||||
)
|
||||
return f"""你是 AWOOOI SRE 系統的軍師 Agent,專職修復方案設計。
|
||||
|
||||
根因假設:{context.get("hypothesis", "")}
|
||||
告警類別:{context.get("category", "")}
|
||||
診斷信心:{context.get("confidence", 0.0):.0%}
|
||||
|
||||
{_inventory_section}
|
||||
你的工作:為此根因提出 1-3 個修復候選方案。
|
||||
每個方案必須評估:
|
||||
- blast_radius(0-100):影響範圍(越高 = 風險越大)
|
||||
@@ -221,6 +236,48 @@ blast_radius 參考:
|
||||
# Helpers
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def _fetch_k8s_inventory(namespace: str = "awoooi-prod", timeout_sec: float = 5.0) -> str:
|
||||
"""
|
||||
取得 K8s 叢集實際 Deployment/StatefulSet 清單,供 Solver prompt 注入。
|
||||
|
||||
2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-2 環境感知):
|
||||
- 在生成 kubectl 指令前查詢叢集真實資源,防止 LLM 幻覺資源名(如 awooiii-api)
|
||||
- 超時或失敗 → 返回 ""(呼叫端降級為警示模式,不中斷 Solver 主流程)
|
||||
- 只執行唯讀 get 指令,不修改叢集
|
||||
|
||||
Returns:
|
||||
"awoooi-api, awoooi-web, postgres, ..." 格式字串,失敗時返回 ""
|
||||
"""
|
||||
import asyncio as _asyncio
|
||||
try:
|
||||
cmd = f"kubectl get deployments,statefulsets -n {namespace} -o jsonpath='{{.items[*].metadata.name}}' 2>/dev/null"
|
||||
proc = await _asyncio.create_subprocess_shell(
|
||||
cmd,
|
||||
stdout=_asyncio.subprocess.PIPE,
|
||||
stderr=_asyncio.subprocess.PIPE,
|
||||
)
|
||||
try:
|
||||
stdout, _ = await _asyncio.wait_for(proc.communicate(), timeout=timeout_sec)
|
||||
except _asyncio.TimeoutError:
|
||||
proc.kill()
|
||||
logger.warning("k8s_inventory_timeout", namespace=namespace, timeout_sec=timeout_sec)
|
||||
return ""
|
||||
|
||||
raw = (stdout or b"").decode("utf-8", errors="replace").strip()
|
||||
if not raw:
|
||||
return ""
|
||||
|
||||
# jsonpath 輸出以空格分隔,轉成可讀逗號格式
|
||||
names = [n.strip() for n in raw.split() if n.strip()]
|
||||
inventory = ", ".join(names)
|
||||
logger.debug("k8s_inventory_fetched", namespace=namespace, count=len(names))
|
||||
return inventory
|
||||
|
||||
except Exception as _e:
|
||||
logger.warning("k8s_inventory_failed", namespace=namespace, error=str(_e))
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_candidates(parsed: dict[str, Any]) -> list[CandidateAction]:
|
||||
"""從 LLM 解析結果提取候選方案(按信心降序)。
|
||||
|
||||
|
||||
@@ -107,7 +107,7 @@ class ApprovalExecutionService:
|
||||
# 瞬態錯誤 → 可重試
|
||||
return any(kw in lower for kw in cls._TRANSIENT_ERROR_KEYWORDS)
|
||||
|
||||
async def execute_approved_action(self, approval: ApprovalRequest) -> None:
|
||||
async def execute_approved_action(self, approval: ApprovalRequest) -> bool:
|
||||
"""
|
||||
背景執行已批准的操作
|
||||
|
||||
@@ -115,8 +115,16 @@ class ApprovalExecutionService:
|
||||
Phase 5: 執行後更新資料庫狀態
|
||||
Phase 6: 執行後發送通知 (Post-Execution Hook)
|
||||
|
||||
2026-04-17 ogt + Claude Sonnet 4.6: 返回 bool 表示 K8s 執行成功與否
|
||||
根本原因: 原本返回 None → decision_manager.py auto-execute 路徑無法得知結果
|
||||
→ 永遠傳 success=True 給 _push_auto_repair_result → 假成功廣播
|
||||
修復: 返回 result.success,讓呼叫端自行決定 Telegram 訊息
|
||||
|
||||
Args:
|
||||
approval: 已批准的授權請求
|
||||
|
||||
Returns:
|
||||
bool: True = K8s 執行成功,False = 執行失敗(含解析失敗)
|
||||
"""
|
||||
from src.services.notifications import ExecutionStatus
|
||||
|
||||
@@ -164,7 +172,7 @@ class ApprovalExecutionService:
|
||||
error_message="Could not parse operation type",
|
||||
)
|
||||
)
|
||||
return
|
||||
return False # 解析失敗 → 執行未發生
|
||||
|
||||
# ADR-076 Task 3: 執行失敗重試機制
|
||||
# 瞬態錯誤 (connection refused, timeout 等) 自動重試,最多 MAX_RETRY 次
|
||||
@@ -317,6 +325,8 @@ class ApprovalExecutionService:
|
||||
except Exception as _resolve_e:
|
||||
logger.warning("incident_resolve_after_execution_failed", error=str(_resolve_e))
|
||||
|
||||
return True # K8s 執行成功
|
||||
|
||||
else:
|
||||
logger.error(
|
||||
"background_execution_failed",
|
||||
@@ -379,6 +389,7 @@ class ApprovalExecutionService:
|
||||
approval_id=str(approval.id),
|
||||
timeout_sec=30.0,
|
||||
)
|
||||
return False # K8s 執行失敗
|
||||
|
||||
async def _push_execution_result_to_alert(
|
||||
self,
|
||||
|
||||
@@ -2040,23 +2040,29 @@ class DecisionManager:
|
||||
incident_id=incident.incident_id, error=str(_mb_err))
|
||||
|
||||
# 執行
|
||||
# 2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-1 假成功修復):
|
||||
# 舊 bug: execute_approved_action 返回 None → 此處永遠傳 success=True 給
|
||||
# _push_auto_repair_result → Telegram 顯示 ✅ 自動修復完成,即使 K8s 拒絕了指令
|
||||
# 修復: execute_approved_action 現在返回 bool,正確透傳給通知函數
|
||||
executor = ApprovalExecutionService()
|
||||
await executor.execute_approved_action(approval)
|
||||
_exec_success = await executor.execute_approved_action(approval)
|
||||
|
||||
# 更新狀態
|
||||
token.state = DecisionState.COMPLETED
|
||||
token.proposal_data["auto_executed"] = True
|
||||
token.proposal_data["exec_success"] = _exec_success
|
||||
await self._save_token(token)
|
||||
|
||||
logger.info(
|
||||
"auto_execute_completed",
|
||||
incident_id=incident.incident_id,
|
||||
action=approval.action,
|
||||
exec_success=_exec_success,
|
||||
)
|
||||
|
||||
# 2026-04-09 Claude Sonnet 4.6: 執行成功 → 發 Telegram 結果通知
|
||||
# 2026-04-09 Claude Sonnet 4.6: 執行完成 → 發 Telegram 結果通知(成功或失敗皆發)
|
||||
_fire_and_forget(
|
||||
_push_auto_repair_result(incident, action, success=True)
|
||||
_push_auto_repair_result(incident, action, success=_exec_success)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user