diff --git a/apps/api/src/agents/solver_agent.py b/apps/api/src/agents/solver_agent.py index 868173a6..5567843a 100644 --- a/apps/api/src/agents/solver_agent.py +++ b/apps/api/src/agents/solver_agent.py @@ -10,11 +10,14 @@ AWOOOI AIOps Phase 2 — Solver Agent(軍師) 1. 每個 Hypothesis 至少產 1 個 CandidateAction 2. blast_radius 評分影響 Reviewer 的審查嚴格度 3. blast_radius > 50 → Reviewer 必須 request_revision -4. 熔斷降級:LLM 失敗 → rule-based mock(基於 category 推 RESTART 為兜底動作) +4. 熔斷降級:LLM 失敗 → graceful degraded(candidates=[], recommended_actions=[], degraded=True) + 禁止 rule-based mock / hardcode RESTART 兜底(北極星 §1.1 修復多樣性 ≥ 40%,禁止寫死規則) 5. Solver 不直接觸碰執行層(Coordinator 的工作) +6. recommended_actions:結構化 MCP 動作清單,供 B3 Telegram 按鈕動態生成 ADR-082: Phase 2 多 Agent 協作 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 2 初始建立 +2026-04-27 Claude Sonnet 4.6: B1 — Solver 結構化動作 (北極星 §1.1 修復多樣性 ≥ 40%) """ from __future__ import annotations @@ -24,9 +27,11 @@ import hashlib import os import re import time +from pathlib import Path from typing import Any import structlog +import yaml from src.agents.base import BaseAgent from src.agents.protocol import ( @@ -35,6 +40,7 @@ from src.agents.protocol import ( AgentVote, CandidateAction, DiagnosisReport, + RecommendedAction, ) from src.observability.agent_step_metrics import observe_agent_step from src.services.sanitization_service import sanitize @@ -62,7 +68,7 @@ PHASE2_STEP_TIMEOUT_SEC = AGENT_SOLVER_TIMEOUT_SEC # 1. 分隔符改為顯式 [ ](ASCII 空格),明確排除 \n\r\t\f\v # 2. 字元類改為 [A-Za-z0-9 \-=./:_,@](顯式空格,非 \s) # 3. 有界 quantifier {1,500} 防止無界回溯 -# 4. re.ASCII 旗標禁用 Unicode 空白匹配(如   等不可見字元) +# 4. re.ASCII 旗標禁用 Unicode 空白匹配(如 等不可見字元) # 範圍:Nemo 路徑 + action_title 路徑 + 標準 candidates 路徑三層防護(C2) _KUBECTL_COMMAND_PATTERN = re.compile( r"^kubectl[ ][A-Za-z0-9 \-=./:_,@]{1,500}$", @@ -72,6 +78,47 @@ _KUBECTL_COMMAND_PATTERN = re.compile( # 指令長度上限(與正則 {1,500} 對齊,先做長度 O(1) 硬檢查再跑正則) _KUBECTL_MAX_LEN = 500 +# 2026-04-27 Claude Sonnet 4.6: B1 — 從 callback_action_spec.yaml 動態載入 MCP tool 清單 +# 失敗無害:載入失敗 → 使用空 dict → prompt 提示「暫無 MCP 清單」 +_CALLBACK_SPEC_PATH = Path(__file__).parent.parent / "services" / "callback_action_spec.yaml" + +# 有效的 mcp_provider 清單(與 RecommendedAction schema 對應) +_VALID_MCP_PROVIDERS = frozenset( + {"k8s", "ssh", "prometheus", "signoz", "database", "internal"} +) + +# 有效的 risk 等級(與 RecommendedAction schema 對應) +_VALID_RISK_LEVELS = frozenset({"low", "medium", "high", "critical"}) + + +def _load_mcp_tool_registry() -> dict[str, dict[str, Any]]: + """ + 從 callback_action_spec.yaml 動態載入 MCP tool 清單。 + + 2026-04-27 Claude Sonnet 4.6: B1 — 動態載入,避免 hardcode + 回傳格式:{action_name: {provider, tool, risk, label, emoji}} + 失敗 → 返回 {} 並 log warning(不中斷 Solver 主流程) + """ + try: + with _CALLBACK_SPEC_PATH.open(encoding="utf-8") as f: + spec = yaml.safe_load(f) + registry: dict[str, dict[str, Any]] = {} + for name, action in (spec.get("actions") or {}).items(): + mcp = action.get("mcp") or {} + registry[name] = { + "provider": mcp.get("provider", ""), + "tool": mcp.get("tool", ""), + "risk": action.get("risk", "low"), + "label": action.get("label", name), + "emoji": action.get("emoji", ""), + "params": mcp.get("params") or {}, + "description": action.get("description", ""), + } + return registry + except Exception as e: + logger.warning("mcp_registry_load_failed", path=str(_CALLBACK_SPEC_PATH), error=str(e)) + return {} + def _is_safe_kubectl_command(cmd: str) -> bool: """kubectl 命令白名單驗證。 @@ -108,6 +155,107 @@ def _is_safe_kubectl_command(cmd: str) -> bool: return _KUBECTL_COMMAND_PATTERN.fullmatch(cmd) is not None +def _validate_recommended_action(raw: Any) -> RecommendedAction | None: + """ + 驗證單一 recommended_action LLM 輸出是否符合 RecommendedAction schema。 + + 2026-04-27 Claude Sonnet 4.6: B1 — schema 驗證,不合規則 graceful skip + - mcp_provider 必須在 _VALID_MCP_PROVIDERS 清單 + - risk 必須在 _VALID_RISK_LEVELS 清單 + - 任何欄位缺失 → 返回 None,由呼叫端記 warning 並 skip + + Returns: + RecommendedAction — 驗證通過 + None — 不符 schema,呼叫端 skip 此項(不假造) + """ + if not isinstance(raw, dict): + return None + + name = str(raw.get("name", "")).strip() + label = str(raw.get("label", "")).strip() + emoji = str(raw.get("emoji", "")).strip() + mcp_provider = str(raw.get("mcp_provider", "")).strip().lower() + mcp_tool = str(raw.get("mcp_tool", "")).strip() + risk = str(raw.get("risk", "")).strip().lower() + reasoning = str(raw.get("reasoning", "")).strip() + params_raw = raw.get("params", {}) + + # 必填欄位非空 + if not name or not label or not mcp_provider or not mcp_tool or not risk: + return None + + # provider 白名單驗證 + if mcp_provider not in _VALID_MCP_PROVIDERS: + logger.warning( + "solver_recommended_action_invalid_provider", + name=name, + mcp_provider=mcp_provider, + valid=sorted(_VALID_MCP_PROVIDERS), + ) + return None + + # risk 白名單驗證 + if risk not in _VALID_RISK_LEVELS: + logger.warning( + "solver_recommended_action_invalid_risk", + name=name, + risk=risk, + valid=sorted(_VALID_RISK_LEVELS), + ) + return None + + # params 型別保護(必須是 dict[str, str]) + if not isinstance(params_raw, dict): + params: dict[str, str] = {} + else: + params = {str(k): str(v) for k, v in params_raw.items()} + + return RecommendedAction( + name=name[:64], + label=label[:80], + emoji=emoji[:8], + mcp_provider=mcp_provider, # type: ignore[arg-type] # 已驗過在 Literal 範圍 + mcp_tool=mcp_tool[:80], + params=params, + risk=risk, # type: ignore[arg-type] # 已驗過在 Literal 範圍 + reasoning=reasoning[:400], + ) + + +def _extract_recommended_actions(parsed: dict[str, Any]) -> list[RecommendedAction]: + """ + 從 LLM 解析結果提取 recommended_actions(按 schema 驗證)。 + + 2026-04-27 Claude Sonnet 4.6: B1 — schema 驗證 + graceful degraded + - LLM 輸出不合 schema → 記 warning + skip(不假造) + - 最多取 3 個(每假設 0-3 個動作) + - 返回空列表 = 降級,呼叫端設 degraded=True + + Returns: + list[RecommendedAction](0-3 個,驗證通過的項目) + """ + raw_list = parsed.get("recommended_actions", []) + if not isinstance(raw_list, list): + return [] + + result: list[RecommendedAction] = [] + for idx, raw in enumerate(raw_list[:5]): # 最多處理 5 個,取前 3 個通過驗證的 + action = _validate_recommended_action(raw) + if action is None: + logger.warning( + "solver_recommended_action_schema_invalid", + index=idx, + raw=str(raw)[:200], + reason="欄位缺失或不符白名單,skip(不假造)", + ) + continue + result.append(action) + if len(result) >= 3: + break + + return result + + class SolverAgent(BaseAgent): """ Solver Agent — 修復方案軍師 @@ -133,7 +281,7 @@ class SolverAgent(BaseAgent): timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級 Returns: - ActionPlan(真實異常時 degraded=True) + ActionPlan(真實異常時 degraded=True,recommended_actions=[],不假造) """ start_ms = int(time.monotonic() * 1000) @@ -146,6 +294,7 @@ class SolverAgent(BaseAgent): latency_ms=0, vote=AgentVote.ABSTAIN, degraded=diagnosis.degraded, + recommended_actions=[], ) try: @@ -154,6 +303,7 @@ class SolverAgent(BaseAgent): logger.info( "solver_done", candidates=len(plan.candidates), + recommended_actions=len(plan.recommended_actions), vote=plan.vote, latency_ms=plan.latency_ms, ) @@ -173,6 +323,7 @@ class SolverAgent(BaseAgent): diagnosis_report=diagnosis, latency_ms=0, vote=AgentVote.ABSTAIN, + recommended_actions=[], ) # 2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-2 環境感知): @@ -181,11 +332,15 @@ class SolverAgent(BaseAgent): # 失敗無害:kubectl 超時或拒絕 → _k8s_inventory 為空 → prompt 仍正常但無鎖定效果 _k8s_inventory = await _fetch_k8s_inventory(namespace="awoooi-prod") + # 2026-04-27 Claude Sonnet 4.6: B1 — 動態載入 MCP tool 清單注入 prompt + mcp_registry = _load_mcp_tool_registry() + prompt = self._build_prompt({ "hypothesis": top.description, "category": top.category, "confidence": top.confidence, "k8s_inventory": _k8s_inventory, + "mcp_registry": mcp_registry, }) # 2026-04-16 ogt + Claude Sonnet 4.6: 傳遞 hypothesis 結構化資料給 OPENCLAW_NEMO @@ -227,6 +382,16 @@ class SolverAgent(BaseAgent): parsed = self._parse_response(sanitize(response_text, "solver_output")) candidates = _extract_candidates(parsed) + # 2026-04-27 Claude Sonnet 4.6: B1 — 提取 recommended_actions(schema 驗證) + # LLM 輸出不合 schema → graceful skip(log warn + empty actions),不假造 + recommended_actions = _extract_recommended_actions(parsed) + if not recommended_actions: + logger.info( + "solver_recommended_actions_empty", + reason="LLM 未輸出合法 recommended_actions 或 schema 驗證失敗", + snapshot_id=diagnosis.evidence_snapshot_id, + ) + # 2026-04-25 ogt + Claude Sonnet 4.6: 非 K8s target 後置過濾(P0 修復) # 根因:GiteaMemoryPressure 告警觸發 Solver → LLM 生成 "kubectl scale deployment gitea" # Gitea 在主機 docker-compose,不在 awoooi-prod namespace → 執行必然失敗 @@ -247,6 +412,7 @@ class SolverAgent(BaseAgent): diagnosis_report=diagnosis, latency_ms=0, vote=AgentVote.APPROVE, + recommended_actions=recommended_actions, ) def _build_prompt(self, context: dict[str, Any]) -> str: @@ -258,6 +424,7 @@ class SolverAgent(BaseAgent): # 修復:要求 action 必須是真實 kubectl 命令,並提供正確範例 # 2026-04-17 ogt + Claude Sonnet 4.6 (Checkpoint-2): 注入 K8s 實際 Deployment 清單 # LLM 必須從此清單選擇資源名稱,不可自行編造 + # 2026-04-27 Claude Sonnet 4.6: B1 — 注入 MCP tool 清單 + recommended_actions 要求 _inventory = context.get("k8s_inventory", "") _inventory_section = ( f"\n🔒 叢集實際 Deployment 清單(awoooi-prod)— 必須從此清單選擇資源名稱:\n{_inventory}\n" @@ -276,16 +443,40 @@ class SolverAgent(BaseAgent): if _inventory else "" ) + + # 2026-04-27 Claude Sonnet 4.6: B1 — 動態 MCP tool 清單注入 + # 只注入 provider/tool/risk/label,避免 params 模板混淆 LLM + mcp_registry: dict[str, dict[str, Any]] = context.get("mcp_registry") or {} + if mcp_registry: + _mcp_lines = [] + for action_name, info in mcp_registry.items(): + _mcp_lines.append( + f" - name: {action_name}" + f" | provider: {info['provider']}" + f" | tool: {info['tool']}" + f" | risk: {info['risk']}" + f" | label: {info['label']}" + ) + _mcp_section = ( + "\n可用 MCP actions(請從此清單選擇 recommended_actions):\n" + + "\n".join(_mcp_lines) + + "\n" + ) + else: + _mcp_section = "\n(MCP action 清單暫不可用,recommended_actions 可留空陣列)\n" + return f"""你是 AWOOOI SRE 系統的軍師 Agent,專職修復方案設計。 根因假設:{context.get("hypothesis", "")} 告警類別:{context.get("category", "")} 診斷信心:{context.get("confidence", 0.0):.0%} -{_inventory_section}{_non_k8s_warning} -你的工作:為此根因提出 1-3 個修復候選方案。 -每個方案必須評估: -- blast_radius(0-100):影響範圍(越高 = 風險越大) -- rollback_cost(0-100):回滾難度(越高 = 越難還原) +{_inventory_section}{_non_k8s_warning}{_mcp_section} +你的工作:為此根因提出 1-3 個修復候選方案,同時輸出 0-3 個結構化 recommended_actions。 + +candidates 格式規則: +- action 欄位必須是真實的 kubectl 命令(不可用自然語言描述) +- 目標資源格式:deployment/,命名空間統一用 awoooi-prod +- 每個方案必須評估 blast_radius(0-100 影響範圍)和 rollback_cost(0-100 回滾難度) blast_radius 參考: - kubectl rollout restart deployment = 10 @@ -295,8 +486,12 @@ blast_radius 參考: - kubectl delete deployment = 75 - kubectl delete pvc = 95 -🔴 關鍵規則:action 欄位必須是真實的 kubectl 命令,不可用自然語言描述。 -目標資源格式:deployment/,命名空間統一用 awoooi-prod。 +recommended_actions 規則(北極星 §1.1 修復多樣性): +- 不要全部是 restart 類動作,至少 1 個是查看 log 或診斷類(低風險) +- mcp_provider 必須是以下之一:k8s | ssh | prometheus | signoz | database | internal +- risk 必須是以下之一:low | medium | high | critical +- critical risk 的動作必須在 reasoning 說明原因 +- params 中可使用模板:{{labels.namespace}} / {{labels.pod}} / {{incident_id}} 以 JSON 回覆: {{ @@ -307,6 +502,35 @@ blast_radius 參考: "rollback_cost": 5, "confidence": 0.8, "rationale": "重啟可清除 OOM 導致的記憶體碎片化" + }}, + {{ + "action": "kubectl top pods -n awoooi-prod --sort-by=memory", + "blast_radius": 0, + "rollback_cost": 0, + "confidence": 0.9, + "rationale": "先確認哪個 Pod 記憶體使用最高再決定操作" + }} + ], + "recommended_actions": [ + {{ + "name": "check_pod_logs", + "label": "查 Pod Log", + "emoji": "📋", + "mcp_provider": "k8s", + "mcp_tool": "k8s_get_pod_logs", + "params": {{"namespace": "awoooi-prod", "pod": "{{labels.pod}}", "tail_lines": "50"}}, + "risk": "low", + "reasoning": "先查 log 確認 OOM 根因,避免盲目重啟" + }}, + {{ + "name": "k8s_restart", + "label": "重啟", + "emoji": "🔄", + "mcp_provider": "k8s", + "mcp_tool": "kubectl_restart", + "params": {{"namespace": "awoooi-prod", "deployment": "{{labels.deployment}}"}}, + "risk": "medium", + "reasoning": "確認 OOM 後重啟清除記憶體碎片" }} ] }}""" @@ -323,23 +547,29 @@ blast_radius 參考: latency_ms: int, reason: str = "unknown", ) -> ActionPlan: - """熔斷降級:rule-based mock(依 category 推 RESTART 兜底)""" - category = diagnosis.top_hypothesis.category if diagnosis.top_hypothesis else "Unknown" - fallback_action = _default_action_for_category(category) + """ + 熔斷降級:graceful degraded(candidates=[], recommended_actions=[], degraded=True) + + 2026-04-27 Claude Sonnet 4.6: B1 — 砍掉 rule-based mock RESTART 兜底 + 舊行為:LLM 失敗 → category-based RESTART mock(hardcode 違反北極星 §1.1) + 新行為:candidates=[],recommended_actions=[],degraded=True,明確標記降級 + - 不假造任何動作(Coordinator 會因 candidates=[] 進入人工審核路徑) + - Telegram 顯示「AI 降級,請人工研判」而非錯誤的 hardcode 建議 + + 北極星 §1.1「禁止寫死規則」— 降級不等於兜底假造,是誠實的能力邊界聲明。 + """ + logger.warning( + "solver_degraded", + reason=reason, + snapshot_id=diagnosis.evidence_snapshot_id if diagnosis else None, + ) return ActionPlan( - candidates=[ - CandidateAction( - action=fallback_action, - blast_radius=20, - rollback_cost=5, - confidence=0.2, - rationale=f"[降級] LLM 分析失敗({reason}),使用類別 {category} 的預設兜底動作", - ) - ], + candidates=[], diagnosis_report=diagnosis, latency_ms=latency_ms, vote=AgentVote.DEGRADED, degraded=True, + recommended_actions=[], ) @@ -627,7 +857,7 @@ def _extract_candidates(parsed: dict[str, Any]) -> list[CandidateAction]: rationale=f"[語意合成] Nemo 建議「{action_title[:80]}」→ 轉為 kubectl 指令", )] - # 缺乏資源名稱或無法合成 → return [](交由 _degraded_plan 輸出 category-based 調查指令) + # 缺乏資源名稱或無法合成 → return [](交由 _degraded_plan 輸出 empty actions) if not _target and any(w in _at_lower for w in ("rollback", "undo", "restart", "重啟", "回滾", "還原", "重新啟動")): logger.warning( "solver_synthesis_insufficient_context", @@ -672,42 +902,6 @@ def _extract_candidates(parsed: dict[str, Any]) -> list[CandidateAction]: return candidates -def _default_action_for_category(category: str) -> str: - """降級時的預設調查指令 — 必須是真實 kubectl/ssh 命令(調查優先,不執行破壞性操作) - - 2026-04-17 ogt + Claude Sonnet 4.6: 改為真實 kubectl 指令 - 2026-04-24 ogt + Claude Sonnet 4.6: 擴展非 K8s 類別(ClickHouse/主機磁碟/DB) - 根因:SentryClickHouseMemoryPressure/HostDiskUsageHigh 類別不符任何 K8s 關鍵字 - → 全部 fallback 到 "kubectl get pods"(無意義診斷指令) - 修復:加入 clickhouse/database/sentry/host/node/infra 類別映射 - """ - category_lower = category.lower() - # K8s workload 層 - if "pod" in category_lower or "kube" in category_lower or "crash" in category_lower: - return "kubectl get pods -n awoooi-prod -o wide" - if "cpu" in category_lower or "load" in category_lower: - return "kubectl top pods -n awoooi-prod --sort-by=cpu" - if "memory" in category_lower or "oom" in category_lower: - return "kubectl top pods -n awoooi-prod --sort-by=memory" - if "network" in category_lower or "connect" in category_lower: - return "kubectl get services -n awoooi-prod" - if "disk" in category_lower or "storage" in category_lower or "pvc" in category_lower: - return "kubectl exec -n awoooi-prod deployment/postgresql -- df -h" - # 外部服務層(非 K8s — 唯讀診斷) - if "clickhouse" in category_lower or "sentry" in category_lower: - return "kubectl get pods -n awoooi-prod -l app=sentry -o wide" - if "database" in category_lower or "postgres" in category_lower or "redis" in category_lower: - return "kubectl get pods -n awoooi-prod -l tier=database -o wide" - if "rollback" in category_lower or "deploy" in category_lower or "version" in category_lower: - return "kubectl rollout history deployment -n awoooi-prod" - if "latency" in category_lower or "slow" in category_lower or "timeout" in category_lower: - return "kubectl top pods -n awoooi-prod --sort-by=cpu" - # 主機層(host/node/infra — 調查指令,kubectl 只查 node 資訊) - if "host" in category_lower or "node" in category_lower or "infra" in category_lower: - return "kubectl describe nodes | grep -A5 'Conditions\\|Allocatable'" - return "kubectl get pods -n awoooi-prod -o wide" - - def compute_input_hash(diagnosis: DiagnosisReport) -> str: """計算 Solver 輸入的 fingerprint。""" key = diagnosis.evidence_snapshot_id + (