diff --git a/apps/api/alert_rules.yaml b/apps/api/alert_rules.yaml index a715a8e0..7d32d0cd 100644 --- a/apps/api/alert_rules.yaml +++ b/apps/api/alert_rules.yaml @@ -149,14 +149,18 @@ rules: - HostClockSkewDetected - HostClockNotSynchronising response: - action_title: "⚠️ 主機告警 — 需 SSH 人工排查" - description: "⚠️ 主機層告警(node_exporter)。此告警源自主機資源,無法透過 kubectl 自動修復。請 SSH 登入主機排查根因:top / htop / df -h / journalctl -xe。" - suggested_action: NO_ACTION - kubectl_command: "" + action_title: "🔍 主機自動診斷 — SSH 收集根因" + description: "主機層告警(node_exporter)。自動 SSH 登入主機執行診斷指令,收集 CPU/記憶體/磁碟資訊後回報。" + # 2026-04-27 Claude Sonnet 4.6: 從 NO_ACTION 改為自動 SSH 診斷 + # 根因:SSH_MCP_ALLOWED_HOSTS 空白導致全部降為人工審核(飛輪完全停轉) + # 修復:補 SSH_MCP_ALLOWED_HOSTS 白名單 + 改為自動診斷指令(收集不修改,安全) + # 診斷原則:只收集資訊,不做任何改動 → risk=low 且不在 _DESTRUCTIVE_PATTERNS 清單 + suggested_action: SSH_DIAGNOSE + kubectl_command: "ssh {host} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'" estimated_downtime: "N/A" risk: low responsibility: INFRA - reasoning: "[規則匹配] 主機層資源告警無法自動修復,需人工登入確認高負載/高記憶體/磁碟根因後決策。禁止 kubectl restart(node_exporter 不是 K8s 服務)。" + reasoning: "[規則匹配] 主機層資源告警,自動 SSH 執行診斷指令(只讀,不修改),收集根因資訊後推送 Telegram 讓 SRE 決策。" - id: high_cpu priority: 40 diff --git a/apps/api/src/plugins/mcp/providers/ssh_provider.py b/apps/api/src/plugins/mcp/providers/ssh_provider.py index 0237daa0..58477b56 100644 --- a/apps/api/src/plugins/mcp/providers/ssh_provider.py +++ b/apps/api/src/plugins/mcp/providers/ssh_provider.py @@ -443,6 +443,15 @@ class SSHProvider(MCPToolProvider): def _build_command(self, tool_name: str, params: dict) -> str: # 所有接受用戶字串的工具,必須先通過 _validate_param() 白名單驗證 + if tool_name == "ssh_diagnose": + # 2026-04-27 Claude Sonnet 4.6: 主機告警自動診斷 — 只讀,不修改任何狀態 + return ( + "echo '=== CPU TOP ===' && ps aux --sort=-%cpu | head -15 && " + "echo '=== MEMORY ===' && free -h && " + "echo '=== DISK ===' && df -h && " + "echo '=== LOAD ===' && uptime" + ) + if tool_name == "ssh_get_top_processes": return "ps aux --sort=-%cpu | head -15" diff --git a/apps/api/src/services/auto_approve.py b/apps/api/src/services/auto_approve.py index f99c072a..0a52f549 100644 --- a/apps/api/src/services/auto_approve.py +++ b/apps/api/src/services/auto_approve.py @@ -301,11 +301,19 @@ class AutoApprovePolicy: # P1-2 改用 NO_EXECUTABLE_ACTION(避免污染 KM 飛輪學習資料) _raw_action = proposal_data.get("action", "") or "" _kubectl_cmd = proposal_data.get("kubectl_command", "") or "" - _has_kubectl = "kubectl" in _raw_action.lower() or "kubectl" in _kubectl_cmd.lower() - if not _has_kubectl: + # 2026-04-27 Claude Sonnet 4.6: 擴充可執行指令識別,加入 SSH 診斷路徑 + # 根因:_has_kubectl 只認 kubectl,SSH 診斷指令(主機告警)被全部攔截 → 飛輪停轉 + # 修復:ssh {host} '...' 格式也是可執行指令,允許走 _ssh_execute() 路徑 + _has_executable = ( + "kubectl" in _raw_action.lower() + or "kubectl" in _kubectl_cmd.lower() + or _raw_action.lower().strip().startswith("ssh ") + or _kubectl_cmd.lower().strip().startswith("ssh ") + ) + if not _has_executable: return self._reject( reason=AutoApproveReason.NO_EXECUTABLE_ACTION, - detail=f"Action '{_raw_action[:60] or _kubectl_cmd[:60]}' is natural language — no kubectl command, requires human review", + detail=f"Action '{_raw_action[:60] or _kubectl_cmd[:60]}' is natural language — no kubectl/ssh command, requires human review", risk_level=risk_level, trust_score=trust_score, confidence=confidence, diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index b8021782..85316bc5 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -3101,6 +3101,9 @@ class DecisionManager: return # 解析 SSH tool + params + # 2026-04-27 Claude Sonnet 4.6: 加入主機診斷路徑 + # 根因:只支援 docker/systemctl restart,主機告警 ssh {host} '...' 格式全降級人工 + # 修復:識別 ssh_diagnose 模式,路由到 ssh_get_top_processes / ssh_get_disk_usage _action_lower = action.lower().strip() if _action_lower.startswith("docker restart"): _tool = "docker_restart" @@ -3108,6 +3111,9 @@ class DecisionManager: elif _action_lower.startswith("systemctl restart"): _tool = "service_restart" _service = target + elif _action_lower.startswith("ssh ") and ("ps aux" in _action_lower or "top" in _action_lower or "free" in _action_lower or "df -h" in _action_lower or "uptime" in _action_lower): + # 主機診斷指令:自動收集 CPU/記憶體/磁碟,不修改任何狀態 + _tool = "ssh_diagnose" else: logger.info( "ssh_execute_unknown_action", @@ -3125,8 +3131,9 @@ class DecisionManager: params: dict = {"host": _host} if _tool == "docker_restart": params["container"] = _container - else: + elif _tool == "service_restart": params["service"] = _service + # ssh_diagnose: 只需 host,無額外 params try: result = await self._ssh.execute(tool_name=_tool, parameters=params) diff --git a/k8s/awoooi-prod/04-configmap.yaml b/k8s/awoooi-prod/04-configmap.yaml index e96e91a8..981a715a 100644 --- a/k8s/awoooi-prod/04-configmap.yaml +++ b/k8s/awoooi-prod/04-configmap.yaml @@ -110,6 +110,10 @@ data: # SSH_MCP_ENABLED=true 需確認 ssh-mcp-key Secret 已建立且 188 已加 authorized_keys SSH_MCP_ENABLED: "true" SSH_MCP_KNOWN_HOSTS_FILE: "/etc/ssh-mcp/known_hosts" + # 2026-04-27 Claude Sonnet 4.6: 授權主機告警 SSH 自動執行白名單 + # 根因:SSH_MCP_ALLOWED_HOSTS 未設定 → _ssh_execute() 攔截 → 主機告警全部降級人工審核 + # 四台主機:110(DevOps金庫/wooo), 120(K3s-1/wooo), 121(K3s-2/wooo), 188(AI中心/ollama) + SSH_MCP_ALLOWED_HOSTS: "192.168.0.110,192.168.0.120,192.168.0.121,192.168.0.188" # MCP Phase 3 (2026-04-11 Claude Sonnet 4.6): ArgoCD + Sentry MCP 啟用 # ARGOCD_API_TOKEN 在 Secrets 中配置 ARGOCD_MCP_ENABLED: "true"