fix(ssh-auto): 主機告警 SSH 自動診斷授權(HostHighCpuLoad 不再卡人工審核)
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 9m7s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 9m7s
根因:SSH_MCP_ALLOWED_HOSTS 未設定 → _ssh_execute() 全部攔截
+ auto_approve 只認 kubectl 不認 ssh → 主機告警永遠降級人工
修復:
- ConfigMap: 補 SSH_MCP_ALLOWED_HOSTS 四主機白名單
- alert_rules: HostHighCpuLoad 等從 NO_ACTION 改為 ssh_diagnose 指令
- auto_approve: _has_executable 加入 ssh 開頭識別
- decision_manager: _ssh_execute() 加入 ssh_diagnose 路由
- ssh_provider: 新增 ssh_diagnose tool(ps aux + free -h + df -h,只讀)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -149,14 +149,18 @@ rules:
|
||||
- HostClockSkewDetected
|
||||
- HostClockNotSynchronising
|
||||
response:
|
||||
action_title: "⚠️ 主機告警 — 需 SSH 人工排查"
|
||||
description: "⚠️ 主機層告警(node_exporter)。此告警源自主機資源,無法透過 kubectl 自動修復。請 SSH 登入主機排查根因:top / htop / df -h / journalctl -xe。"
|
||||
suggested_action: NO_ACTION
|
||||
kubectl_command: ""
|
||||
action_title: "🔍 主機自動診斷 — SSH 收集根因"
|
||||
description: "主機層告警(node_exporter)。自動 SSH 登入主機執行診斷指令,收集 CPU/記憶體/磁碟資訊後回報。"
|
||||
# 2026-04-27 Claude Sonnet 4.6: 從 NO_ACTION 改為自動 SSH 診斷
|
||||
# 根因:SSH_MCP_ALLOWED_HOSTS 空白導致全部降為人工審核(飛輪完全停轉)
|
||||
# 修復:補 SSH_MCP_ALLOWED_HOSTS 白名單 + 改為自動診斷指令(收集不修改,安全)
|
||||
# 診斷原則:只收集資訊,不做任何改動 → risk=low 且不在 _DESTRUCTIVE_PATTERNS 清單
|
||||
suggested_action: SSH_DIAGNOSE
|
||||
kubectl_command: "ssh {host} 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15; echo \"=== MEMORY ===\"; free -h; echo \"=== DISK ===\"; df -h; echo \"=== LOAD ===\"; uptime'"
|
||||
estimated_downtime: "N/A"
|
||||
risk: low
|
||||
responsibility: INFRA
|
||||
reasoning: "[規則匹配] 主機層資源告警無法自動修復,需人工登入確認高負載/高記憶體/磁碟根因後決策。禁止 kubectl restart(node_exporter 不是 K8s 服務)。"
|
||||
reasoning: "[規則匹配] 主機層資源告警,自動 SSH 執行診斷指令(只讀,不修改),收集根因資訊後推送 Telegram 讓 SRE 決策。"
|
||||
|
||||
- id: high_cpu
|
||||
priority: 40
|
||||
|
||||
@@ -443,6 +443,15 @@ class SSHProvider(MCPToolProvider):
|
||||
|
||||
def _build_command(self, tool_name: str, params: dict) -> str:
|
||||
# 所有接受用戶字串的工具,必須先通過 _validate_param() 白名單驗證
|
||||
if tool_name == "ssh_diagnose":
|
||||
# 2026-04-27 Claude Sonnet 4.6: 主機告警自動診斷 — 只讀,不修改任何狀態
|
||||
return (
|
||||
"echo '=== CPU TOP ===' && ps aux --sort=-%cpu | head -15 && "
|
||||
"echo '=== MEMORY ===' && free -h && "
|
||||
"echo '=== DISK ===' && df -h && "
|
||||
"echo '=== LOAD ===' && uptime"
|
||||
)
|
||||
|
||||
if tool_name == "ssh_get_top_processes":
|
||||
return "ps aux --sort=-%cpu | head -15"
|
||||
|
||||
|
||||
@@ -301,11 +301,19 @@ class AutoApprovePolicy:
|
||||
# P1-2 改用 NO_EXECUTABLE_ACTION(避免污染 KM 飛輪學習資料)
|
||||
_raw_action = proposal_data.get("action", "") or ""
|
||||
_kubectl_cmd = proposal_data.get("kubectl_command", "") or ""
|
||||
_has_kubectl = "kubectl" in _raw_action.lower() or "kubectl" in _kubectl_cmd.lower()
|
||||
if not _has_kubectl:
|
||||
# 2026-04-27 Claude Sonnet 4.6: 擴充可執行指令識別,加入 SSH 診斷路徑
|
||||
# 根因:_has_kubectl 只認 kubectl,SSH 診斷指令(主機告警)被全部攔截 → 飛輪停轉
|
||||
# 修復:ssh {host} '...' 格式也是可執行指令,允許走 _ssh_execute() 路徑
|
||||
_has_executable = (
|
||||
"kubectl" in _raw_action.lower()
|
||||
or "kubectl" in _kubectl_cmd.lower()
|
||||
or _raw_action.lower().strip().startswith("ssh ")
|
||||
or _kubectl_cmd.lower().strip().startswith("ssh ")
|
||||
)
|
||||
if not _has_executable:
|
||||
return self._reject(
|
||||
reason=AutoApproveReason.NO_EXECUTABLE_ACTION,
|
||||
detail=f"Action '{_raw_action[:60] or _kubectl_cmd[:60]}' is natural language — no kubectl command, requires human review",
|
||||
detail=f"Action '{_raw_action[:60] or _kubectl_cmd[:60]}' is natural language — no kubectl/ssh command, requires human review",
|
||||
risk_level=risk_level,
|
||||
trust_score=trust_score,
|
||||
confidence=confidence,
|
||||
|
||||
@@ -3101,6 +3101,9 @@ class DecisionManager:
|
||||
return
|
||||
|
||||
# 解析 SSH tool + params
|
||||
# 2026-04-27 Claude Sonnet 4.6: 加入主機診斷路徑
|
||||
# 根因:只支援 docker/systemctl restart,主機告警 ssh {host} '...' 格式全降級人工
|
||||
# 修復:識別 ssh_diagnose 模式,路由到 ssh_get_top_processes / ssh_get_disk_usage
|
||||
_action_lower = action.lower().strip()
|
||||
if _action_lower.startswith("docker restart"):
|
||||
_tool = "docker_restart"
|
||||
@@ -3108,6 +3111,9 @@ class DecisionManager:
|
||||
elif _action_lower.startswith("systemctl restart"):
|
||||
_tool = "service_restart"
|
||||
_service = target
|
||||
elif _action_lower.startswith("ssh ") and ("ps aux" in _action_lower or "top" in _action_lower or "free" in _action_lower or "df -h" in _action_lower or "uptime" in _action_lower):
|
||||
# 主機診斷指令:自動收集 CPU/記憶體/磁碟,不修改任何狀態
|
||||
_tool = "ssh_diagnose"
|
||||
else:
|
||||
logger.info(
|
||||
"ssh_execute_unknown_action",
|
||||
@@ -3125,8 +3131,9 @@ class DecisionManager:
|
||||
params: dict = {"host": _host}
|
||||
if _tool == "docker_restart":
|
||||
params["container"] = _container
|
||||
else:
|
||||
elif _tool == "service_restart":
|
||||
params["service"] = _service
|
||||
# ssh_diagnose: 只需 host,無額外 params
|
||||
|
||||
try:
|
||||
result = await self._ssh.execute(tool_name=_tool, parameters=params)
|
||||
|
||||
@@ -110,6 +110,10 @@ data:
|
||||
# SSH_MCP_ENABLED=true 需確認 ssh-mcp-key Secret 已建立且 188 已加 authorized_keys
|
||||
SSH_MCP_ENABLED: "true"
|
||||
SSH_MCP_KNOWN_HOSTS_FILE: "/etc/ssh-mcp/known_hosts"
|
||||
# 2026-04-27 Claude Sonnet 4.6: 授權主機告警 SSH 自動執行白名單
|
||||
# 根因:SSH_MCP_ALLOWED_HOSTS 未設定 → _ssh_execute() 攔截 → 主機告警全部降級人工審核
|
||||
# 四台主機:110(DevOps金庫/wooo), 120(K3s-1/wooo), 121(K3s-2/wooo), 188(AI中心/ollama)
|
||||
SSH_MCP_ALLOWED_HOSTS: "192.168.0.110,192.168.0.120,192.168.0.121,192.168.0.188"
|
||||
# MCP Phase 3 (2026-04-11 Claude Sonnet 4.6): ArgoCD + Sentry MCP 啟用
|
||||
# ARGOCD_API_TOKEN 在 Secrets 中配置
|
||||
ARGOCD_MCP_ENABLED: "true"
|
||||
|
||||
Reference in New Issue
Block a user