From 607358c4dd835e0e3aa6a5e983bb4077e712f105 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 2 May 2026 11:59:50 +0800 Subject: [PATCH] fix(approval): route SSH actions through SSHProvider on manual approve parse_operation_from_action only knew kubectl and Chinese restart phrases, so any "ssh host '...'" action approved via Telegram fell through to "Could not parse operation type" and reported a fake failure even though the LLM had proposed a valid host repair. Adds OperationType.SSH_HOST, makes the parser detect ssh prefixes (with optional flags / user@host) before kubectl patterns, and routes the SSH_HOST branch in approval_execution.execute_in_background through SSHProvider with the same tool keywords decision_manager uses (ssh_docker_prune / ssh_docker_restart / ssh_systemctl_restart / ssh_diagnose). Unroutable SSH actions now fail loudly with a descriptive error instead of silently breaking. Trigger: 2026-05-02 incidents INC-20260502-D6D0B7 / E12EE4 / 557055 were approved by the user but executor reported "Could not parse" and left the alerts pending. Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/approval_execution.py | 135 +++++++++++++++++++- apps/api/src/services/executor.py | 3 + apps/api/src/services/operation_parser.py | 24 ++-- apps/api/tests/test_operation_parser_ssh.py | 68 ++++++++++ docs/LOGBOOK.md | 23 ++++ 5 files changed, 240 insertions(+), 13 deletions(-) create mode 100644 apps/api/tests/test_operation_parser_ssh.py diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index f4141aa4..06b063b5 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -295,11 +295,27 @@ class ApprovalExecutionService: executor = get_executor() attempt = 1 # 重試計數(INVESTIGATE 路徑不進入重試迴圈,保持 1) - # 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — INVESTIGATE 唯讀查詢 - # 根因:INVESTIGATE 不在 executor.execute_with_audit 的 switch,走 else → success=False - # 修法:偵測到 INVESTIGATE 類型,直接呼叫 execute_kubectl_command(approval.action) - # 唯讀指令無需重試迴圈(失敗即失敗,不會有 transient error 改善空間) - if operation_type == OperationType.INVESTIGATE: + # 2026-05-02 ogt + Claude Sonnet 4.6: 主機 SSH 操作分支 + # 根因:手動批准 ssh action 時 parser 只懂 kubectl,回 None → 「Could not parse」假失敗 + # 修法:偵測到 SSH_HOST 類型,走 SSHProvider 而非 K8s executor + if operation_type == OperationType.SSH_HOST: + result = await self._execute_ssh_host_action( + approval=approval, + host=resource_name or "", + ) + logger.info( + "background_execution_ssh_host", + approval_id=str(approval.id), + action=approval.action, + host=resource_name, + success=result.success, + message=result.message, + ) + elif operation_type == OperationType.INVESTIGATE: + # 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — INVESTIGATE 唯讀查詢 + # 根因:INVESTIGATE 不在 executor.execute_with_audit 的 switch,走 else → success=False + # 修法:偵測到 INVESTIGATE 類型,直接呼叫 execute_kubectl_command(approval.action) + # 唯讀指令無需重試迴圈(失敗即失敗,不會有 transient error 改善空間) result = await executor.execute_kubectl_command( command=approval.action, timeout_sec=30, @@ -594,6 +610,115 @@ class ApprovalExecutionService: ) return False # K8s 執行失敗 + async def _execute_ssh_host_action( + self, + approval: ApprovalRequest, + host: str, + ) -> "ExecutionResult": + """ + 執行 SSH 主機 action(手動批准路徑專用) + + 2026-05-02 ogt + Claude Sonnet 4.6: 修補手動批准 SSH action 卡住的 bug + 根因:parse_operation_from_action 只懂 kubectl,approval_execution 走 K8s executor 拒收 + 修法:偵測 SSH_HOST 後改走 SSHProvider,行為與 decision_manager._ssh_execute 對齊 + + action 解析邏輯: + - "docker prune" / "docker image prune" / "docker volume prune" → ssh_docker_prune + - "docker restart " → ssh_docker_restart + - "systemctl restart " → ssh_systemctl_restart + - "ps aux" / "df -h" / "free -h" / "top" / "uptime" / 'echo' / 'ls -lah' → ssh_diagnose + - 其他:回傳失敗,提示 LLM 改寫 action + """ + from src.services.executor import ExecutionResult + + start = time.time() + action = approval.action or "" + action_lower = action.lower().strip() + + # 路由 SSH MCP tool(與 decision_manager._ssh_execute 對齊) + params: dict = {"host": host} + tool_name: str | None = None + + if "docker" in action_lower and "prune" in action_lower: + tool_name = "ssh_docker_prune" + params["trust_score"] = 0.85 + elif "docker restart" in action_lower: + tool_name = "ssh_docker_restart" + # 嘗試萃取 container name + import re as _re + m = _re.search(r"docker\s+restart\s+([a-z0-9._-]+)", action_lower) + if m: + params["container_name"] = m.group(1) + params["trust_score"] = 0.85 + else: + tool_name = None # 沒抓到 container 名稱,降級 + elif "systemctl restart" in action_lower: + tool_name = "ssh_systemctl_restart" + import re as _re + m = _re.search(r"systemctl\s+restart\s+([a-z0-9._-]+)", action_lower) + if m: + params["service"] = m.group(1) + params["trust_score"] = 0.85 + else: + tool_name = None + elif any(kw in action_lower for kw in ("ps aux", "df -h", "free -h", "top ", "uptime", "echo ", "ls -")): + # 主機診斷類(合 ssh_diagnose 一鍵收集) + tool_name = "ssh_diagnose" + + if tool_name is None: + duration_ms = int((time.time() - start) * 1000) + err = f"SSH action 無法路由到 SSH MCP tool: {action[:120]}" + logger.warning( + "ssh_host_action_unrouted", + approval_id=str(approval.id), + action=action, + host=host, + ) + return ExecutionResult( + success=False, + message="SSH action unrouted", + operation_type=OperationType.SSH_HOST, + target_resource=host, + namespace="host", + duration_ms=duration_ms, + error=err, + ) + + # 呼叫 SSH MCP Provider + from src.plugins.mcp.providers.ssh_provider import SSHProvider + provider = SSHProvider() + try: + mcp_result = await provider.execute(tool_name=tool_name, parameters=params) + duration_ms = int((time.time() - start) * 1000) + success = bool(mcp_result.success) + return ExecutionResult( + success=success, + message=f"ssh_mcp:{tool_name} {'ok' if success else 'failed'}", + operation_type=OperationType.SSH_HOST, + target_resource=host, + namespace="host", + duration_ms=duration_ms, + k8s_response={"tool": tool_name, "output": mcp_result.output} if success else None, + error=None if success else (mcp_result.error or "ssh_mcp execution failed"), + ) + except Exception as e: + duration_ms = int((time.time() - start) * 1000) + logger.warning( + "ssh_host_action_exception", + approval_id=str(approval.id), + tool=tool_name, + error=str(e), + ) + return ExecutionResult( + success=False, + message="ssh_mcp exception", + operation_type=OperationType.SSH_HOST, + target_resource=host, + namespace="host", + duration_ms=duration_ms, + error=str(e), + ) + async def _push_execution_result_to_alert( self, approval: ApprovalRequest, diff --git a/apps/api/src/services/executor.py b/apps/api/src/services/executor.py index 4104b08e..382308fe 100644 --- a/apps/api/src/services/executor.py +++ b/apps/api/src/services/executor.py @@ -49,6 +49,9 @@ class OperationType(str, Enum): SCALE_DEPLOYMENT = "SCALE_DEPLOYMENT" # 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — 唯讀查詢類型(kubectl get/top/describe/logs) INVESTIGATE = "INVESTIGATE" + # 2026-05-02 ogt + Claude Sonnet 4.6: 主機 SSH 操作(docker / systemctl / 診斷) + # 走 SSH MCP Provider 而非 K8s executor;解 approval 後 SSH action 卡在「Could not parse」的 bug + SSH_HOST = "SSH_HOST" # ============================================================================= diff --git a/apps/api/src/services/operation_parser.py b/apps/api/src/services/operation_parser.py index a604a212..92685f42 100644 --- a/apps/api/src/services/operation_parser.py +++ b/apps/api/src/services/operation_parser.py @@ -56,19 +56,14 @@ def parse_operation_from_action(action: str) -> ParsedOperation: - kubectl 指令: "kubectl delete pod nginx-xxx -n production" - 英文自然語言: "Restart deployment api-backend" - 中文自然語言: "重新啟動 awoooi-worker 服務" + - SSH 指令: "ssh 192.168.0.110 'docker prune ...'" / "ssh wooo@192.168.0.110 ..." Examples: "kubectl delete pod nginx-xxx -n production" → ParsedOperation(DELETE_POD, "nginx-xxx", "production") - "Restart deployment api-backend" - → ParsedOperation(RESTART_DEPLOYMENT, "api-backend", "default") - - "Scale deployment web-frontend to 5 replicas" - → ParsedOperation(SCALE_DEPLOYMENT, "web-frontend", "default") - - "重新啟動 awoooi-worker 服務" - → ParsedOperation(RESTART_DEPLOYMENT, "awoooi-worker", "default") + "ssh 192.168.0.110 'docker image prune -a -f'" + → ParsedOperation(SSH_HOST, "192.168.0.110", "host") Args: action: 操作指令字串 @@ -78,6 +73,19 @@ def parse_operation_from_action(action: str) -> ParsedOperation: """ action_lower = action.lower() + # 2026-05-02 ogt + Claude Sonnet 4.6: SSH host 操作識別 + # 根因:approval_execution 把 SSH action 丟進 kubectl parser → 全部 None → 「Could not parse」 + # 修法:第一順位偵測 ssh,回 SSH_HOST,approval_execution 走 SSHProvider + # 支援 "ssh host '...'" / "ssh user@host ..." / "ssh -o opt host ..." + ssh_match = re.search( + r"^\s*ssh\s+(?:-[a-zA-Z]\s*\S+\s+)*(?:[a-zA-Z][\w-]*@)?([a-zA-Z0-9][\w.-]*)", + action_lower, + ) + if ssh_match: + host = ssh_match.group(1) + # namespace 欄位借用為「host_class」,下游不必改 + return ParsedOperation(OperationType.SSH_HOST, host, "host") + # 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — 唯讀指令識別(INVESTIGATE) # 根因:parse_operation_from_action 完全不認識 kubectl get/top/describe/logs → 回 None → 執行失敗 # 修法:優先匹配唯讀指令,回傳 OperationType.INVESTIGATE(零衝擊,blast_radius score=1) diff --git a/apps/api/tests/test_operation_parser_ssh.py b/apps/api/tests/test_operation_parser_ssh.py new file mode 100644 index 00000000..d04337aa --- /dev/null +++ b/apps/api/tests/test_operation_parser_ssh.py @@ -0,0 +1,68 @@ +""" +operation_parser SSH 識別測試 +============================= +2026-05-02 ogt + Claude Sonnet 4.6: 修補手動批准 SSH action 卡死的 bug + +驗證: +- ssh host '...' → SSH_HOST + host +- ssh user@host ... → SSH_HOST + host +- ssh -o opt host ... → SSH_HOST + host +- 既有 kubectl / 中文 / restart 路徑不被 SSH 規則誤抓 +""" + +from __future__ import annotations + +from src.services.executor import OperationType +from src.services.operation_parser import parse_operation_from_action + + +class TestSSHRecognition: + def test_ssh_with_bare_ip(self): + p = parse_operation_from_action("ssh 192.168.0.110 'docker image prune -a -f'") + assert p.operation_type == OperationType.SSH_HOST + assert p.resource_name == "192.168.0.110" + + def test_ssh_with_user_at_host(self): + p = parse_operation_from_action("ssh wooo@192.168.0.110 'df -h'") + assert p.operation_type == OperationType.SSH_HOST + assert p.resource_name == "192.168.0.110" + + def test_ssh_with_options(self): + p = parse_operation_from_action( + "ssh -o StrictHostKeyChecking=no wooo@192.168.0.188 'systemctl restart ollama'" + ) + assert p.operation_type == OperationType.SSH_HOST + assert p.resource_name == "192.168.0.188" + + def test_ssh_with_named_alias(self): + # alias 也要當 host 處理(後端 host whitelist 會再擋) + p = parse_operation_from_action("ssh backup 'ls -lah /home/ollama/backup'") + assert p.operation_type == OperationType.SSH_HOST + assert p.resource_name == "backup" + + def test_ssh_with_diagnostic_combo(self): + p = parse_operation_from_action( + "ssh 192.168.0.110 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15'" + ) + assert p.operation_type == OperationType.SSH_HOST + assert p.resource_name == "192.168.0.110" + + +class TestSSHDoesNotBreakExistingRoutes: + def test_kubectl_rollout_still_works(self): + p = parse_operation_from_action("kubectl rollout restart deployment/awoooi-api") + assert p.operation_type == OperationType.RESTART_DEPLOYMENT + assert p.resource_name == "awoooi-api" + + def test_chinese_restart_still_works(self): + p = parse_operation_from_action("重新啟動 awoooi-worker 服務") + assert p.operation_type == OperationType.RESTART_DEPLOYMENT + assert p.resource_name == "awoooi-worker" + + def test_kubectl_get_routes_to_investigate(self): + p = parse_operation_from_action("kubectl get pods -n awoooi-prod") + assert p.operation_type == OperationType.INVESTIGATE + + def test_unrelated_text_still_returns_none(self): + p = parse_operation_from_action("(未設)") + assert p.operation_type is None diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 56292502..cf39e60f 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -6,6 +6,29 @@ --- +## 2026-05-02 | 手動批准路徑 SSH action 解析修補 + +承接同日早上 docker prune 飛輪部署後,使用者反饋仍有 incident 點「批准」後執行失敗。AOL 顯示 `Could not parse operation type`,根因是 `parse_operation_from_action` 只懂 kubectl 與中文重啟,不認識 `ssh ...` action,所有 SSH 修復動作從 K8s executor 退場。 + +### 完成 +- `OperationType` 新增 `SSH_HOST`,與 K8s 操作類型區隔。 +- `parse_operation_from_action` 在所有 kubectl/中文 pattern 之前先匹配 `ssh [-flags] [user@]host ...`,回 `(SSH_HOST, host, "host")`。 +- `approval_execution.execute_in_background` 新增 `SSH_HOST` 分支,呼叫 `_execute_ssh_host_action`: + - 含 docker prune → `ssh_docker_prune`(trust_score=0.85) + - docker restart → `ssh_docker_restart` + - systemctl restart → `ssh_systemctl_restart` + - 診斷類(ps aux / df -h / free -h / top / uptime / echo / ls) → `ssh_diagnose` + - 其他:失敗並記 unrouted,避免靜默假成功 +- 全部走 SSHProvider,沿用同一套 host 白名單 + trust_score 守衛。 + +### 驗證 +- `pytest tests/test_operation_parser_ssh.py` → 9 passed。 +- `pytest tests/ -k "operation_parser or action_parser or approval or executor or ssh"` → 160 passed, 2 skipped, 0 failed。 +- `python3 -m py_compile` 三檔通過。 + +### 後續待辦 +- f45598b5 + 本 commit 部署到 production 後,重新觸發 INC-20260502-D6D0B7 / E12EE4 / 557055 類事件,確認手動批准路徑能成功執行 SSH 動作。 + ## 2026-05-02 | Telegram 告警噴爆事故閉環 + Docker prune 飛輪補完 承接昨晚到今早 Telegram 告警量爆增(峰值 53/hr)事故。根因是 `ssh-mcp-key` Secret 的 `known_hosts` 欄位是 0 bytes 空檔,asyncssh 拒絕所有 SSH,導致 110 磁碟告警的 auto_repair 全部走「Host key is not trusted → emergency_channel → Telegram」路徑無限重試。