fix(approval): route SSH actions through SSHProvider on manual approve

parse_operation_from_action only knew kubectl and Chinese restart phrases,
so any "ssh host '...'" action approved via Telegram fell through to
"Could not parse operation type" and reported a fake failure even though
the LLM had proposed a valid host repair.

Adds OperationType.SSH_HOST, makes the parser detect ssh prefixes (with
optional flags / user@host) before kubectl patterns, and routes the
SSH_HOST branch in approval_execution.execute_in_background through
SSHProvider with the same tool keywords decision_manager uses
(ssh_docker_prune / ssh_docker_restart / ssh_systemctl_restart /
ssh_diagnose). Unroutable SSH actions now fail loudly with a descriptive
error instead of silently breaking.

Trigger: 2026-05-02 incidents INC-20260502-D6D0B7 / E12EE4 / 557055
were approved by the user but executor reported "Could not parse" and
left the alerts pending.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Your Name
2026-05-02 11:59:50 +08:00
parent 3156ff1c69
commit 607358c4dd
5 changed files with 240 additions and 13 deletions

View File

@@ -295,11 +295,27 @@ class ApprovalExecutionService:
executor = get_executor()
attempt = 1 # 重試計數INVESTIGATE 路徑不進入重試迴圈,保持 1
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — INVESTIGATE 唯讀查詢
# 根因:INVESTIGATE 不在 executor.execute_with_audit 的 switch走 else → success=False
# 修法:偵測到 INVESTIGATE 類型,直接呼叫 execute_kubectl_command(approval.action)
# 唯讀指令無需重試迴圈(失敗即失敗,不會有 transient error 改善空間)
if operation_type == OperationType.INVESTIGATE:
# 2026-05-02 ogt + Claude Sonnet 4.6: 主機 SSH 操作分支
# 根因:手動批准 ssh action 時 parser 只懂 kubectl回 None → 「Could not parse」假失敗
# 修法:偵測到 SSH_HOST 類型,走 SSHProvider 而非 K8s executor
if operation_type == OperationType.SSH_HOST:
result = await self._execute_ssh_host_action(
approval=approval,
host=resource_name or "",
)
logger.info(
"background_execution_ssh_host",
approval_id=str(approval.id),
action=approval.action,
host=resource_name,
success=result.success,
message=result.message,
)
elif operation_type == OperationType.INVESTIGATE:
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — INVESTIGATE 唯讀查詢
# 根因INVESTIGATE 不在 executor.execute_with_audit 的 switch走 else → success=False
# 修法:偵測到 INVESTIGATE 類型,直接呼叫 execute_kubectl_command(approval.action)
# 唯讀指令無需重試迴圈(失敗即失敗,不會有 transient error 改善空間)
result = await executor.execute_kubectl_command(
command=approval.action,
timeout_sec=30,
@@ -594,6 +610,115 @@ class ApprovalExecutionService:
)
return False # K8s 執行失敗
async def _execute_ssh_host_action(
self,
approval: ApprovalRequest,
host: str,
) -> "ExecutionResult":
"""
執行 SSH 主機 action手動批准路徑專用
2026-05-02 ogt + Claude Sonnet 4.6: 修補手動批准 SSH action 卡住的 bug
根因parse_operation_from_action 只懂 kubectlapproval_execution 走 K8s executor 拒收
修法:偵測 SSH_HOST 後改走 SSHProvider行為與 decision_manager._ssh_execute 對齊
action 解析邏輯:
- "docker prune" / "docker image prune" / "docker volume prune" → ssh_docker_prune
- "docker restart <name>" → ssh_docker_restart
- "systemctl restart <svc>" → ssh_systemctl_restart
- "ps aux" / "df -h" / "free -h" / "top" / "uptime" / 'echo' / 'ls -lah' → ssh_diagnose
- 其他:回傳失敗,提示 LLM 改寫 action
"""
from src.services.executor import ExecutionResult
start = time.time()
action = approval.action or ""
action_lower = action.lower().strip()
# 路由 SSH MCP tool與 decision_manager._ssh_execute 對齊)
params: dict = {"host": host}
tool_name: str | None = None
if "docker" in action_lower and "prune" in action_lower:
tool_name = "ssh_docker_prune"
params["trust_score"] = 0.85
elif "docker restart" in action_lower:
tool_name = "ssh_docker_restart"
# 嘗試萃取 container name
import re as _re
m = _re.search(r"docker\s+restart\s+([a-z0-9._-]+)", action_lower)
if m:
params["container_name"] = m.group(1)
params["trust_score"] = 0.85
else:
tool_name = None # 沒抓到 container 名稱,降級
elif "systemctl restart" in action_lower:
tool_name = "ssh_systemctl_restart"
import re as _re
m = _re.search(r"systemctl\s+restart\s+([a-z0-9._-]+)", action_lower)
if m:
params["service"] = m.group(1)
params["trust_score"] = 0.85
else:
tool_name = None
elif any(kw in action_lower for kw in ("ps aux", "df -h", "free -h", "top ", "uptime", "echo ", "ls -")):
# 主機診斷類(合 ssh_diagnose 一鍵收集)
tool_name = "ssh_diagnose"
if tool_name is None:
duration_ms = int((time.time() - start) * 1000)
err = f"SSH action 無法路由到 SSH MCP tool: {action[:120]}"
logger.warning(
"ssh_host_action_unrouted",
approval_id=str(approval.id),
action=action,
host=host,
)
return ExecutionResult(
success=False,
message="SSH action unrouted",
operation_type=OperationType.SSH_HOST,
target_resource=host,
namespace="host",
duration_ms=duration_ms,
error=err,
)
# 呼叫 SSH MCP Provider
from src.plugins.mcp.providers.ssh_provider import SSHProvider
provider = SSHProvider()
try:
mcp_result = await provider.execute(tool_name=tool_name, parameters=params)
duration_ms = int((time.time() - start) * 1000)
success = bool(mcp_result.success)
return ExecutionResult(
success=success,
message=f"ssh_mcp:{tool_name} {'ok' if success else 'failed'}",
operation_type=OperationType.SSH_HOST,
target_resource=host,
namespace="host",
duration_ms=duration_ms,
k8s_response={"tool": tool_name, "output": mcp_result.output} if success else None,
error=None if success else (mcp_result.error or "ssh_mcp execution failed"),
)
except Exception as e:
duration_ms = int((time.time() - start) * 1000)
logger.warning(
"ssh_host_action_exception",
approval_id=str(approval.id),
tool=tool_name,
error=str(e),
)
return ExecutionResult(
success=False,
message="ssh_mcp exception",
operation_type=OperationType.SSH_HOST,
target_resource=host,
namespace="host",
duration_ms=duration_ms,
error=str(e),
)
async def _push_execution_result_to_alert(
self,
approval: ApprovalRequest,

View File

@@ -49,6 +49,9 @@ class OperationType(str, Enum):
SCALE_DEPLOYMENT = "SCALE_DEPLOYMENT"
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — 唯讀查詢類型kubectl get/top/describe/logs
INVESTIGATE = "INVESTIGATE"
# 2026-05-02 ogt + Claude Sonnet 4.6: 主機 SSH 操作docker / systemctl / 診斷)
# 走 SSH MCP Provider 而非 K8s executor解 approval 後 SSH action 卡在「Could not parse」的 bug
SSH_HOST = "SSH_HOST"
# =============================================================================

View File

@@ -56,19 +56,14 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
- kubectl 指令: "kubectl delete pod nginx-xxx -n production"
- 英文自然語言: "Restart deployment api-backend"
- 中文自然語言: "重新啟動 awoooi-worker 服務"
- SSH 指令: "ssh 192.168.0.110 'docker prune ...'" / "ssh wooo@192.168.0.110 ..."
Examples:
"kubectl delete pod nginx-xxx -n production"
→ ParsedOperation(DELETE_POD, "nginx-xxx", "production")
"Restart deployment api-backend"
→ ParsedOperation(RESTART_DEPLOYMENT, "api-backend", "default")
"Scale deployment web-frontend to 5 replicas"
→ ParsedOperation(SCALE_DEPLOYMENT, "web-frontend", "default")
"重新啟動 awoooi-worker 服務"
→ ParsedOperation(RESTART_DEPLOYMENT, "awoooi-worker", "default")
"ssh 192.168.0.110 'docker image prune -a -f'"
→ ParsedOperation(SSH_HOST, "192.168.0.110", "host")
Args:
action: 操作指令字串
@@ -78,6 +73,19 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
"""
action_lower = action.lower()
# 2026-05-02 ogt + Claude Sonnet 4.6: SSH host 操作識別
# 根因approval_execution 把 SSH action 丟進 kubectl parser → 全部 None → 「Could not parse」
# 修法:第一順位偵測 ssh回 SSH_HOSTapproval_execution 走 SSHProvider
# 支援 "ssh host '...'" / "ssh user@host ..." / "ssh -o opt host ..."
ssh_match = re.search(
r"^\s*ssh\s+(?:-[a-zA-Z]\s*\S+\s+)*(?:[a-zA-Z][\w-]*@)?([a-zA-Z0-9][\w.-]*)",
action_lower,
)
if ssh_match:
host = ssh_match.group(1)
# namespace 欄位借用為「host_class」下游不必改
return ParsedOperation(OperationType.SSH_HOST, host, "host")
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — 唯讀指令識別INVESTIGATE
# 根因parse_operation_from_action 完全不認識 kubectl get/top/describe/logs → 回 None → 執行失敗
# 修法:優先匹配唯讀指令,回傳 OperationType.INVESTIGATE零衝擊blast_radius score=1

View File

@@ -0,0 +1,68 @@
"""
operation_parser SSH 識別測試
=============================
2026-05-02 ogt + Claude Sonnet 4.6: 修補手動批准 SSH action 卡死的 bug
驗證:
- ssh host '...' → SSH_HOST + host
- ssh user@host ... → SSH_HOST + host
- ssh -o opt host ... → SSH_HOST + host
- 既有 kubectl / 中文 / restart 路徑不被 SSH 規則誤抓
"""
from __future__ import annotations
from src.services.executor import OperationType
from src.services.operation_parser import parse_operation_from_action
class TestSSHRecognition:
def test_ssh_with_bare_ip(self):
p = parse_operation_from_action("ssh 192.168.0.110 'docker image prune -a -f'")
assert p.operation_type == OperationType.SSH_HOST
assert p.resource_name == "192.168.0.110"
def test_ssh_with_user_at_host(self):
p = parse_operation_from_action("ssh wooo@192.168.0.110 'df -h'")
assert p.operation_type == OperationType.SSH_HOST
assert p.resource_name == "192.168.0.110"
def test_ssh_with_options(self):
p = parse_operation_from_action(
"ssh -o StrictHostKeyChecking=no wooo@192.168.0.188 'systemctl restart ollama'"
)
assert p.operation_type == OperationType.SSH_HOST
assert p.resource_name == "192.168.0.188"
def test_ssh_with_named_alias(self):
# alias 也要當 host 處理(後端 host whitelist 會再擋)
p = parse_operation_from_action("ssh backup 'ls -lah /home/ollama/backup'")
assert p.operation_type == OperationType.SSH_HOST
assert p.resource_name == "backup"
def test_ssh_with_diagnostic_combo(self):
p = parse_operation_from_action(
"ssh 192.168.0.110 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15'"
)
assert p.operation_type == OperationType.SSH_HOST
assert p.resource_name == "192.168.0.110"
class TestSSHDoesNotBreakExistingRoutes:
def test_kubectl_rollout_still_works(self):
p = parse_operation_from_action("kubectl rollout restart deployment/awoooi-api")
assert p.operation_type == OperationType.RESTART_DEPLOYMENT
assert p.resource_name == "awoooi-api"
def test_chinese_restart_still_works(self):
p = parse_operation_from_action("重新啟動 awoooi-worker 服務")
assert p.operation_type == OperationType.RESTART_DEPLOYMENT
assert p.resource_name == "awoooi-worker"
def test_kubectl_get_routes_to_investigate(self):
p = parse_operation_from_action("kubectl get pods -n awoooi-prod")
assert p.operation_type == OperationType.INVESTIGATE
def test_unrelated_text_still_returns_none(self):
p = parse_operation_from_action("(未設)")
assert p.operation_type is None

View File

@@ -6,6 +6,29 @@
---
## 2026-05-02 | 手動批准路徑 SSH action 解析修補
承接同日早上 docker prune 飛輪部署後,使用者反饋仍有 incident 點「批准」後執行失敗。AOL 顯示 `Could not parse operation type`,根因是 `parse_operation_from_action` 只懂 kubectl 與中文重啟,不認識 `ssh ...` action所有 SSH 修復動作從 K8s executor 退場。
### 完成
- `OperationType` 新增 `SSH_HOST`,與 K8s 操作類型區隔。
- `parse_operation_from_action` 在所有 kubectl/中文 pattern 之前先匹配 `ssh [-flags] [user@]host ...`,回 `(SSH_HOST, host, "host")`
- `approval_execution.execute_in_background` 新增 `SSH_HOST` 分支,呼叫 `_execute_ssh_host_action`
- 含 docker prune → `ssh_docker_prune`trust_score=0.85
- docker restart → `ssh_docker_restart`
- systemctl restart → `ssh_systemctl_restart`
- 診斷類ps aux / df -h / free -h / top / uptime / echo / ls`ssh_diagnose`
- 其他:失敗並記 unrouted避免靜默假成功
- 全部走 SSHProvider沿用同一套 host 白名單 + trust_score 守衛。
### 驗證
- `pytest tests/test_operation_parser_ssh.py` → 9 passed。
- `pytest tests/ -k "operation_parser or action_parser or approval or executor or ssh"` → 160 passed, 2 skipped, 0 failed。
- `python3 -m py_compile` 三檔通過。
### 後續待辦
- f45598b5 + 本 commit 部署到 production 後,重新觸發 INC-20260502-D6D0B7 / E12EE4 / 557055 類事件,確認手動批准路徑能成功執行 SSH 動作。
## 2026-05-02 | Telegram 告警噴爆事故閉環 + Docker prune 飛輪補完
承接昨晚到今早 Telegram 告警量爆增(峰值 53/hr事故。根因是 `ssh-mcp-key` Secret 的 `known_hosts` 欄位是 0 bytes 空檔asyncssh 拒絕所有 SSH導致 110 磁碟告警的 auto_repair 全部走「Host key is not trusted → emergency_channel → Telegram」路徑無限重試。