fix(approval): route SSH actions through SSHProvider on manual approve
parse_operation_from_action only knew kubectl and Chinese restart phrases, so any "ssh host '...'" action approved via Telegram fell through to "Could not parse operation type" and reported a fake failure even though the LLM had proposed a valid host repair. Adds OperationType.SSH_HOST, makes the parser detect ssh prefixes (with optional flags / user@host) before kubectl patterns, and routes the SSH_HOST branch in approval_execution.execute_in_background through SSHProvider with the same tool keywords decision_manager uses (ssh_docker_prune / ssh_docker_restart / ssh_systemctl_restart / ssh_diagnose). Unroutable SSH actions now fail loudly with a descriptive error instead of silently breaking. Trigger: 2026-05-02 incidents INC-20260502-D6D0B7 / E12EE4 / 557055 were approved by the user but executor reported "Could not parse" and left the alerts pending. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -295,11 +295,27 @@ class ApprovalExecutionService:
|
||||
executor = get_executor()
|
||||
attempt = 1 # 重試計數(INVESTIGATE 路徑不進入重試迴圈,保持 1)
|
||||
|
||||
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — INVESTIGATE 唯讀查詢
|
||||
# 根因:INVESTIGATE 不在 executor.execute_with_audit 的 switch,走 else → success=False
|
||||
# 修法:偵測到 INVESTIGATE 類型,直接呼叫 execute_kubectl_command(approval.action)
|
||||
# 唯讀指令無需重試迴圈(失敗即失敗,不會有 transient error 改善空間)
|
||||
if operation_type == OperationType.INVESTIGATE:
|
||||
# 2026-05-02 ogt + Claude Sonnet 4.6: 主機 SSH 操作分支
|
||||
# 根因:手動批准 ssh action 時 parser 只懂 kubectl,回 None → 「Could not parse」假失敗
|
||||
# 修法:偵測到 SSH_HOST 類型,走 SSHProvider 而非 K8s executor
|
||||
if operation_type == OperationType.SSH_HOST:
|
||||
result = await self._execute_ssh_host_action(
|
||||
approval=approval,
|
||||
host=resource_name or "",
|
||||
)
|
||||
logger.info(
|
||||
"background_execution_ssh_host",
|
||||
approval_id=str(approval.id),
|
||||
action=approval.action,
|
||||
host=resource_name,
|
||||
success=result.success,
|
||||
message=result.message,
|
||||
)
|
||||
elif operation_type == OperationType.INVESTIGATE:
|
||||
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — INVESTIGATE 唯讀查詢
|
||||
# 根因:INVESTIGATE 不在 executor.execute_with_audit 的 switch,走 else → success=False
|
||||
# 修法:偵測到 INVESTIGATE 類型,直接呼叫 execute_kubectl_command(approval.action)
|
||||
# 唯讀指令無需重試迴圈(失敗即失敗,不會有 transient error 改善空間)
|
||||
result = await executor.execute_kubectl_command(
|
||||
command=approval.action,
|
||||
timeout_sec=30,
|
||||
@@ -594,6 +610,115 @@ class ApprovalExecutionService:
|
||||
)
|
||||
return False # K8s 執行失敗
|
||||
|
||||
async def _execute_ssh_host_action(
|
||||
self,
|
||||
approval: ApprovalRequest,
|
||||
host: str,
|
||||
) -> "ExecutionResult":
|
||||
"""
|
||||
執行 SSH 主機 action(手動批准路徑專用)
|
||||
|
||||
2026-05-02 ogt + Claude Sonnet 4.6: 修補手動批准 SSH action 卡住的 bug
|
||||
根因:parse_operation_from_action 只懂 kubectl,approval_execution 走 K8s executor 拒收
|
||||
修法:偵測 SSH_HOST 後改走 SSHProvider,行為與 decision_manager._ssh_execute 對齊
|
||||
|
||||
action 解析邏輯:
|
||||
- "docker prune" / "docker image prune" / "docker volume prune" → ssh_docker_prune
|
||||
- "docker restart <name>" → ssh_docker_restart
|
||||
- "systemctl restart <svc>" → ssh_systemctl_restart
|
||||
- "ps aux" / "df -h" / "free -h" / "top" / "uptime" / 'echo' / 'ls -lah' → ssh_diagnose
|
||||
- 其他:回傳失敗,提示 LLM 改寫 action
|
||||
"""
|
||||
from src.services.executor import ExecutionResult
|
||||
|
||||
start = time.time()
|
||||
action = approval.action or ""
|
||||
action_lower = action.lower().strip()
|
||||
|
||||
# 路由 SSH MCP tool(與 decision_manager._ssh_execute 對齊)
|
||||
params: dict = {"host": host}
|
||||
tool_name: str | None = None
|
||||
|
||||
if "docker" in action_lower and "prune" in action_lower:
|
||||
tool_name = "ssh_docker_prune"
|
||||
params["trust_score"] = 0.85
|
||||
elif "docker restart" in action_lower:
|
||||
tool_name = "ssh_docker_restart"
|
||||
# 嘗試萃取 container name
|
||||
import re as _re
|
||||
m = _re.search(r"docker\s+restart\s+([a-z0-9._-]+)", action_lower)
|
||||
if m:
|
||||
params["container_name"] = m.group(1)
|
||||
params["trust_score"] = 0.85
|
||||
else:
|
||||
tool_name = None # 沒抓到 container 名稱,降級
|
||||
elif "systemctl restart" in action_lower:
|
||||
tool_name = "ssh_systemctl_restart"
|
||||
import re as _re
|
||||
m = _re.search(r"systemctl\s+restart\s+([a-z0-9._-]+)", action_lower)
|
||||
if m:
|
||||
params["service"] = m.group(1)
|
||||
params["trust_score"] = 0.85
|
||||
else:
|
||||
tool_name = None
|
||||
elif any(kw in action_lower for kw in ("ps aux", "df -h", "free -h", "top ", "uptime", "echo ", "ls -")):
|
||||
# 主機診斷類(合 ssh_diagnose 一鍵收集)
|
||||
tool_name = "ssh_diagnose"
|
||||
|
||||
if tool_name is None:
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
err = f"SSH action 無法路由到 SSH MCP tool: {action[:120]}"
|
||||
logger.warning(
|
||||
"ssh_host_action_unrouted",
|
||||
approval_id=str(approval.id),
|
||||
action=action,
|
||||
host=host,
|
||||
)
|
||||
return ExecutionResult(
|
||||
success=False,
|
||||
message="SSH action unrouted",
|
||||
operation_type=OperationType.SSH_HOST,
|
||||
target_resource=host,
|
||||
namespace="host",
|
||||
duration_ms=duration_ms,
|
||||
error=err,
|
||||
)
|
||||
|
||||
# 呼叫 SSH MCP Provider
|
||||
from src.plugins.mcp.providers.ssh_provider import SSHProvider
|
||||
provider = SSHProvider()
|
||||
try:
|
||||
mcp_result = await provider.execute(tool_name=tool_name, parameters=params)
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
success = bool(mcp_result.success)
|
||||
return ExecutionResult(
|
||||
success=success,
|
||||
message=f"ssh_mcp:{tool_name} {'ok' if success else 'failed'}",
|
||||
operation_type=OperationType.SSH_HOST,
|
||||
target_resource=host,
|
||||
namespace="host",
|
||||
duration_ms=duration_ms,
|
||||
k8s_response={"tool": tool_name, "output": mcp_result.output} if success else None,
|
||||
error=None if success else (mcp_result.error or "ssh_mcp execution failed"),
|
||||
)
|
||||
except Exception as e:
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
logger.warning(
|
||||
"ssh_host_action_exception",
|
||||
approval_id=str(approval.id),
|
||||
tool=tool_name,
|
||||
error=str(e),
|
||||
)
|
||||
return ExecutionResult(
|
||||
success=False,
|
||||
message="ssh_mcp exception",
|
||||
operation_type=OperationType.SSH_HOST,
|
||||
target_resource=host,
|
||||
namespace="host",
|
||||
duration_ms=duration_ms,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
async def _push_execution_result_to_alert(
|
||||
self,
|
||||
approval: ApprovalRequest,
|
||||
|
||||
@@ -49,6 +49,9 @@ class OperationType(str, Enum):
|
||||
SCALE_DEPLOYMENT = "SCALE_DEPLOYMENT"
|
||||
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — 唯讀查詢類型(kubectl get/top/describe/logs)
|
||||
INVESTIGATE = "INVESTIGATE"
|
||||
# 2026-05-02 ogt + Claude Sonnet 4.6: 主機 SSH 操作(docker / systemctl / 診斷)
|
||||
# 走 SSH MCP Provider 而非 K8s executor;解 approval 後 SSH action 卡在「Could not parse」的 bug
|
||||
SSH_HOST = "SSH_HOST"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
||||
@@ -56,19 +56,14 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
|
||||
- kubectl 指令: "kubectl delete pod nginx-xxx -n production"
|
||||
- 英文自然語言: "Restart deployment api-backend"
|
||||
- 中文自然語言: "重新啟動 awoooi-worker 服務"
|
||||
- SSH 指令: "ssh 192.168.0.110 'docker prune ...'" / "ssh wooo@192.168.0.110 ..."
|
||||
|
||||
Examples:
|
||||
"kubectl delete pod nginx-xxx -n production"
|
||||
→ ParsedOperation(DELETE_POD, "nginx-xxx", "production")
|
||||
|
||||
"Restart deployment api-backend"
|
||||
→ ParsedOperation(RESTART_DEPLOYMENT, "api-backend", "default")
|
||||
|
||||
"Scale deployment web-frontend to 5 replicas"
|
||||
→ ParsedOperation(SCALE_DEPLOYMENT, "web-frontend", "default")
|
||||
|
||||
"重新啟動 awoooi-worker 服務"
|
||||
→ ParsedOperation(RESTART_DEPLOYMENT, "awoooi-worker", "default")
|
||||
"ssh 192.168.0.110 'docker image prune -a -f'"
|
||||
→ ParsedOperation(SSH_HOST, "192.168.0.110", "host")
|
||||
|
||||
Args:
|
||||
action: 操作指令字串
|
||||
@@ -78,6 +73,19 @@ def parse_operation_from_action(action: str) -> ParsedOperation:
|
||||
"""
|
||||
action_lower = action.lower()
|
||||
|
||||
# 2026-05-02 ogt + Claude Sonnet 4.6: SSH host 操作識別
|
||||
# 根因:approval_execution 把 SSH action 丟進 kubectl parser → 全部 None → 「Could not parse」
|
||||
# 修法:第一順位偵測 ssh,回 SSH_HOST,approval_execution 走 SSHProvider
|
||||
# 支援 "ssh host '...'" / "ssh user@host ..." / "ssh -o opt host ..."
|
||||
ssh_match = re.search(
|
||||
r"^\s*ssh\s+(?:-[a-zA-Z]\s*\S+\s+)*(?:[a-zA-Z][\w-]*@)?([a-zA-Z0-9][\w.-]*)",
|
||||
action_lower,
|
||||
)
|
||||
if ssh_match:
|
||||
host = ssh_match.group(1)
|
||||
# namespace 欄位借用為「host_class」,下游不必改
|
||||
return ParsedOperation(OperationType.SSH_HOST, host, "host")
|
||||
|
||||
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 11 修復 — 唯讀指令識別(INVESTIGATE)
|
||||
# 根因:parse_operation_from_action 完全不認識 kubectl get/top/describe/logs → 回 None → 執行失敗
|
||||
# 修法:優先匹配唯讀指令,回傳 OperationType.INVESTIGATE(零衝擊,blast_radius score=1)
|
||||
|
||||
68
apps/api/tests/test_operation_parser_ssh.py
Normal file
68
apps/api/tests/test_operation_parser_ssh.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""
|
||||
operation_parser SSH 識別測試
|
||||
=============================
|
||||
2026-05-02 ogt + Claude Sonnet 4.6: 修補手動批准 SSH action 卡死的 bug
|
||||
|
||||
驗證:
|
||||
- ssh host '...' → SSH_HOST + host
|
||||
- ssh user@host ... → SSH_HOST + host
|
||||
- ssh -o opt host ... → SSH_HOST + host
|
||||
- 既有 kubectl / 中文 / restart 路徑不被 SSH 規則誤抓
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from src.services.executor import OperationType
|
||||
from src.services.operation_parser import parse_operation_from_action
|
||||
|
||||
|
||||
class TestSSHRecognition:
|
||||
def test_ssh_with_bare_ip(self):
|
||||
p = parse_operation_from_action("ssh 192.168.0.110 'docker image prune -a -f'")
|
||||
assert p.operation_type == OperationType.SSH_HOST
|
||||
assert p.resource_name == "192.168.0.110"
|
||||
|
||||
def test_ssh_with_user_at_host(self):
|
||||
p = parse_operation_from_action("ssh wooo@192.168.0.110 'df -h'")
|
||||
assert p.operation_type == OperationType.SSH_HOST
|
||||
assert p.resource_name == "192.168.0.110"
|
||||
|
||||
def test_ssh_with_options(self):
|
||||
p = parse_operation_from_action(
|
||||
"ssh -o StrictHostKeyChecking=no wooo@192.168.0.188 'systemctl restart ollama'"
|
||||
)
|
||||
assert p.operation_type == OperationType.SSH_HOST
|
||||
assert p.resource_name == "192.168.0.188"
|
||||
|
||||
def test_ssh_with_named_alias(self):
|
||||
# alias 也要當 host 處理(後端 host whitelist 會再擋)
|
||||
p = parse_operation_from_action("ssh backup 'ls -lah /home/ollama/backup'")
|
||||
assert p.operation_type == OperationType.SSH_HOST
|
||||
assert p.resource_name == "backup"
|
||||
|
||||
def test_ssh_with_diagnostic_combo(self):
|
||||
p = parse_operation_from_action(
|
||||
"ssh 192.168.0.110 'echo \"=== CPU TOP ===\"; ps aux --sort=-%cpu | head -15'"
|
||||
)
|
||||
assert p.operation_type == OperationType.SSH_HOST
|
||||
assert p.resource_name == "192.168.0.110"
|
||||
|
||||
|
||||
class TestSSHDoesNotBreakExistingRoutes:
|
||||
def test_kubectl_rollout_still_works(self):
|
||||
p = parse_operation_from_action("kubectl rollout restart deployment/awoooi-api")
|
||||
assert p.operation_type == OperationType.RESTART_DEPLOYMENT
|
||||
assert p.resource_name == "awoooi-api"
|
||||
|
||||
def test_chinese_restart_still_works(self):
|
||||
p = parse_operation_from_action("重新啟動 awoooi-worker 服務")
|
||||
assert p.operation_type == OperationType.RESTART_DEPLOYMENT
|
||||
assert p.resource_name == "awoooi-worker"
|
||||
|
||||
def test_kubectl_get_routes_to_investigate(self):
|
||||
p = parse_operation_from_action("kubectl get pods -n awoooi-prod")
|
||||
assert p.operation_type == OperationType.INVESTIGATE
|
||||
|
||||
def test_unrelated_text_still_returns_none(self):
|
||||
p = parse_operation_from_action("(未設)")
|
||||
assert p.operation_type is None
|
||||
@@ -6,6 +6,29 @@
|
||||
|
||||
---
|
||||
|
||||
## 2026-05-02 | 手動批准路徑 SSH action 解析修補
|
||||
|
||||
承接同日早上 docker prune 飛輪部署後,使用者反饋仍有 incident 點「批准」後執行失敗。AOL 顯示 `Could not parse operation type`,根因是 `parse_operation_from_action` 只懂 kubectl 與中文重啟,不認識 `ssh ...` action,所有 SSH 修復動作從 K8s executor 退場。
|
||||
|
||||
### 完成
|
||||
- `OperationType` 新增 `SSH_HOST`,與 K8s 操作類型區隔。
|
||||
- `parse_operation_from_action` 在所有 kubectl/中文 pattern 之前先匹配 `ssh [-flags] [user@]host ...`,回 `(SSH_HOST, host, "host")`。
|
||||
- `approval_execution.execute_in_background` 新增 `SSH_HOST` 分支,呼叫 `_execute_ssh_host_action`:
|
||||
- 含 docker prune → `ssh_docker_prune`(trust_score=0.85)
|
||||
- docker restart → `ssh_docker_restart`
|
||||
- systemctl restart → `ssh_systemctl_restart`
|
||||
- 診斷類(ps aux / df -h / free -h / top / uptime / echo / ls) → `ssh_diagnose`
|
||||
- 其他:失敗並記 unrouted,避免靜默假成功
|
||||
- 全部走 SSHProvider,沿用同一套 host 白名單 + trust_score 守衛。
|
||||
|
||||
### 驗證
|
||||
- `pytest tests/test_operation_parser_ssh.py` → 9 passed。
|
||||
- `pytest tests/ -k "operation_parser or action_parser or approval or executor or ssh"` → 160 passed, 2 skipped, 0 failed。
|
||||
- `python3 -m py_compile` 三檔通過。
|
||||
|
||||
### 後續待辦
|
||||
- f45598b5 + 本 commit 部署到 production 後,重新觸發 INC-20260502-D6D0B7 / E12EE4 / 557055 類事件,確認手動批准路徑能成功執行 SSH 動作。
|
||||
|
||||
## 2026-05-02 | Telegram 告警噴爆事故閉環 + Docker prune 飛輪補完
|
||||
|
||||
承接昨晚到今早 Telegram 告警量爆增(峰值 53/hr)事故。根因是 `ssh-mcp-key` Secret 的 `known_hosts` 欄位是 0 bytes 空檔,asyncssh 拒絕所有 SSH,導致 110 磁碟告警的 auto_repair 全部走「Host key is not trusted → emergency_channel → Telegram」路徑無限重試。
|
||||
|
||||
Reference in New Issue
Block a user