fix(mcp): harden ssh provider connection params

2026-05-06 21:51:38 +08:00
parent 150f17b219
commit 8396d37275
3 changed files with 84 additions and 5 deletions
--- a/apps/api/src/plugins/mcp/providers/ssh_provider.py
+++ b/apps/api/src/plugins/mcp/providers/ssh_provider.py
@@ -58,6 +58,7 @@ logger = structlog.get_logger(__name__)

 SSH_KEY_PATH = "/run/secrets/ssh_mcp_key"
 SSH_USER = "wooo"
+SSH_PORT = 22
 DEFAULT_HOST_USERS = {
    # AI/Web host is operated by the ollama account in the current topology.
    "192.168.0.188": "ollama",
@@ -104,6 +105,29 @@ def _validate_param(key: str, value: str) -> str:
    # tail / port / lines 由呼叫方 int() 轉換，不需字串白名單
    return value

+
+def _normalize_ssh_host(value: str) -> str:
+    """
+    Normalize host labels before they enter asyncssh.
+
+    Prometheus labels often arrive as ``192.168.0.110:9100``. That port is the
+    exporter port, not SSH. The SSH provider must connect to the host on the
+    platform SSH port, otherwise asyncssh can receive a stringly port from
+    config/labels and fail with ``%d format`` before the tool even runs.
+    """
+    host = (value or "").strip()
+    if host.startswith("ssh://"):
+        host = host.removeprefix("ssh://")
+    if "@" in host:
+        host = host.rsplit("@", 1)[1]
+    if host.startswith("[") and "]" in host:
+        return host[1:host.index("]")]
+    if host.count(":") == 1:
+        maybe_host, maybe_port = host.rsplit(":", 1)
+        if maybe_port.isdigit():
+            return maybe_host
+    return host
+
 # 群組 A（只讀）
 GROUP_A_TOOLS = {
    "ssh_diagnose",
@@ -375,7 +399,7 @@ class SSHProvider(MCPToolProvider):
                error=f"Unknown tool: {tool_name}",
            )

-        host = parameters.get("host", "")
+        host = _normalize_ssh_host(str(parameters.get("host", "")))

        # 守衛 2: 允許的 host
        if host not in self._allowed_hosts():
@@ -604,7 +628,7 @@ class SSHProvider(MCPToolProvider):
            raise RuntimeError(
                "asyncssh is not installed. "
                "Add 'asyncssh' to pyproject.toml dependencies."
-            )
+            ) from None

        import os
        if not os.path.exists(SSH_KEY_PATH):
@@ -625,11 +649,13 @@ class SSHProvider(MCPToolProvider):

        async with asyncssh.connect(
            host,
+            port=SSH_PORT,
            username=username or SSH_USER,
            client_keys=[SSH_KEY_PATH],
            known_hosts=known_hosts_path,  # None = 跳過驗證（內網），或指定文件路徑
-            connect_timeout=timeout,
+            config=None,  # 禁止讀取使用者 ssh config，避免 Port 字串污染 asyncssh
+            connect_timeout=float(timeout),
        ) as conn:
            # Bug 根因：asyncssh 模組沒有頂層 run()；應呼叫 conn.run()（2026-04-24 Claude Sonnet 4.6）
-            result = await conn.run(cmd, timeout=timeout, check=False)
+            result = await conn.run(cmd, timeout=float(timeout), check=False)
            return (result.stdout or ""), (result.stderr or "")
--- a/apps/api/tests/test_ssh_provider_tools.py
+++ b/apps/api/tests/test_ssh_provider_tools.py
@@ -1,6 +1,6 @@
 import pytest

-from src.plugins.mcp.providers.ssh_provider import SSHProvider
+from src.plugins.mcp.providers.ssh_provider import SSHProvider, _normalize_ssh_host


@pytest.mark.asyncio
@@ -19,3 +19,37 @@ def test_ssh_provider_uses_ollama_user_for_188():

    assert provider._ssh_user_for_host("192.168.0.188") == "ollama"
    assert provider._ssh_user_for_host("192.168.0.110") == "wooo"
+
+
+@pytest.mark.parametrize(
+    "raw,expected",
+    [
+        ("192.168.0.110:9100", "192.168.0.110"),
+        ("wooo@192.168.0.110", "192.168.0.110"),
+        ("ssh://wooo@192.168.0.110:22", "192.168.0.110"),
+        ("192.168.0.188", "192.168.0.188"),
+    ],
+)
+def test_normalize_ssh_host_strips_exporter_ports_and_users(raw, expected):
+    assert _normalize_ssh_host(raw) == expected
+
+
+@pytest.mark.asyncio
+async def test_ssh_execute_normalizes_host_before_allowed_check(monkeypatch):
+    provider = SSHProvider()
+    captured = {}
+
+    async def fake_ssh_exec(host, cmd, timeout, username=None):
+        captured["host"] = host
+        captured["timeout"] = timeout
+        captured["username"] = username
+        return "ok", ""
+
+    monkeypatch.setattr(provider, "_allowed_hosts", lambda: ["192.168.0.110"])
+    monkeypatch.setattr(provider, "_ssh_exec", fake_ssh_exec)
+
+    result = await provider.execute("ssh_diagnose", {"host": "192.168.0.110:9100"})
+
+    assert result.success is True
+    assert captured["host"] == "192.168.0.110"
+    assert isinstance(captured["timeout"], int)
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -1,3 +1,22 @@
+## 2026-05-06 | SSH MCP 連線參數硬化，修復 `%d format` 導致主機診斷全失敗
+
+**背景**：SRE 戰情室與 production log 顯示 host-layer MCP 工具（`ssh_get_top_processes`、`ssh_get_swap_info`、`ssh_diagnose` 等）全數失敗，錯誤為 `%d format: a real number is required, not str`。這讓主機告警無法取得感官證據，後續 AI 只能降級，並在 Telegram 中重複出現「AI 自動修復失敗，已升級人工介入」。
+
+**根因**：
+- 錯誤發生在 `asyncssh` 連線層，不是 Telegram formatter。
+- SSH Provider 未明確指定 SSH port，且未停用使用者 ssh config；若 host label 或 config 帶入字串型 port，`asyncssh` 會在內部 `%d` 格式化時爆炸。
+- Prometheus `instance` 類 label 常見格式是 `192.168.0.110:9100`，該 port 是 exporter port，不是 SSH port。
+
+**本次修補**：
+- SSH Provider 新增 host 正規化，支援移除 `user@`、`ssh://` 與 `:9100` exporter port。
+- `asyncssh.connect()` 明確指定 `port=22`、`config=None`、`connect_timeout=float(timeout)`。
+- 新增 regression tests，鎖定 `192.168.0.110:9100` 會被正規化成 `192.168.0.110` 後才進入 provider 執行。
+
+**驗證**：
+- `python -m py_compile apps/api/src/plugins/mcp/providers/ssh_provider.py apps/api/tests/test_ssh_provider_tools.py`
+- `pytest tests/test_ssh_provider_tools.py tests/test_decision_manager_docker_prune_routing.py tests/test_operation_parser_ssh.py -q` → 20 passed。
+- `ruff check src/plugins/mcp/providers/ssh_provider.py tests/test_ssh_provider_tools.py` → All checks passed。
+
 ## 2026-05-06 | Incident 列表改回純讀，停止前端輪詢觸發 AI 推理

 **背景**：部署 AwoooP 首頁後，production log 顯示載入 `/zh-TW/awooop` 期間會打 `GET /api/v1/incidents`，接著出現 `phase24_ai_router_used provider=ollama` 與 GCP-A Ollama 推理耗時約 55 秒。這代表列表查詢仍會背景啟動 AI 決策，導致前端輪詢佔用 GCP Ollama 推理槽，極端情況下也可能 fallback 到 Gemini 產生成本。