diff --git a/apps/api/src/plugins/mcp/providers/ssh_provider.py b/apps/api/src/plugins/mcp/providers/ssh_provider.py index f21b1f8b..3dac99f4 100644 --- a/apps/api/src/plugins/mcp/providers/ssh_provider.py +++ b/apps/api/src/plugins/mcp/providers/ssh_provider.py @@ -58,6 +58,7 @@ logger = structlog.get_logger(__name__) SSH_KEY_PATH = "/run/secrets/ssh_mcp_key" SSH_USER = "wooo" +SSH_PORT = 22 DEFAULT_HOST_USERS = { # AI/Web host is operated by the ollama account in the current topology. "192.168.0.188": "ollama", @@ -104,6 +105,29 @@ def _validate_param(key: str, value: str) -> str: # tail / port / lines 由呼叫方 int() 轉換,不需字串白名單 return value + +def _normalize_ssh_host(value: str) -> str: + """ + Normalize host labels before they enter asyncssh. + + Prometheus labels often arrive as ``192.168.0.110:9100``. That port is the + exporter port, not SSH. The SSH provider must connect to the host on the + platform SSH port, otherwise asyncssh can receive a stringly port from + config/labels and fail with ``%d format`` before the tool even runs. + """ + host = (value or "").strip() + if host.startswith("ssh://"): + host = host.removeprefix("ssh://") + if "@" in host: + host = host.rsplit("@", 1)[1] + if host.startswith("[") and "]" in host: + return host[1:host.index("]")] + if host.count(":") == 1: + maybe_host, maybe_port = host.rsplit(":", 1) + if maybe_port.isdigit(): + return maybe_host + return host + # 群組 A(只讀) GROUP_A_TOOLS = { "ssh_diagnose", @@ -375,7 +399,7 @@ class SSHProvider(MCPToolProvider): error=f"Unknown tool: {tool_name}", ) - host = parameters.get("host", "") + host = _normalize_ssh_host(str(parameters.get("host", ""))) # 守衛 2: 允許的 host if host not in self._allowed_hosts(): @@ -604,7 +628,7 @@ class SSHProvider(MCPToolProvider): raise RuntimeError( "asyncssh is not installed. " "Add 'asyncssh' to pyproject.toml dependencies." - ) + ) from None import os if not os.path.exists(SSH_KEY_PATH): @@ -625,11 +649,13 @@ class SSHProvider(MCPToolProvider): async with asyncssh.connect( host, + port=SSH_PORT, username=username or SSH_USER, client_keys=[SSH_KEY_PATH], known_hosts=known_hosts_path, # None = 跳過驗證(內網),或指定文件路徑 - connect_timeout=timeout, + config=None, # 禁止讀取使用者 ssh config,避免 Port 字串污染 asyncssh + connect_timeout=float(timeout), ) as conn: # Bug 根因:asyncssh 模組沒有頂層 run();應呼叫 conn.run()(2026-04-24 Claude Sonnet 4.6) - result = await conn.run(cmd, timeout=timeout, check=False) + result = await conn.run(cmd, timeout=float(timeout), check=False) return (result.stdout or ""), (result.stderr or "") diff --git a/apps/api/tests/test_ssh_provider_tools.py b/apps/api/tests/test_ssh_provider_tools.py index 8f9de198..55f380df 100644 --- a/apps/api/tests/test_ssh_provider_tools.py +++ b/apps/api/tests/test_ssh_provider_tools.py @@ -1,6 +1,6 @@ import pytest -from src.plugins.mcp.providers.ssh_provider import SSHProvider +from src.plugins.mcp.providers.ssh_provider import SSHProvider, _normalize_ssh_host @pytest.mark.asyncio @@ -19,3 +19,37 @@ def test_ssh_provider_uses_ollama_user_for_188(): assert provider._ssh_user_for_host("192.168.0.188") == "ollama" assert provider._ssh_user_for_host("192.168.0.110") == "wooo" + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("192.168.0.110:9100", "192.168.0.110"), + ("wooo@192.168.0.110", "192.168.0.110"), + ("ssh://wooo@192.168.0.110:22", "192.168.0.110"), + ("192.168.0.188", "192.168.0.188"), + ], +) +def test_normalize_ssh_host_strips_exporter_ports_and_users(raw, expected): + assert _normalize_ssh_host(raw) == expected + + +@pytest.mark.asyncio +async def test_ssh_execute_normalizes_host_before_allowed_check(monkeypatch): + provider = SSHProvider() + captured = {} + + async def fake_ssh_exec(host, cmd, timeout, username=None): + captured["host"] = host + captured["timeout"] = timeout + captured["username"] = username + return "ok", "" + + monkeypatch.setattr(provider, "_allowed_hosts", lambda: ["192.168.0.110"]) + monkeypatch.setattr(provider, "_ssh_exec", fake_ssh_exec) + + result = await provider.execute("ssh_diagnose", {"host": "192.168.0.110:9100"}) + + assert result.success is True + assert captured["host"] == "192.168.0.110" + assert isinstance(captured["timeout"], int) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 65948691..b6fd8120 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,22 @@ +## 2026-05-06 | SSH MCP 連線參數硬化,修復 `%d format` 導致主機診斷全失敗 + +**背景**:SRE 戰情室與 production log 顯示 host-layer MCP 工具(`ssh_get_top_processes`、`ssh_get_swap_info`、`ssh_diagnose` 等)全數失敗,錯誤為 `%d format: a real number is required, not str`。這讓主機告警無法取得感官證據,後續 AI 只能降級,並在 Telegram 中重複出現「AI 自動修復失敗,已升級人工介入」。 + +**根因**: +- 錯誤發生在 `asyncssh` 連線層,不是 Telegram formatter。 +- SSH Provider 未明確指定 SSH port,且未停用使用者 ssh config;若 host label 或 config 帶入字串型 port,`asyncssh` 會在內部 `%d` 格式化時爆炸。 +- Prometheus `instance` 類 label 常見格式是 `192.168.0.110:9100`,該 port 是 exporter port,不是 SSH port。 + +**本次修補**: +- SSH Provider 新增 host 正規化,支援移除 `user@`、`ssh://` 與 `:9100` exporter port。 +- `asyncssh.connect()` 明確指定 `port=22`、`config=None`、`connect_timeout=float(timeout)`。 +- 新增 regression tests,鎖定 `192.168.0.110:9100` 會被正規化成 `192.168.0.110` 後才進入 provider 執行。 + +**驗證**: +- `python -m py_compile apps/api/src/plugins/mcp/providers/ssh_provider.py apps/api/tests/test_ssh_provider_tools.py` +- `pytest tests/test_ssh_provider_tools.py tests/test_decision_manager_docker_prune_routing.py tests/test_operation_parser_ssh.py -q` → 20 passed。 +- `ruff check src/plugins/mcp/providers/ssh_provider.py tests/test_ssh_provider_tools.py` → All checks passed。 + ## 2026-05-06 | Incident 列表改回純讀,停止前端輪詢觸發 AI 推理 **背景**:部署 AwoooP 首頁後,production log 顯示載入 `/zh-TW/awooop` 期間會打 `GET /api/v1/incidents`,接著出現 `phase24_ai_router_used provider=ollama` 與 GCP-A Ollama 推理耗時約 55 秒。這代表列表查詢仍會背景啟動 AI 決策,導致前端輪詢佔用 GCP Ollama 推理槽,極端情況下也可能 fallback 到 Gemini 產生成本。