fix(mcp): harden ssh provider connection params
This commit is contained in:
@@ -58,6 +58,7 @@ logger = structlog.get_logger(__name__)
|
||||
|
||||
SSH_KEY_PATH = "/run/secrets/ssh_mcp_key"
|
||||
SSH_USER = "wooo"
|
||||
SSH_PORT = 22
|
||||
DEFAULT_HOST_USERS = {
|
||||
# AI/Web host is operated by the ollama account in the current topology.
|
||||
"192.168.0.188": "ollama",
|
||||
@@ -104,6 +105,29 @@ def _validate_param(key: str, value: str) -> str:
|
||||
# tail / port / lines 由呼叫方 int() 轉換,不需字串白名單
|
||||
return value
|
||||
|
||||
|
||||
def _normalize_ssh_host(value: str) -> str:
|
||||
"""
|
||||
Normalize host labels before they enter asyncssh.
|
||||
|
||||
Prometheus labels often arrive as ``192.168.0.110:9100``. That port is the
|
||||
exporter port, not SSH. The SSH provider must connect to the host on the
|
||||
platform SSH port, otherwise asyncssh can receive a stringly port from
|
||||
config/labels and fail with ``%d format`` before the tool even runs.
|
||||
"""
|
||||
host = (value or "").strip()
|
||||
if host.startswith("ssh://"):
|
||||
host = host.removeprefix("ssh://")
|
||||
if "@" in host:
|
||||
host = host.rsplit("@", 1)[1]
|
||||
if host.startswith("[") and "]" in host:
|
||||
return host[1:host.index("]")]
|
||||
if host.count(":") == 1:
|
||||
maybe_host, maybe_port = host.rsplit(":", 1)
|
||||
if maybe_port.isdigit():
|
||||
return maybe_host
|
||||
return host
|
||||
|
||||
# 群組 A(只讀)
|
||||
GROUP_A_TOOLS = {
|
||||
"ssh_diagnose",
|
||||
@@ -375,7 +399,7 @@ class SSHProvider(MCPToolProvider):
|
||||
error=f"Unknown tool: {tool_name}",
|
||||
)
|
||||
|
||||
host = parameters.get("host", "")
|
||||
host = _normalize_ssh_host(str(parameters.get("host", "")))
|
||||
|
||||
# 守衛 2: 允許的 host
|
||||
if host not in self._allowed_hosts():
|
||||
@@ -604,7 +628,7 @@ class SSHProvider(MCPToolProvider):
|
||||
raise RuntimeError(
|
||||
"asyncssh is not installed. "
|
||||
"Add 'asyncssh' to pyproject.toml dependencies."
|
||||
)
|
||||
) from None
|
||||
|
||||
import os
|
||||
if not os.path.exists(SSH_KEY_PATH):
|
||||
@@ -625,11 +649,13 @@ class SSHProvider(MCPToolProvider):
|
||||
|
||||
async with asyncssh.connect(
|
||||
host,
|
||||
port=SSH_PORT,
|
||||
username=username or SSH_USER,
|
||||
client_keys=[SSH_KEY_PATH],
|
||||
known_hosts=known_hosts_path, # None = 跳過驗證(內網),或指定文件路徑
|
||||
connect_timeout=timeout,
|
||||
config=None, # 禁止讀取使用者 ssh config,避免 Port 字串污染 asyncssh
|
||||
connect_timeout=float(timeout),
|
||||
) as conn:
|
||||
# Bug 根因:asyncssh 模組沒有頂層 run();應呼叫 conn.run()(2026-04-24 Claude Sonnet 4.6)
|
||||
result = await conn.run(cmd, timeout=timeout, check=False)
|
||||
result = await conn.run(cmd, timeout=float(timeout), check=False)
|
||||
return (result.stdout or ""), (result.stderr or "")
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import pytest
|
||||
|
||||
from src.plugins.mcp.providers.ssh_provider import SSHProvider
|
||||
from src.plugins.mcp.providers.ssh_provider import SSHProvider, _normalize_ssh_host
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -19,3 +19,37 @@ def test_ssh_provider_uses_ollama_user_for_188():
|
||||
|
||||
assert provider._ssh_user_for_host("192.168.0.188") == "ollama"
|
||||
assert provider._ssh_user_for_host("192.168.0.110") == "wooo"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw,expected",
|
||||
[
|
||||
("192.168.0.110:9100", "192.168.0.110"),
|
||||
("wooo@192.168.0.110", "192.168.0.110"),
|
||||
("ssh://wooo@192.168.0.110:22", "192.168.0.110"),
|
||||
("192.168.0.188", "192.168.0.188"),
|
||||
],
|
||||
)
|
||||
def test_normalize_ssh_host_strips_exporter_ports_and_users(raw, expected):
|
||||
assert _normalize_ssh_host(raw) == expected
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ssh_execute_normalizes_host_before_allowed_check(monkeypatch):
|
||||
provider = SSHProvider()
|
||||
captured = {}
|
||||
|
||||
async def fake_ssh_exec(host, cmd, timeout, username=None):
|
||||
captured["host"] = host
|
||||
captured["timeout"] = timeout
|
||||
captured["username"] = username
|
||||
return "ok", ""
|
||||
|
||||
monkeypatch.setattr(provider, "_allowed_hosts", lambda: ["192.168.0.110"])
|
||||
monkeypatch.setattr(provider, "_ssh_exec", fake_ssh_exec)
|
||||
|
||||
result = await provider.execute("ssh_diagnose", {"host": "192.168.0.110:9100"})
|
||||
|
||||
assert result.success is True
|
||||
assert captured["host"] == "192.168.0.110"
|
||||
assert isinstance(captured["timeout"], int)
|
||||
|
||||
@@ -1,3 +1,22 @@
|
||||
## 2026-05-06 | SSH MCP 連線參數硬化,修復 `%d format` 導致主機診斷全失敗
|
||||
|
||||
**背景**:SRE 戰情室與 production log 顯示 host-layer MCP 工具(`ssh_get_top_processes`、`ssh_get_swap_info`、`ssh_diagnose` 等)全數失敗,錯誤為 `%d format: a real number is required, not str`。這讓主機告警無法取得感官證據,後續 AI 只能降級,並在 Telegram 中重複出現「AI 自動修復失敗,已升級人工介入」。
|
||||
|
||||
**根因**:
|
||||
- 錯誤發生在 `asyncssh` 連線層,不是 Telegram formatter。
|
||||
- SSH Provider 未明確指定 SSH port,且未停用使用者 ssh config;若 host label 或 config 帶入字串型 port,`asyncssh` 會在內部 `%d` 格式化時爆炸。
|
||||
- Prometheus `instance` 類 label 常見格式是 `192.168.0.110:9100`,該 port 是 exporter port,不是 SSH port。
|
||||
|
||||
**本次修補**:
|
||||
- SSH Provider 新增 host 正規化,支援移除 `user@`、`ssh://` 與 `:9100` exporter port。
|
||||
- `asyncssh.connect()` 明確指定 `port=22`、`config=None`、`connect_timeout=float(timeout)`。
|
||||
- 新增 regression tests,鎖定 `192.168.0.110:9100` 會被正規化成 `192.168.0.110` 後才進入 provider 執行。
|
||||
|
||||
**驗證**:
|
||||
- `python -m py_compile apps/api/src/plugins/mcp/providers/ssh_provider.py apps/api/tests/test_ssh_provider_tools.py`
|
||||
- `pytest tests/test_ssh_provider_tools.py tests/test_decision_manager_docker_prune_routing.py tests/test_operation_parser_ssh.py -q` → 20 passed。
|
||||
- `ruff check src/plugins/mcp/providers/ssh_provider.py tests/test_ssh_provider_tools.py` → All checks passed。
|
||||
|
||||
## 2026-05-06 | Incident 列表改回純讀,停止前端輪詢觸發 AI 推理
|
||||
|
||||
**背景**:部署 AwoooP 首頁後,production log 顯示載入 `/zh-TW/awooop` 期間會打 `GET /api/v1/incidents`,接著出現 `phase24_ai_router_used provider=ollama` 與 GCP-A Ollama 推理耗時約 55 秒。這代表列表查詢仍會背景啟動 AI 決策,導致前端輪詢佔用 GCP Ollama 推理槽,極端情況下也可能 fallback 到 Gemini 產生成本。
|
||||
|
||||
Reference in New Issue
Block a user