fix(mcp): harden ssh provider connection params
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / tests (push) Successful in 59s
CD Pipeline / build-and-deploy (push) Successful in 3m20s
CD Pipeline / post-deploy-checks (push) Successful in 1m17s

This commit is contained in:
Your Name
2026-05-06 21:51:38 +08:00
parent 150f17b219
commit 8396d37275
3 changed files with 84 additions and 5 deletions

View File

@@ -58,6 +58,7 @@ logger = structlog.get_logger(__name__)
SSH_KEY_PATH = "/run/secrets/ssh_mcp_key"
SSH_USER = "wooo"
SSH_PORT = 22
DEFAULT_HOST_USERS = {
# AI/Web host is operated by the ollama account in the current topology.
"192.168.0.188": "ollama",
@@ -104,6 +105,29 @@ def _validate_param(key: str, value: str) -> str:
# tail / port / lines 由呼叫方 int() 轉換,不需字串白名單
return value
def _normalize_ssh_host(value: str) -> str:
"""
Normalize host labels before they enter asyncssh.
Prometheus labels often arrive as ``192.168.0.110:9100``. That port is the
exporter port, not SSH. The SSH provider must connect to the host on the
platform SSH port, otherwise asyncssh can receive a stringly port from
config/labels and fail with ``%d format`` before the tool even runs.
"""
host = (value or "").strip()
if host.startswith("ssh://"):
host = host.removeprefix("ssh://")
if "@" in host:
host = host.rsplit("@", 1)[1]
if host.startswith("[") and "]" in host:
return host[1:host.index("]")]
if host.count(":") == 1:
maybe_host, maybe_port = host.rsplit(":", 1)
if maybe_port.isdigit():
return maybe_host
return host
# 群組 A只讀
GROUP_A_TOOLS = {
"ssh_diagnose",
@@ -375,7 +399,7 @@ class SSHProvider(MCPToolProvider):
error=f"Unknown tool: {tool_name}",
)
host = parameters.get("host", "")
host = _normalize_ssh_host(str(parameters.get("host", "")))
# 守衛 2: 允許的 host
if host not in self._allowed_hosts():
@@ -604,7 +628,7 @@ class SSHProvider(MCPToolProvider):
raise RuntimeError(
"asyncssh is not installed. "
"Add 'asyncssh' to pyproject.toml dependencies."
)
) from None
import os
if not os.path.exists(SSH_KEY_PATH):
@@ -625,11 +649,13 @@ class SSHProvider(MCPToolProvider):
async with asyncssh.connect(
host,
port=SSH_PORT,
username=username or SSH_USER,
client_keys=[SSH_KEY_PATH],
known_hosts=known_hosts_path, # None = 跳過驗證(內網),或指定文件路徑
connect_timeout=timeout,
config=None, # 禁止讀取使用者 ssh config避免 Port 字串污染 asyncssh
connect_timeout=float(timeout),
) as conn:
# Bug 根因asyncssh 模組沒有頂層 run();應呼叫 conn.run()2026-04-24 Claude Sonnet 4.6
result = await conn.run(cmd, timeout=timeout, check=False)
result = await conn.run(cmd, timeout=float(timeout), check=False)
return (result.stdout or ""), (result.stderr or "")

View File

@@ -1,6 +1,6 @@
import pytest
from src.plugins.mcp.providers.ssh_provider import SSHProvider
from src.plugins.mcp.providers.ssh_provider import SSHProvider, _normalize_ssh_host
@pytest.mark.asyncio
@@ -19,3 +19,37 @@ def test_ssh_provider_uses_ollama_user_for_188():
assert provider._ssh_user_for_host("192.168.0.188") == "ollama"
assert provider._ssh_user_for_host("192.168.0.110") == "wooo"
@pytest.mark.parametrize(
"raw,expected",
[
("192.168.0.110:9100", "192.168.0.110"),
("wooo@192.168.0.110", "192.168.0.110"),
("ssh://wooo@192.168.0.110:22", "192.168.0.110"),
("192.168.0.188", "192.168.0.188"),
],
)
def test_normalize_ssh_host_strips_exporter_ports_and_users(raw, expected):
assert _normalize_ssh_host(raw) == expected
@pytest.mark.asyncio
async def test_ssh_execute_normalizes_host_before_allowed_check(monkeypatch):
provider = SSHProvider()
captured = {}
async def fake_ssh_exec(host, cmd, timeout, username=None):
captured["host"] = host
captured["timeout"] = timeout
captured["username"] = username
return "ok", ""
monkeypatch.setattr(provider, "_allowed_hosts", lambda: ["192.168.0.110"])
monkeypatch.setattr(provider, "_ssh_exec", fake_ssh_exec)
result = await provider.execute("ssh_diagnose", {"host": "192.168.0.110:9100"})
assert result.success is True
assert captured["host"] == "192.168.0.110"
assert isinstance(captured["timeout"], int)

View File

@@ -1,3 +1,22 @@
## 2026-05-06 | SSH MCP 連線參數硬化,修復 `%d format` 導致主機診斷全失敗
**背景**SRE 戰情室與 production log 顯示 host-layer MCP 工具(`ssh_get_top_processes``ssh_get_swap_info``ssh_diagnose` 等)全數失敗,錯誤為 `%d format: a real number is required, not str`。這讓主機告警無法取得感官證據,後續 AI 只能降級,並在 Telegram 中重複出現「AI 自動修復失敗,已升級人工介入」。
**根因**
- 錯誤發生在 `asyncssh` 連線層,不是 Telegram formatter。
- SSH Provider 未明確指定 SSH port且未停用使用者 ssh config若 host label 或 config 帶入字串型 port`asyncssh` 會在內部 `%d` 格式化時爆炸。
- Prometheus `instance` 類 label 常見格式是 `192.168.0.110:9100`,該 port 是 exporter port不是 SSH port。
**本次修補**
- SSH Provider 新增 host 正規化,支援移除 `user@``ssh://``:9100` exporter port。
- `asyncssh.connect()` 明確指定 `port=22``config=None``connect_timeout=float(timeout)`
- 新增 regression tests鎖定 `192.168.0.110:9100` 會被正規化成 `192.168.0.110` 後才進入 provider 執行。
**驗證**
- `python -m py_compile apps/api/src/plugins/mcp/providers/ssh_provider.py apps/api/tests/test_ssh_provider_tools.py`
- `pytest tests/test_ssh_provider_tools.py tests/test_decision_manager_docker_prune_routing.py tests/test_operation_parser_ssh.py -q` → 20 passed。
- `ruff check src/plugins/mcp/providers/ssh_provider.py tests/test_ssh_provider_tools.py` → All checks passed。
## 2026-05-06 | Incident 列表改回純讀,停止前端輪詢觸發 AI 推理
**背景**:部署 AwoooP 首頁後production log 顯示載入 `/zh-TW/awooop` 期間會打 `GET /api/v1/incidents`,接著出現 `phase24_ai_router_used provider=ollama` 與 GCP-A Ollama 推理耗時約 55 秒。這代表列表查詢仍會背景啟動 AI 決策,導致前端輪詢佔用 GCP Ollama 推理槽,極端情況下也可能 fallback 到 Gemini 產生成本。