feat(api): HostRepairAgent — SSH 主機層修復 (Task 11)
- host_repair_agent.py: layer路由、command injection防護、asyncio SSH執行 - 測試: 12 cases 全通過 (routing/sanitize/success/fail/timeout/denied) - SSH key: /etc/repair-ssh/id_ed25519 (K8s secret mount) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
134
apps/api/src/services/host_repair_agent.py
Normal file
134
apps/api/src/services/host_repair_agent.py
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
"""
|
||||||
|
src/services/host_repair_agent.py
|
||||||
|
Host Repair Agent — 透過 SSH 執行主機層修復
|
||||||
|
2026-04-05 Claude Code: Sprint 3 Host Auto-Repair
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# SSH 連線設定 — layer → host config
|
||||||
|
LAYER_SSH_CONFIG: dict[str, dict] = {
|
||||||
|
"docker-110": {
|
||||||
|
"host": "192.168.0.110",
|
||||||
|
"user": "wooo",
|
||||||
|
"key_path": "/etc/repair-ssh/id_ed25519",
|
||||||
|
},
|
||||||
|
"docker-188": {
|
||||||
|
"host": "192.168.0.188",
|
||||||
|
"user": "ollama",
|
||||||
|
"key_path": "/etc/repair-ssh/id_ed25519",
|
||||||
|
},
|
||||||
|
"systemd-188": {
|
||||||
|
"host": "192.168.0.188",
|
||||||
|
"user": "ollama",
|
||||||
|
"key_path": "/etc/repair-ssh/id_ed25519",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Component 名稱規則: 小寫英數 + 連字符,1-31 字元
|
||||||
|
_COMPONENT_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,30}$")
|
||||||
|
|
||||||
|
SSH_TIMEOUT = 60 # seconds
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HostRepairResult:
|
||||||
|
success: bool
|
||||||
|
layer: str
|
||||||
|
component: str
|
||||||
|
output: str = ""
|
||||||
|
error: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
def get_ssh_config_for_layer(layer: str) -> dict:
|
||||||
|
"""取得指定 layer 的 SSH 連線設定。k8s layer 不走 SSH。"""
|
||||||
|
if layer == "k8s" or layer.startswith("k8s"):
|
||||||
|
raise ValueError(f"Layer '{layer}' uses kubectl, not SSH")
|
||||||
|
config = LAYER_SSH_CONFIG.get(layer)
|
||||||
|
if config is None:
|
||||||
|
raise ValueError(f"Unknown layer: '{layer}'")
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def build_repair_command(component: str) -> str:
|
||||||
|
"""組裝 repair 命令,防止 command injection。"""
|
||||||
|
if not _COMPONENT_RE.match(component):
|
||||||
|
raise ValueError(f"Invalid component name: '{component}'")
|
||||||
|
return f"repair:{component}"
|
||||||
|
|
||||||
|
|
||||||
|
class HostRepairAgent:
|
||||||
|
"""透過 SSH 執行主機層修復命令。"""
|
||||||
|
|
||||||
|
async def repair(self, layer: str, component: str) -> HostRepairResult:
|
||||||
|
"""執行修復並回傳結果。"""
|
||||||
|
try:
|
||||||
|
config = get_ssh_config_for_layer(layer)
|
||||||
|
command = build_repair_command(component)
|
||||||
|
except ValueError as e:
|
||||||
|
return HostRepairResult(
|
||||||
|
success=False,
|
||||||
|
layer=layer,
|
||||||
|
component=component,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
output = await self._ssh_execute(
|
||||||
|
host=config["host"],
|
||||||
|
user=config["user"],
|
||||||
|
command=command,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
return HostRepairResult(
|
||||||
|
success=False,
|
||||||
|
layer=layer,
|
||||||
|
component=component,
|
||||||
|
error=f"SSH timeout after {SSH_TIMEOUT}s",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return HostRepairResult(
|
||||||
|
success=False,
|
||||||
|
layer=layer,
|
||||||
|
component=component,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
success = output.startswith("REPAIR_OK:")
|
||||||
|
return HostRepairResult(
|
||||||
|
success=success,
|
||||||
|
layer=layer,
|
||||||
|
component=component,
|
||||||
|
output=output,
|
||||||
|
error="" if success else output,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _ssh_execute(self, host: str, user: str, command: str) -> str:
|
||||||
|
"""執行 SSH 命令,回傳 stdout。"""
|
||||||
|
key_path = LAYER_SSH_CONFIG.get(
|
||||||
|
next((k for k, v in LAYER_SSH_CONFIG.items() if v["host"] == host and v["user"] == user), None),
|
||||||
|
{}
|
||||||
|
).get("key_path", "/etc/repair-ssh/id_ed25519")
|
||||||
|
|
||||||
|
proc = await asyncio.wait_for(
|
||||||
|
asyncio.create_subprocess_exec(
|
||||||
|
"ssh",
|
||||||
|
"-i", key_path,
|
||||||
|
"-o", "StrictHostKeyChecking=no",
|
||||||
|
"-o", "BatchMode=yes",
|
||||||
|
"-o", f"ConnectTimeout={SSH_TIMEOUT}",
|
||||||
|
f"{user}@{host}",
|
||||||
|
command,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
),
|
||||||
|
timeout=SSH_TIMEOUT,
|
||||||
|
)
|
||||||
|
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=SSH_TIMEOUT)
|
||||||
|
output = stdout.decode().strip()
|
||||||
|
logger.info("SSH repair %s@%s %s → %s", user, host, command, output)
|
||||||
|
return output
|
||||||
130
apps/api/tests/test_host_repair_agent.py
Normal file
130
apps/api/tests/test_host_repair_agent.py
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
"""
|
||||||
|
tests/test_host_repair_agent.py
|
||||||
|
Host Repair Agent 單元測試
|
||||||
|
不需要實際 SSH 連線 — 測試路由邏輯和命令組裝
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import AsyncMock, patch
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# 測試 HostRepairConfig 路由
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class TestHostRepairConfig:
|
||||||
|
def test_layer_docker_110_routes_to_110(self):
|
||||||
|
from src.services.host_repair_agent import get_ssh_config_for_layer
|
||||||
|
config = get_ssh_config_for_layer("docker-110")
|
||||||
|
assert config["user"] == "wooo"
|
||||||
|
assert config["host"] == "192.168.0.110"
|
||||||
|
|
||||||
|
def test_layer_docker_188_routes_to_188(self):
|
||||||
|
from src.services.host_repair_agent import get_ssh_config_for_layer
|
||||||
|
config = get_ssh_config_for_layer("docker-188")
|
||||||
|
assert config["user"] == "ollama"
|
||||||
|
assert config["host"] == "192.168.0.188"
|
||||||
|
|
||||||
|
def test_layer_systemd_188_routes_to_188(self):
|
||||||
|
from src.services.host_repair_agent import get_ssh_config_for_layer
|
||||||
|
config = get_ssh_config_for_layer("systemd-188")
|
||||||
|
assert config["user"] == "ollama"
|
||||||
|
assert config["host"] == "192.168.0.188"
|
||||||
|
|
||||||
|
def test_unknown_layer_raises(self):
|
||||||
|
from src.services.host_repair_agent import get_ssh_config_for_layer
|
||||||
|
with pytest.raises(ValueError, match="Unknown layer"):
|
||||||
|
get_ssh_config_for_layer("unknown-layer")
|
||||||
|
|
||||||
|
def test_k8s_layer_raises(self):
|
||||||
|
"""k8s layer 不走 SSH,應 raise"""
|
||||||
|
from src.services.host_repair_agent import get_ssh_config_for_layer
|
||||||
|
with pytest.raises(ValueError, match="kubectl"):
|
||||||
|
get_ssh_config_for_layer("k8s")
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# 測試 SSH 命令組裝
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class TestSSHCommandBuilding:
|
||||||
|
def test_repair_command_format(self):
|
||||||
|
from src.services.host_repair_agent import build_repair_command
|
||||||
|
cmd = build_repair_command("sentry")
|
||||||
|
assert cmd == "repair:sentry"
|
||||||
|
|
||||||
|
def test_repair_command_component_sanitized(self):
|
||||||
|
"""防止 command injection"""
|
||||||
|
from src.services.host_repair_agent import build_repair_command
|
||||||
|
with pytest.raises(ValueError, match="Invalid component"):
|
||||||
|
build_repair_command("sentry; rm -rf /")
|
||||||
|
|
||||||
|
def test_repair_command_valid_components(self):
|
||||||
|
from src.services.host_repair_agent import build_repair_command
|
||||||
|
valid = ["sentry", "harbor", "gitea", "openclaw", "gitea-runner", "alertmanager", "redis", "nginx"]
|
||||||
|
for component in valid:
|
||||||
|
cmd = build_repair_command(component)
|
||||||
|
assert cmd == f"repair:{component}"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# 測試 HostRepairAgent.repair() 路由
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class TestHostRepairAgent:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_repair_success_returns_ok(self):
|
||||||
|
from src.services.host_repair_agent import HostRepairAgent
|
||||||
|
|
||||||
|
agent = HostRepairAgent()
|
||||||
|
with patch.object(agent, "_ssh_execute", new_callable=AsyncMock) as mock_ssh:
|
||||||
|
mock_ssh.return_value = "REPAIR_OK:sentry"
|
||||||
|
|
||||||
|
result = await agent.repair(layer="docker-110", component="sentry")
|
||||||
|
|
||||||
|
assert result.success is True
|
||||||
|
assert result.component == "sentry"
|
||||||
|
assert result.layer == "docker-110"
|
||||||
|
mock_ssh.assert_called_once_with(
|
||||||
|
host="192.168.0.110",
|
||||||
|
user="wooo",
|
||||||
|
command="repair:sentry"
|
||||||
|
)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_repair_fail_returns_failure(self):
|
||||||
|
from src.services.host_repair_agent import HostRepairAgent
|
||||||
|
|
||||||
|
agent = HostRepairAgent()
|
||||||
|
with patch.object(agent, "_ssh_execute", new_callable=AsyncMock) as mock_ssh:
|
||||||
|
mock_ssh.return_value = "REPAIR_FAIL:harbor:exit_1"
|
||||||
|
|
||||||
|
result = await agent.repair(layer="docker-110", component="harbor")
|
||||||
|
|
||||||
|
assert result.success is False
|
||||||
|
assert "REPAIR_FAIL" in result.error
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_repair_ssh_timeout_returns_failure(self):
|
||||||
|
from src.services.host_repair_agent import HostRepairAgent
|
||||||
|
|
||||||
|
agent = HostRepairAgent()
|
||||||
|
with patch.object(agent, "_ssh_execute", new_callable=AsyncMock) as mock_ssh:
|
||||||
|
mock_ssh.side_effect = asyncio.TimeoutError()
|
||||||
|
|
||||||
|
result = await agent.repair(layer="docker-110", component="sentry")
|
||||||
|
|
||||||
|
assert result.success is False
|
||||||
|
assert "timeout" in result.error.lower()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_repair_denied_returns_failure(self):
|
||||||
|
from src.services.host_repair_agent import HostRepairAgent
|
||||||
|
|
||||||
|
agent = HostRepairAgent()
|
||||||
|
with patch.object(agent, "_ssh_execute", new_callable=AsyncMock) as mock_ssh:
|
||||||
|
mock_ssh.return_value = "REPAIR_DENIED:unknown_component:badcomponent"
|
||||||
|
|
||||||
|
result = await agent.repair(layer="docker-110", component="badcomponent")
|
||||||
|
|
||||||
|
assert result.success is False
|
||||||
Reference in New Issue
Block a user