From 1a654aa37d02ff7dbe0d970e28f8f979e771f654 Mon Sep 17 00:00:00 2001 From: OG T Date: Mon, 6 Apr 2026 14:22:54 +0800 Subject: [PATCH] =?UTF-8?q?feat(api):=20HostRepairAgent=20=E4=B8=89?= =?UTF-8?q?=E6=A2=9D=E5=9F=B7=E8=A1=8C=E8=B7=AF=E5=BE=91=20+=20known=5Fhos?= =?UTF-8?q?ts=20+=20Ansible=20=E7=99=BD=E5=90=8D=E5=96=AE=20(Sprint=203=20?= =?UTF-8?q?T3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/host_repair_agent.py | 143 ++++++++++++++++++++- apps/api/tests/test_host_repair_agent.py | 62 +++++++++ 2 files changed, 204 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/host_repair_agent.py b/apps/api/src/services/host_repair_agent.py index 8cbcfa01..24208368 100644 --- a/apps/api/src/services/host_repair_agent.py +++ b/apps/api/src/services/host_repair_agent.py @@ -5,6 +5,7 @@ Host Repair Agent — 透過 SSH 執行主機層修復 2026-04-05 Claude Code: C1 修正 — key_path 直接傳入 _ssh_execute,不反查 """ import asyncio +import os import re import logging from dataclasses import dataclass @@ -52,6 +53,30 @@ _SUPPORTED_SCHEMES = {"openclaw", "ansible", "ssh"} _SHELL_METACHAR_RE = re.compile(r'[;&|`&]|\$\(') _MAX_COMMAND_LEN = 512 +# Ansible 控制節點設定 — 從 env/ConfigMap 讀取 +# 2026-04-06 Claude Code: Sprint 3 T3 +ANSIBLE_CONTROL_HOST = os.environ.get("ANSIBLE_CONTROL_NODE_HOST", "192.168.0.188") +ANSIBLE_CONTROL_USER = os.environ.get("ANSIBLE_CONTROL_NODE_USER", "ollama") +ANSIBLE_PLAYBOOKS_PATH = os.environ.get("ANSIBLE_PLAYBOOKS_PATH", "~/openclaw-v5/ansible/playbooks") +KNOWN_HOSTS_PATH = "/etc/repair-ssh/known_hosts" + + +def validate_ansible_playbook(playbook_name: str) -> None: + """ + 驗證 playbook 名稱在白名單內,防止路徑遍歷攻擊。 + 白名單從環境變數 ANSIBLE_PLAYBOOK_WHITELIST 讀取(ConfigMap 注入)。 + + Raises: + ValueError: playbook 不在白名單 + """ + whitelist_raw = os.environ.get("ANSIBLE_PLAYBOOK_WHITELIST", "") + allowed = {p.strip() for p in whitelist_raw.split(",") if p.strip()} + if "/" in playbook_name or ".." in playbook_name or playbook_name not in allowed: + raise ValueError( + f"Security Block: '{playbook_name}' not in allowed whitelist. " + f"Allowed: {sorted(allowed)}" + ) + def parse_uri_command(command: str) -> SshCommandURI: """ @@ -164,6 +189,121 @@ class HostRepairAgent: error="" if success else output, ) + async def repair_by_uri(self, command: str, approved: bool = False) -> HostRepairResult: + """ + 根據 URI scheme 路由至對應的執行路徑。 + 2026-04-06 Claude Code: Sprint 3 T3 + """ + try: + uri = parse_uri_command(command) + except ValueError as e: + return HostRepairResult(success=False, layer="", component="", error=str(e)) + + if uri.scheme == "openclaw": + return await self._execute_openclaw(uri.host_or_layer, uri.payload) + + if uri.scheme == "ansible": + try: + validate_ansible_playbook(uri.payload) + except ValueError as e: + return HostRepairResult(success=False, layer="ansible", component=uri.payload, error=str(e)) + return await self._execute_ansible(uri.host_or_layer, uri.payload) + + if uri.scheme == "ssh": + if not approved: + return HostRepairResult( + success=False, + layer="ssh", + component=uri.payload, + error="ssh:// scheme requires_approval=True — must be explicitly approved", + ) + try: + validate_shell_safety(uri.payload) + except ValueError as e: + return HostRepairResult(success=False, layer="ssh", component=uri.payload, error=str(e)) + return await self._execute_ssh_direct(uri.host_or_layer, uri.payload) + + return HostRepairResult(success=False, layer="", component="", error=f"Unhandled scheme: {uri.scheme}") + + async def _execute_openclaw(self, layer: str, component: str) -> HostRepairResult: + """openclaw:// — 呼叫現有的 repair(layer, component) 邏輯""" + return await self.repair(layer=layer, component=component) + + async def _execute_ansible(self, _control_host: str, playbook_name: str) -> HostRepairResult: + """ + ansible:// — SSH 至控制節點,執行 ansible-playbook。 + 2026-04-06 Claude Code: Sprint 3 T3 + 注意: 強制使用 ConfigMap 的控制節點,忽略 URI 中的 host (安全設計) + """ + host = ANSIBLE_CONTROL_HOST + user = ANSIBLE_CONTROL_USER + playbook_path = f"{ANSIBLE_PLAYBOOKS_PATH}/{playbook_name}" + ssh_command = f"ansible-playbook {playbook_path}" + + try: + output = await self._ssh_execute( + host=host, + user=user, + key_path="/etc/repair-ssh/id_ed25519", + command=ssh_command, + ) + except asyncio.TimeoutError: + return HostRepairResult( + success=False, layer="ansible", component=playbook_name, + error=f"Ansible SSH timeout after {SSH_TIMEOUT}s", + ) + except Exception as e: + return HostRepairResult( + success=False, layer="ansible", component=playbook_name, + error=str(e), + ) + + success = "REPAIR_OK" in output or "ok=" in output + return HostRepairResult( + success=success, + layer="ansible", + component=playbook_name, + output=output, + error="" if success else output, + ) + + async def _execute_ssh_direct(self, host_user: str, command: str) -> HostRepairResult: + """ + ssh:// — 直接執行 SSH 命令(需明確 approved=True)。 + host_user 格式: "wooo@192.168.0.110" + 2026-04-06 Claude Code: Sprint 3 T3 + """ + if "@" in host_user: + user, host = host_user.split("@", 1) + else: + return HostRepairResult( + success=False, layer="ssh", component=command, + error=f"Invalid host_user format '{host_user}' (expected user@host)", + ) + try: + output = await self._ssh_execute( + host=host, + user=user, + key_path="/etc/repair-ssh/id_ed25519", + command=command, + ) + except asyncio.TimeoutError: + return HostRepairResult( + success=False, layer="ssh", component=command, + error=f"SSH timeout after {SSH_TIMEOUT}s", + ) + except Exception as e: + return HostRepairResult(success=False, layer="ssh", component=command, error=str(e)) + + success = not output.startswith("ERROR") + return HostRepairResult( + success=success, + layer="ssh", + component=command, + output=output, + error="" if success else output, + ) + async def _ssh_execute(self, host: str, user: str, key_path: str, command: str) -> str: """執行 SSH 命令,回傳 stdout。key_path 由呼叫方傳入,不反查。""" import time @@ -172,7 +312,8 @@ class HostRepairAgent: asyncio.create_subprocess_exec( "ssh", "-i", key_path, - "-o", "StrictHostKeyChecking=no", + "-o", "StrictHostKeyChecking=yes", + "-o", f"UserKnownHostsFile={KNOWN_HOSTS_PATH}", "-o", "BatchMode=yes", "-o", f"ConnectTimeout={SSH_TIMEOUT}", f"{user}@{host}", diff --git a/apps/api/tests/test_host_repair_agent.py b/apps/api/tests/test_host_repair_agent.py index 08512827..a3530fac 100644 --- a/apps/api/tests/test_host_repair_agent.py +++ b/apps/api/tests/test_host_repair_agent.py @@ -209,3 +209,65 @@ class TestValidateShellSafety: from src.services.host_repair_agent import validate_shell_safety with pytest.raises(ValueError, match="too long"): validate_shell_safety("a" * 513) + + +import os +from unittest.mock import patch, AsyncMock + + +class TestAnsibleWhitelist: + def test_allowed_playbook_passes(self): + from src.services.host_repair_agent import validate_ansible_playbook + with patch.dict(os.environ, {"ANSIBLE_PLAYBOOK_WHITELIST": "vacuum_postgres.yml,clear_redis_cache.yml"}): + validate_ansible_playbook("vacuum_postgres.yml") # must not raise + + def test_disallowed_playbook_raises(self): + from src.services.host_repair_agent import validate_ansible_playbook + with patch.dict(os.environ, {"ANSIBLE_PLAYBOOK_WHITELIST": "vacuum_postgres.yml"}): + with pytest.raises(ValueError, match="not in allowed whitelist"): + validate_ansible_playbook("evil_script.sh") + + def test_path_traversal_blocked(self): + from src.services.host_repair_agent import validate_ansible_playbook + with patch.dict(os.environ, {"ANSIBLE_PLAYBOOK_WHITELIST": "vacuum_postgres.yml"}): + with pytest.raises(ValueError, match="not in allowed whitelist"): + validate_ansible_playbook("../../../etc/passwd") + + +class TestRepairByUri: + @pytest.mark.asyncio + async def test_openclaw_scheme_calls_repair(self): + from src.services.host_repair_agent import HostRepairAgent, HostRepairResult + agent = HostRepairAgent() + with patch.object(agent, "_execute_openclaw", new_callable=AsyncMock) as mock_oc: + mock_oc.return_value = HostRepairResult(success=True, layer="docker-110", component="sentry", output="REPAIR_OK:sentry") + result = await agent.repair_by_uri("openclaw://docker-110/sentry") + assert result.success is True + mock_oc.assert_awaited_once_with("docker-110", "sentry") + + @pytest.mark.asyncio + async def test_ansible_scheme_calls_ansible(self): + from src.services.host_repair_agent import HostRepairAgent, HostRepairResult + agent = HostRepairAgent() + with patch.object(agent, "_execute_ansible", new_callable=AsyncMock) as mock_ans, \ + patch.dict(os.environ, {"ANSIBLE_PLAYBOOK_WHITELIST": "vacuum_postgres.yml"}): + mock_ans.return_value = HostRepairResult(success=True, layer="ansible", component="vacuum_postgres.yml", output="REPAIR_OK:ansible") + result = await agent.repair_by_uri("ansible://192.168.0.188/vacuum_postgres.yml") + assert result.success is True + mock_ans.assert_awaited_once_with("192.168.0.188", "vacuum_postgres.yml") + + @pytest.mark.asyncio + async def test_ssh_scheme_blocked_without_approval_flag(self): + from src.services.host_repair_agent import HostRepairAgent + agent = HostRepairAgent() + result = await agent.repair_by_uri("ssh://wooo@192.168.0.110/docker ps") + assert result.success is False + assert "requires_approval" in result.error + + @pytest.mark.asyncio + async def test_invalid_uri_returns_failure(self): + from src.services.host_repair_agent import HostRepairAgent + agent = HostRepairAgent() + result = await agent.repair_by_uri("bad-format") + assert result.success is False + assert "Unsupported scheme" in result.error