C1: _ssh_execute 直接接收 key_path 參數,不反查 LAYER_SSH_CONFIG
C2: PlaybookService.create() proxy,Router 不再穿透呼叫 _repository
C3: CD Step 1b sed 替換 IMAGE_TAG_PLACEHOLDER,消除失敗中斷風險
M3: repair-bot 110/188 regex 統一 [a-z0-9][a-z0-9-]{0,30},禁止底線
m1: defaultMode 0400 加八進位說明注釋
m2: _ssh_execute 用 deadline 計算剩餘 timeout
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
135 lines
4.2 KiB
Python
135 lines
4.2 KiB
Python
"""
|
||
src/services/host_repair_agent.py
|
||
Host Repair Agent — 透過 SSH 執行主機層修復
|
||
2026-04-05 Claude Code: Sprint 3 Host Auto-Repair
|
||
2026-04-05 Claude Code: C1 修正 — key_path 直接傳入 _ssh_execute,不反查
|
||
"""
|
||
import asyncio
|
||
import re
|
||
import logging
|
||
from dataclasses import dataclass
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# SSH 連線設定 — layer → host config
|
||
LAYER_SSH_CONFIG: dict[str, dict] = {
|
||
"docker-110": {
|
||
"host": "192.168.0.110",
|
||
"user": "wooo",
|
||
"key_path": "/etc/repair-ssh/id_ed25519",
|
||
},
|
||
"docker-188": {
|
||
"host": "192.168.0.188",
|
||
"user": "ollama",
|
||
"key_path": "/etc/repair-ssh/id_ed25519",
|
||
},
|
||
"systemd-188": {
|
||
"host": "192.168.0.188",
|
||
"user": "ollama",
|
||
"key_path": "/etc/repair-ssh/id_ed25519",
|
||
},
|
||
}
|
||
|
||
# Component 名稱規則: 小寫英數 + 連字符,1-31 字元
|
||
_COMPONENT_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,30}$")
|
||
|
||
SSH_TIMEOUT = 60 # seconds
|
||
|
||
|
||
@dataclass
|
||
class HostRepairResult:
|
||
success: bool
|
||
layer: str
|
||
component: str
|
||
output: str = ""
|
||
error: str = ""
|
||
|
||
|
||
def get_ssh_config_for_layer(layer: str) -> dict:
|
||
"""取得指定 layer 的 SSH 連線設定。k8s layer 不走 SSH。"""
|
||
if layer == "k8s" or layer.startswith("k8s"):
|
||
raise ValueError(f"Layer '{layer}' uses kubectl, not SSH")
|
||
config = LAYER_SSH_CONFIG.get(layer)
|
||
if config is None:
|
||
raise ValueError(f"Unknown layer: '{layer}'")
|
||
return config
|
||
|
||
|
||
def build_repair_command(component: str) -> str:
|
||
"""組裝 repair 命令,防止 command injection。"""
|
||
if not _COMPONENT_RE.match(component):
|
||
raise ValueError(f"Invalid component name: '{component}'")
|
||
return f"repair:{component}"
|
||
|
||
|
||
class HostRepairAgent:
|
||
"""透過 SSH 執行主機層修復命令。"""
|
||
|
||
async def repair(self, layer: str, component: str) -> HostRepairResult:
|
||
"""執行修復並回傳結果。"""
|
||
try:
|
||
config = get_ssh_config_for_layer(layer)
|
||
command = build_repair_command(component)
|
||
except ValueError as e:
|
||
return HostRepairResult(
|
||
success=False,
|
||
layer=layer,
|
||
component=component,
|
||
error=str(e),
|
||
)
|
||
|
||
try:
|
||
output = await self._ssh_execute(
|
||
host=config["host"],
|
||
user=config["user"],
|
||
key_path=config["key_path"],
|
||
command=command,
|
||
)
|
||
except asyncio.TimeoutError:
|
||
return HostRepairResult(
|
||
success=False,
|
||
layer=layer,
|
||
component=component,
|
||
error=f"SSH timeout after {SSH_TIMEOUT}s",
|
||
)
|
||
except Exception as e:
|
||
return HostRepairResult(
|
||
success=False,
|
||
layer=layer,
|
||
component=component,
|
||
error=str(e),
|
||
)
|
||
|
||
success = output.startswith("REPAIR_OK:")
|
||
return HostRepairResult(
|
||
success=success,
|
||
layer=layer,
|
||
component=component,
|
||
output=output,
|
||
error="" if success else output,
|
||
)
|
||
|
||
async def _ssh_execute(self, host: str, user: str, key_path: str, command: str) -> str:
|
||
"""執行 SSH 命令,回傳 stdout。key_path 由呼叫方傳入,不反查。"""
|
||
import time
|
||
deadline = time.monotonic() + SSH_TIMEOUT
|
||
proc = await asyncio.wait_for(
|
||
asyncio.create_subprocess_exec(
|
||
"ssh",
|
||
"-i", key_path,
|
||
"-o", "StrictHostKeyChecking=no",
|
||
"-o", "BatchMode=yes",
|
||
"-o", f"ConnectTimeout={SSH_TIMEOUT}",
|
||
f"{user}@{host}",
|
||
command,
|
||
stdout=asyncio.subprocess.PIPE,
|
||
stderr=asyncio.subprocess.PIPE,
|
||
),
|
||
timeout=SSH_TIMEOUT,
|
||
)
|
||
remaining = max(1.0, deadline - time.monotonic())
|
||
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=remaining)
|
||
output = stdout.decode().strip()
|
||
logger.info("SSH repair %s@%s %s → %s", user, host, command, output)
|
||
return output
|