Files
awoooi/apps/api/src/services/host_repair_agent.py
OG T 4b24ecd67f fix(sprint3): 首席架構師 Review C1/C2/C3/M3/m1 修正
C1: _ssh_execute 直接接收 key_path 參數,不反查 LAYER_SSH_CONFIG
C2: PlaybookService.create() proxy,Router 不再穿透呼叫 _repository
C3: CD Step 1b sed 替換 IMAGE_TAG_PLACEHOLDER,消除失敗中斷風險
M3: repair-bot 110/188 regex 統一 [a-z0-9][a-z0-9-]{0,30},禁止底線
m1: defaultMode 0400 加八進位說明注釋
m2: _ssh_execute 用 deadline 計算剩餘 timeout

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 13:07:59 +08:00

135 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
src/services/host_repair_agent.py
Host Repair Agent — 透過 SSH 執行主機層修復
2026-04-05 Claude Code: Sprint 3 Host Auto-Repair
2026-04-05 Claude Code: C1 修正 — key_path 直接傳入 _ssh_execute不反查
"""
import asyncio
import re
import logging
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# SSH 連線設定 — layer → host config
LAYER_SSH_CONFIG: dict[str, dict] = {
"docker-110": {
"host": "192.168.0.110",
"user": "wooo",
"key_path": "/etc/repair-ssh/id_ed25519",
},
"docker-188": {
"host": "192.168.0.188",
"user": "ollama",
"key_path": "/etc/repair-ssh/id_ed25519",
},
"systemd-188": {
"host": "192.168.0.188",
"user": "ollama",
"key_path": "/etc/repair-ssh/id_ed25519",
},
}
# Component 名稱規則: 小寫英數 + 連字符1-31 字元
_COMPONENT_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,30}$")
SSH_TIMEOUT = 60 # seconds
@dataclass
class HostRepairResult:
success: bool
layer: str
component: str
output: str = ""
error: str = ""
def get_ssh_config_for_layer(layer: str) -> dict:
"""取得指定 layer 的 SSH 連線設定。k8s layer 不走 SSH。"""
if layer == "k8s" or layer.startswith("k8s"):
raise ValueError(f"Layer '{layer}' uses kubectl, not SSH")
config = LAYER_SSH_CONFIG.get(layer)
if config is None:
raise ValueError(f"Unknown layer: '{layer}'")
return config
def build_repair_command(component: str) -> str:
"""組裝 repair 命令,防止 command injection。"""
if not _COMPONENT_RE.match(component):
raise ValueError(f"Invalid component name: '{component}'")
return f"repair:{component}"
class HostRepairAgent:
"""透過 SSH 執行主機層修復命令。"""
async def repair(self, layer: str, component: str) -> HostRepairResult:
"""執行修復並回傳結果。"""
try:
config = get_ssh_config_for_layer(layer)
command = build_repair_command(component)
except ValueError as e:
return HostRepairResult(
success=False,
layer=layer,
component=component,
error=str(e),
)
try:
output = await self._ssh_execute(
host=config["host"],
user=config["user"],
key_path=config["key_path"],
command=command,
)
except asyncio.TimeoutError:
return HostRepairResult(
success=False,
layer=layer,
component=component,
error=f"SSH timeout after {SSH_TIMEOUT}s",
)
except Exception as e:
return HostRepairResult(
success=False,
layer=layer,
component=component,
error=str(e),
)
success = output.startswith("REPAIR_OK:")
return HostRepairResult(
success=success,
layer=layer,
component=component,
output=output,
error="" if success else output,
)
async def _ssh_execute(self, host: str, user: str, key_path: str, command: str) -> str:
"""執行 SSH 命令,回傳 stdout。key_path 由呼叫方傳入,不反查。"""
import time
deadline = time.monotonic() + SSH_TIMEOUT
proc = await asyncio.wait_for(
asyncio.create_subprocess_exec(
"ssh",
"-i", key_path,
"-o", "StrictHostKeyChecking=no",
"-o", "BatchMode=yes",
"-o", f"ConnectTimeout={SSH_TIMEOUT}",
f"{user}@{host}",
command,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
),
timeout=SSH_TIMEOUT,
)
remaining = max(1.0, deadline - time.monotonic())
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=remaining)
output = stdout.decode().strip()
logger.info("SSH repair %s@%s %s%s", user, host, command, output)
return output