fix: DOCKER_RESTART 改走 SSH 跳板(110→188),修復 AIOps AutoHeal 閉環
All checks were successful
CD Pipeline / deploy (push) Successful in 1m16s

根本原因:scheduler 容器內無 Docker socket,直接執行 docker restart 失敗。
修正:使用 SSHJumpExecutor(wooo@110 → ollama@188)透過跳板執行 docker restart。
SSH key:/app/config/autoheal_id_ed25519(rw mount 已存在)。
同步關閉 9 筆 2026-04-19 過期 DNS_FAIL incidents(根因已由網路修復解決)。

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
ogt
2026-04-20 20:19:46 +08:00
parent 34620b7b04
commit b2803c90be

View File

@@ -306,28 +306,45 @@ class AutoHealService:
"action": "DOCKER_RESTART",
"message": "Playbook missing 'container' in action_params",
}
# 安全:命令為靜態 listcontainer 通過驗證正則
try:
safe_container = re.sub(r'[^a-zA-Z0-9._-]', '', container)
if safe_container != container:
raise ValueError(f"Container name contains unsafe chars: {container!r}")
cmd = self._DOCKER_RESTART_CMD + [safe_container]
result = subprocess.run(
cmd, capture_output=True, text=True, timeout=60
)
success = result.returncode == 0
safe_container = re.sub(r'[^a-zA-Z0-9._-]', '', container)
if safe_container != container:
return {"success": False, "action": "DOCKER_RESTART",
"message": f"Container name contains unsafe chars: {container!r}"}
# 透過 SSH 跳板110→188執行 docker restartADR-013 §DOCKER_RESTART
# 容器內無 Docker socket必須 SSH 到宿主機執行
key_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'autoheal_id_ed25519')
key_path = os.path.normpath(key_path)
if not os.path.exists(key_path):
logger.warning("[AutoHeal] SSH key 不存在: %s,降級為 ALERT_ONLY", key_path)
return {
"success": success,
"success": False,
"action": "DOCKER_RESTART",
"message": (
f"Restarted {safe_container}"
if success else
f"Restart failed: {result.stderr[:200]}"
),
"message": f"SSH key 不存在: {key_path},請確認 config/autoheal_id_ed25519 已掛載",
}
executor = SSHJumpExecutor(
jump_host="192.168.0.110",
jump_user="wooo",
jump_key_path=key_path,
jump_connect_timeout=10,
jump_command_timeout=60,
)
try:
result = executor.execute_command(
target_host="192.168.0.188",
target_user="ollama",
command=["docker", "restart", safe_container],
)
success = result.get("success", False)
msg = (
f"容器 {safe_container} 重啟成功SSH 跳板)"
if success else
f"容器重啟失敗: {result.get('stderr','')[:200]}"
)
return {"success": success, "action": "DOCKER_RESTART", "message": msg}
except Exception as e:
logger.error("[AutoHeal] DOCKER_RESTART failed: %s", e)
return {"success": False, "action": "DOCKER_RESTART", "message": str(e)}
logger.error("[AutoHeal] DOCKER_RESTART SSH 失敗: %s", e)
return {"success": False, "action": "DOCKER_RESTART", "message": f"SSH 執行例外: {e}"}
if action_type == "SSH_CMD":
# SSH_CMD命令必須以 list 形式存在 action_params['argv']