fix: DOCKER_RESTART 改走 SSH 跳板(110→188),修復 AIOps AutoHeal 閉環
All checks were successful
CD Pipeline / deploy (push) Successful in 1m16s
All checks were successful
CD Pipeline / deploy (push) Successful in 1m16s
根本原因:scheduler 容器內無 Docker socket,直接執行 docker restart 失敗。 修正:使用 SSHJumpExecutor(wooo@110 → ollama@188)透過跳板執行 docker restart。 SSH key:/app/config/autoheal_id_ed25519(rw mount 已存在)。 同步關閉 9 筆 2026-04-19 過期 DNS_FAIL incidents(根因已由網路修復解決)。 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -306,28 +306,45 @@ class AutoHealService:
|
||||
"action": "DOCKER_RESTART",
|
||||
"message": "Playbook missing 'container' in action_params",
|
||||
}
|
||||
# 安全:命令為靜態 list,container 通過驗證正則
|
||||
try:
|
||||
safe_container = re.sub(r'[^a-zA-Z0-9._-]', '', container)
|
||||
if safe_container != container:
|
||||
raise ValueError(f"Container name contains unsafe chars: {container!r}")
|
||||
cmd = self._DOCKER_RESTART_CMD + [safe_container]
|
||||
result = subprocess.run(
|
||||
cmd, capture_output=True, text=True, timeout=60
|
||||
)
|
||||
success = result.returncode == 0
|
||||
safe_container = re.sub(r'[^a-zA-Z0-9._-]', '', container)
|
||||
if safe_container != container:
|
||||
return {"success": False, "action": "DOCKER_RESTART",
|
||||
"message": f"Container name contains unsafe chars: {container!r}"}
|
||||
|
||||
# 透過 SSH 跳板(110→188)執行 docker restart(ADR-013 §DOCKER_RESTART)
|
||||
# 容器內無 Docker socket,必須 SSH 到宿主機執行
|
||||
key_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'autoheal_id_ed25519')
|
||||
key_path = os.path.normpath(key_path)
|
||||
if not os.path.exists(key_path):
|
||||
logger.warning("[AutoHeal] SSH key 不存在: %s,降級為 ALERT_ONLY", key_path)
|
||||
return {
|
||||
"success": success,
|
||||
"success": False,
|
||||
"action": "DOCKER_RESTART",
|
||||
"message": (
|
||||
f"Restarted {safe_container}"
|
||||
if success else
|
||||
f"Restart failed: {result.stderr[:200]}"
|
||||
),
|
||||
"message": f"SSH key 不存在: {key_path},請確認 config/autoheal_id_ed25519 已掛載",
|
||||
}
|
||||
executor = SSHJumpExecutor(
|
||||
jump_host="192.168.0.110",
|
||||
jump_user="wooo",
|
||||
jump_key_path=key_path,
|
||||
jump_connect_timeout=10,
|
||||
jump_command_timeout=60,
|
||||
)
|
||||
try:
|
||||
result = executor.execute_command(
|
||||
target_host="192.168.0.188",
|
||||
target_user="ollama",
|
||||
command=["docker", "restart", safe_container],
|
||||
)
|
||||
success = result.get("success", False)
|
||||
msg = (
|
||||
f"容器 {safe_container} 重啟成功(SSH 跳板)"
|
||||
if success else
|
||||
f"容器重啟失敗: {result.get('stderr','')[:200]}"
|
||||
)
|
||||
return {"success": success, "action": "DOCKER_RESTART", "message": msg}
|
||||
except Exception as e:
|
||||
logger.error("[AutoHeal] DOCKER_RESTART failed: %s", e)
|
||||
return {"success": False, "action": "DOCKER_RESTART", "message": str(e)}
|
||||
logger.error("[AutoHeal] DOCKER_RESTART SSH 失敗: %s", e)
|
||||
return {"success": False, "action": "DOCKER_RESTART", "message": f"SSH 執行例外: {e}"}
|
||||
|
||||
if action_type == "SSH_CMD":
|
||||
# SSH_CMD:命令必須以 list 形式存在 action_params['argv']
|
||||
|
||||
Reference in New Issue
Block a user