From b2803c90be8469703a7bef257f5da010a90552ce Mon Sep 17 00:00:00 2001 From: ogt Date: Mon, 20 Apr 2026 20:19:46 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20DOCKER=5FRESTART=20=E6=94=B9=E8=B5=B0=20?= =?UTF-8?q?SSH=20=E8=B7=B3=E6=9D=BF=EF=BC=88110=E2=86=92188=EF=BC=89?= =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E5=BE=A9=20AIOps=20AutoHeal=20=E9=96=89?= =?UTF-8?q?=E7=92=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根本原因:scheduler 容器內無 Docker socket,直接執行 docker restart 失敗。 修正:使用 SSHJumpExecutor(wooo@110 → ollama@188)透過跳板執行 docker restart。 SSH key:/app/config/autoheal_id_ed25519(rw mount 已存在)。 同步關閉 9 筆 2026-04-19 過期 DNS_FAIL incidents(根因已由網路修復解決)。 Co-Authored-By: Claude Sonnet 4.6 --- services/auto_heal_service.py | 53 +++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/services/auto_heal_service.py b/services/auto_heal_service.py index c6628dd..175e004 100644 --- a/services/auto_heal_service.py +++ b/services/auto_heal_service.py @@ -306,28 +306,45 @@ class AutoHealService: "action": "DOCKER_RESTART", "message": "Playbook missing 'container' in action_params", } - # 安全:命令為靜態 list,container 通過驗證正則 - try: - safe_container = re.sub(r'[^a-zA-Z0-9._-]', '', container) - if safe_container != container: - raise ValueError(f"Container name contains unsafe chars: {container!r}") - cmd = self._DOCKER_RESTART_CMD + [safe_container] - result = subprocess.run( - cmd, capture_output=True, text=True, timeout=60 - ) - success = result.returncode == 0 + safe_container = re.sub(r'[^a-zA-Z0-9._-]', '', container) + if safe_container != container: + return {"success": False, "action": "DOCKER_RESTART", + "message": f"Container name contains unsafe chars: {container!r}"} + + # 透過 SSH 跳板(110→188)執行 docker restart(ADR-013 §DOCKER_RESTART) + # 容器內無 Docker socket,必須 SSH 到宿主機執行 + key_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'autoheal_id_ed25519') + key_path = os.path.normpath(key_path) + if not os.path.exists(key_path): + logger.warning("[AutoHeal] SSH key 不存在: %s,降級為 ALERT_ONLY", key_path) return { - "success": success, + "success": False, "action": "DOCKER_RESTART", - "message": ( - f"Restarted {safe_container}" - if success else - f"Restart failed: {result.stderr[:200]}" - ), + "message": f"SSH key 不存在: {key_path},請確認 config/autoheal_id_ed25519 已掛載", } + executor = SSHJumpExecutor( + jump_host="192.168.0.110", + jump_user="wooo", + jump_key_path=key_path, + jump_connect_timeout=10, + jump_command_timeout=60, + ) + try: + result = executor.execute_command( + target_host="192.168.0.188", + target_user="ollama", + command=["docker", "restart", safe_container], + ) + success = result.get("success", False) + msg = ( + f"容器 {safe_container} 重啟成功(SSH 跳板)" + if success else + f"容器重啟失敗: {result.get('stderr','')[:200]}" + ) + return {"success": success, "action": "DOCKER_RESTART", "message": msg} except Exception as e: - logger.error("[AutoHeal] DOCKER_RESTART failed: %s", e) - return {"success": False, "action": "DOCKER_RESTART", "message": str(e)} + logger.error("[AutoHeal] DOCKER_RESTART SSH 失敗: %s", e) + return {"success": False, "action": "DOCKER_RESTART", "message": f"SSH 執行例外: {e}"} if action_type == "SSH_CMD": # SSH_CMD:命令必須以 list 形式存在 action_params['argv']