Files
ewoooc/services/auto_heal_service.py
ogt b2803c90be
All checks were successful
CD Pipeline / deploy (push) Successful in 1m16s
fix: DOCKER_RESTART 改走 SSH 跳板(110→188),修復 AIOps AutoHeal 閉環
根本原因:scheduler 容器內無 Docker socket,直接執行 docker restart 失敗。
修正:使用 SSHJumpExecutor(wooo@110 → ollama@188)透過跳板執行 docker restart。
SSH key:/app/config/autoheal_id_ed25519(rw mount 已存在)。
同步關閉 9 筆 2026-04-19 過期 DNS_FAIL incidents(根因已由網路修復解決)。

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-20 20:19:46 +08:00

393 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
auto_heal_service.py
ADR-013 AIOps 自動修復服務。
SSHJumpExecutor通過跳板機安全執行遠端命令。
AutoHealServicePlayBook 驅動的自動修復主服務。
"""
import atexit
import logging
import os
import re
import subprocess
import tempfile
from typing import Dict, Any, List, Optional
logger = logging.getLogger(__name__)
# ── 輸入驗證用的安全正則 ──────────────────────────────────────────────────────
# hostname / IP字母、數字、連字號、點RFC 1123 + IPv4
_HOST_RE = re.compile(r'^[a-zA-Z0-9]([a-zA-Z0-9.\-]{0,253}[a-zA-Z0-9])?$')
# username字母、數字、底線、連字號POSIX 用戶名)
_USER_RE = re.compile(r'^[a-zA-Z0-9_][a-zA-Z0-9._-]{0,31}$')
# PlayBook action_type allowlist防止任意命令植入
ALLOWED_ACTION_TYPES = frozenset({
'DOCKER_RESTART',
'WAIT_RETRY',
'ALERT_ONLY',
'SSH_CMD',
})
def _validate_host(host: str) -> str:
"""驗證 hostname/IP防止 SSH option injection-o ProxyCommand=..."""
if not host or not _HOST_RE.match(host):
raise ValueError(f"Invalid host: {host!r}")
return host
def _validate_user(user: str) -> str:
"""驗證 Unix 用戶名"""
if not user or not _USER_RE.match(user):
raise ValueError(f"Invalid user: {user!r}")
return user
class SSHJumpExecutor:
"""
通過跳板機執行遠端命令的安全封裝。
Security notes:
- jump_host / target_host / users 均通過正則驗證,防止 SSH option injection
- command 必須為 listargv不接受字串避免遠端 shell 解析
- SSH 指令列以 '--' 結尾,強制不解析後續參數為 SSH 選項
- 私鑰資料寫入 600 權限臨時檔,程序退出時清除
"""
def __init__(
self,
jump_host: str,
jump_user: str,
jump_key_path: Optional[str] = None,
jump_key_data: Optional[str] = None,
jump_port: int = 22,
jump_connect_timeout: int = 5,
jump_command_timeout: int = 60,
):
self.jump_host = _validate_host(jump_host)
self.jump_user = _validate_user(jump_user)
self.jump_key_path = jump_key_path
self.jump_key_data = jump_key_data
self.jump_port = int(jump_port)
self.jump_connect_timeout = int(jump_connect_timeout)
self.jump_command_timeout = int(jump_command_timeout)
self._tmp_key_path: Optional[str] = None
if self.jump_key_data:
self._tmp_key_path = self._write_temp_key(self.jump_key_data)
@staticmethod
def _write_temp_key(key_data: str) -> str:
"""將私鑰寫入 600 權限臨時檔並註冊退出清理"""
fd, tmp_path = tempfile.mkstemp(prefix="ssh_key_")
try:
os.write(fd, key_data.encode())
finally:
os.close(fd)
os.chmod(tmp_path, 0o600)
atexit.register(
lambda p=tmp_path: os.unlink(p) if os.path.exists(p) else None
)
return tmp_path
def _make_env(self) -> Dict[str, str]:
env = dict(os.environ)
env["SSH_ASKPASS"] = "echo"
env["DISPLAY"] = ""
return env
def _build_ssh_base_cmd(self) -> List[str]:
"""構建 SSH 基礎選項(不含目標主機與命令)"""
base = [
"ssh",
"-o", "StrictHostKeyChecking=no",
"-o", "BatchMode=yes",
"-o", f"ConnectTimeout={self.jump_connect_timeout}",
"-o", "ServerAliveInterval=15",
"-o", "ServerAliveCountMax=3",
"-p", str(self.jump_port),
]
key_path = self._tmp_key_path or self.jump_key_path
if key_path:
base.extend(["-i", key_path])
return base
def execute_command(
self,
target_host: str,
target_user: str,
command: List[str], # ← LIST不接受字串
) -> Dict[str, Any]:
"""
通過跳板機在目標主機執行命令。
Args:
target_host: 目標主機 hostname 或 IP必須通過驗證
target_user: 目標主機用戶名(必須通過驗證)
command: 命令及參數列表e.g. ['docker', 'restart', 'momo-app']
不接受字串,防止遠端 shell 重新解析
Raises:
ValueError: 若 command 為空、為字串,或 host/user 格式非法
"""
if isinstance(command, str):
raise TypeError(
"command must be a list, not a string. "
"Passing a string risks remote shell injection."
)
if not command:
raise ValueError("command list cannot be empty")
target_host = _validate_host(target_host)
target_user = _validate_user(target_user)
full_cmd = self._build_ssh_base_cmd()
full_cmd.extend([
"-J", f"{self.jump_user}@{self.jump_host}",
f"{target_user}@{target_host}",
"--", # 強制停止 SSH 選項解析
*command, # 展開命令 list每個元素獨立 argv
])
try:
result = subprocess.run(
full_cmd,
capture_output=True,
text=True,
timeout=self.jump_command_timeout,
env=self._make_env(),
)
return {
"success": result.returncode == 0,
"exit_code": result.returncode,
"stdout": result.stdout.strip(),
"stderr": result.stderr.strip(),
"command": command,
}
except subprocess.TimeoutExpired:
return {
"success": False,
"exit_code": -1,
"stdout": "",
"stderr": "SSH command timed out",
"command": command,
}
except Exception as exc:
logger.warning("SSH jump execution failed: %s", exc, exc_info=True)
return {
"success": False,
"exit_code": -1,
"stdout": "",
"stderr": str(exc),
"command": command,
}
class AutoHealService:
"""
ADR-013 PlayBook 驅動的自動修復主服務。
支援的 action_typeALLOWED_ACTION_TYPES
DOCKER_RESTART — 在指定主機重啟 Docker 服務
WAIT_RETRY — 等待後重試(不做系統操作)
ALERT_ONLY — 只記錄 / 發 Telegram不執行
SSH_CMD — 執行 PlayBook 指定的靜態白名單命令list 型)
"""
# Docker 操作的安全命令對應表(防止 PlayBook 攜帶任意命令)
_DOCKER_RESTART_CMD = ["docker", "restart"]
def handle_exception(
self,
error_type: str,
context: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""
根據 error_type 查詢 PlayBook 並執行對應修復動作。
Args:
error_type: 錯誤類型字串e.g. 'resource_pressure'
context: 觸發上下文,可包含 queue_size / system_load
Returns:
修復結果 dict含 success / action / message
"""
context = context or {}
logger.info(
"[AutoHeal] handle_exception: error_type=%s context=%s",
error_type, context
)
# 從 DB 查詢匹配的 PlayBook
playbook = self._find_playbook(error_type)
if not playbook:
logger.info("[AutoHeal] No matching playbook for: %s", error_type)
return {
"success": False,
"action": None,
"message": f"No playbook matched for error_type={error_type}",
}
action_type = playbook.get("action_type", "")
if action_type not in ALLOWED_ACTION_TYPES:
logger.warning(
"[AutoHeal] Playbook action_type not in allowlist: %s", action_type
)
return {
"success": False,
"action": action_type,
"message": f"action_type '{action_type}' is not allowed",
}
return self._execute_action(action_type, playbook, context)
def _find_playbook(self, error_type: str) -> Optional[Dict[str, Any]]:
"""查詢符合 error_type 的第一個 active PlayBook"""
try:
from database.manager import get_session
from database.autoheal_models import Playbook
from sqlalchemy import text
session = get_session()
try:
pb = (
session.query(Playbook)
.filter(
Playbook.error_type == error_type,
Playbook.is_active.is_(True),
)
.first()
)
if pb:
return {
"id": pb.id,
"name": pb.name,
"action_type": pb.action_type,
"action_params": pb.get_action_params(),
"max_retries": pb.max_retries,
"cooldown_min": pb.cooldown_min,
}
finally:
session.close()
except Exception as e:
logger.error("[AutoHeal] Playbook lookup failed: %s", e)
return None
def _execute_action(
self,
action_type: str,
playbook: Dict[str, Any],
context: Dict[str, Any],
) -> Dict[str, Any]:
"""執行 PlayBook 動作(所有命令均為靜態 allowlist無外部字串插入"""
params = playbook.get("action_params", {})
if action_type == "WAIT_RETRY":
wait_min = min(int(params.get("wait_minutes", 5)), 30)
return {
"success": True,
"action": "WAIT_RETRY",
"message": f"Waiting {wait_min} min before retry (playbook: {playbook['name']})",
}
if action_type == "ALERT_ONLY":
return {
"success": True,
"action": "ALERT_ONLY",
"message": params.get("message", "Alert sent"),
}
if action_type == "DOCKER_RESTART":
container = params.get("container")
if not container:
return {
"success": False,
"action": "DOCKER_RESTART",
"message": "Playbook missing 'container' in action_params",
}
safe_container = re.sub(r'[^a-zA-Z0-9._-]', '', container)
if safe_container != container:
return {"success": False, "action": "DOCKER_RESTART",
"message": f"Container name contains unsafe chars: {container!r}"}
# 透過 SSH 跳板110→188執行 docker restartADR-013 §DOCKER_RESTART
# 容器內無 Docker socket必須 SSH 到宿主機執行
key_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'autoheal_id_ed25519')
key_path = os.path.normpath(key_path)
if not os.path.exists(key_path):
logger.warning("[AutoHeal] SSH key 不存在: %s,降級為 ALERT_ONLY", key_path)
return {
"success": False,
"action": "DOCKER_RESTART",
"message": f"SSH key 不存在: {key_path},請確認 config/autoheal_id_ed25519 已掛載",
}
executor = SSHJumpExecutor(
jump_host="192.168.0.110",
jump_user="wooo",
jump_key_path=key_path,
jump_connect_timeout=10,
jump_command_timeout=60,
)
try:
result = executor.execute_command(
target_host="192.168.0.188",
target_user="ollama",
command=["docker", "restart", safe_container],
)
success = result.get("success", False)
msg = (
f"容器 {safe_container} 重啟成功SSH 跳板)"
if success else
f"容器重啟失敗: {result.get('stderr','')[:200]}"
)
return {"success": success, "action": "DOCKER_RESTART", "message": msg}
except Exception as e:
logger.error("[AutoHeal] DOCKER_RESTART SSH 失敗: %s", e)
return {"success": False, "action": "DOCKER_RESTART", "message": f"SSH 執行例外: {e}"}
if action_type == "SSH_CMD":
# SSH_CMD命令必須以 list 形式存在 action_params['argv']
argv = params.get("argv")
if not isinstance(argv, list) or not argv:
return {
"success": False,
"action": "SSH_CMD",
"message": "Playbook SSH_CMD requires action_params.argv (list)",
}
host = params.get("host", "")
user = params.get("user", "ollama")
try:
_validate_host(host)
_validate_user(user)
except ValueError as e:
return {"success": False, "action": "SSH_CMD", "message": str(e)}
# 直接 SSH無跳板list argv不走 shell
ssh_cmd = [
"ssh",
"-o", "StrictHostKeyChecking=no",
"-o", "BatchMode=yes",
"-o", "ConnectTimeout=10",
f"{user}@{host}",
"--",
*argv,
]
try:
result = subprocess.run(
ssh_cmd, capture_output=True, text=True, timeout=60
)
return {
"success": result.returncode == 0,
"action": "SSH_CMD",
"message": result.stdout.strip() or result.stderr.strip(),
}
except Exception as e:
return {"success": False, "action": "SSH_CMD", "message": str(e)}
return {
"success": False,
"action": action_type,
"message": f"Unhandled action_type: {action_type}",
}