All checks were successful
CD Pipeline / deploy (push) Successful in 1m16s
根本原因:scheduler 容器內無 Docker socket,直接執行 docker restart 失敗。 修正:使用 SSHJumpExecutor(wooo@110 → ollama@188)透過跳板執行 docker restart。 SSH key:/app/config/autoheal_id_ed25519(rw mount 已存在)。 同步關閉 9 筆 2026-04-19 過期 DNS_FAIL incidents(根因已由網路修復解決)。 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
393 lines
14 KiB
Python
393 lines
14 KiB
Python
"""
|
||
auto_heal_service.py
|
||
ADR-013 AIOps 自動修復服務。
|
||
|
||
SSHJumpExecutor:通過跳板機安全執行遠端命令。
|
||
AutoHealService:PlayBook 驅動的自動修復主服務。
|
||
"""
|
||
import atexit
|
||
import logging
|
||
import os
|
||
import re
|
||
import subprocess
|
||
import tempfile
|
||
from typing import Dict, Any, List, Optional
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ── 輸入驗證用的安全正則 ──────────────────────────────────────────────────────
|
||
# hostname / IP:字母、數字、連字號、點(RFC 1123 + IPv4)
|
||
_HOST_RE = re.compile(r'^[a-zA-Z0-9]([a-zA-Z0-9.\-]{0,253}[a-zA-Z0-9])?$')
|
||
# username:字母、數字、底線、連字號(POSIX 用戶名)
|
||
_USER_RE = re.compile(r'^[a-zA-Z0-9_][a-zA-Z0-9._-]{0,31}$')
|
||
|
||
# PlayBook action_type allowlist(防止任意命令植入)
|
||
ALLOWED_ACTION_TYPES = frozenset({
|
||
'DOCKER_RESTART',
|
||
'WAIT_RETRY',
|
||
'ALERT_ONLY',
|
||
'SSH_CMD',
|
||
})
|
||
|
||
|
||
def _validate_host(host: str) -> str:
|
||
"""驗證 hostname/IP,防止 SSH option injection(-o ProxyCommand=...)"""
|
||
if not host or not _HOST_RE.match(host):
|
||
raise ValueError(f"Invalid host: {host!r}")
|
||
return host
|
||
|
||
|
||
def _validate_user(user: str) -> str:
|
||
"""驗證 Unix 用戶名"""
|
||
if not user or not _USER_RE.match(user):
|
||
raise ValueError(f"Invalid user: {user!r}")
|
||
return user
|
||
|
||
|
||
class SSHJumpExecutor:
|
||
"""
|
||
通過跳板機執行遠端命令的安全封裝。
|
||
|
||
Security notes:
|
||
- jump_host / target_host / users 均通過正則驗證,防止 SSH option injection
|
||
- command 必須為 list(argv),不接受字串,避免遠端 shell 解析
|
||
- SSH 指令列以 '--' 結尾,強制不解析後續參數為 SSH 選項
|
||
- 私鑰資料寫入 600 權限臨時檔,程序退出時清除
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
jump_host: str,
|
||
jump_user: str,
|
||
jump_key_path: Optional[str] = None,
|
||
jump_key_data: Optional[str] = None,
|
||
jump_port: int = 22,
|
||
jump_connect_timeout: int = 5,
|
||
jump_command_timeout: int = 60,
|
||
):
|
||
self.jump_host = _validate_host(jump_host)
|
||
self.jump_user = _validate_user(jump_user)
|
||
self.jump_key_path = jump_key_path
|
||
self.jump_key_data = jump_key_data
|
||
self.jump_port = int(jump_port)
|
||
self.jump_connect_timeout = int(jump_connect_timeout)
|
||
self.jump_command_timeout = int(jump_command_timeout)
|
||
self._tmp_key_path: Optional[str] = None
|
||
|
||
if self.jump_key_data:
|
||
self._tmp_key_path = self._write_temp_key(self.jump_key_data)
|
||
|
||
@staticmethod
|
||
def _write_temp_key(key_data: str) -> str:
|
||
"""將私鑰寫入 600 權限臨時檔並註冊退出清理"""
|
||
fd, tmp_path = tempfile.mkstemp(prefix="ssh_key_")
|
||
try:
|
||
os.write(fd, key_data.encode())
|
||
finally:
|
||
os.close(fd)
|
||
os.chmod(tmp_path, 0o600)
|
||
atexit.register(
|
||
lambda p=tmp_path: os.unlink(p) if os.path.exists(p) else None
|
||
)
|
||
return tmp_path
|
||
|
||
def _make_env(self) -> Dict[str, str]:
|
||
env = dict(os.environ)
|
||
env["SSH_ASKPASS"] = "echo"
|
||
env["DISPLAY"] = ""
|
||
return env
|
||
|
||
def _build_ssh_base_cmd(self) -> List[str]:
|
||
"""構建 SSH 基礎選項(不含目標主機與命令)"""
|
||
base = [
|
||
"ssh",
|
||
"-o", "StrictHostKeyChecking=no",
|
||
"-o", "BatchMode=yes",
|
||
"-o", f"ConnectTimeout={self.jump_connect_timeout}",
|
||
"-o", "ServerAliveInterval=15",
|
||
"-o", "ServerAliveCountMax=3",
|
||
"-p", str(self.jump_port),
|
||
]
|
||
key_path = self._tmp_key_path or self.jump_key_path
|
||
if key_path:
|
||
base.extend(["-i", key_path])
|
||
return base
|
||
|
||
def execute_command(
|
||
self,
|
||
target_host: str,
|
||
target_user: str,
|
||
command: List[str], # ← LIST,不接受字串
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
通過跳板機在目標主機執行命令。
|
||
|
||
Args:
|
||
target_host: 目標主機 hostname 或 IP(必須通過驗證)
|
||
target_user: 目標主機用戶名(必須通過驗證)
|
||
command: 命令及參數列表(e.g. ['docker', 'restart', 'momo-app'])
|
||
不接受字串,防止遠端 shell 重新解析
|
||
|
||
Raises:
|
||
ValueError: 若 command 為空、為字串,或 host/user 格式非法
|
||
"""
|
||
if isinstance(command, str):
|
||
raise TypeError(
|
||
"command must be a list, not a string. "
|
||
"Passing a string risks remote shell injection."
|
||
)
|
||
if not command:
|
||
raise ValueError("command list cannot be empty")
|
||
|
||
target_host = _validate_host(target_host)
|
||
target_user = _validate_user(target_user)
|
||
|
||
full_cmd = self._build_ssh_base_cmd()
|
||
full_cmd.extend([
|
||
"-J", f"{self.jump_user}@{self.jump_host}",
|
||
f"{target_user}@{target_host}",
|
||
"--", # 強制停止 SSH 選項解析
|
||
*command, # 展開命令 list,每個元素獨立 argv
|
||
])
|
||
|
||
try:
|
||
result = subprocess.run(
|
||
full_cmd,
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=self.jump_command_timeout,
|
||
env=self._make_env(),
|
||
)
|
||
return {
|
||
"success": result.returncode == 0,
|
||
"exit_code": result.returncode,
|
||
"stdout": result.stdout.strip(),
|
||
"stderr": result.stderr.strip(),
|
||
"command": command,
|
||
}
|
||
except subprocess.TimeoutExpired:
|
||
return {
|
||
"success": False,
|
||
"exit_code": -1,
|
||
"stdout": "",
|
||
"stderr": "SSH command timed out",
|
||
"command": command,
|
||
}
|
||
except Exception as exc:
|
||
logger.warning("SSH jump execution failed: %s", exc, exc_info=True)
|
||
return {
|
||
"success": False,
|
||
"exit_code": -1,
|
||
"stdout": "",
|
||
"stderr": str(exc),
|
||
"command": command,
|
||
}
|
||
|
||
|
||
class AutoHealService:
|
||
"""
|
||
ADR-013 PlayBook 驅動的自動修復主服務。
|
||
|
||
支援的 action_type(ALLOWED_ACTION_TYPES):
|
||
DOCKER_RESTART — 在指定主機重啟 Docker 服務
|
||
WAIT_RETRY — 等待後重試(不做系統操作)
|
||
ALERT_ONLY — 只記錄 / 發 Telegram,不執行
|
||
SSH_CMD — 執行 PlayBook 指定的靜態白名單命令(list 型)
|
||
"""
|
||
|
||
# Docker 操作的安全命令對應表(防止 PlayBook 攜帶任意命令)
|
||
_DOCKER_RESTART_CMD = ["docker", "restart"]
|
||
|
||
def handle_exception(
|
||
self,
|
||
error_type: str,
|
||
context: Optional[Dict[str, Any]] = None,
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
根據 error_type 查詢 PlayBook 並執行對應修復動作。
|
||
|
||
Args:
|
||
error_type: 錯誤類型字串(e.g. 'resource_pressure')
|
||
context: 觸發上下文,可包含 queue_size / system_load
|
||
|
||
Returns:
|
||
修復結果 dict,含 success / action / message
|
||
"""
|
||
context = context or {}
|
||
logger.info(
|
||
"[AutoHeal] handle_exception: error_type=%s context=%s",
|
||
error_type, context
|
||
)
|
||
|
||
# 從 DB 查詢匹配的 PlayBook
|
||
playbook = self._find_playbook(error_type)
|
||
if not playbook:
|
||
logger.info("[AutoHeal] No matching playbook for: %s", error_type)
|
||
return {
|
||
"success": False,
|
||
"action": None,
|
||
"message": f"No playbook matched for error_type={error_type}",
|
||
}
|
||
|
||
action_type = playbook.get("action_type", "")
|
||
if action_type not in ALLOWED_ACTION_TYPES:
|
||
logger.warning(
|
||
"[AutoHeal] Playbook action_type not in allowlist: %s", action_type
|
||
)
|
||
return {
|
||
"success": False,
|
||
"action": action_type,
|
||
"message": f"action_type '{action_type}' is not allowed",
|
||
}
|
||
|
||
return self._execute_action(action_type, playbook, context)
|
||
|
||
def _find_playbook(self, error_type: str) -> Optional[Dict[str, Any]]:
|
||
"""查詢符合 error_type 的第一個 active PlayBook"""
|
||
try:
|
||
from database.manager import get_session
|
||
from database.autoheal_models import Playbook
|
||
from sqlalchemy import text
|
||
|
||
session = get_session()
|
||
try:
|
||
pb = (
|
||
session.query(Playbook)
|
||
.filter(
|
||
Playbook.error_type == error_type,
|
||
Playbook.is_active.is_(True),
|
||
)
|
||
.first()
|
||
)
|
||
if pb:
|
||
return {
|
||
"id": pb.id,
|
||
"name": pb.name,
|
||
"action_type": pb.action_type,
|
||
"action_params": pb.get_action_params(),
|
||
"max_retries": pb.max_retries,
|
||
"cooldown_min": pb.cooldown_min,
|
||
}
|
||
finally:
|
||
session.close()
|
||
except Exception as e:
|
||
logger.error("[AutoHeal] Playbook lookup failed: %s", e)
|
||
return None
|
||
|
||
def _execute_action(
|
||
self,
|
||
action_type: str,
|
||
playbook: Dict[str, Any],
|
||
context: Dict[str, Any],
|
||
) -> Dict[str, Any]:
|
||
"""執行 PlayBook 動作(所有命令均為靜態 allowlist,無外部字串插入)"""
|
||
params = playbook.get("action_params", {})
|
||
|
||
if action_type == "WAIT_RETRY":
|
||
wait_min = min(int(params.get("wait_minutes", 5)), 30)
|
||
return {
|
||
"success": True,
|
||
"action": "WAIT_RETRY",
|
||
"message": f"Waiting {wait_min} min before retry (playbook: {playbook['name']})",
|
||
}
|
||
|
||
if action_type == "ALERT_ONLY":
|
||
return {
|
||
"success": True,
|
||
"action": "ALERT_ONLY",
|
||
"message": params.get("message", "Alert sent"),
|
||
}
|
||
|
||
if action_type == "DOCKER_RESTART":
|
||
container = params.get("container")
|
||
if not container:
|
||
return {
|
||
"success": False,
|
||
"action": "DOCKER_RESTART",
|
||
"message": "Playbook missing 'container' in action_params",
|
||
}
|
||
safe_container = re.sub(r'[^a-zA-Z0-9._-]', '', container)
|
||
if safe_container != container:
|
||
return {"success": False, "action": "DOCKER_RESTART",
|
||
"message": f"Container name contains unsafe chars: {container!r}"}
|
||
|
||
# 透過 SSH 跳板(110→188)執行 docker restart(ADR-013 §DOCKER_RESTART)
|
||
# 容器內無 Docker socket,必須 SSH 到宿主機執行
|
||
key_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'autoheal_id_ed25519')
|
||
key_path = os.path.normpath(key_path)
|
||
if not os.path.exists(key_path):
|
||
logger.warning("[AutoHeal] SSH key 不存在: %s,降級為 ALERT_ONLY", key_path)
|
||
return {
|
||
"success": False,
|
||
"action": "DOCKER_RESTART",
|
||
"message": f"SSH key 不存在: {key_path},請確認 config/autoheal_id_ed25519 已掛載",
|
||
}
|
||
executor = SSHJumpExecutor(
|
||
jump_host="192.168.0.110",
|
||
jump_user="wooo",
|
||
jump_key_path=key_path,
|
||
jump_connect_timeout=10,
|
||
jump_command_timeout=60,
|
||
)
|
||
try:
|
||
result = executor.execute_command(
|
||
target_host="192.168.0.188",
|
||
target_user="ollama",
|
||
command=["docker", "restart", safe_container],
|
||
)
|
||
success = result.get("success", False)
|
||
msg = (
|
||
f"容器 {safe_container} 重啟成功(SSH 跳板)"
|
||
if success else
|
||
f"容器重啟失敗: {result.get('stderr','')[:200]}"
|
||
)
|
||
return {"success": success, "action": "DOCKER_RESTART", "message": msg}
|
||
except Exception as e:
|
||
logger.error("[AutoHeal] DOCKER_RESTART SSH 失敗: %s", e)
|
||
return {"success": False, "action": "DOCKER_RESTART", "message": f"SSH 執行例外: {e}"}
|
||
|
||
if action_type == "SSH_CMD":
|
||
# SSH_CMD:命令必須以 list 形式存在 action_params['argv']
|
||
argv = params.get("argv")
|
||
if not isinstance(argv, list) or not argv:
|
||
return {
|
||
"success": False,
|
||
"action": "SSH_CMD",
|
||
"message": "Playbook SSH_CMD requires action_params.argv (list)",
|
||
}
|
||
host = params.get("host", "")
|
||
user = params.get("user", "ollama")
|
||
try:
|
||
_validate_host(host)
|
||
_validate_user(user)
|
||
except ValueError as e:
|
||
return {"success": False, "action": "SSH_CMD", "message": str(e)}
|
||
|
||
# 直接 SSH(無跳板),list argv,不走 shell
|
||
ssh_cmd = [
|
||
"ssh",
|
||
"-o", "StrictHostKeyChecking=no",
|
||
"-o", "BatchMode=yes",
|
||
"-o", "ConnectTimeout=10",
|
||
f"{user}@{host}",
|
||
"--",
|
||
*argv,
|
||
]
|
||
try:
|
||
result = subprocess.run(
|
||
ssh_cmd, capture_output=True, text=True, timeout=60
|
||
)
|
||
return {
|
||
"success": result.returncode == 0,
|
||
"action": "SSH_CMD",
|
||
"message": result.stdout.strip() or result.stderr.strip(),
|
||
}
|
||
except Exception as e:
|
||
return {"success": False, "action": "SSH_CMD", "message": str(e)}
|
||
|
||
return {
|
||
"success": False,
|
||
"action": action_type,
|
||
"message": f"Unhandled action_type: {action_type}",
|
||
}
|