diff --git a/services/aider_heal_executor.py b/services/aider_heal_executor.py index a986699..5365042 100644 --- a/services/aider_heal_executor.py +++ b/services/aider_heal_executor.py @@ -2,15 +2,15 @@ services/aider_heal_executor.py ADR-014: Autonomous Code Heal Pipeline -透過 SSH 在 110 主機執行 Aider,自動修復 momo-pro repo 的程式碼問題, -修復後直接 git push,觸發 Gitea CD Pipeline 部署。 +通过 SSH 在 110 主机执行 Aider,自动修复 momo-pro repo 的程式碼问题, +修复后直接 git push,触发 Gitea CD Pipeline 部署。 -安全護欄: - L1 - 檔案白名單(只改 services/ routes/ database/ 內 .py) - L2 - diff 限制(>50 行 → 拒絕,不 push) - L3 - 每小時最多 5 次 CODE_FIX - L4 - health check 失敗 → 自動 git revert + push - L5 - Telegram 通知每次修復結果(成功/失敗/回滾) +安全护拦: + L1 - 文件白名单(只改 services/ routes/ database/ 内 .py) + L2 - diff 限制(>50 行 → 拒绝,不 push) + L3 - 每小时最多 5 次 CODE_FIX + L4 - health check 失败 → 自动 git revert + push + L5 - Telegram 通知每次修复结果(成功/失败/回滚) """ import os @@ -18,81 +18,95 @@ import re import time import subprocess import threading +import shlex import requests from datetime import datetime, timedelta -from typing import Optional +from typing import Optional, Dict, Any, List +from pathlib import Path + from services.logger_manager import SystemLogger logger = SystemLogger("AiderHealExecutor").get_logger() -# ── 設定 ────────────────────────────────────────────────────────────────────── -HEAL_SSH_HOST = "192.168.0.110" -HEAL_SSH_USER = "wooo" -HEAL_SSH_KEY = os.getenv("DEPLOY_SSH_KEY_PATH", "/root/.ssh/id_deploy") +# ── 配置 ────────────────────────────────────────────────────────────────────── +HEAL_SSH_HOST: str = os.getenv("HEAL_SSH_HOST", "192.168.0.110") +HEAL_SSH_USER: str = os.getenv("HEAL_SSH_USER", "wooo") +HEAL_SSH_KEY_DEFAULT = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519")) +HEAL_SSH_KEY = os.getenv("DEPLOY_SSH_KEY_PATH", HEAL_SSH_KEY_DEFAULT) +HEAL_SSH_PORT: int = int(os.getenv("HEAL_SSH_PORT", "22")) -REPO_PATH_110 = os.getenv("AIDER_REPO_PATH", "/home/wooo/ewoooc") -GITEA_REMOTE = "origin" -HEALTH_CHECK_URL = os.getenv("MOMO_BASE_URL", "https://mo.wooo.work") + "/health" - -GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "") -AIDER_MODEL = os.getenv("AIDER_MODEL", "gemini/gemini-2.0-flash") - -MAX_DIFF_LINES = int(os.getenv("AIDER_MAX_DIFF_LINES", "50")) -MAX_HOURLY_FIX = int(os.getenv("AIDER_MAX_HOURLY_FIX", "5")) - -TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "") -TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "") - -# 允許 Aider 修改的路徑(正則) -ALLOWED_FILE_PATTERN = re.compile( - r'^(services|routes|database)/[a-zA-Z0-9_]+\.py$' +REPO_PATH_110: str = os.getenv("AIDER_REPO_PATH", "/home/wooo/ewoooc") +GITEA_REMOTE: str = "origin" +HEALTH_CHECK_URL: str = ( + os.getenv("MOMO_BASE_URL", "https://mo.wooo.work").rstrip("/") + "/health" ) -# ── 速率計數器(執行緒安全) ──────────────────────────────────────────────── -_lock = threading.Lock() -_fix_history: list[datetime] = [] +OLLAMA_API_BASE: str = os.getenv("OLLAMA_API_BASE", "http://192.168.0.111:11434") +AIDER_MODEL: str = os.getenv("AIDER_MODEL", "ollama/qwen2.5-coder:7b") + +MAX_DIFF_LINES: int = int(os.getenv("AIDER_MAX_DIFF_LINES", "50")) +MAX_HOURLY_FIX: int = int(os.getenv("AIDER_MAX_HOURLY_FIX", "5")) + +TELEGRAM_BOT_TOKEN: str = os.getenv("TELEGRAM_BOT_TOKEN", "") +TELEGRAM_CHAT_ID: str = os.getenv("TELEGRAM_CHAT_ID", "") + +# 允许 Aider 修改的路径(正则) +ALLOWED_FILE_PATTERN = re.compile( + r"^(services|routes|database)/[a-zA-Z0-9_]+\.py$" +) + +# ── 速率控制(线程安全) ───────────────────────────────────────────────────── +_lock: threading.Lock = threading.Lock() +_fix_history: List[float] = [] +_last_host_reset: float = time.monotonic() -def _check_rate_limit() -> bool: - """回傳 True 表示尚未超限,可執行修復。""" - now = datetime.utcnow() - cutoff = now - timedelta(hours=1) +def _enforce_rate_limit() -> bool: + """ + 每小时最多 MAX_HOURLY_FIX 次修复。 + 使用单调时钟避免系统时间跳变影响。 + """ + global _last_host_reset, _fix_history + now = time.monotonic() + with _lock: - global _fix_history - _fix_history = [t for t in _fix_history if t > cutoff] + # 每小时重置一次计数(基于单调时钟的近似小时窗口) + if now - _last_host_reset > 3600.0: + _fix_history.clear() + _last_host_reset = now + if len(_fix_history) >= MAX_HOURLY_FIX: return False + _fix_history.append(now) return True -def _notify_telegram(msg: str): - """發送 Telegram 通知(非阻塞,忽略失敗)""" - if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID: - return - try: - requests.post( - f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage", - json={"chat_id": TELEGRAM_CHAT_ID, "text": msg, "parse_mode": "HTML"}, - timeout=5 - ) - except Exception: - pass - - -def _ssh_run(cmd: str, timeout: int = 60) -> tuple[int, str, str]: - """在 110 主機執行指令,回傳 (returncode, stdout, stderr)""" - full_cmd = [ - "ssh", - "-i", HEAL_SSH_KEY, - "-o", "StrictHostKeyChecking=no", - "-o", "ConnectTimeout=10", - f"{HEAL_SSH_USER}@{HEAL_SSH_HOST}", - cmd - ] +def _ssh_exec( + cmd: str, + cwd: Optional[str] = None, + timeout: int = 60, + check: bool = True, +) -> tuple[int, str, str]: + """ + 在远程主机执行命令(通过 SSH)。 + 返回 (returncode, stdout, stderr) + """ + safe_cmd = cmd.replace('"', '\\"').replace("`", "\\`").replace("$", "\\$") + full_cmd = ( + f"ssh -p {HEAL_SSH_PORT} -i {shlex.quote(HEAL_SSH_KEY)} " + f"-o StrictHostKeyChecking=no " + f"-o ConnectTimeout=10 " + f"{HEAL_SSH_USER}@{HEAL_SSH_HOST} {shlex.quote(safe_cmd)}" + ) try: result = subprocess.run( - full_cmd, capture_output=True, text=True, timeout=timeout + full_cmd, + shell=True, + capture_output=True, + text=True, + cwd=cwd, + timeout=timeout, ) return result.returncode, result.stdout.strip(), result.stderr.strip() except subprocess.TimeoutExpired: @@ -101,87 +115,138 @@ def _ssh_run(cmd: str, timeout: int = 60) -> tuple[int, str, str]: return -1, "", str(e) -def _health_check(retries: int = 6, interval: int = 10) -> bool: - """等待健康檢查通過,最多 retries * interval 秒""" - for i in range(retries): - try: - r = requests.get(HEALTH_CHECK_URL, timeout=10) - if r.status_code == 200: - return True - except Exception: - pass - if i < retries - 1: - time.sleep(interval) +def _http_get_json(url: str, timeout: int = 10) -> Optional[Dict[str, Any]]: + try: + resp = requests.get(url, timeout=timeout) + if resp.status_code == 200: + return resp.json() + except Exception: + pass + return None + + +def _wait_for_health( + url: str, + timeout_seconds: int = 120, + interval_seconds: int = 10, +) -> bool: + """ + 持续轮询健康检查,直到成功或超时。 + """ + deadline = time.monotonic() + timeout_seconds + while time.monotonic() < deadline: + data = _http_get_json(url) + if data and data.get("status") == "ok": + return True + time.sleep(interval_seconds) return False +def _notify_telegram(message_html: str) -> None: + """非阻塞通知,失败静默忽略。""" + if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID: + return + try: + requests.post( + f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage", + json={"chat_id": TELEGRAM_CHAT_ID, "text": message_html, "parse_mode": "HTML"}, + timeout=5, + ) + except Exception: + pass + + +def _git_cmd( + repo_path: str, + args: List[str], + timeout: int = 30, + check: bool = True, +) -> tuple[int, str, str]: + """在 repo_path 下执行 git 命令。""" + return _ssh_exec( + f"cd {shlex.quote(repo_path)} && git " + " ".join(shlex.quote(a) for a in args), + cwd=repo_path, + timeout=timeout, + check=check, + ) + + def execute_code_fix( error_type: str, error_message: str, target_file: str, - context: dict | None = None, -) -> dict: + context: Optional[dict] = None, +) -> Dict[str, Any]: """ - 主要入口:針對指定檔案執行 Aider 自動修復並推版。 + 主要入口:针对指定文件执行 Aider 自动修复并推版。 - Args: - error_type: 錯誤類型(如 'ImportError', 'RuntimeError') - error_message: 完整錯誤訊息(來自容器 log) - target_file: 相對於 repo root 的檔案路徑(如 'services/pchome_crawler.py') - context: 額外上下文字典(可選) - - Returns: - { - 'success': bool, - 'action': 'CODE_FIX', - 'message': str, - 'commit_sha': str | None, - 'reverted': bool, - } + 返回结构: + { + 'success': bool, + 'action': 'CODE_FIX', + 'message': str, + 'commit_sha': str | None, + 'reverted': bool, + } """ ts = datetime.now().strftime("%Y/%m/%d %H:%M:%S") - ctx = context or {} + ctx: Dict[str, Any] = context or {} + repo = Path(REPO_PATH_110).expanduser() - # L1:檔案白名單 + # L1:文件白名单 if not ALLOWED_FILE_PATTERN.match(target_file): - reason = f"[AiderHeal] 檔案不在白名單: {target_file}" - logger.warning(reason) - return {"success": False, "action": "CODE_FIX", - "message": reason, "commit_sha": None, "reverted": False} + reason = f"[AiderHeal] 文件不在白名单: {target_file}" + logger.warning("event=heal_reject reason=%s file=%s", reason, target_file) + return { + "success": False, + "action": "CODE_FIX", + "message": reason, + "commit_sha": None, + "reverted": False, + } # L3:速率限制 - if not _check_rate_limit(): - reason = f"[AiderHeal] 每小時上限 {MAX_HOURLY_FIX} 次,跳過" - logger.warning(reason) - return {"success": False, "action": "CODE_FIX", - "message": reason, "commit_sha": None, "reverted": False} + if not _enforce_rate_limit(): + reason = f"[AiderHeal] 每小时上限 {MAX_HOURLY_FIX} 次,跳过" + logger.warning("event=rate_limit file=%s", target_file) + return { + "success": False, + "action": "CODE_FIX", + "message": reason, + "commit_sha": None, + "reverted": False, + } _notify_telegram( - f"🔧 AiderHeal 啟動\n" - f"├ 錯誤類型: {error_type}\n" - f"├ 目標檔案: {target_file}\n" - f"└ 時間: {ts}" + f"🔧 AiderHeal 启动\n" + f"├ 错误类型: {error_type}\n" + f"├ 目标文件: {target_file}\n" + f"└ 时间: {ts}" ) - logger.info("[AiderHeal] 開始修復: %s → %s", error_type, target_file) + logger.info("event=heal_start error_type=%s file=%s", error_type, target_file) - # ── Step 1:準備 repo(在 110 上)────────────────────────────────────────── + # ── Step 1:准备 repo(在 110 上) ──────────────────────────────────────── setup_cmds = ( f"cd {REPO_PATH_110} && " f"git fetch {GITEA_REMOTE} main 2>&1 && " f"git reset --hard {GITEA_REMOTE}/main 2>&1 && " f"git stash 2>&1 || true" ) - rc, out, err = _ssh_run(setup_cmds, timeout=30) + rc, out, err = _ssh_exec(setup_cmds, timeout=30) if rc != 0: - msg = f"[AiderHeal] git 準備失敗: {err or out}" - logger.error(msg) - _notify_telegram(f"❌ AiderHeal 失敗(git 準備)\n{msg}") - return {"success": False, "action": "CODE_FIX", - "message": msg, "commit_sha": None, "reverted": False} + msg = f"[AiderHeal] git 准备失败: {err or out}" + logger.error("event=setup_failed error=%s", msg) + _notify_telegram(f"❌ AiderHeal 失败(git 准备)\n{msg}") + return { + "success": False, + "action": "CODE_FIX", + "message": msg, + "commit_sha": None, + "reverted": False, + } - # ── Step 2:組裝 Aider 指令 ──────────────────────────────────────────────── - # 截斷 error_message,避免 shell 注入問題 - safe_error = error_message[:500].replace('"', "'").replace('`', "'").replace('$', '') + # ── Step 2:构造 Aider 指令 ─────────────────────────────────────────────── + safe_error = error_message[:500].replace('"', "'").replace("`", "'").replace("$", "") instruction = ( f"Fix the following {error_type} in this file. " f"Only fix what is necessary, do not refactor or add features. " @@ -190,44 +255,62 @@ def execute_code_fix( aider_cmd = ( f"cd {REPO_PATH_110} && " - f"GEMINI_API_KEY={GEMINI_API_KEY} " + f"PATH=/home/wooo/.local/bin:$PATH OLLAMA_API_BASE={OLLAMA_API_BASE} " f"aider --model {AIDER_MODEL} " f"--yes-always --no-git " f'--message "{instruction}" ' - f"{target_file} 2>&1" + f"{shlex.quote(target_file)} 2>&1" ) - logger.info("[AiderHeal] 執行 aider on 110...") - rc, aider_out, aider_err = _ssh_run(aider_cmd, timeout=180) - logger.info("[AiderHeal] aider 輸出: %s", (aider_out or aider_err)[:300]) + logger.info("event=aider_exec file=%s", target_file) + rc, aider_out, aider_err = _ssh_exec(aider_cmd, timeout=180) + logger.debug("event=aider_output snippet=%s", (aider_out or aider_err)[:300]) - # ── Step 3:diff 行數檢查(L2 護欄)─────────────────────────────────────── - diff_cmd = f"cd {REPO_PATH_110} && git diff --unified=0 | wc -l" - rc2, diff_lines_str, _ = _ssh_run(diff_cmd) - diff_lines = int(diff_lines_str.strip()) if diff_lines_str.strip().isdigit() else 999 + # ── Step 3:diff 评估(L2 护拦) ───────────────────────────────────────── + # 使用 git diff --numstat 获取有意义的变更行数(增加+删除) + numstat_cmd = ( + f"cd {REPO_PATH_110} && " + f"git diff --numstat HEAD 2>&1 | awk '{{added+=$1; deleted+=$2}} END{{print added+deleted}}'" + ) + rc2, diff_lines_str, _ = _ssh_exec(numstat_cmd, timeout=10) + diff_lines = int(diff_lines_str.strip()) if rc2 == 0 and diff_lines_str.strip().isdigit() else 0 if diff_lines == 0: - msg = f"[AiderHeal] Aider 未產生任何修改(diff=0行),可能已自動解決或模型失效" - logger.warning(msg) - _notify_telegram(f"⚠️ AiderHeal:無修改產生\n{target_file}") - return {"success": False, "action": "CODE_FIX", - "message": msg, "commit_sha": None, "reverted": False} + msg = "[AiderHeal] Aider 未产生任何修改(diff=0),可能已自动解决或模型失效" + logger.warning("event=no_diff file=%s", target_file) + _notify_telegram(f"⚠️ AiderHeal:无修改产生\n{target_file}") + return { + "success": False, + "action": "CODE_FIX", + "message": msg, + "commit_sha": None, + "reverted": False, + } if diff_lines > MAX_DIFF_LINES: - # 改動太大,丟棄並升級告警 - _ssh_run(f"cd {REPO_PATH_110} && git checkout -- .", timeout=10) - msg = (f"[AiderHeal] diff 超出限制 {diff_lines}>{MAX_DIFF_LINES} 行," - f"已丟棄,需人工介入") - logger.warning(msg) - _notify_telegram( - f"⚠️ AiderHeal:diff 過大,需人工審核\n" - f"├ 檔案: {target_file}\n" - f"├ diff: {diff_lines} 行(上限 {MAX_DIFF_LINES})\n" - f"└ 錯誤: {error_type}" + # 改动太大,丢弃并告警 + _, _, _ = _ssh_exec( + f"cd {REPO_PATH_110} && git checkout -- . 2>&1", timeout=10 ) - return {"success": False, "action": "CODE_FIX", - "message": msg, "commit_sha": None, "reverted": False} + msg = ( + f"[AiderHeal] diff 超出限制 {diff_lines} > {MAX_DIFF_LINES} 行," + f"已丢弃,需人工介入" + ) + logger.warning("event=diff_too_large file=%s diff_lines=%d", target_file, diff_lines) + _notify_telegram( + f"⚠️ AiderHeal:diff 过大,需人工审核\n" + f"├ 文件: {target_file}\n" + f"├ diff: {diff_lines} 行(上限 {MAX_DIFF_LINES})\n" + f"└ 错误: {error_type}" + ) + return { + "success": False, + "action": "CODE_FIX", + "message": msg, + "commit_sha": None, + "reverted": False, + } - # ── Step 4:git commit + push ────────────────────────────────────────────── + # ── Step 4:提交并推送 ─────────────────────────────────────────────────── fix_msg = ( f"fix(autoheal): [{error_type}] auto-fix {target_file}\n\n" f"Triggered by AiderHealExecutor (ADR-014)\n" @@ -235,72 +318,92 @@ def execute_code_fix( ) commit_cmd = ( f"cd {REPO_PATH_110} && " - f'git add {target_file} && ' - f'git commit -m "{fix_msg}" 2>&1 && ' + f'git add {shlex.quote(target_file)} && ' + f'git commit -m {shlex.quote(fix_msg)} 2>&1 && ' f"git push {GITEA_REMOTE} main 2>&1" ) - rc3, commit_out, commit_err = _ssh_run(commit_cmd, timeout=30) + rc3, commit_out, commit_err = _ssh_exec(commit_cmd, timeout=30) - # 取得 commit SHA - sha_cmd = f"cd {REPO_PATH_110} && git rev-parse --short HEAD" - _, commit_sha, _ = _ssh_run(sha_cmd) + # 获取最新的 commit SHA(从 push 后的 HEAD 获取,更可靠) + _, commit_sha, _ = _git_cmd(REPO_PATH_110, ["log", "-1", "--format=%H"], timeout=10) commit_sha = commit_sha.strip() or "unknown" if rc3 != 0: - msg = f"[AiderHeal] git push 失敗: {commit_err or commit_out}" - logger.error(msg) - _notify_telegram(f"❌ AiderHeal git push 失敗\n{msg}") - return {"success": False, "action": "CODE_FIX", - "message": msg, "commit_sha": None, "reverted": False} + msg = f"[AiderHeal] git push 失败: {commit_err or commit_out}" + logger.error("event=push_failed error=%s", msg) + _notify_telegram(f"❌ AiderHeal git push 失败\n{msg}") + return { + "success": False, + "action": "CODE_FIX", + "message": msg, + "commit_sha": None, + "reverted": False, + } - logger.info("[AiderHeal] push 成功,commit=%s,等待健康檢查...", commit_sha) + logger.info("event=push_ok commit=%s", commit_sha) _notify_telegram( f"🚀 AiderHeal push 完成\n" f"├ commit: {commit_sha}\n" - f"├ 檔案: {target_file}\n" - f"└ 等待健康檢查..." + f"├ 文件: {target_file}\n" + f"└ 等待健康检查..." ) - # ── Step 5:健康檢查(L4 護欄)──────────────────────────────────────────── - time.sleep(20) # 等 CD 部署啟動 - healthy = _health_check(retries=6, interval=10) + # ── Step 5:健康检查(L4 护拦) ────────────────────────────────────────── + time.sleep(10) # 给部署一点启动缓冲 + healthy = _wait_for_health(HEALTH_CHECK_URL, timeout_seconds=120, interval_seconds=10) if healthy: - msg = f"[AiderHeal] 修復成功並部署完成: {target_file} ({commit_sha})" - logger.info(msg) + msg = f"[AiderHeal] 修复成功并部署完成: {target_file} ({commit_sha})" + logger.info("event=heal_success commit=%s file=%s", commit_sha, target_file) _notify_telegram( - f"✅ AiderHeal 修復完成\n" - f"├ 錯誤: {error_type}\n" - f"├ 檔案: {target_file}\n" + f"✅ AiderHeal 修复完成\n" + f"├ 错误: {error_type}\n" + f"├ 文件: {target_file}\n" f"├ commit: {commit_sha}\n" f"└ diff: {diff_lines} 行" ) - return {"success": True, "action": "CODE_FIX", - "message": msg, "commit_sha": commit_sha, "reverted": False} + return { + "success": True, + "action": "CODE_FIX", + "message": msg, + "commit_sha": commit_sha, + "reverted": False, + } - # ── Step 6:健康檢查失敗 → 自動 revert(L4 護欄)───────────────────────── - logger.error("[AiderHeal] 健康檢查失敗,執行自動 revert...") - revert_cmd = ( + # ── Step 6:健康检查失败 → 自动 revert(L4 护拦) ───────────────────────── + logger.error("event=health_check_failed commit=%s", commit_sha) + _, revert_out, revert_err = _ssh_exec( f"cd {REPO_PATH_110} && " f"git revert --no-edit {commit_sha} 2>&1 && " - f"git push {GITEA_REMOTE} main 2>&1" + f"git push {GITEA_REMOTE} main 2>&1", + timeout=30, ) - rc4, rev_out, rev_err = _ssh_run(revert_cmd, timeout=30) - if rc4 == 0: - _, revert_sha, _ = _ssh_run(sha_cmd) - revert_sha = revert_sha.strip() - msg = f"[AiderHeal] 健康檢查失敗,已自動 revert: {commit_sha} → {revert_sha}" - logger.warning(msg) + _, revert_sha, _ = _git_cmd(REPO_PATH_110, ["log", "-1", "--format=%H"], timeout=10) + revert_sha = revert_sha.strip() or "unknown" + + if "error" not in revert_out.lower() and "error" not in revert_err.lower(): + msg = ( + f"[AiderHeal] 健康检查失败,已自动回滚: " + f"{commit_sha} → {revert_sha}" + ) + logger.warning("event=reverted commit=%s to=%s", commit_sha, revert_sha) _notify_telegram( - f"🔄 AiderHeal 自動回滾\n" + f"🔄 AiderHeal 自动回滚\n" f"├ 原 commit: {commit_sha}\n" - f"├ 回滾 commit: {revert_sha}\n" + f"├ 回滚 commit: {revert_sha}\n" f"└ 需人工排查: {error_type} in {target_file}" ) else: - msg = f"[AiderHeal] revert 失敗!需立即人工介入: {rev_err}" - logger.critical(msg) - _notify_telegram(f"🚨 AiderHeal revert 失敗!請立即人工介入\n{msg}") + msg = f"[AiderHeal] 回滚失败!需立即人工介入: {revert_err}" + logger.critical("event=revert_failed commit=%s error=%s", commit_sha, revert_err) + _notify_telegram( + f"🚨 AiderHeal 回滚失败!请立即人工介入\n{msg}" + ) - return {"success": False, "action": "CODE_FIX", - "message": msg, "commit_sha": commit_sha, "reverted": rc4 == 0} + return { + "success": False, + "action": "CODE_FIX", + "message": msg, + "commit_sha": commit_sha, + "reverted": True, + } diff --git a/services/auto_heal_service.py b/services/auto_heal_service.py index 7c3f7b1..1bca61b 100644 --- a/services/auto_heal_service.py +++ b/services/auto_heal_service.py @@ -307,14 +307,17 @@ class AutoHealService: "action": "DOCKER_RESTART", "message": "Playbook missing 'container' in action_params", } - safe_container = re.sub(r'[^a-zA-Z0-9._-]', '', container) + safe_container = re.sub(r"[^a-zA-Z0-9._-]", "", container) if safe_container != container: - return {"success": False, "action": "DOCKER_RESTART", - "message": f"Container name contains unsafe chars: {container!r}"} + return { + "success": False, + "action": "DOCKER_RESTART", + "message": f"Container name contains unsafe chars: {container!r}", + } # 透過 SSH 跳板(110→188)執行 docker restart(ADR-013 §DOCKER_RESTART) # 容器內無 Docker socket,必須 SSH 到宿主機執行 - key_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'autoheal_id_ed25519') + key_path = os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519") key_path = os.path.normpath(key_path) if not os.path.exists(key_path): logger.warning("[AutoHeal] SSH key 不存在: %s,降級為 ALERT_ONLY", key_path) @@ -388,7 +391,7 @@ class AutoHealService: if action_type == "CODE_FIX": # ADR-014: 透過 Aider 自動修覆程式碼並推版 - target_file = params.get("target_file", "") + target_file = context.get("target_file", "") error_type = context.get("error_type", "UnknownError") error_message = context.get("error_message", "") if not target_file: diff --git a/services/elephant_alpha_autonomous_engine.py b/services/elephant_alpha_autonomous_engine.py index 0caf89a..de0267b 100644 --- a/services/elephant_alpha_autonomous_engine.py +++ b/services/elephant_alpha_autonomous_engine.py @@ -307,7 +307,8 @@ class ElephantAlphaAutonomousEngine: async def _check_code_exception_trigger(self, trigger: AutonomousTrigger) -> bool: """ADR-014: 掃描容器 log 抓取 Python Traceback""" - import subprocess + import os + from services.auto_heal_service import SSHJumpExecutor containers = trigger.conditions.get("scan_containers", ["momo-pro-system", "momo-scheduler"]) error_ptns = trigger.conditions.get("error_patterns", ["Traceback", "ImportError"]) @@ -315,12 +316,36 @@ class ElephantAlphaAutonomousEngine: error_context = [] target_file = "" + # ADR-013 機制:容器內無 docker socket,需透過 SSH 執行 + key_path = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519")) + if not os.path.exists(key_path): + logger.warning("[ElephantAlpha] SSH key %s 不存在,無法掃描 docker logs", key_path) + return False + + executor = SSHJumpExecutor( + jump_host="192.168.0.110", + jump_user="wooo", + jump_key_path=key_path, + jump_connect_timeout=5, + jump_command_timeout=15, + ) + for c in containers: try: # 只掃描最近 5 分鐘的 log - cmd = ["docker", "logs", "--since", "5m", c] - result = subprocess.run(cmd, capture_output=True, text=True, timeout=5) - out = result.stdout + "\n" + result.stderr + result = await asyncio.get_event_loop().run_in_executor( + None, + executor.execute_command, + "192.168.0.188", + "ollama", + ["docker", "logs", "--since", "5m", c] + ) + + if not result.get("success"): + logger.debug("Failed to scan log for %s via SSH", c) + continue + + out = result.get("stdout", "") + "\n" + result.get("stderr", "") # 簡單找 Traceback if "Traceback (most recent call last):" in out: @@ -347,7 +372,7 @@ class ElephantAlphaAutonomousEngine: break # 只抓第一個錯誤 except Exception as e: - logger.debug(f"Failed to scan log for {c}: {e}") + logger.debug(f"Failed to exec SSH scan for {c}: {e}") if has_error and error_context: # 暫存到 trigger class 中供後續 _handle 使用