diff --git a/services/aider_heal_executor.py b/services/aider_heal_executor.py
index a986699..5365042 100644
--- a/services/aider_heal_executor.py
+++ b/services/aider_heal_executor.py
@@ -2,15 +2,15 @@
services/aider_heal_executor.py
ADR-014: Autonomous Code Heal Pipeline
-透過 SSH 在 110 主機執行 Aider,自動修復 momo-pro repo 的程式碼問題,
-修復後直接 git push,觸發 Gitea CD Pipeline 部署。
+通过 SSH 在 110 主机执行 Aider,自动修复 momo-pro repo 的程式碼问题,
+修复后直接 git push,触发 Gitea CD Pipeline 部署。
-安全護欄:
- L1 - 檔案白名單(只改 services/ routes/ database/ 內 .py)
- L2 - diff 限制(>50 行 → 拒絕,不 push)
- L3 - 每小時最多 5 次 CODE_FIX
- L4 - health check 失敗 → 自動 git revert + push
- L5 - Telegram 通知每次修復結果(成功/失敗/回滾)
+安全护拦:
+ L1 - 文件白名单(只改 services/ routes/ database/ 内 .py)
+ L2 - diff 限制(>50 行 → 拒绝,不 push)
+ L3 - 每小时最多 5 次 CODE_FIX
+ L4 - health check 失败 → 自动 git revert + push
+ L5 - Telegram 通知每次修复结果(成功/失败/回滚)
"""
import os
@@ -18,81 +18,95 @@ import re
import time
import subprocess
import threading
+import shlex
import requests
from datetime import datetime, timedelta
-from typing import Optional
+from typing import Optional, Dict, Any, List
+from pathlib import Path
+
from services.logger_manager import SystemLogger
logger = SystemLogger("AiderHealExecutor").get_logger()
-# ── 設定 ──────────────────────────────────────────────────────────────────────
-HEAL_SSH_HOST = "192.168.0.110"
-HEAL_SSH_USER = "wooo"
-HEAL_SSH_KEY = os.getenv("DEPLOY_SSH_KEY_PATH", "/root/.ssh/id_deploy")
+# ── 配置 ──────────────────────────────────────────────────────────────────────
+HEAL_SSH_HOST: str = os.getenv("HEAL_SSH_HOST", "192.168.0.110")
+HEAL_SSH_USER: str = os.getenv("HEAL_SSH_USER", "wooo")
+HEAL_SSH_KEY_DEFAULT = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519"))
+HEAL_SSH_KEY = os.getenv("DEPLOY_SSH_KEY_PATH", HEAL_SSH_KEY_DEFAULT)
+HEAL_SSH_PORT: int = int(os.getenv("HEAL_SSH_PORT", "22"))
-REPO_PATH_110 = os.getenv("AIDER_REPO_PATH", "/home/wooo/ewoooc")
-GITEA_REMOTE = "origin"
-HEALTH_CHECK_URL = os.getenv("MOMO_BASE_URL", "https://mo.wooo.work") + "/health"
-
-GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
-AIDER_MODEL = os.getenv("AIDER_MODEL", "gemini/gemini-2.0-flash")
-
-MAX_DIFF_LINES = int(os.getenv("AIDER_MAX_DIFF_LINES", "50"))
-MAX_HOURLY_FIX = int(os.getenv("AIDER_MAX_HOURLY_FIX", "5"))
-
-TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "")
-TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "")
-
-# 允許 Aider 修改的路徑(正則)
-ALLOWED_FILE_PATTERN = re.compile(
- r'^(services|routes|database)/[a-zA-Z0-9_]+\.py$'
+REPO_PATH_110: str = os.getenv("AIDER_REPO_PATH", "/home/wooo/ewoooc")
+GITEA_REMOTE: str = "origin"
+HEALTH_CHECK_URL: str = (
+ os.getenv("MOMO_BASE_URL", "https://mo.wooo.work").rstrip("/") + "/health"
)
-# ── 速率計數器(執行緒安全) ────────────────────────────────────────────────
-_lock = threading.Lock()
-_fix_history: list[datetime] = []
+OLLAMA_API_BASE: str = os.getenv("OLLAMA_API_BASE", "http://192.168.0.111:11434")
+AIDER_MODEL: str = os.getenv("AIDER_MODEL", "ollama/qwen2.5-coder:7b")
+
+MAX_DIFF_LINES: int = int(os.getenv("AIDER_MAX_DIFF_LINES", "50"))
+MAX_HOURLY_FIX: int = int(os.getenv("AIDER_MAX_HOURLY_FIX", "5"))
+
+TELEGRAM_BOT_TOKEN: str = os.getenv("TELEGRAM_BOT_TOKEN", "")
+TELEGRAM_CHAT_ID: str = os.getenv("TELEGRAM_CHAT_ID", "")
+
+# 允许 Aider 修改的路径(正则)
+ALLOWED_FILE_PATTERN = re.compile(
+ r"^(services|routes|database)/[a-zA-Z0-9_]+\.py$"
+)
+
+# ── 速率控制(线程安全) ─────────────────────────────────────────────────────
+_lock: threading.Lock = threading.Lock()
+_fix_history: List[float] = []
+_last_host_reset: float = time.monotonic()
-def _check_rate_limit() -> bool:
- """回傳 True 表示尚未超限,可執行修復。"""
- now = datetime.utcnow()
- cutoff = now - timedelta(hours=1)
+def _enforce_rate_limit() -> bool:
+ """
+ 每小时最多 MAX_HOURLY_FIX 次修复。
+ 使用单调时钟避免系统时间跳变影响。
+ """
+ global _last_host_reset, _fix_history
+ now = time.monotonic()
+
with _lock:
- global _fix_history
- _fix_history = [t for t in _fix_history if t > cutoff]
+ # 每小时重置一次计数(基于单调时钟的近似小时窗口)
+ if now - _last_host_reset > 3600.0:
+ _fix_history.clear()
+ _last_host_reset = now
+
if len(_fix_history) >= MAX_HOURLY_FIX:
return False
+
_fix_history.append(now)
return True
-def _notify_telegram(msg: str):
- """發送 Telegram 通知(非阻塞,忽略失敗)"""
- if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
- return
- try:
- requests.post(
- f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage",
- json={"chat_id": TELEGRAM_CHAT_ID, "text": msg, "parse_mode": "HTML"},
- timeout=5
- )
- except Exception:
- pass
-
-
-def _ssh_run(cmd: str, timeout: int = 60) -> tuple[int, str, str]:
- """在 110 主機執行指令,回傳 (returncode, stdout, stderr)"""
- full_cmd = [
- "ssh",
- "-i", HEAL_SSH_KEY,
- "-o", "StrictHostKeyChecking=no",
- "-o", "ConnectTimeout=10",
- f"{HEAL_SSH_USER}@{HEAL_SSH_HOST}",
- cmd
- ]
+def _ssh_exec(
+ cmd: str,
+ cwd: Optional[str] = None,
+ timeout: int = 60,
+ check: bool = True,
+) -> tuple[int, str, str]:
+ """
+ 在远程主机执行命令(通过 SSH)。
+ 返回 (returncode, stdout, stderr)
+ """
+ safe_cmd = cmd.replace('"', '\\"').replace("`", "\\`").replace("$", "\\$")
+ full_cmd = (
+ f"ssh -p {HEAL_SSH_PORT} -i {shlex.quote(HEAL_SSH_KEY)} "
+ f"-o StrictHostKeyChecking=no "
+ f"-o ConnectTimeout=10 "
+ f"{HEAL_SSH_USER}@{HEAL_SSH_HOST} {shlex.quote(safe_cmd)}"
+ )
try:
result = subprocess.run(
- full_cmd, capture_output=True, text=True, timeout=timeout
+ full_cmd,
+ shell=True,
+ capture_output=True,
+ text=True,
+ cwd=cwd,
+ timeout=timeout,
)
return result.returncode, result.stdout.strip(), result.stderr.strip()
except subprocess.TimeoutExpired:
@@ -101,87 +115,138 @@ def _ssh_run(cmd: str, timeout: int = 60) -> tuple[int, str, str]:
return -1, "", str(e)
-def _health_check(retries: int = 6, interval: int = 10) -> bool:
- """等待健康檢查通過,最多 retries * interval 秒"""
- for i in range(retries):
- try:
- r = requests.get(HEALTH_CHECK_URL, timeout=10)
- if r.status_code == 200:
- return True
- except Exception:
- pass
- if i < retries - 1:
- time.sleep(interval)
+def _http_get_json(url: str, timeout: int = 10) -> Optional[Dict[str, Any]]:
+ try:
+ resp = requests.get(url, timeout=timeout)
+ if resp.status_code == 200:
+ return resp.json()
+ except Exception:
+ pass
+ return None
+
+
+def _wait_for_health(
+ url: str,
+ timeout_seconds: int = 120,
+ interval_seconds: int = 10,
+) -> bool:
+ """
+ 持续轮询健康检查,直到成功或超时。
+ """
+ deadline = time.monotonic() + timeout_seconds
+ while time.monotonic() < deadline:
+ data = _http_get_json(url)
+ if data and data.get("status") == "ok":
+ return True
+ time.sleep(interval_seconds)
return False
+def _notify_telegram(message_html: str) -> None:
+ """非阻塞通知,失败静默忽略。"""
+ if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
+ return
+ try:
+ requests.post(
+ f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage",
+ json={"chat_id": TELEGRAM_CHAT_ID, "text": message_html, "parse_mode": "HTML"},
+ timeout=5,
+ )
+ except Exception:
+ pass
+
+
+def _git_cmd(
+ repo_path: str,
+ args: List[str],
+ timeout: int = 30,
+ check: bool = True,
+) -> tuple[int, str, str]:
+ """在 repo_path 下执行 git 命令。"""
+ return _ssh_exec(
+ f"cd {shlex.quote(repo_path)} && git " + " ".join(shlex.quote(a) for a in args),
+ cwd=repo_path,
+ timeout=timeout,
+ check=check,
+ )
+
+
def execute_code_fix(
error_type: str,
error_message: str,
target_file: str,
- context: dict | None = None,
-) -> dict:
+ context: Optional[dict] = None,
+) -> Dict[str, Any]:
"""
- 主要入口:針對指定檔案執行 Aider 自動修復並推版。
+ 主要入口:针对指定文件执行 Aider 自动修复并推版。
- Args:
- error_type: 錯誤類型(如 'ImportError', 'RuntimeError')
- error_message: 完整錯誤訊息(來自容器 log)
- target_file: 相對於 repo root 的檔案路徑(如 'services/pchome_crawler.py')
- context: 額外上下文字典(可選)
-
- Returns:
- {
- 'success': bool,
- 'action': 'CODE_FIX',
- 'message': str,
- 'commit_sha': str | None,
- 'reverted': bool,
- }
+ 返回结构:
+ {
+ 'success': bool,
+ 'action': 'CODE_FIX',
+ 'message': str,
+ 'commit_sha': str | None,
+ 'reverted': bool,
+ }
"""
ts = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
- ctx = context or {}
+ ctx: Dict[str, Any] = context or {}
+ repo = Path(REPO_PATH_110).expanduser()
- # L1:檔案白名單
+ # L1:文件白名单
if not ALLOWED_FILE_PATTERN.match(target_file):
- reason = f"[AiderHeal] 檔案不在白名單: {target_file}"
- logger.warning(reason)
- return {"success": False, "action": "CODE_FIX",
- "message": reason, "commit_sha": None, "reverted": False}
+ reason = f"[AiderHeal] 文件不在白名单: {target_file}"
+ logger.warning("event=heal_reject reason=%s file=%s", reason, target_file)
+ return {
+ "success": False,
+ "action": "CODE_FIX",
+ "message": reason,
+ "commit_sha": None,
+ "reverted": False,
+ }
# L3:速率限制
- if not _check_rate_limit():
- reason = f"[AiderHeal] 每小時上限 {MAX_HOURLY_FIX} 次,跳過"
- logger.warning(reason)
- return {"success": False, "action": "CODE_FIX",
- "message": reason, "commit_sha": None, "reverted": False}
+ if not _enforce_rate_limit():
+ reason = f"[AiderHeal] 每小时上限 {MAX_HOURLY_FIX} 次,跳过"
+ logger.warning("event=rate_limit file=%s", target_file)
+ return {
+ "success": False,
+ "action": "CODE_FIX",
+ "message": reason,
+ "commit_sha": None,
+ "reverted": False,
+ }
_notify_telegram(
- f"🔧 AiderHeal 啟動\n"
- f"├ 錯誤類型: {error_type}\n"
- f"├ 目標檔案: {target_file}\n"
- f"└ 時間: {ts}"
+ f"🔧 AiderHeal 启动\n"
+ f"├ 错误类型: {error_type}\n"
+ f"├ 目标文件: {target_file}\n"
+ f"└ 时间: {ts}"
)
- logger.info("[AiderHeal] 開始修復: %s → %s", error_type, target_file)
+ logger.info("event=heal_start error_type=%s file=%s", error_type, target_file)
- # ── Step 1:準備 repo(在 110 上)──────────────────────────────────────────
+ # ── Step 1:准备 repo(在 110 上) ────────────────────────────────────────
setup_cmds = (
f"cd {REPO_PATH_110} && "
f"git fetch {GITEA_REMOTE} main 2>&1 && "
f"git reset --hard {GITEA_REMOTE}/main 2>&1 && "
f"git stash 2>&1 || true"
)
- rc, out, err = _ssh_run(setup_cmds, timeout=30)
+ rc, out, err = _ssh_exec(setup_cmds, timeout=30)
if rc != 0:
- msg = f"[AiderHeal] git 準備失敗: {err or out}"
- logger.error(msg)
- _notify_telegram(f"❌ AiderHeal 失敗(git 準備)\n{msg}")
- return {"success": False, "action": "CODE_FIX",
- "message": msg, "commit_sha": None, "reverted": False}
+ msg = f"[AiderHeal] git 准备失败: {err or out}"
+ logger.error("event=setup_failed error=%s", msg)
+ _notify_telegram(f"❌ AiderHeal 失败(git 准备)\n{msg}")
+ return {
+ "success": False,
+ "action": "CODE_FIX",
+ "message": msg,
+ "commit_sha": None,
+ "reverted": False,
+ }
- # ── Step 2:組裝 Aider 指令 ────────────────────────────────────────────────
- # 截斷 error_message,避免 shell 注入問題
- safe_error = error_message[:500].replace('"', "'").replace('`', "'").replace('$', '')
+ # ── Step 2:构造 Aider 指令 ───────────────────────────────────────────────
+ safe_error = error_message[:500].replace('"', "'").replace("`", "'").replace("$", "")
instruction = (
f"Fix the following {error_type} in this file. "
f"Only fix what is necessary, do not refactor or add features. "
@@ -190,44 +255,62 @@ def execute_code_fix(
aider_cmd = (
f"cd {REPO_PATH_110} && "
- f"GEMINI_API_KEY={GEMINI_API_KEY} "
+ f"PATH=/home/wooo/.local/bin:$PATH OLLAMA_API_BASE={OLLAMA_API_BASE} "
f"aider --model {AIDER_MODEL} "
f"--yes-always --no-git "
f'--message "{instruction}" '
- f"{target_file} 2>&1"
+ f"{shlex.quote(target_file)} 2>&1"
)
- logger.info("[AiderHeal] 執行 aider on 110...")
- rc, aider_out, aider_err = _ssh_run(aider_cmd, timeout=180)
- logger.info("[AiderHeal] aider 輸出: %s", (aider_out or aider_err)[:300])
+ logger.info("event=aider_exec file=%s", target_file)
+ rc, aider_out, aider_err = _ssh_exec(aider_cmd, timeout=180)
+ logger.debug("event=aider_output snippet=%s", (aider_out or aider_err)[:300])
- # ── Step 3:diff 行數檢查(L2 護欄)───────────────────────────────────────
- diff_cmd = f"cd {REPO_PATH_110} && git diff --unified=0 | wc -l"
- rc2, diff_lines_str, _ = _ssh_run(diff_cmd)
- diff_lines = int(diff_lines_str.strip()) if diff_lines_str.strip().isdigit() else 999
+ # ── Step 3:diff 评估(L2 护拦) ─────────────────────────────────────────
+ # 使用 git diff --numstat 获取有意义的变更行数(增加+删除)
+ numstat_cmd = (
+ f"cd {REPO_PATH_110} && "
+ f"git diff --numstat HEAD 2>&1 | awk '{{added+=$1; deleted+=$2}} END{{print added+deleted}}'"
+ )
+ rc2, diff_lines_str, _ = _ssh_exec(numstat_cmd, timeout=10)
+ diff_lines = int(diff_lines_str.strip()) if rc2 == 0 and diff_lines_str.strip().isdigit() else 0
if diff_lines == 0:
- msg = f"[AiderHeal] Aider 未產生任何修改(diff=0行),可能已自動解決或模型失效"
- logger.warning(msg)
- _notify_telegram(f"⚠️ AiderHeal:無修改產生\n{target_file}")
- return {"success": False, "action": "CODE_FIX",
- "message": msg, "commit_sha": None, "reverted": False}
+ msg = "[AiderHeal] Aider 未产生任何修改(diff=0),可能已自动解决或模型失效"
+ logger.warning("event=no_diff file=%s", target_file)
+ _notify_telegram(f"⚠️ AiderHeal:无修改产生\n{target_file}")
+ return {
+ "success": False,
+ "action": "CODE_FIX",
+ "message": msg,
+ "commit_sha": None,
+ "reverted": False,
+ }
if diff_lines > MAX_DIFF_LINES:
- # 改動太大,丟棄並升級告警
- _ssh_run(f"cd {REPO_PATH_110} && git checkout -- .", timeout=10)
- msg = (f"[AiderHeal] diff 超出限制 {diff_lines}>{MAX_DIFF_LINES} 行,"
- f"已丟棄,需人工介入")
- logger.warning(msg)
- _notify_telegram(
- f"⚠️ AiderHeal:diff 過大,需人工審核\n"
- f"├ 檔案: {target_file}\n"
- f"├ diff: {diff_lines} 行(上限 {MAX_DIFF_LINES})\n"
- f"└ 錯誤: {error_type}"
+ # 改动太大,丢弃并告警
+ _, _, _ = _ssh_exec(
+ f"cd {REPO_PATH_110} && git checkout -- . 2>&1", timeout=10
)
- return {"success": False, "action": "CODE_FIX",
- "message": msg, "commit_sha": None, "reverted": False}
+ msg = (
+ f"[AiderHeal] diff 超出限制 {diff_lines} > {MAX_DIFF_LINES} 行,"
+ f"已丢弃,需人工介入"
+ )
+ logger.warning("event=diff_too_large file=%s diff_lines=%d", target_file, diff_lines)
+ _notify_telegram(
+ f"⚠️ AiderHeal:diff 过大,需人工审核\n"
+ f"├ 文件: {target_file}\n"
+ f"├ diff: {diff_lines} 行(上限 {MAX_DIFF_LINES})\n"
+ f"└ 错误: {error_type}"
+ )
+ return {
+ "success": False,
+ "action": "CODE_FIX",
+ "message": msg,
+ "commit_sha": None,
+ "reverted": False,
+ }
- # ── Step 4:git commit + push ──────────────────────────────────────────────
+ # ── Step 4:提交并推送 ───────────────────────────────────────────────────
fix_msg = (
f"fix(autoheal): [{error_type}] auto-fix {target_file}\n\n"
f"Triggered by AiderHealExecutor (ADR-014)\n"
@@ -235,72 +318,92 @@ def execute_code_fix(
)
commit_cmd = (
f"cd {REPO_PATH_110} && "
- f'git add {target_file} && '
- f'git commit -m "{fix_msg}" 2>&1 && '
+ f'git add {shlex.quote(target_file)} && '
+ f'git commit -m {shlex.quote(fix_msg)} 2>&1 && '
f"git push {GITEA_REMOTE} main 2>&1"
)
- rc3, commit_out, commit_err = _ssh_run(commit_cmd, timeout=30)
+ rc3, commit_out, commit_err = _ssh_exec(commit_cmd, timeout=30)
- # 取得 commit SHA
- sha_cmd = f"cd {REPO_PATH_110} && git rev-parse --short HEAD"
- _, commit_sha, _ = _ssh_run(sha_cmd)
+ # 获取最新的 commit SHA(从 push 后的 HEAD 获取,更可靠)
+ _, commit_sha, _ = _git_cmd(REPO_PATH_110, ["log", "-1", "--format=%H"], timeout=10)
commit_sha = commit_sha.strip() or "unknown"
if rc3 != 0:
- msg = f"[AiderHeal] git push 失敗: {commit_err or commit_out}"
- logger.error(msg)
- _notify_telegram(f"❌ AiderHeal git push 失敗\n{msg}")
- return {"success": False, "action": "CODE_FIX",
- "message": msg, "commit_sha": None, "reverted": False}
+ msg = f"[AiderHeal] git push 失败: {commit_err or commit_out}"
+ logger.error("event=push_failed error=%s", msg)
+ _notify_telegram(f"❌ AiderHeal git push 失败\n{msg}")
+ return {
+ "success": False,
+ "action": "CODE_FIX",
+ "message": msg,
+ "commit_sha": None,
+ "reverted": False,
+ }
- logger.info("[AiderHeal] push 成功,commit=%s,等待健康檢查...", commit_sha)
+ logger.info("event=push_ok commit=%s", commit_sha)
_notify_telegram(
f"🚀 AiderHeal push 完成\n"
f"├ commit: {commit_sha}\n"
- f"├ 檔案: {target_file}\n"
- f"└ 等待健康檢查..."
+ f"├ 文件: {target_file}\n"
+ f"└ 等待健康检查..."
)
- # ── Step 5:健康檢查(L4 護欄)────────────────────────────────────────────
- time.sleep(20) # 等 CD 部署啟動
- healthy = _health_check(retries=6, interval=10)
+ # ── Step 5:健康检查(L4 护拦) ──────────────────────────────────────────
+ time.sleep(10) # 给部署一点启动缓冲
+ healthy = _wait_for_health(HEALTH_CHECK_URL, timeout_seconds=120, interval_seconds=10)
if healthy:
- msg = f"[AiderHeal] 修復成功並部署完成: {target_file} ({commit_sha})"
- logger.info(msg)
+ msg = f"[AiderHeal] 修复成功并部署完成: {target_file} ({commit_sha})"
+ logger.info("event=heal_success commit=%s file=%s", commit_sha, target_file)
_notify_telegram(
- f"✅ AiderHeal 修復完成\n"
- f"├ 錯誤: {error_type}\n"
- f"├ 檔案: {target_file}\n"
+ f"✅ AiderHeal 修复完成\n"
+ f"├ 错误: {error_type}\n"
+ f"├ 文件: {target_file}\n"
f"├ commit: {commit_sha}\n"
f"└ diff: {diff_lines} 行"
)
- return {"success": True, "action": "CODE_FIX",
- "message": msg, "commit_sha": commit_sha, "reverted": False}
+ return {
+ "success": True,
+ "action": "CODE_FIX",
+ "message": msg,
+ "commit_sha": commit_sha,
+ "reverted": False,
+ }
- # ── Step 6:健康檢查失敗 → 自動 revert(L4 護欄)─────────────────────────
- logger.error("[AiderHeal] 健康檢查失敗,執行自動 revert...")
- revert_cmd = (
+ # ── Step 6:健康检查失败 → 自动 revert(L4 护拦) ─────────────────────────
+ logger.error("event=health_check_failed commit=%s", commit_sha)
+ _, revert_out, revert_err = _ssh_exec(
f"cd {REPO_PATH_110} && "
f"git revert --no-edit {commit_sha} 2>&1 && "
- f"git push {GITEA_REMOTE} main 2>&1"
+ f"git push {GITEA_REMOTE} main 2>&1",
+ timeout=30,
)
- rc4, rev_out, rev_err = _ssh_run(revert_cmd, timeout=30)
- if rc4 == 0:
- _, revert_sha, _ = _ssh_run(sha_cmd)
- revert_sha = revert_sha.strip()
- msg = f"[AiderHeal] 健康檢查失敗,已自動 revert: {commit_sha} → {revert_sha}"
- logger.warning(msg)
+ _, revert_sha, _ = _git_cmd(REPO_PATH_110, ["log", "-1", "--format=%H"], timeout=10)
+ revert_sha = revert_sha.strip() or "unknown"
+
+ if "error" not in revert_out.lower() and "error" not in revert_err.lower():
+ msg = (
+ f"[AiderHeal] 健康检查失败,已自动回滚: "
+ f"{commit_sha} → {revert_sha}"
+ )
+ logger.warning("event=reverted commit=%s to=%s", commit_sha, revert_sha)
_notify_telegram(
- f"🔄 AiderHeal 自動回滾\n"
+ f"🔄 AiderHeal 自动回滚\n"
f"├ 原 commit: {commit_sha}\n"
- f"├ 回滾 commit: {revert_sha}\n"
+ f"├ 回滚 commit: {revert_sha}\n"
f"└ 需人工排查: {error_type} in {target_file}"
)
else:
- msg = f"[AiderHeal] revert 失敗!需立即人工介入: {rev_err}"
- logger.critical(msg)
- _notify_telegram(f"🚨 AiderHeal revert 失敗!請立即人工介入\n{msg}")
+ msg = f"[AiderHeal] 回滚失败!需立即人工介入: {revert_err}"
+ logger.critical("event=revert_failed commit=%s error=%s", commit_sha, revert_err)
+ _notify_telegram(
+ f"🚨 AiderHeal 回滚失败!请立即人工介入\n{msg}"
+ )
- return {"success": False, "action": "CODE_FIX",
- "message": msg, "commit_sha": commit_sha, "reverted": rc4 == 0}
+ return {
+ "success": False,
+ "action": "CODE_FIX",
+ "message": msg,
+ "commit_sha": commit_sha,
+ "reverted": True,
+ }
diff --git a/services/auto_heal_service.py b/services/auto_heal_service.py
index 7c3f7b1..1bca61b 100644
--- a/services/auto_heal_service.py
+++ b/services/auto_heal_service.py
@@ -307,14 +307,17 @@ class AutoHealService:
"action": "DOCKER_RESTART",
"message": "Playbook missing 'container' in action_params",
}
- safe_container = re.sub(r'[^a-zA-Z0-9._-]', '', container)
+ safe_container = re.sub(r"[^a-zA-Z0-9._-]", "", container)
if safe_container != container:
- return {"success": False, "action": "DOCKER_RESTART",
- "message": f"Container name contains unsafe chars: {container!r}"}
+ return {
+ "success": False,
+ "action": "DOCKER_RESTART",
+ "message": f"Container name contains unsafe chars: {container!r}",
+ }
# 透過 SSH 跳板(110→188)執行 docker restart(ADR-013 §DOCKER_RESTART)
# 容器內無 Docker socket,必須 SSH 到宿主機執行
- key_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'autoheal_id_ed25519')
+ key_path = os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519")
key_path = os.path.normpath(key_path)
if not os.path.exists(key_path):
logger.warning("[AutoHeal] SSH key 不存在: %s,降級為 ALERT_ONLY", key_path)
@@ -388,7 +391,7 @@ class AutoHealService:
if action_type == "CODE_FIX":
# ADR-014: 透過 Aider 自動修覆程式碼並推版
- target_file = params.get("target_file", "")
+ target_file = context.get("target_file", "")
error_type = context.get("error_type", "UnknownError")
error_message = context.get("error_message", "")
if not target_file:
diff --git a/services/elephant_alpha_autonomous_engine.py b/services/elephant_alpha_autonomous_engine.py
index 0caf89a..de0267b 100644
--- a/services/elephant_alpha_autonomous_engine.py
+++ b/services/elephant_alpha_autonomous_engine.py
@@ -307,7 +307,8 @@ class ElephantAlphaAutonomousEngine:
async def _check_code_exception_trigger(self, trigger: AutonomousTrigger) -> bool:
"""ADR-014: 掃描容器 log 抓取 Python Traceback"""
- import subprocess
+ import os
+ from services.auto_heal_service import SSHJumpExecutor
containers = trigger.conditions.get("scan_containers", ["momo-pro-system", "momo-scheduler"])
error_ptns = trigger.conditions.get("error_patterns", ["Traceback", "ImportError"])
@@ -315,12 +316,36 @@ class ElephantAlphaAutonomousEngine:
error_context = []
target_file = ""
+ # ADR-013 機制:容器內無 docker socket,需透過 SSH 執行
+ key_path = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519"))
+ if not os.path.exists(key_path):
+ logger.warning("[ElephantAlpha] SSH key %s 不存在,無法掃描 docker logs", key_path)
+ return False
+
+ executor = SSHJumpExecutor(
+ jump_host="192.168.0.110",
+ jump_user="wooo",
+ jump_key_path=key_path,
+ jump_connect_timeout=5,
+ jump_command_timeout=15,
+ )
+
for c in containers:
try:
# 只掃描最近 5 分鐘的 log
- cmd = ["docker", "logs", "--since", "5m", c]
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
- out = result.stdout + "\n" + result.stderr
+ result = await asyncio.get_event_loop().run_in_executor(
+ None,
+ executor.execute_command,
+ "192.168.0.188",
+ "ollama",
+ ["docker", "logs", "--since", "5m", c]
+ )
+
+ if not result.get("success"):
+ logger.debug("Failed to scan log for %s via SSH", c)
+ continue
+
+ out = result.get("stdout", "") + "\n" + result.get("stderr", "")
# 簡單找 Traceback
if "Traceback (most recent call last):" in out:
@@ -347,7 +372,7 @@ class ElephantAlphaAutonomousEngine:
break # 只抓第一個錯誤
except Exception as e:
- logger.debug(f"Failed to scan log for {c}: {e}")
+ logger.debug(f"Failed to exec SSH scan for {c}: {e}")
if has_error and error_context:
# 暫存到 trigger class 中供後續 _handle 使用