fix(aiops): resolve ADR-014 logical bugs

- Fixed target_file context passing in auto_heal_service
- Fixed docker log scanning inside momo-scheduler using SSHJumpExecutor
- Fixed AiderHealExecutor SSH key path
This commit is contained in:
ogt
2026-04-20 23:25:49 +08:00
parent e343a85322
commit bf5f0d256a
3 changed files with 326 additions and 195 deletions

View File

@@ -2,15 +2,15 @@
services/aider_heal_executor.py
ADR-014: Autonomous Code Heal Pipeline
透過 SSH 在 110 主機執行 Aider動修復 momo-pro repo 的程式碼問題
復後直接 git push觸發 Gitea CD Pipeline 部署。
通过 SSH 在 110 主机执行 Aider动修复 momo-pro repo 的程式碼问题
复后直接 git push触发 Gitea CD Pipeline 部署。
安全護欄
L1 - 檔案白名(只改 services/ routes/ database/ .py
L2 - diff 限制(>50 行 → 拒,不 push
L3 - 每小最多 5 次 CODE_FIX
L4 - health check 失 → 自 git revert + push
L5 - Telegram 通知每次修復結果(成功/失/回
安全护拦
L1 - 文件白名(只改 services/ routes/ database/ .py
L2 - diff 限制(>50 行 → 拒,不 push
L3 - 每小最多 5 次 CODE_FIX
L4 - health check 失 → 自 git revert + push
L5 - Telegram 通知每次修复结果(成功/失/回
"""
import os
@@ -18,81 +18,95 @@ import re
import time
import subprocess
import threading
import shlex
import requests
from datetime import datetime, timedelta
from typing import Optional
from typing import Optional, Dict, Any, List
from pathlib import Path
from services.logger_manager import SystemLogger
logger = SystemLogger("AiderHealExecutor").get_logger()
# ── 設定 ──────────────────────────────────────────────────────────────────────
HEAL_SSH_HOST = "192.168.0.110"
HEAL_SSH_USER = "wooo"
HEAL_SSH_KEY = os.getenv("DEPLOY_SSH_KEY_PATH", "/root/.ssh/id_deploy")
# ── 配置 ──────────────────────────────────────────────────────────────────────
HEAL_SSH_HOST: str = os.getenv("HEAL_SSH_HOST", "192.168.0.110")
HEAL_SSH_USER: str = os.getenv("HEAL_SSH_USER", "wooo")
HEAL_SSH_KEY_DEFAULT = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519"))
HEAL_SSH_KEY = os.getenv("DEPLOY_SSH_KEY_PATH", HEAL_SSH_KEY_DEFAULT)
HEAL_SSH_PORT: int = int(os.getenv("HEAL_SSH_PORT", "22"))
REPO_PATH_110 = os.getenv("AIDER_REPO_PATH", "/home/wooo/ewoooc")
GITEA_REMOTE = "origin"
HEALTH_CHECK_URL = os.getenv("MOMO_BASE_URL", "https://mo.wooo.work") + "/health"
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
AIDER_MODEL = os.getenv("AIDER_MODEL", "gemini/gemini-2.0-flash")
MAX_DIFF_LINES = int(os.getenv("AIDER_MAX_DIFF_LINES", "50"))
MAX_HOURLY_FIX = int(os.getenv("AIDER_MAX_HOURLY_FIX", "5"))
TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "")
TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "")
# 允許 Aider 修改的路徑(正則)
ALLOWED_FILE_PATTERN = re.compile(
r'^(services|routes|database)/[a-zA-Z0-9_]+\.py$'
REPO_PATH_110: str = os.getenv("AIDER_REPO_PATH", "/home/wooo/ewoooc")
GITEA_REMOTE: str = "origin"
HEALTH_CHECK_URL: str = (
os.getenv("MOMO_BASE_URL", "https://mo.wooo.work").rstrip("/") + "/health"
)
# ── 速率計數器(執行緒安全) ────────────────────────────────────────────────
_lock = threading.Lock()
_fix_history: list[datetime] = []
OLLAMA_API_BASE: str = os.getenv("OLLAMA_API_BASE", "http://192.168.0.111:11434")
AIDER_MODEL: str = os.getenv("AIDER_MODEL", "ollama/qwen2.5-coder:7b")
MAX_DIFF_LINES: int = int(os.getenv("AIDER_MAX_DIFF_LINES", "50"))
MAX_HOURLY_FIX: int = int(os.getenv("AIDER_MAX_HOURLY_FIX", "5"))
TELEGRAM_BOT_TOKEN: str = os.getenv("TELEGRAM_BOT_TOKEN", "")
TELEGRAM_CHAT_ID: str = os.getenv("TELEGRAM_CHAT_ID", "")
# 允许 Aider 修改的路径(正则)
ALLOWED_FILE_PATTERN = re.compile(
r"^(services|routes|database)/[a-zA-Z0-9_]+\.py$"
)
# ── 速率控制(线程安全) ─────────────────────────────────────────────────────
_lock: threading.Lock = threading.Lock()
_fix_history: List[float] = []
_last_host_reset: float = time.monotonic()
def _check_rate_limit() -> bool:
"""回傳 True 表示尚未超限,可執行修復。"""
now = datetime.utcnow()
cutoff = now - timedelta(hours=1)
def _enforce_rate_limit() -> bool:
"""
每小时最多 MAX_HOURLY_FIX 次修复。
使用单调时钟避免系统时间跳变影响。
"""
global _last_host_reset, _fix_history
now = time.monotonic()
with _lock:
global _fix_history
_fix_history = [t for t in _fix_history if t > cutoff]
# 每小时重置一次计数(基于单调时钟的近似小时窗口)
if now - _last_host_reset > 3600.0:
_fix_history.clear()
_last_host_reset = now
if len(_fix_history) >= MAX_HOURLY_FIX:
return False
_fix_history.append(now)
return True
def _notify_telegram(msg: str):
"""發送 Telegram 通知(非阻塞,忽略失敗)"""
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
return
try:
requests.post(
f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage",
json={"chat_id": TELEGRAM_CHAT_ID, "text": msg, "parse_mode": "HTML"},
timeout=5
)
except Exception:
pass
def _ssh_run(cmd: str, timeout: int = 60) -> tuple[int, str, str]:
"""在 110 主機執行指令,回傳 (returncode, stdout, stderr)"""
full_cmd = [
"ssh",
"-i", HEAL_SSH_KEY,
"-o", "StrictHostKeyChecking=no",
"-o", "ConnectTimeout=10",
f"{HEAL_SSH_USER}@{HEAL_SSH_HOST}",
cmd
]
def _ssh_exec(
cmd: str,
cwd: Optional[str] = None,
timeout: int = 60,
check: bool = True,
) -> tuple[int, str, str]:
"""
在远程主机执行命令(通过 SSH
返回 (returncode, stdout, stderr)
"""
safe_cmd = cmd.replace('"', '\\"').replace("`", "\\`").replace("$", "\\$")
full_cmd = (
f"ssh -p {HEAL_SSH_PORT} -i {shlex.quote(HEAL_SSH_KEY)} "
f"-o StrictHostKeyChecking=no "
f"-o ConnectTimeout=10 "
f"{HEAL_SSH_USER}@{HEAL_SSH_HOST} {shlex.quote(safe_cmd)}"
)
try:
result = subprocess.run(
full_cmd, capture_output=True, text=True, timeout=timeout
full_cmd,
shell=True,
capture_output=True,
text=True,
cwd=cwd,
timeout=timeout,
)
return result.returncode, result.stdout.strip(), result.stderr.strip()
except subprocess.TimeoutExpired:
@@ -101,87 +115,138 @@ def _ssh_run(cmd: str, timeout: int = 60) -> tuple[int, str, str]:
return -1, "", str(e)
def _health_check(retries: int = 6, interval: int = 10) -> bool:
"""等待健康檢查通過,最多 retries * interval 秒"""
for i in range(retries):
try:
r = requests.get(HEALTH_CHECK_URL, timeout=10)
if r.status_code == 200:
return True
except Exception:
pass
if i < retries - 1:
time.sleep(interval)
def _http_get_json(url: str, timeout: int = 10) -> Optional[Dict[str, Any]]:
try:
resp = requests.get(url, timeout=timeout)
if resp.status_code == 200:
return resp.json()
except Exception:
pass
return None
def _wait_for_health(
url: str,
timeout_seconds: int = 120,
interval_seconds: int = 10,
) -> bool:
"""
持续轮询健康检查,直到成功或超时。
"""
deadline = time.monotonic() + timeout_seconds
while time.monotonic() < deadline:
data = _http_get_json(url)
if data and data.get("status") == "ok":
return True
time.sleep(interval_seconds)
return False
def _notify_telegram(message_html: str) -> None:
"""非阻塞通知,失败静默忽略。"""
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
return
try:
requests.post(
f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage",
json={"chat_id": TELEGRAM_CHAT_ID, "text": message_html, "parse_mode": "HTML"},
timeout=5,
)
except Exception:
pass
def _git_cmd(
repo_path: str,
args: List[str],
timeout: int = 30,
check: bool = True,
) -> tuple[int, str, str]:
"""在 repo_path 下执行 git 命令。"""
return _ssh_exec(
f"cd {shlex.quote(repo_path)} && git " + " ".join(shlex.quote(a) for a in args),
cwd=repo_path,
timeout=timeout,
check=check,
)
def execute_code_fix(
error_type: str,
error_message: str,
target_file: str,
context: dict | None = None,
) -> dict:
context: Optional[dict] = None,
) -> Dict[str, Any]:
"""
主要入口:針對指定檔案執行 Aider 自動修復並推版。
主要入口:针对指定文件执行 Aider 自动修复并推版。
Args:
error_type: 錯誤類型(如 'ImportError', 'RuntimeError'
error_message: 完整錯誤訊息(來自容器 log
target_file: 相對於 repo root 的檔案路徑(如 'services/pchome_crawler.py'
context: 額外上下文字典(可選)
Returns:
{
'success': bool,
'action': 'CODE_FIX',
'message': str,
'commit_sha': str | None,
'reverted': bool,
}
返回结构:
{
'success': bool,
'action': 'CODE_FIX',
'message': str,
'commit_sha': str | None,
'reverted': bool,
}
"""
ts = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
ctx = context or {}
ctx: Dict[str, Any] = context or {}
repo = Path(REPO_PATH_110).expanduser()
# L1檔案白名
# L1文件白名
if not ALLOWED_FILE_PATTERN.match(target_file):
reason = f"[AiderHeal] 檔案不在白名: {target_file}"
logger.warning(reason)
return {"success": False, "action": "CODE_FIX",
"message": reason, "commit_sha": None, "reverted": False}
reason = f"[AiderHeal] 文件不在白名: {target_file}"
logger.warning("event=heal_reject reason=%s file=%s", reason, target_file)
return {
"success": False,
"action": "CODE_FIX",
"message": reason,
"commit_sha": None,
"reverted": False,
}
# L3速率限制
if not _check_rate_limit():
reason = f"[AiderHeal] 每小上限 {MAX_HOURLY_FIX} 次,跳"
logger.warning(reason)
return {"success": False, "action": "CODE_FIX",
"message": reason, "commit_sha": None, "reverted": False}
if not _enforce_rate_limit():
reason = f"[AiderHeal] 每小上限 {MAX_HOURLY_FIX} 次,跳"
logger.warning("event=rate_limit file=%s", target_file)
return {
"success": False,
"action": "CODE_FIX",
"message": reason,
"commit_sha": None,
"reverted": False,
}
_notify_telegram(
f"🔧 <b>AiderHeal 啟動</b>\n"
f"錯誤類型: <code>{error_type}</code>\n"
f"├ 目標檔案: <code>{target_file}</code>\n"
f"時間: {ts}"
f"🔧 <b>AiderHeal 启动</b>\n"
f"错误类型: <code>{error_type}</code>\n"
f"├ 目标文件: <code>{target_file}</code>\n"
f"时间: {ts}"
)
logger.info("[AiderHeal] 開始修復: %s %s", error_type, target_file)
logger.info("event=heal_start error_type=%s file=%s", error_type, target_file)
# ── Step 1準備 repo在 110 上)──────────────────────────────────────────
# ── Step 1准备 repo在 110 上) ────────────────────────────────────────
setup_cmds = (
f"cd {REPO_PATH_110} && "
f"git fetch {GITEA_REMOTE} main 2>&1 && "
f"git reset --hard {GITEA_REMOTE}/main 2>&1 && "
f"git stash 2>&1 || true"
)
rc, out, err = _ssh_run(setup_cmds, timeout=30)
rc, out, err = _ssh_exec(setup_cmds, timeout=30)
if rc != 0:
msg = f"[AiderHeal] git 準備失敗: {err or out}"
logger.error(msg)
_notify_telegram(f"❌ AiderHeal 失git 準備\n<code>{msg}</code>")
return {"success": False, "action": "CODE_FIX",
"message": msg, "commit_sha": None, "reverted": False}
msg = f"[AiderHeal] git 准备失败: {err or out}"
logger.error("event=setup_failed error=%s", msg)
_notify_telegram(f"❌ AiderHeal 失git 准备\n<code>{msg}</code>")
return {
"success": False,
"action": "CODE_FIX",
"message": msg,
"commit_sha": None,
"reverted": False,
}
# ── Step 2組裝 Aider 指令 ───────────────────────────────────────────────
# 截斷 error_message避免 shell 注入問題
safe_error = error_message[:500].replace('"', "'").replace('`', "'").replace('$', '')
# ── Step 2构造 Aider 指令 ───────────────────────────────────────────────
safe_error = error_message[:500].replace('"', "'").replace("`", "'").replace("$", "")
instruction = (
f"Fix the following {error_type} in this file. "
f"Only fix what is necessary, do not refactor or add features. "
@@ -190,44 +255,62 @@ def execute_code_fix(
aider_cmd = (
f"cd {REPO_PATH_110} && "
f"GEMINI_API_KEY={GEMINI_API_KEY} "
f"PATH=/home/wooo/.local/bin:$PATH OLLAMA_API_BASE={OLLAMA_API_BASE} "
f"aider --model {AIDER_MODEL} "
f"--yes-always --no-git "
f'--message "{instruction}" '
f"{target_file} 2>&1"
f"{shlex.quote(target_file)} 2>&1"
)
logger.info("[AiderHeal] 執行 aider on 110...")
rc, aider_out, aider_err = _ssh_run(aider_cmd, timeout=180)
logger.info("[AiderHeal] aider 輸出: %s", (aider_out or aider_err)[:300])
logger.info("event=aider_exec file=%s", target_file)
rc, aider_out, aider_err = _ssh_exec(aider_cmd, timeout=180)
logger.debug("event=aider_output snippet=%s", (aider_out or aider_err)[:300])
# ── Step 3diff 行數檢查L2 護欄)───────────────────────────────────────
diff_cmd = f"cd {REPO_PATH_110} && git diff --unified=0 | wc -l"
rc2, diff_lines_str, _ = _ssh_run(diff_cmd)
diff_lines = int(diff_lines_str.strip()) if diff_lines_str.strip().isdigit() else 999
# ── Step 3diff 评估L2 护拦) ─────────────────────────────────────────
# 使用 git diff --numstat 获取有意义的变更行数(增加+删除)
numstat_cmd = (
f"cd {REPO_PATH_110} && "
f"git diff --numstat HEAD 2>&1 | awk '{{added+=$1; deleted+=$2}} END{{print added+deleted}}'"
)
rc2, diff_lines_str, _ = _ssh_exec(numstat_cmd, timeout=10)
diff_lines = int(diff_lines_str.strip()) if rc2 == 0 and diff_lines_str.strip().isdigit() else 0
if diff_lines == 0:
msg = f"[AiderHeal] Aider 未生任何修改diff=0),可能已自動解決或模型失效"
logger.warning(msg)
_notify_telegram(f"⚠️ AiderHeal修改\n<code>{target_file}</code>")
return {"success": False, "action": "CODE_FIX",
"message": msg, "commit_sha": None, "reverted": False}
msg = "[AiderHeal] Aider 未生任何修改diff=0可能已自动解决或模型失效"
logger.warning("event=no_diff file=%s", target_file)
_notify_telegram(f"⚠️ AiderHeal修改\n<code>{target_file}</code>")
return {
"success": False,
"action": "CODE_FIX",
"message": msg,
"commit_sha": None,
"reverted": False,
}
if diff_lines > MAX_DIFF_LINES:
# 改太大,丟棄並升級告警
_ssh_run(f"cd {REPO_PATH_110} && git checkout -- .", timeout=10)
msg = (f"[AiderHeal] diff 超出限制 {diff_lines}>{MAX_DIFF_LINES} 行,"
f"已丟棄,需人工介入")
logger.warning(msg)
_notify_telegram(
f"⚠️ <b>AiderHealdiff 過大,需人工審核</b>\n"
f"├ 檔案: <code>{target_file}</code>\n"
f"├ diff: {diff_lines} 行(上限 {MAX_DIFF_LINES}\n"
f"└ 錯誤: <code>{error_type}</code>"
# 改太大,丢弃并告警
_, _, _ = _ssh_exec(
f"cd {REPO_PATH_110} && git checkout -- . 2>&1", timeout=10
)
return {"success": False, "action": "CODE_FIX",
"message": msg, "commit_sha": None, "reverted": False}
msg = (
f"[AiderHeal] diff 超出限制 {diff_lines} > {MAX_DIFF_LINES} 行,"
f"已丢弃,需人工介入"
)
logger.warning("event=diff_too_large file=%s diff_lines=%d", target_file, diff_lines)
_notify_telegram(
f"⚠️ <b>AiderHealdiff 过大,需人工审核</b>\n"
f"├ 文件: <code>{target_file}</code>\n"
f"├ diff: {diff_lines} 行(上限 {MAX_DIFF_LINES}\n"
f"└ 错误: <code>{error_type}</code>"
)
return {
"success": False,
"action": "CODE_FIX",
"message": msg,
"commit_sha": None,
"reverted": False,
}
# ── Step 4git commit + push ──────────────────────────────────────────────
# ── Step 4提交并推送 ───────────────────────────────────────────────────
fix_msg = (
f"fix(autoheal): [{error_type}] auto-fix {target_file}\n\n"
f"Triggered by AiderHealExecutor (ADR-014)\n"
@@ -235,72 +318,92 @@ def execute_code_fix(
)
commit_cmd = (
f"cd {REPO_PATH_110} && "
f'git add {target_file} && '
f'git commit -m "{fix_msg}" 2>&1 && '
f'git add {shlex.quote(target_file)} && '
f'git commit -m {shlex.quote(fix_msg)} 2>&1 && '
f"git push {GITEA_REMOTE} main 2>&1"
)
rc3, commit_out, commit_err = _ssh_run(commit_cmd, timeout=30)
rc3, commit_out, commit_err = _ssh_exec(commit_cmd, timeout=30)
# 取得 commit SHA
sha_cmd = f"cd {REPO_PATH_110} && git rev-parse --short HEAD"
_, commit_sha, _ = _ssh_run(sha_cmd)
# 获取最新的 commit SHA(从 push 后的 HEAD 获取,更可靠)
_, commit_sha, _ = _git_cmd(REPO_PATH_110, ["log", "-1", "--format=%H"], timeout=10)
commit_sha = commit_sha.strip() or "unknown"
if rc3 != 0:
msg = f"[AiderHeal] git push 失: {commit_err or commit_out}"
logger.error(msg)
_notify_telegram(f"❌ AiderHeal git push 失\n<code>{msg}</code>")
return {"success": False, "action": "CODE_FIX",
"message": msg, "commit_sha": None, "reverted": False}
msg = f"[AiderHeal] git push 失: {commit_err or commit_out}"
logger.error("event=push_failed error=%s", msg)
_notify_telegram(f"❌ AiderHeal git push 失\n<code>{msg}</code>")
return {
"success": False,
"action": "CODE_FIX",
"message": msg,
"commit_sha": None,
"reverted": False,
}
logger.info("[AiderHeal] push 成功commit=%s,等待健康檢查...", commit_sha)
logger.info("event=push_ok commit=%s", commit_sha)
_notify_telegram(
f"🚀 <b>AiderHeal push 完成</b>\n"
f"├ commit: <code>{commit_sha}</code>\n"
f"檔案: <code>{target_file}</code>\n"
f"└ 等待健康查..."
f"文件: <code>{target_file}</code>\n"
f"└ 等待健康查..."
)
# ── Step 5健康L4 護欄)────────────────────────────────────────────
time.sleep(20) # 等 CD 部署啟動
healthy = _health_check(retries=6, interval=10)
# ── Step 5健康L4 护拦) ──────────────────────────────────────────
time.sleep(10) # 给部署一点启动缓冲
healthy = _wait_for_health(HEALTH_CHECK_URL, timeout_seconds=120, interval_seconds=10)
if healthy:
msg = f"[AiderHeal] 修成功部署完成: {target_file} ({commit_sha})"
logger.info(msg)
msg = f"[AiderHeal] 修成功部署完成: {target_file} ({commit_sha})"
logger.info("event=heal_success commit=%s file=%s", commit_sha, target_file)
_notify_telegram(
f"✅ <b>AiderHeal 修完成</b>\n"
f"錯誤: <code>{error_type}</code>\n"
f"檔案: <code>{target_file}</code>\n"
f"✅ <b>AiderHeal 修完成</b>\n"
f"错误: <code>{error_type}</code>\n"
f"文件: <code>{target_file}</code>\n"
f"├ commit: <code>{commit_sha}</code>\n"
f"└ diff: {diff_lines}"
)
return {"success": True, "action": "CODE_FIX",
"message": msg, "commit_sha": commit_sha, "reverted": False}
return {
"success": True,
"action": "CODE_FIX",
"message": msg,
"commit_sha": commit_sha,
"reverted": False,
}
# ── Step 6健康查失 → 自 revertL4 護欄)─────────────────────────
logger.error("[AiderHeal] 健康檢查失敗,執行自動 revert...")
revert_cmd = (
# ── Step 6健康查失 → 自 revertL4 护拦) ─────────────────────────
logger.error("event=health_check_failed commit=%s", commit_sha)
_, revert_out, revert_err = _ssh_exec(
f"cd {REPO_PATH_110} && "
f"git revert --no-edit {commit_sha} 2>&1 && "
f"git push {GITEA_REMOTE} main 2>&1"
f"git push {GITEA_REMOTE} main 2>&1",
timeout=30,
)
rc4, rev_out, rev_err = _ssh_run(revert_cmd, timeout=30)
if rc4 == 0:
_, revert_sha, _ = _ssh_run(sha_cmd)
revert_sha = revert_sha.strip()
msg = f"[AiderHeal] 健康檢查失敗,已自動 revert: {commit_sha}{revert_sha}"
logger.warning(msg)
_, revert_sha, _ = _git_cmd(REPO_PATH_110, ["log", "-1", "--format=%H"], timeout=10)
revert_sha = revert_sha.strip() or "unknown"
if "error" not in revert_out.lower() and "error" not in revert_err.lower():
msg = (
f"[AiderHeal] 健康检查失败,已自动回滚: "
f"{commit_sha}{revert_sha}"
)
logger.warning("event=reverted commit=%s to=%s", commit_sha, revert_sha)
_notify_telegram(
f"🔄 <b>AiderHeal 自動回滾</b>\n"
f"🔄 <b>AiderHeal 自动回滚</b>\n"
f"├ 原 commit: <code>{commit_sha}</code>\n"
f"├ 回 commit: <code>{revert_sha}</code>\n"
f"├ 回 commit: <code>{revert_sha}</code>\n"
f"└ 需人工排查: <code>{error_type}</code> in <code>{target_file}</code>"
)
else:
msg = f"[AiderHeal] revert 失敗!需立即人工介入: {rev_err}"
logger.critical(msg)
_notify_telegram(f"🚨 <b>AiderHeal revert 失敗!請立即人工介入</b>\n<code>{msg}</code>")
msg = f"[AiderHeal] 回滚失败!需立即人工介入: {revert_err}"
logger.critical("event=revert_failed commit=%s error=%s", commit_sha, revert_err)
_notify_telegram(
f"🚨 <b>AiderHeal 回滚失败!请立即人工介入</b>\n<code>{msg}</code>"
)
return {"success": False, "action": "CODE_FIX",
"message": msg, "commit_sha": commit_sha, "reverted": rc4 == 0}
return {
"success": False,
"action": "CODE_FIX",
"message": msg,
"commit_sha": commit_sha,
"reverted": True,
}

View File

@@ -307,14 +307,17 @@ class AutoHealService:
"action": "DOCKER_RESTART",
"message": "Playbook missing 'container' in action_params",
}
safe_container = re.sub(r'[^a-zA-Z0-9._-]', '', container)
safe_container = re.sub(r"[^a-zA-Z0-9._-]", "", container)
if safe_container != container:
return {"success": False, "action": "DOCKER_RESTART",
"message": f"Container name contains unsafe chars: {container!r}"}
return {
"success": False,
"action": "DOCKER_RESTART",
"message": f"Container name contains unsafe chars: {container!r}",
}
# 透過 SSH 跳板110→188執行 docker restartADR-013 §DOCKER_RESTART
# 容器內無 Docker socket必須 SSH 到宿主機執行
key_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'autoheal_id_ed25519')
key_path = os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519")
key_path = os.path.normpath(key_path)
if not os.path.exists(key_path):
logger.warning("[AutoHeal] SSH key 不存在: %s,降級為 ALERT_ONLY", key_path)
@@ -388,7 +391,7 @@ class AutoHealService:
if action_type == "CODE_FIX":
# ADR-014: 透過 Aider 自動修覆程式碼並推版
target_file = params.get("target_file", "")
target_file = context.get("target_file", "")
error_type = context.get("error_type", "UnknownError")
error_message = context.get("error_message", "")
if not target_file:

View File

@@ -307,7 +307,8 @@ class ElephantAlphaAutonomousEngine:
async def _check_code_exception_trigger(self, trigger: AutonomousTrigger) -> bool:
"""ADR-014: 掃描容器 log 抓取 Python Traceback"""
import subprocess
import os
from services.auto_heal_service import SSHJumpExecutor
containers = trigger.conditions.get("scan_containers", ["momo-pro-system", "momo-scheduler"])
error_ptns = trigger.conditions.get("error_patterns", ["Traceback", "ImportError"])
@@ -315,12 +316,36 @@ class ElephantAlphaAutonomousEngine:
error_context = []
target_file = ""
# ADR-013 機制:容器內無 docker socket需透過 SSH 執行
key_path = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519"))
if not os.path.exists(key_path):
logger.warning("[ElephantAlpha] SSH key %s 不存在,無法掃描 docker logs", key_path)
return False
executor = SSHJumpExecutor(
jump_host="192.168.0.110",
jump_user="wooo",
jump_key_path=key_path,
jump_connect_timeout=5,
jump_command_timeout=15,
)
for c in containers:
try:
# 只掃描最近 5 分鐘的 log
cmd = ["docker", "logs", "--since", "5m", c]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
out = result.stdout + "\n" + result.stderr
result = await asyncio.get_event_loop().run_in_executor(
None,
executor.execute_command,
"192.168.0.188",
"ollama",
["docker", "logs", "--since", "5m", c]
)
if not result.get("success"):
logger.debug("Failed to scan log for %s via SSH", c)
continue
out = result.get("stdout", "") + "\n" + result.get("stderr", "")
# 簡單找 Traceback
if "Traceback (most recent call last):" in out:
@@ -347,7 +372,7 @@ class ElephantAlphaAutonomousEngine:
break # 只抓第一個錯誤
except Exception as e:
logger.debug(f"Failed to scan log for {c}: {e}")
logger.debug(f"Failed to exec SSH scan for {c}: {e}")
if has_error and error_context:
# 暫存到 trigger class 中供後續 _handle 使用