476 lines
18 KiB
Python
476 lines
18 KiB
Python
"""
|
||
services/aider_heal_executor.py
|
||
ADR-020: Autonomous Code Heal Pipeline(Code Review 全自動修復端到端執行器)
|
||
|
||
透過 SSH 在 110 主機執行 Aider,自動修復 momo-pro repo 的程式碼問題,
|
||
修復後直接 git push,觸發 Gitea CD Pipeline 部署。
|
||
|
||
分支策略:直推 main,依賴 CD pipeline 健康檢查與 git revert 作回滾安全網。
|
||
(不採 PR 流程,呼應 ADR-020「全自動修復、無人工審查門檻」精神)
|
||
|
||
安全護欄:
|
||
L1 - 檔案白名單(只改 services/ routes/ database/ 內 .py)
|
||
L2 - diff 限制(>50 行 → 拒絕,不 push)
|
||
L3 - 每小時最多 5 次 CODE_FIX
|
||
L4 - health check 失敗 → 自動 git revert + push
|
||
L5 - Telegram 通知每次修復結果(成功/失敗/回滾)
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
import time
|
||
import threading
|
||
import shlex
|
||
import requests
|
||
from datetime import datetime, timedelta
|
||
from typing import Optional, Dict, Any, List
|
||
from pathlib import Path
|
||
|
||
from services.logger_manager import SystemLogger
|
||
from utils.ssh_helper import run_ssh_command
|
||
|
||
logger = SystemLogger("AiderHealExecutor").get_logger()
|
||
|
||
# ── 配置 ──────────────────────────────────────────────────────────────────────
|
||
HEAL_SSH_HOST: str = os.getenv("HEAL_SSH_HOST", "192.168.0.110")
|
||
HEAL_SSH_USER: str = os.getenv("HEAL_SSH_USER", "wooo")
|
||
HEAL_SSH_KEY_DEFAULT = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519"))
|
||
HEAL_SSH_KEY = os.getenv("DEPLOY_SSH_KEY_PATH", HEAL_SSH_KEY_DEFAULT)
|
||
HEAL_SSH_PORT: int = int(os.getenv("HEAL_SSH_PORT", "22"))
|
||
|
||
REPO_PATH_110: str = os.getenv("AIDER_REPO_PATH", "/home/wooo/ewoooc")
|
||
GITEA_REMOTE: str = "origin"
|
||
HEALTH_CHECK_URL: str = (
|
||
os.getenv("MOMO_BASE_URL", "https://mo.wooo.work").rstrip("/") + "/health"
|
||
)
|
||
|
||
# ADR-027 Phase 2 N2:OLLAMA_API_BASE 改 lazy resolve(GCP 優先 / 111 備援)。
|
||
# 注意:本變數透過 SSH 傳入 110 上的 Aider CLI 執行環境(line 312 OLLAMA_API_BASE=...),
|
||
# 所以每次 execute_code_fix 啟動時才需要值;此處僅作為「無 env 時的預設」。
|
||
# 顯式 env 設定者(運維/.env)優先使用,符合向下相容。
|
||
def _default_ollama_api_base() -> str:
|
||
"""Lazy 取得 Aider CLI 用的 Ollama API base,避免 import-time 寫死 111。"""
|
||
env_val = os.getenv("OLLAMA_API_BASE")
|
||
if env_val:
|
||
return env_val
|
||
try:
|
||
from services.ollama_service import resolve_ollama_host
|
||
return resolve_ollama_host()
|
||
except Exception:
|
||
# 兜底:保留原行為 — 內網 111
|
||
return "http://192.168.0.111:11434"
|
||
|
||
|
||
# 注意:保留 module-level 屬性供向下相容(測試 / monkey-patch),
|
||
# 但 execute_code_fix 內部會於每次執行時 re-evaluate 以避免 cache 失效。
|
||
OLLAMA_API_BASE: str = _default_ollama_api_base()
|
||
AIDER_MODEL: str = os.getenv("AIDER_MODEL", "ollama/qwen2.5-coder:7b")
|
||
|
||
MAX_DIFF_LINES: int = int(os.getenv("AIDER_MAX_DIFF_LINES", "50"))
|
||
MAX_HOURLY_FIX: int = int(os.getenv("AIDER_MAX_HOURLY_FIX", "5"))
|
||
|
||
TELEGRAM_BOT_TOKEN: str = os.getenv("TELEGRAM_BOT_TOKEN", "")
|
||
_chat_ids_raw = os.getenv("TELEGRAM_CHAT_IDS", "[]")
|
||
try:
|
||
_chat_ids_list = json.loads(_chat_ids_raw)
|
||
TELEGRAM_CHAT_ID: str = str(_chat_ids_list[0]) if _chat_ids_list else os.getenv("TELEGRAM_CHAT_ID", "")
|
||
except Exception:
|
||
TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "")
|
||
|
||
# 允許 Aider 修改的路徑(正規表示式)
|
||
ALLOWED_FILE_PATTERN = re.compile(
|
||
r"^(services|routes|database)/[a-zA-Z0-9_]+\.py$"
|
||
)
|
||
|
||
# ── 速率控制(執行緒安全) ────────────────────────────────────────────────────
|
||
_lock: threading.Lock = threading.Lock()
|
||
_fix_history: List[float] = []
|
||
_last_host_reset: float = time.monotonic()
|
||
|
||
|
||
def _enforce_rate_limit() -> bool:
|
||
"""
|
||
每小時最多 MAX_HOURLY_FIX 次修復。
|
||
使用單調時鐘避免系統時間跳變影響。
|
||
"""
|
||
global _last_host_reset, _fix_history
|
||
now = time.monotonic()
|
||
|
||
with _lock:
|
||
# 每小時重置一次計數(基於單調時鐘的近似小時窗口)
|
||
if now - _last_host_reset > 3600.0:
|
||
_fix_history.clear()
|
||
_last_host_reset = now
|
||
|
||
if len(_fix_history) >= MAX_HOURLY_FIX:
|
||
return False
|
||
|
||
_fix_history.append(now)
|
||
return True
|
||
|
||
|
||
def _ssh_exec(
|
||
cmd: str,
|
||
cwd: Optional[str] = None,
|
||
timeout: int = 60,
|
||
check: bool = True,
|
||
) -> tuple[int, str, str]:
|
||
"""
|
||
在遠端主機執行命令(透過 SSH)。
|
||
返回 (returncode, stdout, stderr)
|
||
|
||
使用 list + shell=False 避免 shell injection,
|
||
cmd_str 作為 SSH 的最後一個參數,由遠端 shell 負責解析。
|
||
"""
|
||
result = run_ssh_command(
|
||
host=HEAL_SSH_HOST,
|
||
user=HEAL_SSH_USER,
|
||
command=cmd,
|
||
port=HEAL_SSH_PORT,
|
||
key_path=HEAL_SSH_KEY,
|
||
connect_timeout=10,
|
||
command_timeout=timeout,
|
||
cwd=cwd,
|
||
logger=logger,
|
||
)
|
||
return result.returncode, result.stdout, result.stderr
|
||
|
||
|
||
def _http_get_json(url: str, timeout: int = 10) -> Optional[Dict[str, Any]]:
|
||
try:
|
||
resp = requests.get(url, timeout=timeout)
|
||
if resp.status_code == 200:
|
||
return resp.json()
|
||
except Exception:
|
||
pass
|
||
return None
|
||
|
||
|
||
def _wait_for_health(
|
||
url: str,
|
||
timeout_seconds: int = 120,
|
||
interval_seconds: int = 10,
|
||
) -> bool:
|
||
"""
|
||
持續輪詢健康檢查,直到成功或超時。
|
||
"""
|
||
deadline = time.monotonic() + timeout_seconds
|
||
while time.monotonic() < deadline:
|
||
data = _http_get_json(url)
|
||
if data and data.get("status") == "ok":
|
||
return True
|
||
time.sleep(interval_seconds)
|
||
return False
|
||
|
||
|
||
def _notify_telegram(message_html: str) -> None:
|
||
"""
|
||
非阻塞通知,失敗靜默忽略。
|
||
|
||
ADR-019 Phase 5: 改走 EventRouter 統一入口(event_type=aider_heal_event,
|
||
severity=warning,會走 L0/L1 由 EventRouter 內部分流)。失敗仍靜默 pass,
|
||
caller 行為不變。
|
||
"""
|
||
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
|
||
return
|
||
try:
|
||
from services.event_router import dispatch_sync
|
||
dispatch_sync(event={
|
||
"event_type": "aider_heal_event",
|
||
"severity": "warning",
|
||
"source": "AiderHealExecutor",
|
||
"title": "Aider 自動修復通知",
|
||
"summary": message_html[:400],
|
||
"status": "heal_notification",
|
||
"payload": {"raw_message_html": message_html},
|
||
}, admin_chat_ids=[TELEGRAM_CHAT_ID])
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
def _git_cmd(
|
||
repo_path: str,
|
||
args: List[str],
|
||
timeout: int = 30,
|
||
check: bool = True,
|
||
) -> tuple[int, str, str]:
|
||
"""在 repo_path 下執行 git 命令。"""
|
||
return _ssh_exec(
|
||
f"cd {shlex.quote(repo_path)} && git " + " ".join(shlex.quote(a) for a in args),
|
||
cwd=repo_path,
|
||
timeout=timeout,
|
||
check=check,
|
||
)
|
||
|
||
|
||
def execute_code_fix(
|
||
error_type: str,
|
||
error_message: str,
|
||
target_file: str,
|
||
context: Optional[dict] = None,
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
主要入口:針對指定檔案執行 Aider 自動修復並推版。
|
||
|
||
返回結構:
|
||
{
|
||
'success': bool,
|
||
'action': 'CODE_FIX',
|
||
'message': str,
|
||
'commit_sha': str | None,
|
||
'reverted': bool,
|
||
}
|
||
"""
|
||
ts = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
|
||
ctx: Dict[str, Any] = context or {}
|
||
repo = Path(REPO_PATH_110).expanduser()
|
||
|
||
# L0:preflight — 確認 110 上的 repo 路徑真的存在且是 git repo
|
||
# 沒有這個檢查時,後續 cd $REPO_PATH 失敗會被 shell `|| true` 吞掉,
|
||
# 導致整條 pipeline 走完卻 0 次 push,靜默 100% no-op(2026-05-03 實測)
|
||
rc_pre, _, _ = _ssh_exec(
|
||
f"test -d {shlex.quote(REPO_PATH_110)}/.git", timeout=10
|
||
)
|
||
if rc_pre != 0:
|
||
msg = (
|
||
f"[AiderHeal] preflight 失敗:110 主機上 {REPO_PATH_110} 不存在或不是 git repo。"
|
||
f"請檢查 AIDER_REPO_PATH env / 在 110 上 git clone repo(見 ADR-020 SOP)"
|
||
)
|
||
logger.error("event=preflight_failed path=%s", REPO_PATH_110)
|
||
_notify_telegram(
|
||
f"🚨 <b>AiderHeal preflight 失敗</b>\n"
|
||
f"├ 路徑:<code>{REPO_PATH_110}</code>\n"
|
||
f"├ 主機:<code>{HEAL_SSH_HOST}</code>\n"
|
||
f"└ 動作:請依 ADR-020 SOP 在 110 上 clone repo 並設好 push 權限"
|
||
)
|
||
return {
|
||
"success": False,
|
||
"action": "CODE_FIX",
|
||
"message": msg,
|
||
"commit_sha": None,
|
||
"reverted": False,
|
||
}
|
||
|
||
# L1:檔案白名單
|
||
if not ALLOWED_FILE_PATTERN.match(target_file):
|
||
reason = f"[AiderHeal] 檔案不在白名單:{target_file}"
|
||
logger.warning("event=heal_reject reason=%s file=%s", reason, target_file)
|
||
return {
|
||
"success": False,
|
||
"action": "CODE_FIX",
|
||
"message": reason,
|
||
"commit_sha": None,
|
||
"reverted": False,
|
||
}
|
||
|
||
# L3:速率限制
|
||
if not _enforce_rate_limit():
|
||
reason = f"[AiderHeal] 每小時上限 {MAX_HOURLY_FIX} 次,跳過"
|
||
logger.warning("event=rate_limit file=%s", target_file)
|
||
return {
|
||
"success": False,
|
||
"action": "CODE_FIX",
|
||
"message": reason,
|
||
"commit_sha": None,
|
||
"reverted": False,
|
||
}
|
||
|
||
_notify_telegram(
|
||
f"🔧 <b>AiderHeal 啟動</b>\n"
|
||
f"├ 錯誤類型:<code>{error_type}</code>\n"
|
||
f"├ 目標檔案:<code>{target_file}</code>\n"
|
||
f"└ 時間:{ts}"
|
||
)
|
||
logger.info("event=heal_start error_type=%s file=%s", error_type, target_file)
|
||
|
||
# ── Step 1:準備 repo(在 110 上) ────────────────────────────────────────
|
||
# 注意:`A && B && C && (D || true)` 才能讓 stash 失敗時被吞、其他步驟失敗時保留 rc。
|
||
# 早期版本寫 `A && B && C && D || true`,shell 結合性等同
|
||
# `(A && B && C && D) || true`,cd 失敗整條 chain 被吞 rc=0,line 261 永不觸發。
|
||
setup_cmds = (
|
||
f"cd {shlex.quote(REPO_PATH_110)} && "
|
||
f"git fetch {GITEA_REMOTE} main 2>&1 && "
|
||
f"git reset --hard {GITEA_REMOTE}/main 2>&1 && "
|
||
f"(git stash 2>&1 || true)"
|
||
)
|
||
rc, out, err = _ssh_exec(setup_cmds, timeout=30)
|
||
if rc != 0:
|
||
msg = f"[AiderHeal] Git 準備失敗:{err or out}"
|
||
logger.error("event=setup_failed error=%s", msg)
|
||
_notify_telegram(f"❌ AiderHeal 失敗(Git 準備)\n<code>{msg}</code>")
|
||
return {
|
||
"success": False,
|
||
"action": "CODE_FIX",
|
||
"message": msg,
|
||
"commit_sha": None,
|
||
"reverted": False,
|
||
}
|
||
|
||
# ── Step 2:建構 Aider 指令 ───────────────────────────────────────────────
|
||
safe_error = error_message[:500].replace('"', "'").replace("`", "'").replace("$", "")
|
||
instruction = (
|
||
f"Fix the following {error_type} in this file. "
|
||
f"Only fix what is necessary, do not refactor or add features. "
|
||
f"Error: {safe_error}"
|
||
)
|
||
|
||
# ADR-027 Phase 2 N2:每次執行都 re-evaluate OLLAMA_API_BASE,
|
||
# 確保 GCP 主機掛掉時新觸發的 heal 能拿到 fallback 值。
|
||
ollama_api_base_runtime = _default_ollama_api_base()
|
||
aider_cmd = (
|
||
f"cd {shlex.quote(REPO_PATH_110)} && "
|
||
f"PATH=/home/wooo/.local/bin:$PATH OLLAMA_API_BASE={ollama_api_base_runtime} "
|
||
f"aider --model {AIDER_MODEL} "
|
||
f"--yes-always --no-git "
|
||
f'--message "{instruction}" '
|
||
f"{shlex.quote(target_file)} 2>&1"
|
||
)
|
||
logger.info("event=aider_ollama_api_base host=%s", ollama_api_base_runtime)
|
||
logger.info("event=aider_exec file=%s", target_file)
|
||
rc, aider_out, aider_err = _ssh_exec(aider_cmd, timeout=180)
|
||
logger.debug("event=aider_output snippet=%s", (aider_out or aider_err)[:300])
|
||
|
||
# ── Step 3:diff 評估(L2 護欄) ─────────────────────────────────────────
|
||
# 使用 git diff --numstat 獲取有意義的變更行數(新增+刪除)
|
||
numstat_cmd = (
|
||
f"cd {shlex.quote(REPO_PATH_110)} && "
|
||
f"git diff --numstat HEAD 2>&1 | awk '{{added+=$1; deleted+=$2}} END{{print added+deleted}}'"
|
||
)
|
||
rc2, diff_lines_str, _ = _ssh_exec(numstat_cmd, timeout=10)
|
||
diff_lines = int(diff_lines_str.strip()) if rc2 == 0 and diff_lines_str.strip().isdigit() else 0
|
||
|
||
if diff_lines == 0:
|
||
msg = "[AiderHeal] Aider 未產生任何修改(diff=0),可能已自動解決或模型失效"
|
||
logger.warning("event=no_diff file=%s", target_file)
|
||
_notify_telegram(f"⚠️ AiderHeal:無修改產生\n<code>{target_file}</code>")
|
||
return {
|
||
"success": False,
|
||
"action": "CODE_FIX",
|
||
"message": msg,
|
||
"commit_sha": None,
|
||
"reverted": False,
|
||
}
|
||
|
||
if diff_lines > MAX_DIFF_LINES:
|
||
# 改動太大,丟棄並告警
|
||
_, _, _ = _ssh_exec(
|
||
f"cd {shlex.quote(REPO_PATH_110)} && git checkout -- . 2>&1", timeout=10
|
||
)
|
||
msg = (
|
||
f"[AiderHeal] diff 超出限制 {diff_lines} > {MAX_DIFF_LINES} 行,"
|
||
f"已丟棄,需人工介入"
|
||
)
|
||
logger.warning("event=diff_too_large file=%s diff_lines=%d", target_file, diff_lines)
|
||
_notify_telegram(
|
||
f"⚠️ <b>AiderHeal:diff 過大,需人工審核</b>\n"
|
||
f"├ 檔案:<code>{target_file}</code>\n"
|
||
f"├ diff:{diff_lines} 行(上限 {MAX_DIFF_LINES})\n"
|
||
f"└ 錯誤:<code>{error_type}</code>"
|
||
)
|
||
return {
|
||
"success": False,
|
||
"action": "CODE_FIX",
|
||
"message": msg,
|
||
"commit_sha": None,
|
||
"reverted": False,
|
||
}
|
||
|
||
# ── Step 4:提交並推送 ───────────────────────────────────────────────────
|
||
fix_msg = (
|
||
f"fix(autoheal): [{error_type}] auto-fix {target_file}\n\n"
|
||
f"Triggered by AiderHealExecutor (ADR-014)\n"
|
||
f"Error: {safe_error[:200]}"
|
||
)
|
||
commit_cmd = (
|
||
f"cd {shlex.quote(REPO_PATH_110)} && "
|
||
f'git add {shlex.quote(target_file)} && '
|
||
f'git commit -m {shlex.quote(fix_msg)} 2>&1 && '
|
||
f"git push {GITEA_REMOTE} main 2>&1"
|
||
)
|
||
rc3, commit_out, commit_err = _ssh_exec(commit_cmd, timeout=30)
|
||
|
||
# 獲取最新的 commit SHA(從 push 後的 HEAD 獲取,更可靠)
|
||
_, commit_sha, _ = _git_cmd(REPO_PATH_110, ["log", "-1", "--format=%H"], timeout=10)
|
||
commit_sha = commit_sha.strip() or "unknown"
|
||
|
||
if rc3 != 0:
|
||
msg = f"[AiderHeal] git push 失敗:{commit_err or commit_out}"
|
||
logger.error("event=push_failed error=%s", msg)
|
||
_notify_telegram(f"❌ AiderHeal git push 失敗\n<code>{msg}</code>")
|
||
return {
|
||
"success": False,
|
||
"action": "CODE_FIX",
|
||
"message": msg,
|
||
"commit_sha": None,
|
||
"reverted": False,
|
||
}
|
||
|
||
logger.info("event=push_ok commit=%s", commit_sha)
|
||
_notify_telegram(
|
||
f"🚀 <b>AiderHeal push 完成</b>\n"
|
||
f"├ commit:<code>{commit_sha}</code>\n"
|
||
f"├ 檔案:<code>{target_file}</code>\n"
|
||
f"└ 等待健康檢查…"
|
||
)
|
||
|
||
# ── Step 5:健康檢查(L4 護欄) ──────────────────────────────────────────
|
||
time.sleep(10) # 給部署一點啟動緩衝
|
||
healthy = _wait_for_health(HEALTH_CHECK_URL, timeout_seconds=120, interval_seconds=10)
|
||
|
||
if healthy:
|
||
msg = f"[AiderHeal] 修復成功並部署完成:{target_file} ({commit_sha})"
|
||
logger.info("event=heal_success commit=%s file=%s", commit_sha, target_file)
|
||
_notify_telegram(
|
||
f"✅ <b>AiderHeal 修復完成</b>\n"
|
||
f"├ 錯誤:<code>{error_type}</code>\n"
|
||
f"├ 檔案:<code>{target_file}</code>\n"
|
||
f"├ commit:<code>{commit_sha}</code>\n"
|
||
f"└ diff:{diff_lines} 行"
|
||
)
|
||
return {
|
||
"success": True,
|
||
"action": "CODE_FIX",
|
||
"message": msg,
|
||
"commit_sha": commit_sha,
|
||
"reverted": False,
|
||
}
|
||
|
||
# ── Step 6:健康檢查失敗 → 自動 revert(L4 護欄) ─────────────────────────
|
||
logger.error("event=health_check_failed commit=%s", commit_sha)
|
||
_, revert_out, revert_err = _ssh_exec(
|
||
f"cd {shlex.quote(REPO_PATH_110)} && "
|
||
f"git revert --no-edit {shlex.quote(commit_sha)} 2>&1 && "
|
||
f"git push {GITEA_REMOTE} main 2>&1",
|
||
timeout=30,
|
||
)
|
||
_, revert_sha, _ = _git_cmd(REPO_PATH_110, ["log", "-1", "--format=%H"], timeout=10)
|
||
revert_sha = revert_sha.strip() or "unknown"
|
||
|
||
if "error" not in revert_out.lower() and "error" not in revert_err.lower():
|
||
msg = (
|
||
f"[AiderHeal] 健康檢查失敗,已自動回滾:"
|
||
f"{commit_sha} → {revert_sha}"
|
||
)
|
||
logger.warning("event=reverted commit=%s to=%s", commit_sha, revert_sha)
|
||
_notify_telegram(
|
||
f"🔄 <b>AiderHeal 自動回滾</b>\n"
|
||
f"├ 原 commit:<code>{commit_sha}</code>\n"
|
||
f"├ 回滾 commit:<code>{revert_sha}</code>\n"
|
||
f"└ 需人工排查:<code>{error_type}</code> in <code>{target_file}</code>"
|
||
)
|
||
else:
|
||
msg = f"[AiderHeal] 回滾失敗!需立即人工介入:{revert_err}"
|
||
logger.critical("event=revert_failed commit=%s error=%s", commit_sha, revert_err)
|
||
_notify_telegram(
|
||
f"🚨 <b>AiderHeal 回滾失敗!請立即人工介入</b>\n<code>{msg}</code>"
|
||
)
|
||
|
||
return {
|
||
"success": False,
|
||
"action": "CODE_FIX",
|
||
"message": msg,
|
||
"commit_sha": commit_sha,
|
||
"reverted": True,
|
||
}
|