""" services/aider_heal_executor.py ADR-020: Autonomous Code Heal Pipeline(Code Review 全自動修復端到端執行器) 透過 SSH 在 110 主機執行 Aider,自動修復 momo-pro repo 的程式碼問題, 修復後直接 git push,觸發 Gitea CD Pipeline 部署。 分支策略:直推 main,依賴 CD pipeline 健康檢查與 git revert 作回滾安全網。 (不採 PR 流程,呼應 ADR-020「全自動修復、無人工審查門檻」精神) 安全護欄: L1 - 檔案白名單(只改 services/ routes/ database/ 內 .py) L2 - diff 限制(>50 行 → 拒絕,不 push) L3 - 每小時最多 5 次 CODE_FIX L4 - health check 失敗 → 自動 git revert + push L5 - Telegram 通知每次修復結果(成功/失敗/回滾) """ import json import os import re import time import threading import shlex import html import requests from datetime import datetime from typing import Optional, Dict, Any, List from services.logger_manager import SystemLogger from utils.ssh_helper import run_ssh_command logger = SystemLogger("AiderHealExecutor").get_logger() # ── 配置 ────────────────────────────────────────────────────────────────────── HEAL_SSH_HOST: str = os.getenv("HEAL_SSH_HOST", "192.168.0.110") HEAL_SSH_USER: str = os.getenv("HEAL_SSH_USER", "wooo") HEAL_SSH_KEY_DEFAULT = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519")) HEAL_SSH_KEY = os.getenv("DEPLOY_SSH_KEY_PATH", HEAL_SSH_KEY_DEFAULT) HEAL_SSH_PORT: int = int(os.getenv("HEAL_SSH_PORT", "22")) REPO_PATH_110: str = os.getenv("AIDER_REPO_PATH", "/home/wooo/ewoooc") GITEA_REMOTE: str = "origin" HEALTH_CHECK_URL: str = ( os.getenv("MOMO_BASE_URL", "https://mo.wooo.work").rstrip("/") + "/health" ) # ADR-027 Phase 2 N2:OLLAMA_API_BASE 改 lazy resolve(GCP 優先 / 111 備援)。 # 注意:本變數透過 SSH 傳入 110 上的 Aider CLI 執行環境(line 312 OLLAMA_API_BASE=...), # 所以每次 execute_code_fix 啟動時才需要值;此處僅作為「無 env 時的預設」。 # 顯式 env 設定者(運維/.env)只接受 GCP-A/GCP-B/111,避免自動修復流量繞過 ADR-028。 def _default_ollama_api_base() -> str: """Lazy 取得 Aider CLI 用的 Ollama API base,避免 import-time 寫死 111。""" try: from services.ollama_service import approved_ollama_env, resolve_ollama_host env_val = approved_ollama_env("OLLAMA_API_BASE") if env_val: return env_val return resolve_ollama_host() except Exception: # 兜底:保留原行為 — 內網 111 return "http://192.168.0.111:11434" # 注意:保留 module-level 屬性供向下相容(測試 / monkey-patch), # 但 execute_code_fix 內部會於每次執行時 re-evaluate 以避免 cache 失效。 OLLAMA_API_BASE: str = _default_ollama_api_base() AIDER_MODEL: str = os.getenv("AIDER_MODEL", "ollama/qwen2.5-coder:7b") MAX_DIFF_LINES: int = int(os.getenv("AIDER_MAX_DIFF_LINES", "50")) MAX_HOURLY_FIX: int = int(os.getenv("AIDER_MAX_HOURLY_FIX", "5")) TELEGRAM_BOT_TOKEN: str = os.getenv("TELEGRAM_BOT_TOKEN", "") _chat_ids_raw = os.getenv("TELEGRAM_CHAT_IDS", "[]") try: _chat_ids_list = json.loads(_chat_ids_raw) TELEGRAM_CHAT_ID: str = str(_chat_ids_list[0]) if _chat_ids_list else os.getenv("TELEGRAM_CHAT_ID", "") except Exception: TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "") # 允許 Aider 修改的路徑(正規表示式) # ADR-020 白名單允許 services/routes/database 底下的 Python 模組,含子目錄; # tests/docs/config 等檔案仍需人工處理,避免 Aider 以「修測試」掩蓋產品問題。 ALLOWED_FILE_PATTERN = re.compile( r"^(services|routes|database)/(?:[a-zA-Z0-9_]+/)*[a-zA-Z0-9_]+\.py$" ) # ── 速率控制(執行緒安全) ──────────────────────────────────────────────────── _lock: threading.Lock = threading.Lock() _fix_history: List[float] = [] _last_host_reset: float = time.monotonic() def _enforce_rate_limit() -> bool: """ 每小時最多 MAX_HOURLY_FIX 次修復。 使用單調時鐘避免系統時間跳變影響。 """ global _last_host_reset, _fix_history now = time.monotonic() with _lock: # 每小時重置一次計數(基於單調時鐘的近似小時窗口) if now - _last_host_reset > 3600.0: _fix_history.clear() _last_host_reset = now if len(_fix_history) >= MAX_HOURLY_FIX: return False _fix_history.append(now) return True def _ssh_exec( cmd: str, cwd: Optional[str] = None, timeout: int = 60, check: bool = True, ) -> tuple[int, str, str]: """ 在遠端主機執行命令(透過 SSH)。 返回 (returncode, stdout, stderr) 使用 list + shell=False 避免 shell injection, cmd_str 作為 SSH 的最後一個參數,由遠端 shell 負責解析。 """ result = run_ssh_command( host=HEAL_SSH_HOST, user=HEAL_SSH_USER, command=cmd, port=HEAL_SSH_PORT, key_path=HEAL_SSH_KEY, connect_timeout=10, command_timeout=timeout, cwd=cwd, logger=logger, ) return result.returncode, result.stdout, result.stderr def _http_get_json(url: str, timeout: int = 10) -> Optional[Dict[str, Any]]: try: resp = requests.get(url, timeout=timeout) if resp.status_code == 200: return resp.json() logger.debug("[AiderHeal] health probe returned status=%s url=%s", resp.status_code, url) except Exception: logger.debug("[AiderHeal] health probe request failed url=%s", url, exc_info=True) return None def _wait_for_health( url: str, timeout_seconds: int = 120, interval_seconds: int = 10, ) -> bool: """ 持續輪詢健康檢查,直到成功或超時。 """ deadline = time.monotonic() + timeout_seconds while time.monotonic() < deadline: data = _http_get_json(url) if data and str(data.get("status", "")).lower() in {"ok", "healthy"}: return True time.sleep(interval_seconds) return False def _notify_telegram(message_html: str) -> None: """ 非阻塞通知,失敗靜默忽略。 ADR-019 Phase 5: 改走 EventRouter 統一入口(event_type=aider_heal_event, severity=warning,會走 L0/L1 由 EventRouter 內部分流)。失敗仍靜默 pass, caller 行為不變。 """ if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID: return try: from services.event_router import dispatch_sync dispatch_sync(event={ "event_type": "aider_heal_event", "severity": "warning", "source": "AiderHealExecutor", "title": "Aider 自動修復通知", "summary": message_html[:400], "status": "heal_notification", "payload": {"raw_message_html": message_html}, }, admin_chat_ids=[TELEGRAM_CHAT_ID]) except Exception: logger.warning("[AiderHeal] EventRouter notification dispatch failed", exc_info=True) def _git_cmd( repo_path: str, args: List[str], timeout: int = 30, check: bool = True, ) -> tuple[int, str, str]: """在 repo_path 下執行 git 命令。""" return _ssh_exec( f"cd {shlex.quote(repo_path)} && git " + " ".join(shlex.quote(a) for a in args), cwd=repo_path, timeout=timeout, check=check, ) def execute_code_fix( error_type: str, error_message: str, target_file: str, context: Optional[dict] = None, ) -> Dict[str, Any]: """ 主要入口:針對指定檔案執行 Aider 自動修復並推版。 返回結構: { 'success': bool, 'action': 'CODE_FIX', 'message': str, 'commit_sha': str | None, 'reverted': bool, } """ ts = datetime.now().strftime("%Y/%m/%d %H:%M:%S") ctx: Dict[str, Any] = context or {} # L1:檔案白名單。必須先於 110 preflight 執行,否則 tests/docs 等 # 本來就不能自動修的 finding 會被誤報成「110 repo 不存在」。 if not ALLOWED_FILE_PATTERN.match(target_file): reason = f"[AiderHeal] 檔案不在 ADR-020 自動修復白名單:{target_file}" logger.warning("event=heal_reject reason=path_not_allowed file=%s", target_file) _notify_telegram( f"⚠️ AiderHeal 已略過自動修復\n" f"├ 檔案:{html.escape(target_file[:200])}\n" f"├ 原因:不在 ADR-020 自動修復白名單(僅允許 services/routes/database 內 Python 檔案)\n" f"└ 動作:請人工確認 finding,或調整白名單後重跑 Code Review" ) return { "success": False, "action": "CODE_FIX", "message": reason, "commit_sha": None, "reverted": False, } # L0:preflight — 確認 110 上的 repo 路徑真的存在且是 git repo # 沒有這個檢查時,後續 cd $REPO_PATH 失敗會被 shell `|| true` 吞掉, # 導致整條 pipeline 走完卻 0 次 push,靜默 100% no-op(2026-05-03 實測) preflight_cmd = ( f"test -d {shlex.quote(REPO_PATH_110)} && " f"test -d {shlex.quote(REPO_PATH_110)}/.git && " f"cd {shlex.quote(REPO_PATH_110)} && " f"git rev-parse --is-inside-work-tree 2>&1" ) rc_pre, out_pre, err_pre = _ssh_exec(preflight_cmd, timeout=10) if rc_pre != 0: preflight_detail = (err_pre or out_pre or "").strip() if not preflight_detail: preflight_detail = "SSH 逾時、repo 路徑不存在,或目標不是 git repo" msg = ( f"[AiderHeal] preflight 失敗:110 主機上 {REPO_PATH_110} 不存在或不是 git repo。" f"請檢查 AIDER_REPO_PATH env / 在 110 上 git clone repo(見 ADR-020 SOP)。" f"detail={preflight_detail[:300]}" ) logger.error( "event=preflight_failed path=%s rc=%s stderr=%s stdout=%s", REPO_PATH_110, rc_pre, err_pre, out_pre, ) _notify_telegram( f"🚨 AiderHeal preflight 失敗\n" f"├ 路徑:{REPO_PATH_110}\n" f"├ 主機:{HEAL_SSH_HOST}\n" f"├ 細節:{html.escape(preflight_detail[:240])}\n" f"└ 動作:請依 ADR-020 SOP 在 110 上 clone repo 並設好 push 權限" ) return { "success": False, "action": "CODE_FIX", "message": msg, "commit_sha": None, "reverted": False, } # L3:速率限制 if not _enforce_rate_limit(): reason = f"[AiderHeal] 每小時上限 {MAX_HOURLY_FIX} 次,跳過" logger.warning("event=rate_limit file=%s", target_file) return { "success": False, "action": "CODE_FIX", "message": reason, "commit_sha": None, "reverted": False, } _notify_telegram( f"🔧 AiderHeal 啟動\n" f"├ 錯誤類型:{error_type}\n" f"├ 目標檔案:{target_file}\n" f"└ 時間:{ts}" ) logger.info("event=heal_start error_type=%s file=%s", error_type, target_file) # ── Step 1:準備 repo(在 110 上) ──────────────────────────────────────── # 注意:`A && B && C && (D || true)` 才能讓 stash 失敗時被吞、其他步驟失敗時保留 rc。 # 早期版本寫 `A && B && C && D || true`,shell 結合性等同 # `(A && B && C && D) || true`,cd 失敗整條 chain 被吞 rc=0,line 261 永不觸發。 setup_cmds = ( f"cd {shlex.quote(REPO_PATH_110)} && " f"git fetch {GITEA_REMOTE} main 2>&1 && " f"git reset --hard {GITEA_REMOTE}/main 2>&1 && " f"(git stash 2>&1 || true)" ) rc, out, err = _ssh_exec(setup_cmds, timeout=30) if rc != 0: msg = f"[AiderHeal] Git 準備失敗:{err or out}" logger.error("event=setup_failed error=%s", msg) _notify_telegram(f"❌ AiderHeal 失敗(Git 準備)\n{msg}") return { "success": False, "action": "CODE_FIX", "message": msg, "commit_sha": None, "reverted": False, } # ── Step 2:建構 Aider 指令 ─────────────────────────────────────────────── safe_error = error_message[:500].replace('"', "'").replace("`", "'").replace("$", "") instruction = ( f"Fix the following {error_type} in this file. " f"Only fix what is necessary, do not refactor or add features. " f"Error: {safe_error}" ) # ADR-027 Phase 2 N2:每次執行都 re-evaluate OLLAMA_API_BASE, # 確保 GCP 主機掛掉時新觸發的 heal 能拿到 fallback 值。 ollama_api_base_runtime = _default_ollama_api_base() aider_cmd = ( f"cd {shlex.quote(REPO_PATH_110)} && " f"PATH=/home/wooo/.local/bin:$PATH OLLAMA_API_BASE={ollama_api_base_runtime} " f"aider --model {AIDER_MODEL} " f"--yes-always --no-git " f'--message "{instruction}" ' f"{shlex.quote(target_file)} 2>&1" ) logger.info("event=aider_ollama_api_base host=%s", ollama_api_base_runtime) logger.info("event=aider_exec file=%s", target_file) rc, aider_out, aider_err = _ssh_exec(aider_cmd, timeout=180) logger.debug("event=aider_output snippet=%s", (aider_out or aider_err)[:300]) # ── Step 3:diff 評估(L2 護欄) ───────────────────────────────────────── # 使用 git diff --numstat 獲取有意義的變更行數(新增+刪除) numstat_cmd = ( f"cd {shlex.quote(REPO_PATH_110)} && " f"git diff --numstat HEAD 2>&1 | awk '{{added+=$1; deleted+=$2}} END{{print added+deleted}}'" ) rc2, diff_lines_str, _ = _ssh_exec(numstat_cmd, timeout=10) diff_lines = int(diff_lines_str.strip()) if rc2 == 0 and diff_lines_str.strip().isdigit() else 0 if diff_lines == 0: msg = "[AiderHeal] Aider 未產生任何修改(diff=0),可能已自動解決或模型失效" logger.warning("event=no_diff file=%s", target_file) _notify_telegram(f"⚠️ AiderHeal:無修改產生\n{target_file}") return { "success": False, "action": "CODE_FIX", "message": msg, "commit_sha": None, "reverted": False, } if diff_lines > MAX_DIFF_LINES: # 改動太大,丟棄並告警 _, _, _ = _ssh_exec( f"cd {shlex.quote(REPO_PATH_110)} && git checkout -- . 2>&1", timeout=10 ) msg = ( f"[AiderHeal] diff 超出限制 {diff_lines} > {MAX_DIFF_LINES} 行," f"已丟棄,需人工介入" ) logger.warning("event=diff_too_large file=%s diff_lines=%d", target_file, diff_lines) _notify_telegram( f"⚠️ AiderHeal:diff 過大,需人工審核\n" f"├ 檔案:{target_file}\n" f"├ diff:{diff_lines} 行(上限 {MAX_DIFF_LINES})\n" f"└ 錯誤:{error_type}" ) return { "success": False, "action": "CODE_FIX", "message": msg, "commit_sha": None, "reverted": False, } # ── Step 4:提交並推送 ─────────────────────────────────────────────────── fix_msg = ( f"fix(autoheal): [{error_type}] auto-fix {target_file}\n\n" f"Triggered by AiderHealExecutor (ADR-014)\n" f"Error: {safe_error[:200]}" ) commit_cmd = ( f"cd {shlex.quote(REPO_PATH_110)} && " f'git add {shlex.quote(target_file)} && ' f'git commit -m {shlex.quote(fix_msg)} 2>&1 && ' f"git push {GITEA_REMOTE} main 2>&1" ) rc3, commit_out, commit_err = _ssh_exec(commit_cmd, timeout=30) # 獲取最新的 commit SHA(從 push 後的 HEAD 獲取,更可靠) _, commit_sha, _ = _git_cmd(REPO_PATH_110, ["log", "-1", "--format=%H"], timeout=10) commit_sha = commit_sha.strip() or "unknown" if rc3 != 0: msg = f"[AiderHeal] git push 失敗:{commit_err or commit_out}" logger.error("event=push_failed error=%s", msg) _notify_telegram(f"❌ AiderHeal git push 失敗\n{msg}") return { "success": False, "action": "CODE_FIX", "message": msg, "commit_sha": None, "reverted": False, } logger.info("event=push_ok commit=%s", commit_sha) _notify_telegram( f"🚀 AiderHeal push 完成\n" f"├ commit:{commit_sha}\n" f"├ 檔案:{target_file}\n" f"└ 等待健康檢查…" ) # ── Step 5:健康檢查(L4 護欄) ────────────────────────────────────────── time.sleep(10) # 給部署一點啟動緩衝 healthy = _wait_for_health(HEALTH_CHECK_URL, timeout_seconds=120, interval_seconds=10) if healthy: msg = f"[AiderHeal] 修復成功並部署完成:{target_file} ({commit_sha})" logger.info("event=heal_success commit=%s file=%s", commit_sha, target_file) _notify_telegram( f"✅ AiderHeal 修復完成\n" f"├ 錯誤:{error_type}\n" f"├ 檔案:{target_file}\n" f"├ commit:{commit_sha}\n" f"└ diff:{diff_lines} 行" ) return { "success": True, "action": "CODE_FIX", "message": msg, "commit_sha": commit_sha, "reverted": False, } # ── Step 6:健康檢查失敗 → 自動 revert(L4 護欄) ───────────────────────── logger.error("event=health_check_failed commit=%s", commit_sha) _, revert_out, revert_err = _ssh_exec( f"cd {shlex.quote(REPO_PATH_110)} && " f"git revert --no-edit {shlex.quote(commit_sha)} 2>&1 && " f"git push {GITEA_REMOTE} main 2>&1", timeout=30, ) _, revert_sha, _ = _git_cmd(REPO_PATH_110, ["log", "-1", "--format=%H"], timeout=10) revert_sha = revert_sha.strip() or "unknown" if "error" not in revert_out.lower() and "error" not in revert_err.lower(): msg = ( f"[AiderHeal] 健康檢查失敗,已自動回滾:" f"{commit_sha} → {revert_sha}" ) logger.warning("event=reverted commit=%s to=%s", commit_sha, revert_sha) _notify_telegram( f"🔄 AiderHeal 自動回滾\n" f"├ 原 commit:{commit_sha}\n" f"├ 回滾 commit:{revert_sha}\n" f"└ 需人工排查:{error_type} in {target_file}" ) else: msg = f"[AiderHeal] 回滾失敗!需立即人工介入:{revert_err}" logger.critical("event=revert_failed commit=%s error=%s", commit_sha, revert_err) _notify_telegram( f"🚨 AiderHeal 回滾失敗!請立即人工介入\n{msg}" ) return { "success": False, "action": "CODE_FIX", "message": msg, "commit_sha": commit_sha, "reverted": True, }