Files
ewoooc/services/aider_heal_executor.py
OoO 078bf2683c fix(adr-027): Phase 2 — ADR-027 4 破洞修補 + 移除寫死 111
config.py — B1+B2 lazy resolve
- get_ollama_host() 取代 import-time freeze 的 OLLAMA_HOST
- get_embedding_host() 取代 EMBEDDING_HOST
- 主機切換時不需重啟 Python 進程

services/ollama_service.py — B3+B4 三主機級聯
- resolve_ollama_host(primary, secondary, fallback) 三主機級聯
  - Primary:   34.143.170.20 (SSD) — GCP 主主機
  - Secondary: 34.21.145.224 (SSD) — 同等效能備援
  - Fallback:  192.168.0.111 (HDD) — 最後一道防線
- _is_reachable: HTTP /api/version probe 取代 TCP socket(防 process 卡死假活)
- mark_unhealthy(host) 即時失效 cache,30s 內跳過該主機
- 14 unit tests 全綠

services/aider_heal_executor.py — N2
- 移除寫死 192.168.0.111,改用 get_ollama_host()
- AiderHeal 終於遵循 ADR-027 GCP 優先策略

Operation Ollama-First v5.0 / Phase 2 A6

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 23:05:11 +08:00

487 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
services/aider_heal_executor.py
ADR-020: Autonomous Code Heal PipelineCode Review 全自動修復端到端執行器)
透過 SSH 在 110 主機執行 Aider自動修復 momo-pro repo 的程式碼問題,
修復後直接 git push觸發 Gitea CD Pipeline 部署。
分支策略:直推 main依賴 CD pipeline 健康檢查與 git revert 作回滾安全網。
(不採 PR 流程,呼應 ADR-020「全自動修復、無人工審查門檻」精神
安全護欄:
L1 - 檔案白名單(只改 services/ routes/ database/ 內 .py
L2 - diff 限制(>50 行 → 拒絕,不 push
L3 - 每小時最多 5 次 CODE_FIX
L4 - health check 失敗 → 自動 git revert + push
L5 - Telegram 通知每次修復結果(成功/失敗/回滾)
"""
import json
import os
import re
import time
import subprocess
import threading
import shlex
import requests
from datetime import datetime, timedelta
from typing import Optional, Dict, Any, List
from pathlib import Path
from services.logger_manager import SystemLogger
logger = SystemLogger("AiderHealExecutor").get_logger()
# ── 配置 ──────────────────────────────────────────────────────────────────────
HEAL_SSH_HOST: str = os.getenv("HEAL_SSH_HOST", "192.168.0.110")
HEAL_SSH_USER: str = os.getenv("HEAL_SSH_USER", "wooo")
HEAL_SSH_KEY_DEFAULT = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "config", "autoheal_id_ed25519"))
HEAL_SSH_KEY = os.getenv("DEPLOY_SSH_KEY_PATH", HEAL_SSH_KEY_DEFAULT)
HEAL_SSH_PORT: int = int(os.getenv("HEAL_SSH_PORT", "22"))
REPO_PATH_110: str = os.getenv("AIDER_REPO_PATH", "/home/wooo/ewoooc")
GITEA_REMOTE: str = "origin"
HEALTH_CHECK_URL: str = (
os.getenv("MOMO_BASE_URL", "https://mo.wooo.work").rstrip("/") + "/health"
)
# ADR-027 Phase 2 N2OLLAMA_API_BASE 改 lazy resolveGCP 優先 / 111 備援)。
# 注意:本變數透過 SSH 傳入 110 上的 Aider CLI 執行環境line 312 OLLAMA_API_BASE=...
# 所以每次 execute_code_fix 啟動時才需要值;此處僅作為「無 env 時的預設」。
# 顯式 env 設定者(運維/.env優先使用符合向下相容。
def _default_ollama_api_base() -> str:
"""Lazy 取得 Aider CLI 用的 Ollama API base避免 import-time 寫死 111。"""
env_val = os.getenv("OLLAMA_API_BASE")
if env_val:
return env_val
try:
from services.ollama_service import resolve_ollama_host
return resolve_ollama_host()
except Exception:
# 兜底:保留原行為 — 內網 111
return "http://192.168.0.111:11434"
# 注意:保留 module-level 屬性供向下相容(測試 / monkey-patch
# 但 execute_code_fix 內部會於每次執行時 re-evaluate 以避免 cache 失效。
OLLAMA_API_BASE: str = _default_ollama_api_base()
AIDER_MODEL: str = os.getenv("AIDER_MODEL", "ollama/qwen2.5-coder:7b")
MAX_DIFF_LINES: int = int(os.getenv("AIDER_MAX_DIFF_LINES", "50"))
MAX_HOURLY_FIX: int = int(os.getenv("AIDER_MAX_HOURLY_FIX", "5"))
TELEGRAM_BOT_TOKEN: str = os.getenv("TELEGRAM_BOT_TOKEN", "")
_chat_ids_raw = os.getenv("TELEGRAM_CHAT_IDS", "[]")
try:
_chat_ids_list = json.loads(_chat_ids_raw)
TELEGRAM_CHAT_ID: str = str(_chat_ids_list[0]) if _chat_ids_list else os.getenv("TELEGRAM_CHAT_ID", "")
except Exception:
TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "")
# 允許 Aider 修改的路徑(正規表示式)
ALLOWED_FILE_PATTERN = re.compile(
r"^(services|routes|database)/[a-zA-Z0-9_]+\.py$"
)
# ── 速率控制(執行緒安全) ────────────────────────────────────────────────────
_lock: threading.Lock = threading.Lock()
_fix_history: List[float] = []
_last_host_reset: float = time.monotonic()
def _enforce_rate_limit() -> bool:
"""
每小時最多 MAX_HOURLY_FIX 次修復。
使用單調時鐘避免系統時間跳變影響。
"""
global _last_host_reset, _fix_history
now = time.monotonic()
with _lock:
# 每小時重置一次計數(基於單調時鐘的近似小時窗口)
if now - _last_host_reset > 3600.0:
_fix_history.clear()
_last_host_reset = now
if len(_fix_history) >= MAX_HOURLY_FIX:
return False
_fix_history.append(now)
return True
def _ssh_exec(
cmd: str,
cwd: Optional[str] = None,
timeout: int = 60,
check: bool = True,
) -> tuple[int, str, str]:
"""
在遠端主機執行命令(透過 SSH
返回 (returncode, stdout, stderr)
使用 list + shell=False 避免 shell injection
cmd_str 作為 SSH 的最後一個參數,由遠端 shell 負責解析。
"""
full_cmd = [
"ssh",
"-p", str(HEAL_SSH_PORT),
"-i", HEAL_SSH_KEY,
"-o", "StrictHostKeyChecking=no",
"-o", "ConnectTimeout=10",
f"{HEAL_SSH_USER}@{HEAL_SSH_HOST}",
cmd,
]
try:
result = subprocess.run(
full_cmd,
shell=False,
capture_output=True,
text=True,
cwd=cwd,
timeout=timeout,
)
return result.returncode, result.stdout.strip(), result.stderr.strip()
except subprocess.TimeoutExpired:
return -1, "", f"SSH timeout after {timeout}s"
except Exception as e:
return -1, "", str(e)
def _http_get_json(url: str, timeout: int = 10) -> Optional[Dict[str, Any]]:
try:
resp = requests.get(url, timeout=timeout)
if resp.status_code == 200:
return resp.json()
except Exception:
pass
return None
def _wait_for_health(
url: str,
timeout_seconds: int = 120,
interval_seconds: int = 10,
) -> bool:
"""
持續輪詢健康檢查,直到成功或超時。
"""
deadline = time.monotonic() + timeout_seconds
while time.monotonic() < deadline:
data = _http_get_json(url)
if data and data.get("status") == "ok":
return True
time.sleep(interval_seconds)
return False
def _notify_telegram(message_html: str) -> None:
"""
非阻塞通知,失敗靜默忽略。
ADR-019 Phase 5: 改走 EventRouter 統一入口event_type=aider_heal_event,
severity=warning會走 L0/L1 由 EventRouter 內部分流)。失敗仍靜默 pass
caller 行為不變。
"""
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
return
try:
from services.event_router import dispatch_sync
dispatch_sync(event={
"event_type": "aider_heal_event",
"severity": "warning",
"source": "AiderHealExecutor",
"title": "Aider 自動修復通知",
"summary": message_html[:400],
"status": "heal_notification",
"payload": {"raw_message_html": message_html},
}, admin_chat_ids=[TELEGRAM_CHAT_ID])
except Exception:
pass
def _git_cmd(
repo_path: str,
args: List[str],
timeout: int = 30,
check: bool = True,
) -> tuple[int, str, str]:
"""在 repo_path 下執行 git 命令。"""
return _ssh_exec(
f"cd {shlex.quote(repo_path)} && git " + " ".join(shlex.quote(a) for a in args),
cwd=repo_path,
timeout=timeout,
check=check,
)
def execute_code_fix(
error_type: str,
error_message: str,
target_file: str,
context: Optional[dict] = None,
) -> Dict[str, Any]:
"""
主要入口:針對指定檔案執行 Aider 自動修復並推版。
返回結構:
{
'success': bool,
'action': 'CODE_FIX',
'message': str,
'commit_sha': str | None,
'reverted': bool,
}
"""
ts = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
ctx: Dict[str, Any] = context or {}
repo = Path(REPO_PATH_110).expanduser()
# L0preflight — 確認 110 上的 repo 路徑真的存在且是 git repo
# 沒有這個檢查時,後續 cd $REPO_PATH 失敗會被 shell `|| true` 吞掉,
# 導致整條 pipeline 走完卻 0 次 push靜默 100% no-op2026-05-03 實測)
rc_pre, _, _ = _ssh_exec(
f"test -d {shlex.quote(REPO_PATH_110)}/.git", timeout=10
)
if rc_pre != 0:
msg = (
f"[AiderHeal] preflight 失敗110 主機上 {REPO_PATH_110} 不存在或不是 git repo。"
f"請檢查 AIDER_REPO_PATH env / 在 110 上 git clone repo見 ADR-020 SOP"
)
logger.error("event=preflight_failed path=%s", REPO_PATH_110)
_notify_telegram(
f"🚨 <b>AiderHeal preflight 失敗</b>\n"
f"├ 路徑:<code>{REPO_PATH_110}</code>\n"
f"├ 主機:<code>{HEAL_SSH_HOST}</code>\n"
f"└ 動作:請依 ADR-020 SOP 在 110 上 clone repo 並設好 push 權限"
)
return {
"success": False,
"action": "CODE_FIX",
"message": msg,
"commit_sha": None,
"reverted": False,
}
# L1檔案白名單
if not ALLOWED_FILE_PATTERN.match(target_file):
reason = f"[AiderHeal] 檔案不在白名單:{target_file}"
logger.warning("event=heal_reject reason=%s file=%s", reason, target_file)
return {
"success": False,
"action": "CODE_FIX",
"message": reason,
"commit_sha": None,
"reverted": False,
}
# L3速率限制
if not _enforce_rate_limit():
reason = f"[AiderHeal] 每小時上限 {MAX_HOURLY_FIX} 次,跳過"
logger.warning("event=rate_limit file=%s", target_file)
return {
"success": False,
"action": "CODE_FIX",
"message": reason,
"commit_sha": None,
"reverted": False,
}
_notify_telegram(
f"🔧 <b>AiderHeal 啟動</b>\n"
f"├ 錯誤類型:<code>{error_type}</code>\n"
f"├ 目標檔案:<code>{target_file}</code>\n"
f"└ 時間:{ts}"
)
logger.info("event=heal_start error_type=%s file=%s", error_type, target_file)
# ── Step 1準備 repo在 110 上) ────────────────────────────────────────
# 注意:`A && B && C && (D || true)` 才能讓 stash 失敗時被吞、其他步驟失敗時保留 rc。
# 早期版本寫 `A && B && C && D || true`shell 結合性等同
# `(A && B && C && D) || true`cd 失敗整條 chain 被吞 rc=0line 261 永不觸發。
setup_cmds = (
f"cd {shlex.quote(REPO_PATH_110)} && "
f"git fetch {GITEA_REMOTE} main 2>&1 && "
f"git reset --hard {GITEA_REMOTE}/main 2>&1 && "
f"(git stash 2>&1 || true)"
)
rc, out, err = _ssh_exec(setup_cmds, timeout=30)
if rc != 0:
msg = f"[AiderHeal] Git 準備失敗:{err or out}"
logger.error("event=setup_failed error=%s", msg)
_notify_telegram(f"❌ AiderHeal 失敗Git 準備)\n<code>{msg}</code>")
return {
"success": False,
"action": "CODE_FIX",
"message": msg,
"commit_sha": None,
"reverted": False,
}
# ── Step 2建構 Aider 指令 ───────────────────────────────────────────────
safe_error = error_message[:500].replace('"', "'").replace("`", "'").replace("$", "")
instruction = (
f"Fix the following {error_type} in this file. "
f"Only fix what is necessary, do not refactor or add features. "
f"Error: {safe_error}"
)
# ADR-027 Phase 2 N2每次執行都 re-evaluate OLLAMA_API_BASE
# 確保 GCP 主機掛掉時新觸發的 heal 能拿到 fallback 值。
ollama_api_base_runtime = _default_ollama_api_base()
aider_cmd = (
f"cd {shlex.quote(REPO_PATH_110)} && "
f"PATH=/home/wooo/.local/bin:$PATH OLLAMA_API_BASE={ollama_api_base_runtime} "
f"aider --model {AIDER_MODEL} "
f"--yes-always --no-git "
f'--message "{instruction}" '
f"{shlex.quote(target_file)} 2>&1"
)
logger.info("event=aider_ollama_api_base host=%s", ollama_api_base_runtime)
logger.info("event=aider_exec file=%s", target_file)
rc, aider_out, aider_err = _ssh_exec(aider_cmd, timeout=180)
logger.debug("event=aider_output snippet=%s", (aider_out or aider_err)[:300])
# ── Step 3diff 評估L2 護欄) ─────────────────────────────────────────
# 使用 git diff --numstat 獲取有意義的變更行數(新增+刪除)
numstat_cmd = (
f"cd {shlex.quote(REPO_PATH_110)} && "
f"git diff --numstat HEAD 2>&1 | awk '{{added+=$1; deleted+=$2}} END{{print added+deleted}}'"
)
rc2, diff_lines_str, _ = _ssh_exec(numstat_cmd, timeout=10)
diff_lines = int(diff_lines_str.strip()) if rc2 == 0 and diff_lines_str.strip().isdigit() else 0
if diff_lines == 0:
msg = "[AiderHeal] Aider 未產生任何修改diff=0可能已自動解決或模型失效"
logger.warning("event=no_diff file=%s", target_file)
_notify_telegram(f"⚠️ AiderHeal無修改產生\n<code>{target_file}</code>")
return {
"success": False,
"action": "CODE_FIX",
"message": msg,
"commit_sha": None,
"reverted": False,
}
if diff_lines > MAX_DIFF_LINES:
# 改動太大,丟棄並告警
_, _, _ = _ssh_exec(
f"cd {shlex.quote(REPO_PATH_110)} && git checkout -- . 2>&1", timeout=10
)
msg = (
f"[AiderHeal] diff 超出限制 {diff_lines} > {MAX_DIFF_LINES} 行,"
f"已丟棄,需人工介入"
)
logger.warning("event=diff_too_large file=%s diff_lines=%d", target_file, diff_lines)
_notify_telegram(
f"⚠️ <b>AiderHealdiff 過大,需人工審核</b>\n"
f"├ 檔案:<code>{target_file}</code>\n"
f"├ diff{diff_lines} 行(上限 {MAX_DIFF_LINES}\n"
f"└ 錯誤:<code>{error_type}</code>"
)
return {
"success": False,
"action": "CODE_FIX",
"message": msg,
"commit_sha": None,
"reverted": False,
}
# ── Step 4提交並推送 ───────────────────────────────────────────────────
fix_msg = (
f"fix(autoheal): [{error_type}] auto-fix {target_file}\n\n"
f"Triggered by AiderHealExecutor (ADR-014)\n"
f"Error: {safe_error[:200]}"
)
commit_cmd = (
f"cd {shlex.quote(REPO_PATH_110)} && "
f'git add {shlex.quote(target_file)} && '
f'git commit -m {shlex.quote(fix_msg)} 2>&1 && '
f"git push {GITEA_REMOTE} main 2>&1"
)
rc3, commit_out, commit_err = _ssh_exec(commit_cmd, timeout=30)
# 獲取最新的 commit SHA從 push 後的 HEAD 獲取,更可靠)
_, commit_sha, _ = _git_cmd(REPO_PATH_110, ["log", "-1", "--format=%H"], timeout=10)
commit_sha = commit_sha.strip() or "unknown"
if rc3 != 0:
msg = f"[AiderHeal] git push 失敗:{commit_err or commit_out}"
logger.error("event=push_failed error=%s", msg)
_notify_telegram(f"❌ AiderHeal git push 失敗\n<code>{msg}</code>")
return {
"success": False,
"action": "CODE_FIX",
"message": msg,
"commit_sha": None,
"reverted": False,
}
logger.info("event=push_ok commit=%s", commit_sha)
_notify_telegram(
f"🚀 <b>AiderHeal push 完成</b>\n"
f"├ commit<code>{commit_sha}</code>\n"
f"├ 檔案:<code>{target_file}</code>\n"
f"└ 等待健康檢查…"
)
# ── Step 5健康檢查L4 護欄) ──────────────────────────────────────────
time.sleep(10) # 給部署一點啟動緩衝
healthy = _wait_for_health(HEALTH_CHECK_URL, timeout_seconds=120, interval_seconds=10)
if healthy:
msg = f"[AiderHeal] 修復成功並部署完成:{target_file} ({commit_sha})"
logger.info("event=heal_success commit=%s file=%s", commit_sha, target_file)
_notify_telegram(
f"✅ <b>AiderHeal 修復完成</b>\n"
f"├ 錯誤:<code>{error_type}</code>\n"
f"├ 檔案:<code>{target_file}</code>\n"
f"├ commit<code>{commit_sha}</code>\n"
f"└ diff{diff_lines}"
)
return {
"success": True,
"action": "CODE_FIX",
"message": msg,
"commit_sha": commit_sha,
"reverted": False,
}
# ── Step 6健康檢查失敗 → 自動 revertL4 護欄) ─────────────────────────
logger.error("event=health_check_failed commit=%s", commit_sha)
_, revert_out, revert_err = _ssh_exec(
f"cd {shlex.quote(REPO_PATH_110)} && "
f"git revert --no-edit {shlex.quote(commit_sha)} 2>&1 && "
f"git push {GITEA_REMOTE} main 2>&1",
timeout=30,
)
_, revert_sha, _ = _git_cmd(REPO_PATH_110, ["log", "-1", "--format=%H"], timeout=10)
revert_sha = revert_sha.strip() or "unknown"
if "error" not in revert_out.lower() and "error" not in revert_err.lower():
msg = (
f"[AiderHeal] 健康檢查失敗,已自動回滾:"
f"{commit_sha}{revert_sha}"
)
logger.warning("event=reverted commit=%s to=%s", commit_sha, revert_sha)
_notify_telegram(
f"🔄 <b>AiderHeal 自動回滾</b>\n"
f"├ 原 commit<code>{commit_sha}</code>\n"
f"├ 回滾 commit<code>{revert_sha}</code>\n"
f"└ 需人工排查:<code>{error_type}</code> in <code>{target_file}</code>"
)
else:
msg = f"[AiderHeal] 回滾失敗!需立即人工介入:{revert_err}"
logger.critical("event=revert_failed commit=%s error=%s", commit_sha, revert_err)
_notify_telegram(
f"🚨 <b>AiderHeal 回滾失敗!請立即人工介入</b>\n<code>{msg}</code>"
)
return {
"success": False,
"action": "CODE_FIX",
"message": msg,
"commit_sha": commit_sha,
"reverted": True,
}