diff --git a/migrations/014_code_fix_playbook.sql b/migrations/014_code_fix_playbook.sql new file mode 100644 index 0000000..2edc060 --- /dev/null +++ b/migrations/014_code_fix_playbook.sql @@ -0,0 +1,11 @@ +-- ADR-014: Aider Code Fix Playbook +INSERT INTO aiops_playbook (name, description, trigger_pattern, action_type, action_params, is_active) +VALUES ( + 'Auto-fix Python Runtime exceptions using Aider (ADR-014)', + 'When Elephant Alpha detects python_exception with tracebacks in logs, trigger CODE_FIX', + 'python_exception', + 'CODE_FIX', + '{"max_diff_lines": 50, "require_health_check": true, "auto_revert_on_fail": true}'::jsonb, + true +) +ON CONFLICT DO NOTHING; diff --git a/services/aider_heal_executor.py b/services/aider_heal_executor.py new file mode 100644 index 0000000..a986699 --- /dev/null +++ b/services/aider_heal_executor.py @@ -0,0 +1,306 @@ +""" +services/aider_heal_executor.py +ADR-014: Autonomous Code Heal Pipeline + +透過 SSH 在 110 主機執行 Aider,自動修復 momo-pro repo 的程式碼問題, +修復後直接 git push,觸發 Gitea CD Pipeline 部署。 + +安全護欄: + L1 - 檔案白名單(只改 services/ routes/ database/ 內 .py) + L2 - diff 限制(>50 行 → 拒絕,不 push) + L3 - 每小時最多 5 次 CODE_FIX + L4 - health check 失敗 → 自動 git revert + push + L5 - Telegram 通知每次修復結果(成功/失敗/回滾) +""" + +import os +import re +import time +import subprocess +import threading +import requests +from datetime import datetime, timedelta +from typing import Optional +from services.logger_manager import SystemLogger + +logger = SystemLogger("AiderHealExecutor").get_logger() + +# ── 設定 ────────────────────────────────────────────────────────────────────── +HEAL_SSH_HOST = "192.168.0.110" +HEAL_SSH_USER = "wooo" +HEAL_SSH_KEY = os.getenv("DEPLOY_SSH_KEY_PATH", "/root/.ssh/id_deploy") + +REPO_PATH_110 = os.getenv("AIDER_REPO_PATH", "/home/wooo/ewoooc") +GITEA_REMOTE = "origin" +HEALTH_CHECK_URL = os.getenv("MOMO_BASE_URL", "https://mo.wooo.work") + "/health" + +GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "") +AIDER_MODEL = os.getenv("AIDER_MODEL", "gemini/gemini-2.0-flash") + +MAX_DIFF_LINES = int(os.getenv("AIDER_MAX_DIFF_LINES", "50")) +MAX_HOURLY_FIX = int(os.getenv("AIDER_MAX_HOURLY_FIX", "5")) + +TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "") +TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "") + +# 允許 Aider 修改的路徑(正則) +ALLOWED_FILE_PATTERN = re.compile( + r'^(services|routes|database)/[a-zA-Z0-9_]+\.py$' +) + +# ── 速率計數器(執行緒安全) ──────────────────────────────────────────────── +_lock = threading.Lock() +_fix_history: list[datetime] = [] + + +def _check_rate_limit() -> bool: + """回傳 True 表示尚未超限,可執行修復。""" + now = datetime.utcnow() + cutoff = now - timedelta(hours=1) + with _lock: + global _fix_history + _fix_history = [t for t in _fix_history if t > cutoff] + if len(_fix_history) >= MAX_HOURLY_FIX: + return False + _fix_history.append(now) + return True + + +def _notify_telegram(msg: str): + """發送 Telegram 通知(非阻塞,忽略失敗)""" + if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID: + return + try: + requests.post( + f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage", + json={"chat_id": TELEGRAM_CHAT_ID, "text": msg, "parse_mode": "HTML"}, + timeout=5 + ) + except Exception: + pass + + +def _ssh_run(cmd: str, timeout: int = 60) -> tuple[int, str, str]: + """在 110 主機執行指令,回傳 (returncode, stdout, stderr)""" + full_cmd = [ + "ssh", + "-i", HEAL_SSH_KEY, + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + f"{HEAL_SSH_USER}@{HEAL_SSH_HOST}", + cmd + ] + try: + result = subprocess.run( + full_cmd, capture_output=True, text=True, timeout=timeout + ) + return result.returncode, result.stdout.strip(), result.stderr.strip() + except subprocess.TimeoutExpired: + return -1, "", f"SSH timeout after {timeout}s" + except Exception as e: + return -1, "", str(e) + + +def _health_check(retries: int = 6, interval: int = 10) -> bool: + """等待健康檢查通過,最多 retries * interval 秒""" + for i in range(retries): + try: + r = requests.get(HEALTH_CHECK_URL, timeout=10) + if r.status_code == 200: + return True + except Exception: + pass + if i < retries - 1: + time.sleep(interval) + return False + + +def execute_code_fix( + error_type: str, + error_message: str, + target_file: str, + context: dict | None = None, +) -> dict: + """ + 主要入口:針對指定檔案執行 Aider 自動修復並推版。 + + Args: + error_type: 錯誤類型(如 'ImportError', 'RuntimeError') + error_message: 完整錯誤訊息(來自容器 log) + target_file: 相對於 repo root 的檔案路徑(如 'services/pchome_crawler.py') + context: 額外上下文字典(可選) + + Returns: + { + 'success': bool, + 'action': 'CODE_FIX', + 'message': str, + 'commit_sha': str | None, + 'reverted': bool, + } + """ + ts = datetime.now().strftime("%Y/%m/%d %H:%M:%S") + ctx = context or {} + + # L1:檔案白名單 + if not ALLOWED_FILE_PATTERN.match(target_file): + reason = f"[AiderHeal] 檔案不在白名單: {target_file}" + logger.warning(reason) + return {"success": False, "action": "CODE_FIX", + "message": reason, "commit_sha": None, "reverted": False} + + # L3:速率限制 + if not _check_rate_limit(): + reason = f"[AiderHeal] 每小時上限 {MAX_HOURLY_FIX} 次,跳過" + logger.warning(reason) + return {"success": False, "action": "CODE_FIX", + "message": reason, "commit_sha": None, "reverted": False} + + _notify_telegram( + f"🔧 AiderHeal 啟動\n" + f"├ 錯誤類型: {error_type}\n" + f"├ 目標檔案: {target_file}\n" + f"└ 時間: {ts}" + ) + logger.info("[AiderHeal] 開始修復: %s → %s", error_type, target_file) + + # ── Step 1:準備 repo(在 110 上)────────────────────────────────────────── + setup_cmds = ( + f"cd {REPO_PATH_110} && " + f"git fetch {GITEA_REMOTE} main 2>&1 && " + f"git reset --hard {GITEA_REMOTE}/main 2>&1 && " + f"git stash 2>&1 || true" + ) + rc, out, err = _ssh_run(setup_cmds, timeout=30) + if rc != 0: + msg = f"[AiderHeal] git 準備失敗: {err or out}" + logger.error(msg) + _notify_telegram(f"❌ AiderHeal 失敗(git 準備)\n{msg}") + return {"success": False, "action": "CODE_FIX", + "message": msg, "commit_sha": None, "reverted": False} + + # ── Step 2:組裝 Aider 指令 ──────────────────────────────────────────────── + # 截斷 error_message,避免 shell 注入問題 + safe_error = error_message[:500].replace('"', "'").replace('`', "'").replace('$', '') + instruction = ( + f"Fix the following {error_type} in this file. " + f"Only fix what is necessary, do not refactor or add features. " + f"Error: {safe_error}" + ) + + aider_cmd = ( + f"cd {REPO_PATH_110} && " + f"GEMINI_API_KEY={GEMINI_API_KEY} " + f"aider --model {AIDER_MODEL} " + f"--yes-always --no-git " + f'--message "{instruction}" ' + f"{target_file} 2>&1" + ) + logger.info("[AiderHeal] 執行 aider on 110...") + rc, aider_out, aider_err = _ssh_run(aider_cmd, timeout=180) + logger.info("[AiderHeal] aider 輸出: %s", (aider_out or aider_err)[:300]) + + # ── Step 3:diff 行數檢查(L2 護欄)─────────────────────────────────────── + diff_cmd = f"cd {REPO_PATH_110} && git diff --unified=0 | wc -l" + rc2, diff_lines_str, _ = _ssh_run(diff_cmd) + diff_lines = int(diff_lines_str.strip()) if diff_lines_str.strip().isdigit() else 999 + + if diff_lines == 0: + msg = f"[AiderHeal] Aider 未產生任何修改(diff=0行),可能已自動解決或模型失效" + logger.warning(msg) + _notify_telegram(f"⚠️ AiderHeal:無修改產生\n{target_file}") + return {"success": False, "action": "CODE_FIX", + "message": msg, "commit_sha": None, "reverted": False} + + if diff_lines > MAX_DIFF_LINES: + # 改動太大,丟棄並升級告警 + _ssh_run(f"cd {REPO_PATH_110} && git checkout -- .", timeout=10) + msg = (f"[AiderHeal] diff 超出限制 {diff_lines}>{MAX_DIFF_LINES} 行," + f"已丟棄,需人工介入") + logger.warning(msg) + _notify_telegram( + f"⚠️ AiderHeal:diff 過大,需人工審核\n" + f"├ 檔案: {target_file}\n" + f"├ diff: {diff_lines} 行(上限 {MAX_DIFF_LINES})\n" + f"└ 錯誤: {error_type}" + ) + return {"success": False, "action": "CODE_FIX", + "message": msg, "commit_sha": None, "reverted": False} + + # ── Step 4:git commit + push ────────────────────────────────────────────── + fix_msg = ( + f"fix(autoheal): [{error_type}] auto-fix {target_file}\n\n" + f"Triggered by AiderHealExecutor (ADR-014)\n" + f"Error: {safe_error[:200]}" + ) + commit_cmd = ( + f"cd {REPO_PATH_110} && " + f'git add {target_file} && ' + f'git commit -m "{fix_msg}" 2>&1 && ' + f"git push {GITEA_REMOTE} main 2>&1" + ) + rc3, commit_out, commit_err = _ssh_run(commit_cmd, timeout=30) + + # 取得 commit SHA + sha_cmd = f"cd {REPO_PATH_110} && git rev-parse --short HEAD" + _, commit_sha, _ = _ssh_run(sha_cmd) + commit_sha = commit_sha.strip() or "unknown" + + if rc3 != 0: + msg = f"[AiderHeal] git push 失敗: {commit_err or commit_out}" + logger.error(msg) + _notify_telegram(f"❌ AiderHeal git push 失敗\n{msg}") + return {"success": False, "action": "CODE_FIX", + "message": msg, "commit_sha": None, "reverted": False} + + logger.info("[AiderHeal] push 成功,commit=%s,等待健康檢查...", commit_sha) + _notify_telegram( + f"🚀 AiderHeal push 完成\n" + f"├ commit: {commit_sha}\n" + f"├ 檔案: {target_file}\n" + f"└ 等待健康檢查..." + ) + + # ── Step 5:健康檢查(L4 護欄)──────────────────────────────────────────── + time.sleep(20) # 等 CD 部署啟動 + healthy = _health_check(retries=6, interval=10) + + if healthy: + msg = f"[AiderHeal] 修復成功並部署完成: {target_file} ({commit_sha})" + logger.info(msg) + _notify_telegram( + f"✅ AiderHeal 修復完成\n" + f"├ 錯誤: {error_type}\n" + f"├ 檔案: {target_file}\n" + f"├ commit: {commit_sha}\n" + f"└ diff: {diff_lines} 行" + ) + return {"success": True, "action": "CODE_FIX", + "message": msg, "commit_sha": commit_sha, "reverted": False} + + # ── Step 6:健康檢查失敗 → 自動 revert(L4 護欄)───────────────────────── + logger.error("[AiderHeal] 健康檢查失敗,執行自動 revert...") + revert_cmd = ( + f"cd {REPO_PATH_110} && " + f"git revert --no-edit {commit_sha} 2>&1 && " + f"git push {GITEA_REMOTE} main 2>&1" + ) + rc4, rev_out, rev_err = _ssh_run(revert_cmd, timeout=30) + if rc4 == 0: + _, revert_sha, _ = _ssh_run(sha_cmd) + revert_sha = revert_sha.strip() + msg = f"[AiderHeal] 健康檢查失敗,已自動 revert: {commit_sha} → {revert_sha}" + logger.warning(msg) + _notify_telegram( + f"🔄 AiderHeal 自動回滾\n" + f"├ 原 commit: {commit_sha}\n" + f"├ 回滾 commit: {revert_sha}\n" + f"└ 需人工排查: {error_type} in {target_file}" + ) + else: + msg = f"[AiderHeal] revert 失敗!需立即人工介入: {rev_err}" + logger.critical(msg) + _notify_telegram(f"🚨 AiderHeal revert 失敗!請立即人工介入\n{msg}") + + return {"success": False, "action": "CODE_FIX", + "message": msg, "commit_sha": commit_sha, "reverted": rc4 == 0} diff --git a/services/auto_heal_service.py b/services/auto_heal_service.py index 175e004..7c3f7b1 100644 --- a/services/auto_heal_service.py +++ b/services/auto_heal_service.py @@ -27,6 +27,7 @@ ALLOWED_ACTION_TYPES = frozenset({ 'WAIT_RETRY', 'ALERT_ONLY', 'SSH_CMD', + 'CODE_FIX', # ADR-014: Aider 自動修覆 }) @@ -385,6 +386,30 @@ class AutoHealService: except Exception as e: return {"success": False, "action": "SSH_CMD", "message": str(e)} + if action_type == "CODE_FIX": + # ADR-014: 透過 Aider 自動修覆程式碼並推版 + target_file = params.get("target_file", "") + error_type = context.get("error_type", "UnknownError") + error_message = context.get("error_message", "") + if not target_file: + return { + "success": False, + "action": "CODE_FIX", + "message": "Playbook CODE_FIX requires action_params.target_file", + } + try: + from services.aider_heal_executor import execute_code_fix + return execute_code_fix( + error_type=error_type, + error_message=error_message, + target_file=target_file, + context=context, + ) + except Exception as e: + logger.error("[AutoHeal] CODE_FIX 失敗: %s", e) + return {"success": False, "action": "CODE_FIX", + "message": f"CODE_FIX 例外: {e}"} + return { "success": False, "action": action_type, diff --git a/services/elephant_alpha_autonomous_engine.py b/services/elephant_alpha_autonomous_engine.py index eed2681..0caf89a 100644 --- a/services/elephant_alpha_autonomous_engine.py +++ b/services/elephant_alpha_autonomous_engine.py @@ -57,6 +57,7 @@ _TRIGGER_ZH: Dict[str, str] = { "resource_optimization": "資源調配優化", "sales_anomaly": "銷售異常偵測", "ea_escalation": "EA 升級審核", + "code_exception": "程式碼異常偵測", # ADR-014 } _AGENT_ZH: Dict[str, str] = { @@ -119,10 +120,11 @@ class ElephantAlphaAutonomousEngine: # 各 trigger 的 escalation cooldown(分鐘) ESCALATION_COOLDOWN: Dict[str, int] = { - "price_drop_alert": 30, # 同一類型 30 分鐘只發一次 + "price_drop_alert": 30, "market_opportunity": 60, "threat_escalation": 15, "resource_optimization": 60, + "code_exception": 5, # ADR-014: 程式錯誤 5 分鐘再檢查一次 } DEFAULT_COOLDOWN_MIN = 30 @@ -171,7 +173,15 @@ class ElephantAlphaAutonomousEngine: conditions={"system_load": "high", "queue_size": ">10"}, threshold=0.6, enabled=True - ) + ), + AutonomousTrigger( + trigger_type="code_exception", # ADR-014 + conditions={"scan_containers": ["momo-pro-system", "momo-scheduler"], + "error_patterns": ["Traceback", "ImportError", + "RuntimeError", "ModuleNotFoundError"]}, + threshold=1.0, # 出現即觸發 + enabled=True + ), ] async def start_autonomous_monitoring(self): @@ -223,6 +233,8 @@ class ElephantAlphaAutonomousEngine: return await self._check_threat_escalation_trigger(trigger) elif trigger.trigger_type == "resource_optimization": return await self._check_resource_optimization_trigger(trigger) + elif trigger.trigger_type == "code_exception": # ADR-014 + return await self._check_code_exception_trigger(trigger) return False # ── Trigger checkers ────────────────────────────────────────────── @@ -293,6 +305,58 @@ class ElephantAlphaAutonomousEngine: return (self._get_action_queue_size() > 10 or self._get_system_load_percentage() > 80) + async def _check_code_exception_trigger(self, trigger: AutonomousTrigger) -> bool: + """ADR-014: 掃描容器 log 抓取 Python Traceback""" + import subprocess + containers = trigger.conditions.get("scan_containers", ["momo-pro-system", "momo-scheduler"]) + error_ptns = trigger.conditions.get("error_patterns", ["Traceback", "ImportError"]) + + has_error = False + error_context = [] + target_file = "" + + for c in containers: + try: + # 只掃描最近 5 分鐘的 log + cmd = ["docker", "logs", "--since", "5m", c] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=5) + out = result.stdout + "\n" + result.stderr + + # 簡單找 Traceback + if "Traceback (most recent call last):" in out: + lines = out.splitlines() + for i, line in enumerate(lines): + if "Traceback" in line: + err_block = lines[i:i+15] # 抓後續15行 + err_str = "\n".join(err_block) + + # 嘗試從 Traceback 中提取本專案的檔案路徑 + import re + # 找 File "/app/services/xxx.py" 或類似 + m = re.search(r'File "([^"]*/(services|routes|database)/[^"]+\.py)"', err_str) + if m: + target_file = m.group(1) + # 整理成相對路徑 + if "/app/" in target_file: + target_file = target_file.split("/app/")[1] + elif "momo-pro-system/" in target_file: + target_file = target_file.split("momo-pro-system/")[1] + + error_context.append(f"[{c}] {err_str}") + has_error = True + break # 只抓第一個錯誤 + + except Exception as e: + logger.debug(f"Failed to scan log for {c}: {e}") + + if has_error and error_context: + # 暫存到 trigger class 中供後續 _handle 使用 + trigger._temp_error_msg = "\n".join(error_context) + trigger._temp_target_file = target_file + return True + + return False + # ── Decision execution ──────────────────────────────────────────── async def _execute_autonomous_decision(self, trigger: AutonomousTrigger): @@ -317,6 +381,12 @@ class ElephantAlphaAutonomousEngine: trigger.last_triggered = datetime.now() return + # ADR-014: code_exception → AiderHeal code fix loop + if trigger.trigger_type == "code_exception": + await self._handle_code_exception_via_aider(trigger) + trigger.last_triggered = datetime.now() + return + context = await self._build_trigger_context(trigger) decision = await elephant_orchestrator.analyze_and_coordinate(context) @@ -371,6 +441,36 @@ class ElephantAlphaAutonomousEngine: except Exception as e: logger.error(f"[ElephantAlpha] AutoHeal handoff failed: {e}") + async def _handle_code_exception_via_aider(self, trigger: AutonomousTrigger): + """ADR-014: code_exception → auto_heal_service.handle_exception (CODE_FIX)""" + error_msg = getattr(trigger, '_temp_error_msg', 'Unknown Traceback') + target_file = getattr(trigger, '_temp_target_file', '') + + # 基本過濾:有 traceback 但找不到目標檔案時不處理 + if not target_file: + logger.warning("[ElephantAlpha] No target file parsed from traceback, skipping CODE_FIX") + return + + try: + from services.auto_heal_service import AutoHealService + heal_service = AutoHealService() + + # 使用 error_type='python_exception',此類型應該在 PlayBook 表中有對應設定 + await asyncio.get_event_loop().run_in_executor( + None, + heal_service.handle_exception, + "python_exception", + { + "error_type": "Python Traceback", + "error_message": error_msg, + "target_file": target_file, + "source": "elephant_alpha_code_scan", + } + ) + logger.info(f"[ElephantAlpha] Code exception handed off to AutoHealService for {target_file}") + except Exception as e: + logger.error(f"[ElephantAlpha] AutoHeal (CODE_FIX) handoff failed: {e}") + async def _build_trigger_context(self, trigger: AutonomousTrigger) -> Dict[str, Any]: context = { "trigger_type": trigger.trigger_type,