From fb0dad22892edc1bb658367b3e07bb791b3a90d8 Mon Sep 17 00:00:00 2001 From: ogt Date: Sun, 19 Apr 2026 16:30:45 +0800 Subject: [PATCH] =?UTF-8?q?fix(ai-ops):=20AutoHeal=20=E4=B8=89=E9=A0=85?= =?UTF-8?q?=E4=BF=AE=E6=AD=A3=20+=20=E9=80=9A=E7=9F=A5=E6=A0=BC=E5=BC=8F?= =?UTF-8?q?=E9=87=8D=E8=A8=AD=E8=A8=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. SSH 金鑰:新增 _SSH_KEY_PATH(/app/config/autoheal_id_ed25519) paramiko key_filename 參數,支援 config 目錄 rw mount 無需重建容器 2. _create_incident:加入 refresh+expunge 避免 session.close() 後 incident.severity 等屬性 DetachedInstanceError 3. _write_heal_log fallback:補 duration_ms=duration_ms 原本 fallback HealLog() 沒設 duration_ms → None:.0f 觸發 TypeError 4. _notify_telegram 格式重設計 - success/failed/skipped 三種 header 差異化 - failed 時顯示人工介入指令 + Incident ID - 三段式分隔(標題 → PlayBook 動作 → 結論) - 移除「已沉澱至 KM」在 failed 時的誤導訊息 SSH 驗證:2026-04-19 16:30 實測 result=success duration=3110ms Co-Authored-By: Claude Sonnet 4.6 --- services/auto_heal_service.py | 48 +++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/services/auto_heal_service.py b/services/auto_heal_service.py index 1e23692..9f78eff 100644 --- a/services/auto_heal_service.py +++ b/services/auto_heal_service.py @@ -40,6 +40,8 @@ _JUMP_HOST = os.getenv("SSH_JUMP_HOST", "192.168.0.110") _JUMP_USER = os.getenv("SSH_JUMP_USER", "wooo") _TARGET_HOST = os.getenv("SSH_TARGET_HOST", "192.168.0.188") _TARGET_USER = os.getenv("SSH_TARGET_USER", "ollama") +# SSH 私鑰路徑:優先 env,fallback 到 config 目錄(rw mount,不需重建容器) +_SSH_KEY_PATH = os.getenv("SSH_KEY_PATH", "/app/config/autoheal_id_ed25519") # ─── 白名單允許執行的指令前綴 ──────────────────────────── _CMD_WHITELIST = [ @@ -117,9 +119,14 @@ def _execute_ssh_cmd(cmd: str) -> Tuple[bool, str]: try: import paramiko + import os as _os + key_path = _SSH_KEY_PATH if _os.path.isfile(_SSH_KEY_PATH) else None + key_kwargs = {"key_filename": key_path} if key_path else {} + jump = paramiko.SSHClient() jump.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - jump.connect(_JUMP_HOST, username=_JUMP_USER, timeout=10) + jump.connect(_JUMP_HOST, username=_JUMP_USER, timeout=10, + look_for_keys=True, **key_kwargs) # 透過跳板機建立隧道 transport = jump.get_transport() @@ -129,7 +136,8 @@ def _execute_ssh_cmd(cmd: str) -> Tuple[bool, str]: target = paramiko.SSHClient() target.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - target.connect(_TARGET_HOST, username=_TARGET_USER, sock=chan, timeout=15) + target.connect(_TARGET_HOST, username=_TARGET_USER, sock=chan, timeout=15, + look_for_keys=True, **key_kwargs) _stdin, stdout, stderr = target.exec_command(cmd, timeout=60) out = stdout.read().decode("utf-8", errors="replace").strip() @@ -224,6 +232,8 @@ class AutoHealService: ) session.add(incident) session.commit() + session.refresh(incident) + session.expunge(incident) sys_log.info(f"[AutoHeal] 建立 Incident id={incident.id} type={error_type}") return incident except Exception as e: @@ -379,7 +389,7 @@ class AutoHealService: except Exception as e: session.rollback() sys_log.error(f"[AutoHeal] write_heal_log 失敗: {e}") - return HealLog(result=result, action_detail=action_detail) + return HealLog(result=result, action_detail=action_detail, duration_ms=duration_ms) finally: session.close() @@ -422,19 +432,35 @@ class AutoHealService: # ── 步驟 7:Telegram 通知 ─────────────────────────── def _notify_telegram(self, incident: Incident, playbook: Playbook, heal_log: HealLog) -> None: - """推播修復結果通知""" - icon = {"success": "✅", "failed": "❌", "skipped": "⏭️"}.get(heal_log.result, "❓") sev_icon = {"P1": "🔴", "P2": "🟠", "P3": "🟡"}.get(incident.severity, "⚪") + result = heal_log.result + + if result == "success": + header = f"✅ [AIOps] 自動修復成功" + footer = f"💾 知識已沉澱至 KM" + elif result == "failed": + header = f"🚨 [AIOps] 自動修復失敗 — 需人工介入" + footer = ( + f"⚠️ 修復指令回傳錯誤,請登入 188 手動排查:\n" + f"docker restart {playbook.get_action_params().get('container', '?')}\n" + f"🆔 Incident #{incident.id}" + ) + else: + header = f"⏭️ [AIOps] 修復已略過" + footer = f"🆔 Incident #{incident.id}" msg = ( - f"{sev_icon} [EwoooC AIOps] 自動修復報告\n\n" - f"📌 任務:{incident.task_name}\n" - f"🚨 錯誤類型:{incident.error_type}\n" - f"📝 症狀:{incident.error_message[:200]}\n\n" + f"{sev_icon} {header}\n" + f"━━━━━━━━━━━━━━━━━━\n" + f"📌 {incident.task_name}\n" + f"🔖 {incident.error_type} · {incident.severity}\n" + f"📝 {incident.error_message[:180]}\n" + f"━━━━━━━━━━━━━━━━━━\n" f"🔧 PlayBook:{playbook.name}\n" f"⚙️ 動作:{heal_log.action_detail}\n" - f"{icon} 結果:{heal_log.result}({heal_log.duration_ms:.0f}ms)\n\n" - f"💾 已沉澱至 KM(auto_heal_playbook)" + f"⏱ 耗時:{heal_log.duration_ms:.0f}ms\n" + f"━━━━━━━━━━━━━━━━━━\n" + f"{footer}" ) _send_telegram(msg)