fix(ai-ops): AutoHeal 三項修正 + 通知格式重設計
All checks were successful
CD Pipeline / deploy (push) Successful in 1m19s
All checks were successful
CD Pipeline / deploy (push) Successful in 1m19s
1. SSH 金鑰:新增 _SSH_KEY_PATH(/app/config/autoheal_id_ed25519) paramiko key_filename 參數,支援 config 目錄 rw mount 無需重建容器 2. _create_incident:加入 refresh+expunge 避免 session.close() 後 incident.severity 等屬性 DetachedInstanceError 3. _write_heal_log fallback:補 duration_ms=duration_ms 原本 fallback HealLog() 沒設 duration_ms → None:.0f 觸發 TypeError 4. _notify_telegram 格式重設計 - success/failed/skipped 三種 header 差異化 - failed 時顯示人工介入指令 + Incident ID - 三段式分隔(標題 → PlayBook 動作 → 結論) - 移除「已沉澱至 KM」在 failed 時的誤導訊息 SSH 驗證:2026-04-19 16:30 實測 result=success duration=3110ms Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,8 @@ _JUMP_HOST = os.getenv("SSH_JUMP_HOST", "192.168.0.110")
|
||||
_JUMP_USER = os.getenv("SSH_JUMP_USER", "wooo")
|
||||
_TARGET_HOST = os.getenv("SSH_TARGET_HOST", "192.168.0.188")
|
||||
_TARGET_USER = os.getenv("SSH_TARGET_USER", "ollama")
|
||||
# SSH 私鑰路徑:優先 env,fallback 到 config 目錄(rw mount,不需重建容器)
|
||||
_SSH_KEY_PATH = os.getenv("SSH_KEY_PATH", "/app/config/autoheal_id_ed25519")
|
||||
|
||||
# ─── 白名單允許執行的指令前綴 ────────────────────────────
|
||||
_CMD_WHITELIST = [
|
||||
@@ -117,9 +119,14 @@ def _execute_ssh_cmd(cmd: str) -> Tuple[bool, str]:
|
||||
|
||||
try:
|
||||
import paramiko
|
||||
import os as _os
|
||||
key_path = _SSH_KEY_PATH if _os.path.isfile(_SSH_KEY_PATH) else None
|
||||
key_kwargs = {"key_filename": key_path} if key_path else {}
|
||||
|
||||
jump = paramiko.SSHClient()
|
||||
jump.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
jump.connect(_JUMP_HOST, username=_JUMP_USER, timeout=10)
|
||||
jump.connect(_JUMP_HOST, username=_JUMP_USER, timeout=10,
|
||||
look_for_keys=True, **key_kwargs)
|
||||
|
||||
# 透過跳板機建立隧道
|
||||
transport = jump.get_transport()
|
||||
@@ -129,7 +136,8 @@ def _execute_ssh_cmd(cmd: str) -> Tuple[bool, str]:
|
||||
|
||||
target = paramiko.SSHClient()
|
||||
target.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
target.connect(_TARGET_HOST, username=_TARGET_USER, sock=chan, timeout=15)
|
||||
target.connect(_TARGET_HOST, username=_TARGET_USER, sock=chan, timeout=15,
|
||||
look_for_keys=True, **key_kwargs)
|
||||
|
||||
_stdin, stdout, stderr = target.exec_command(cmd, timeout=60)
|
||||
out = stdout.read().decode("utf-8", errors="replace").strip()
|
||||
@@ -224,6 +232,8 @@ class AutoHealService:
|
||||
)
|
||||
session.add(incident)
|
||||
session.commit()
|
||||
session.refresh(incident)
|
||||
session.expunge(incident)
|
||||
sys_log.info(f"[AutoHeal] 建立 Incident id={incident.id} type={error_type}")
|
||||
return incident
|
||||
except Exception as e:
|
||||
@@ -379,7 +389,7 @@ class AutoHealService:
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
sys_log.error(f"[AutoHeal] write_heal_log 失敗: {e}")
|
||||
return HealLog(result=result, action_detail=action_detail)
|
||||
return HealLog(result=result, action_detail=action_detail, duration_ms=duration_ms)
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
@@ -422,19 +432,35 @@ class AutoHealService:
|
||||
# ── 步驟 7:Telegram 通知 ───────────────────────────
|
||||
def _notify_telegram(self, incident: Incident, playbook: Playbook,
|
||||
heal_log: HealLog) -> None:
|
||||
"""推播修復結果通知"""
|
||||
icon = {"success": "✅", "failed": "❌", "skipped": "⏭️"}.get(heal_log.result, "❓")
|
||||
sev_icon = {"P1": "🔴", "P2": "🟠", "P3": "🟡"}.get(incident.severity, "⚪")
|
||||
result = heal_log.result
|
||||
|
||||
if result == "success":
|
||||
header = f"✅ <b>[AIOps] 自動修復成功</b>"
|
||||
footer = f"💾 知識已沉澱至 KM"
|
||||
elif result == "failed":
|
||||
header = f"🚨 <b>[AIOps] 自動修復失敗 — 需人工介入</b>"
|
||||
footer = (
|
||||
f"⚠️ 修復指令回傳錯誤,請登入 188 手動排查:\n"
|
||||
f"<code>docker restart {playbook.get_action_params().get('container', '?')}</code>\n"
|
||||
f"🆔 Incident #{incident.id}"
|
||||
)
|
||||
else:
|
||||
header = f"⏭️ <b>[AIOps] 修復已略過</b>"
|
||||
footer = f"🆔 Incident #{incident.id}"
|
||||
|
||||
msg = (
|
||||
f"{sev_icon} <b>[EwoooC AIOps] 自動修復報告</b>\n\n"
|
||||
f"📌 任務:<code>{incident.task_name}</code>\n"
|
||||
f"🚨 錯誤類型:<code>{incident.error_type}</code>\n"
|
||||
f"📝 症狀:{incident.error_message[:200]}\n\n"
|
||||
f"{sev_icon} {header}\n"
|
||||
f"━━━━━━━━━━━━━━━━━━\n"
|
||||
f"📌 <b>{incident.task_name}</b>\n"
|
||||
f"🔖 {incident.error_type} · {incident.severity}\n"
|
||||
f"📝 {incident.error_message[:180]}\n"
|
||||
f"━━━━━━━━━━━━━━━━━━\n"
|
||||
f"🔧 PlayBook:{playbook.name}\n"
|
||||
f"⚙️ 動作:<code>{heal_log.action_detail}</code>\n"
|
||||
f"{icon} 結果:<b>{heal_log.result}</b>({heal_log.duration_ms:.0f}ms)\n\n"
|
||||
f"💾 已沉澱至 KM(auto_heal_playbook)"
|
||||
f"⏱ 耗時:{heal_log.duration_ms:.0f}ms\n"
|
||||
f"━━━━━━━━━━━━━━━━━━\n"
|
||||
f"{footer}"
|
||||
)
|
||||
_send_telegram(msg)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user