fix(ai-ops): AutoHeal 三項修正 + 通知格式重設計
All checks were successful
CD Pipeline / deploy (push) Successful in 1m19s

1. SSH 金鑰:新增 _SSH_KEY_PATH(/app/config/autoheal_id_ed25519)
   paramiko key_filename 參數,支援 config 目錄 rw mount 無需重建容器

2. _create_incident:加入 refresh+expunge
   避免 session.close() 後 incident.severity 等屬性 DetachedInstanceError

3. _write_heal_log fallback:補 duration_ms=duration_ms
   原本 fallback HealLog() 沒設 duration_ms → None:.0f 觸發 TypeError

4. _notify_telegram 格式重設計
   - success/failed/skipped 三種 header 差異化
   - failed 時顯示人工介入指令 + Incident ID
   - 三段式分隔(標題 → PlayBook 動作 → 結論)
   - 移除「已沉澱至 KM」在 failed 時的誤導訊息

SSH 驗證:2026-04-19 16:30 實測 result=success duration=3110ms

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
ogt
2026-04-19 16:30:45 +08:00
parent 352a99db58
commit fb0dad2289

View File

@@ -40,6 +40,8 @@ _JUMP_HOST = os.getenv("SSH_JUMP_HOST", "192.168.0.110")
_JUMP_USER = os.getenv("SSH_JUMP_USER", "wooo")
_TARGET_HOST = os.getenv("SSH_TARGET_HOST", "192.168.0.188")
_TARGET_USER = os.getenv("SSH_TARGET_USER", "ollama")
# SSH 私鑰路徑:優先 envfallback 到 config 目錄rw mount不需重建容器
_SSH_KEY_PATH = os.getenv("SSH_KEY_PATH", "/app/config/autoheal_id_ed25519")
# ─── 白名單允許執行的指令前綴 ────────────────────────────
_CMD_WHITELIST = [
@@ -117,9 +119,14 @@ def _execute_ssh_cmd(cmd: str) -> Tuple[bool, str]:
try:
import paramiko
import os as _os
key_path = _SSH_KEY_PATH if _os.path.isfile(_SSH_KEY_PATH) else None
key_kwargs = {"key_filename": key_path} if key_path else {}
jump = paramiko.SSHClient()
jump.set_missing_host_key_policy(paramiko.AutoAddPolicy())
jump.connect(_JUMP_HOST, username=_JUMP_USER, timeout=10)
jump.connect(_JUMP_HOST, username=_JUMP_USER, timeout=10,
look_for_keys=True, **key_kwargs)
# 透過跳板機建立隧道
transport = jump.get_transport()
@@ -129,7 +136,8 @@ def _execute_ssh_cmd(cmd: str) -> Tuple[bool, str]:
target = paramiko.SSHClient()
target.set_missing_host_key_policy(paramiko.AutoAddPolicy())
target.connect(_TARGET_HOST, username=_TARGET_USER, sock=chan, timeout=15)
target.connect(_TARGET_HOST, username=_TARGET_USER, sock=chan, timeout=15,
look_for_keys=True, **key_kwargs)
_stdin, stdout, stderr = target.exec_command(cmd, timeout=60)
out = stdout.read().decode("utf-8", errors="replace").strip()
@@ -224,6 +232,8 @@ class AutoHealService:
)
session.add(incident)
session.commit()
session.refresh(incident)
session.expunge(incident)
sys_log.info(f"[AutoHeal] 建立 Incident id={incident.id} type={error_type}")
return incident
except Exception as e:
@@ -379,7 +389,7 @@ class AutoHealService:
except Exception as e:
session.rollback()
sys_log.error(f"[AutoHeal] write_heal_log 失敗: {e}")
return HealLog(result=result, action_detail=action_detail)
return HealLog(result=result, action_detail=action_detail, duration_ms=duration_ms)
finally:
session.close()
@@ -422,19 +432,35 @@ class AutoHealService:
# ── 步驟 7Telegram 通知 ───────────────────────────
def _notify_telegram(self, incident: Incident, playbook: Playbook,
heal_log: HealLog) -> None:
"""推播修復結果通知"""
icon = {"success": "", "failed": "", "skipped": "⏭️"}.get(heal_log.result, "")
sev_icon = {"P1": "🔴", "P2": "🟠", "P3": "🟡"}.get(incident.severity, "")
result = heal_log.result
if result == "success":
header = f"✅ <b>[AIOps] 自動修復成功</b>"
footer = f"💾 知識已沉澱至 KM"
elif result == "failed":
header = f"🚨 <b>[AIOps] 自動修復失敗 — 需人工介入</b>"
footer = (
f"⚠️ 修復指令回傳錯誤,請登入 188 手動排查:\n"
f"<code>docker restart {playbook.get_action_params().get('container', '?')}</code>\n"
f"🆔 Incident #{incident.id}"
)
else:
header = f"⏭️ <b>[AIOps] 修復已略過</b>"
footer = f"🆔 Incident #{incident.id}"
msg = (
f"{sev_icon} <b>[EwoooC AIOps] 自動修復報告</b>\n\n"
f"📌 任務:<code>{incident.task_name}</code>\n"
f"🚨 錯誤類型:<code>{incident.error_type}</code>\n"
f"📝 症狀:{incident.error_message[:200]}\n\n"
f"{sev_icon} {header}\n"
f"━━━━━━━━━━━━━━━━━━\n"
f"📌 <b>{incident.task_name}</b>\n"
f"🔖 {incident.error_type} · {incident.severity}\n"
f"📝 {incident.error_message[:180]}\n"
f"━━━━━━━━━━━━━━━━━━\n"
f"🔧 PlayBook{playbook.name}\n"
f"⚙️ 動作:<code>{heal_log.action_detail}</code>\n"
f"{icon} 結果:<b>{heal_log.result}</b>{heal_log.duration_ms:.0f}ms\n\n"
f"💾 已沉澱至 KMauto_heal_playbook"
f"⏱ 耗時:{heal_log.duration_ms:.0f}ms\n"
f"━━━━━━━━━━━━━━━━━━\n"
f"{footer}"
)
_send_telegram(msg)