Files
ewoooc/database/autoheal_models.py
ogt 77d3a1da48
Some checks failed
CD Pipeline / deploy (push) Failing after 3m24s
feat(ai-ops): ADR-013 AIOps 自動修復閉環完整實作
架構(Exception → Incident → PlayBook → Heal → KM → Telegram):

新增元件:
- database/autoheal_models.py: Incident/Playbook/HealLog 三張表 + 7 條種子 PlayBook
- migrations/013_autoheal.sql: 建表 DDL + 種子資料(冪等 INSERT)
- services/auto_heal_service.py: 核心引擎 7 步閉環
  - _classify_error: 8 類錯誤自動分類 (DNS_FAIL/DB_UNREACHABLE/OOM/...)
  - _match_playbook: error_type + keyword + 冷卻 + max_retries 保護
  - _execute_playbook: DOCKER_RESTART/SSH_CMD/ALERT_ONLY/WAIT_RETRY
  - _sink_to_km: 修復知識寫入 ai_insights (auto_heal_playbook)
  - SSH 白名單:僅允許 docker restart / compose restart / docker start

修改元件:
- database/manager.py: _init_autoheal_tables() 啟動時建表+種子 PlayBook
- scheduler.py: 3 個核心任務植入 handle_exception
  (run_auto_import_task / run_icaim_analysis_task / run_weekly_strategy_task)
- requirements.txt: paramiko(SSH 跳板;不可用時降級 subprocess+CLI ssh)

安全設計: CMD 白名單 + cooldown + max_retries escalation + DB 冪等 migration

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 16:03:49 +08:00

251 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
AIOps 自動修復資料庫模型 (ADR-013)
三張表incidents / playbooks / heal_logs
構成「感知 → 匹配 → 執行 → 記錄」的完整閉環資料層
"""
import json
from sqlalchemy import (
Column, Integer, String, Text, Boolean, DateTime, Float, ForeignKey, Index
)
from datetime import datetime
from .models import Base
class Incident(Base):
"""
事件主表 - 紀錄每一個系統異常事件。
status 生命週期open → healing → resolved / escalated
"""
__tablename__ = "incidents"
id = Column(Integer, primary_key=True)
# 來源資訊
task_name = Column(String(100), nullable=False, index=True) # 如 run_auto_import_task
error_type = Column(String(50), nullable=False, index=True) # DB_UNREACHABLE / DNS_FAIL / OOM / etc.
error_message = Column(Text, nullable=False) # 原始 exception 訊息(簡短)
error_traceback = Column(Text) # 完整 traceback可大
# 嚴重度與狀態
severity = Column(String(5), default="P2") # P1 / P2 / P3
status = Column(String(20), default="open", index=True) # open / healing / resolved / escalated
# PlayBook 關聯
playbook_id = Column(Integer, ForeignKey("playbooks.id"), nullable=True)
# 計數
retry_count = Column(Integer, default=0)
# 時間
resolved_at = Column(DateTime, nullable=True)
created_at = Column(DateTime, default=datetime.now)
updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
__table_args__ = (
Index("idx_incident_status_created", "status", "created_at"),
Index("idx_incident_task_error", "task_name", "error_type"),
)
def to_dict(self) -> dict:
return {
"id": self.id,
"task_name": self.task_name,
"error_type": self.error_type,
"error_message": self.error_message,
"severity": self.severity,
"status": self.status,
"playbook_id": self.playbook_id,
"retry_count": self.retry_count,
"resolved_at": self.resolved_at.isoformat() if self.resolved_at else None,
"created_at": self.created_at.isoformat() if self.created_at else None,
}
class Playbook(Base):
"""
PlayBook 規則庫 - 每一列是一條「對應到修復動作」的規則。
match_pattern 是 JSON 陣列ANY 命中即觸發。
action_params 是 JSON 物件,包含執行動作所需的參數。
"""
__tablename__ = "playbooks"
id = Column(Integer, primary_key=True)
# 識別與分類
name = Column(String(200), nullable=False, unique=True) # 人類可讀名稱
error_type = Column(String(50), nullable=False, index=True) # 必須對應 Incident.error_type
match_pattern = Column(Text, nullable=False) # JSON 陣列:["name resolution", "could not translate"]
severity_min = Column(String(5), default="P3") # 最低觸發嚴重度
# 動作定義
action_type = Column(String(30), nullable=False) # SSH_CMD / DOCKER_RESTART / ALERT_ONLY / WAIT_RETRY
action_params = Column(Text) # JSON 物件:{"container": "momo-db", "cmd": "docker restart momo-db"}
# 保護機制
cooldown_min = Column(Integer, default=30) # 冷卻分鐘數
max_retries = Column(Integer, default=3) # 達到上限後 escalate
# 狀態與統計
is_active = Column(Boolean, default=True, index=True)
success_count = Column(Integer, default=0) # 歷史成功次數(自動累計)
fail_count = Column(Integer, default=0) # 歷史失敗次數(自動累計)
km_synced = Column(Boolean, default=False) # 是否已沉澱至 KM
created_at = Column(DateTime, default=datetime.now)
updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
def get_match_patterns(self) -> list:
"""回傳 match_pattern 的 Python list"""
try:
return json.loads(self.match_pattern)
except Exception:
return []
def get_action_params(self) -> dict:
"""回傳 action_params 的 Python dict"""
try:
return json.loads(self.action_params) if self.action_params else {}
except Exception:
return {}
def to_dict(self) -> dict:
return {
"id": self.id,
"name": self.name,
"error_type": self.error_type,
"match_pattern": self.get_match_patterns(),
"action_type": self.action_type,
"action_params": self.get_action_params(),
"cooldown_min": self.cooldown_min,
"max_retries": self.max_retries,
"is_active": self.is_active,
"success_count": self.success_count,
"fail_count": self.fail_count,
}
class HealLog(Base):
"""
修復執行紀錄 - 每次 AutoHeal 嘗試都會寫一筆。
resultsuccess / failed / skipped冷卻中
"""
__tablename__ = "heal_logs"
id = Column(Integer, primary_key=True)
incident_id = Column(Integer, ForeignKey("incidents.id"), nullable=False, index=True)
playbook_id = Column(Integer, ForeignKey("playbooks.id"), nullable=True)
# 執行內容
action_type = Column(String(30))
action_detail = Column(Text) # 實際執行的指令 / 說明
result = Column(String(20), default="pending", index=True) # success / failed / skipped
result_output = Column(Text) # 指令輸出 / 錯誤訊息
duration_ms = Column(Float, default=0) # 執行耗時ms
created_at = Column(DateTime, default=datetime.now)
__table_args__ = (
Index("idx_heal_log_incident", "incident_id", "created_at"),
)
def to_dict(self) -> dict:
return {
"id": self.id,
"incident_id": self.incident_id,
"playbook_id": self.playbook_id,
"action_type": self.action_type,
"action_detail": self.action_detail,
"result": self.result,
"result_output": self.result_output,
"duration_ms": self.duration_ms,
"created_at": self.created_at.isoformat() if self.created_at else None,
}
# ─────────────────────────────────────────────────
# 預設種子 PlayBook 資料(首次啟動植入)
# ─────────────────────────────────────────────────
SEED_PLAYBOOKS = [
{
"name": "Docker DNS 解析失敗修復",
"error_type": "DNS_FAIL",
"match_pattern": json.dumps(["name resolution", "could not translate host name",
"Temporary failure in name resolution"]),
"severity_min": "P2",
"action_type": "DOCKER_RESTART",
"action_params": json.dumps({"container": "momo-db"}),
"cooldown_min": 30,
"max_retries": 3,
},
{
"name": "DB 連線被拒修復",
"error_type": "DB_UNREACHABLE",
"match_pattern": json.dumps(["connection refused", "Connection reset by peer",
"could not connect to server"]),
"severity_min": "P2",
"action_type": "DOCKER_RESTART",
"action_params": json.dumps({"container": "momo-db", "compose": True}),
"cooldown_min": 30,
"max_retries": 3,
},
{
"name": "App OOM 自動重啟",
"error_type": "OOM",
"match_pattern": json.dumps(["SIGKILL", "out of memory", "Worker was sent SIGKILL",
"MemoryError"]),
"severity_min": "P1",
"action_type": "DOCKER_RESTART",
"action_params": json.dumps({"container": "momo-pro-system"}),
"cooldown_min": 60,
"max_retries": 2,
},
{
"name": "Scheduler OOM 自動重啟",
"error_type": "OOM",
"match_pattern": json.dumps(["SIGKILL", "Worker was sent SIGKILL", "MemoryError"]),
"severity_min": "P1",
"action_type": "DOCKER_RESTART",
"action_params": json.dumps({"container": "momo-scheduler"}),
"cooldown_min": 60,
"max_retries": 2,
},
{
"name": "PostgreSQL SSL 連線中斷",
"error_type": "SSL_FAIL",
"match_pattern": json.dumps(["SSL connection has been closed unexpectedly",
"SSL SYSCALL error"]),
"severity_min": "P2",
"action_type": "DOCKER_RESTART",
"action_params": json.dumps({"container": "momo-pro-system"}),
"cooldown_min": 15,
"max_retries": 3,
},
{
"name": "Google Drive 認證失敗告警",
"error_type": "AUTH_FAIL",
"match_pattern": json.dumps(["invalid_grant", "google_token.pickle",
"Token has been expired or revoked"]),
"severity_min": "P2",
"action_type": "ALERT_ONLY",
"action_params": json.dumps({"message": "Google Drive OAuth Token 已過期,請人工重新認證。參閱 docs/guides/google_drive_setup.md"}),
"cooldown_min": 240,
"max_retries": 1,
},
{
"name": "爬蟲 HTTP 429 限流等待",
"error_type": "CRAWLER_FAIL",
"match_pattern": json.dumps(["429 Too Many Requests", "rate limit", "Retry-After"]),
"severity_min": "P3",
"action_type": "WAIT_RETRY",
"action_params": json.dumps({"wait_minutes": 30}),
"cooldown_min": 30,
"max_retries": 2,
},
]