#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ AIOps 自動修復資料庫模型 (ADR-013) 三張表:incidents / playbooks / heal_logs 構成「感知 → 匹配 → 執行 → 記錄」的完整閉環資料層 """ import json from sqlalchemy import ( Column, Integer, String, Text, Boolean, DateTime, Float, ForeignKey, Index ) from datetime import datetime from .models import Base class Incident(Base): """ 事件主表 - 紀錄每一個系統異常事件。 status 生命週期:open → healing → resolved / escalated """ __tablename__ = "incidents" id = Column(Integer, primary_key=True) # 來源資訊 task_name = Column(String(100), nullable=False, index=True) # 如 run_auto_import_task error_type = Column(String(50), nullable=False, index=True) # DB_UNREACHABLE / DNS_FAIL / OOM / etc. error_message = Column(Text, nullable=False) # 原始 exception 訊息(簡短) error_traceback = Column(Text) # 完整 traceback(可大) # 嚴重度與狀態 severity = Column(String(5), default="P2") # P1 / P2 / P3 status = Column(String(20), default="open", index=True) # open / healing / resolved / escalated # PlayBook 關聯 playbook_id = Column(Integer, ForeignKey("playbooks.id"), nullable=True) # 計數 retry_count = Column(Integer, default=0) # 時間 resolved_at = Column(DateTime, nullable=True) created_at = Column(DateTime, default=datetime.now) updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now) __table_args__ = ( Index("idx_incident_status_created", "status", "created_at"), Index("idx_incident_task_error", "task_name", "error_type"), ) def to_dict(self) -> dict: return { "id": self.id, "task_name": self.task_name, "error_type": self.error_type, "error_message": self.error_message, "severity": self.severity, "status": self.status, "playbook_id": self.playbook_id, "retry_count": self.retry_count, "resolved_at": self.resolved_at.isoformat() if self.resolved_at else None, "created_at": self.created_at.isoformat() if self.created_at else None, } class Playbook(Base): """ PlayBook 規則庫 - 每一列是一條「對應到修復動作」的規則。 match_pattern 是 JSON 陣列,ANY 命中即觸發。 action_params 是 JSON 物件,包含執行動作所需的參數。 """ __tablename__ = "playbooks" id = Column(Integer, primary_key=True) # 識別與分類 name = Column(String(200), nullable=False, unique=True) # 人類可讀名稱 error_type = Column(String(50), nullable=False, index=True) # 必須對應 Incident.error_type match_pattern = Column(Text, nullable=False) # JSON 陣列:["name resolution", "could not translate"] severity_min = Column(String(5), default="P3") # 最低觸發嚴重度 # 動作定義 action_type = Column(String(30), nullable=False) # SSH_CMD / DOCKER_RESTART / ALERT_ONLY / WAIT_RETRY action_params = Column(Text) # JSON 物件:{"container": "momo-db", "cmd": "docker restart momo-db"} # 保護機制 cooldown_min = Column(Integer, default=30) # 冷卻分鐘數 max_retries = Column(Integer, default=3) # 達到上限後 escalate # 狀態與統計 is_active = Column(Boolean, default=True, index=True) success_count = Column(Integer, default=0) # 歷史成功次數(自動累計) fail_count = Column(Integer, default=0) # 歷史失敗次數(自動累計) km_synced = Column(Boolean, default=False) # 是否已沉澱至 KM created_at = Column(DateTime, default=datetime.now) updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now) def get_match_patterns(self) -> list: """回傳 match_pattern 的 Python list""" try: return json.loads(self.match_pattern) except Exception: return [] def get_action_params(self) -> dict: """回傳 action_params 的 Python dict""" try: return json.loads(self.action_params) if self.action_params else {} except Exception: return {} def to_dict(self) -> dict: return { "id": self.id, "name": self.name, "error_type": self.error_type, "match_pattern": self.get_match_patterns(), "action_type": self.action_type, "action_params": self.get_action_params(), "cooldown_min": self.cooldown_min, "max_retries": self.max_retries, "is_active": self.is_active, "success_count": self.success_count, "fail_count": self.fail_count, } class HealLog(Base): """ 修復執行紀錄 - 每次 AutoHeal 嘗試都會寫一筆。 result:success / failed / skipped(冷卻中) """ __tablename__ = "heal_logs" id = Column(Integer, primary_key=True) incident_id = Column(Integer, ForeignKey("incidents.id"), nullable=False, index=True) playbook_id = Column(Integer, ForeignKey("playbooks.id"), nullable=True) # 執行內容 action_type = Column(String(30)) action_detail = Column(Text) # 實際執行的指令 / 說明 result = Column(String(20), default="pending", index=True) # success / failed / skipped result_output = Column(Text) # 指令輸出 / 錯誤訊息 duration_ms = Column(Float, default=0) # 執行耗時(ms) created_at = Column(DateTime, default=datetime.now) __table_args__ = ( Index("idx_heal_log_incident", "incident_id", "created_at"), ) def to_dict(self) -> dict: return { "id": self.id, "incident_id": self.incident_id, "playbook_id": self.playbook_id, "action_type": self.action_type, "action_detail": self.action_detail, "result": self.result, "result_output": self.result_output, "duration_ms": self.duration_ms, "created_at": self.created_at.isoformat() if self.created_at else None, } # ───────────────────────────────────────────────── # 預設種子 PlayBook 資料(首次啟動植入) # ───────────────────────────────────────────────── SEED_PLAYBOOKS = [ { "name": "Docker DNS 解析失敗修復", "error_type": "DNS_FAIL", "match_pattern": json.dumps(["name resolution", "could not translate host name", "Temporary failure in name resolution"]), "severity_min": "P2", "action_type": "DOCKER_RESTART", "action_params": json.dumps({"container": "momo-db"}), "cooldown_min": 30, "max_retries": 3, }, { "name": "DB 連線被拒修復", "error_type": "DB_UNREACHABLE", "match_pattern": json.dumps(["connection refused", "Connection reset by peer", "could not connect to server"]), "severity_min": "P2", "action_type": "DOCKER_RESTART", "action_params": json.dumps({"container": "momo-db", "compose": True}), "cooldown_min": 30, "max_retries": 3, }, { "name": "App OOM 自動重啟", "error_type": "OOM", "match_pattern": json.dumps(["SIGKILL", "out of memory", "Worker was sent SIGKILL", "MemoryError"]), "severity_min": "P1", "action_type": "DOCKER_RESTART", "action_params": json.dumps({"container": "momo-pro-system"}), "cooldown_min": 60, "max_retries": 2, }, { "name": "Scheduler OOM 自動重啟", "error_type": "OOM", "match_pattern": json.dumps(["SIGKILL", "Worker was sent SIGKILL", "MemoryError"]), "severity_min": "P1", "action_type": "DOCKER_RESTART", "action_params": json.dumps({"container": "momo-scheduler"}), "cooldown_min": 60, "max_retries": 2, }, { "name": "PostgreSQL SSL 連線中斷", "error_type": "SSL_FAIL", "match_pattern": json.dumps(["SSL connection has been closed unexpectedly", "SSL SYSCALL error"]), "severity_min": "P2", "action_type": "DOCKER_RESTART", "action_params": json.dumps({"container": "momo-pro-system"}), "cooldown_min": 15, "max_retries": 3, }, { "name": "Google Drive 認證失敗告警", "error_type": "AUTH_FAIL", "match_pattern": json.dumps(["invalid_grant", "google_token.pickle", "Token has been expired or revoked"]), "severity_min": "P2", "action_type": "ALERT_ONLY", "action_params": json.dumps({"message": "Google Drive OAuth Token 已過期,請人工重新認證。參閱 docs/guides/google_drive_setup.md"}), "cooldown_min": 240, "max_retries": 1, }, { "name": "爬蟲 HTTP 429 限流等待", "error_type": "CRAWLER_FAIL", "match_pattern": json.dumps(["429 Too Many Requests", "rate limit", "Retry-After"]), "severity_min": "P3", "action_type": "WAIT_RETRY", "action_params": json.dumps({"wait_minutes": 30}), "cooldown_min": 30, "max_retries": 2, }, ]