feat(ai-ops): ADR-013 AIOps 自動修復閉環完整實作
Some checks failed
CD Pipeline / deploy (push) Failing after 3m24s
Some checks failed
CD Pipeline / deploy (push) Failing after 3m24s
架構(Exception → Incident → PlayBook → Heal → KM → Telegram): 新增元件: - database/autoheal_models.py: Incident/Playbook/HealLog 三張表 + 7 條種子 PlayBook - migrations/013_autoheal.sql: 建表 DDL + 種子資料(冪等 INSERT) - services/auto_heal_service.py: 核心引擎 7 步閉環 - _classify_error: 8 類錯誤自動分類 (DNS_FAIL/DB_UNREACHABLE/OOM/...) - _match_playbook: error_type + keyword + 冷卻 + max_retries 保護 - _execute_playbook: DOCKER_RESTART/SSH_CMD/ALERT_ONLY/WAIT_RETRY - _sink_to_km: 修復知識寫入 ai_insights (auto_heal_playbook) - SSH 白名單:僅允許 docker restart / compose restart / docker start 修改元件: - database/manager.py: _init_autoheal_tables() 啟動時建表+種子 PlayBook - scheduler.py: 3 個核心任務植入 handle_exception (run_auto_import_task / run_icaim_analysis_task / run_weekly_strategy_task) - requirements.txt: paramiko(SSH 跳板;不可用時降級 subprocess+CLI ssh) 安全設計: CMD 白名單 + cooldown + max_retries escalation + DB 冪等 migration Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
250
database/autoheal_models.py
Normal file
250
database/autoheal_models.py
Normal file
@@ -0,0 +1,250 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
AIOps 自動修復資料庫模型 (ADR-013)
|
||||
三張表:incidents / playbooks / heal_logs
|
||||
構成「感知 → 匹配 → 執行 → 記錄」的完整閉環資料層
|
||||
"""
|
||||
|
||||
import json
|
||||
from sqlalchemy import (
|
||||
Column, Integer, String, Text, Boolean, DateTime, Float, ForeignKey, Index
|
||||
)
|
||||
from datetime import datetime
|
||||
from .models import Base
|
||||
|
||||
|
||||
class Incident(Base):
|
||||
"""
|
||||
事件主表 - 紀錄每一個系統異常事件。
|
||||
|
||||
status 生命週期:open → healing → resolved / escalated
|
||||
"""
|
||||
__tablename__ = "incidents"
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
|
||||
# 來源資訊
|
||||
task_name = Column(String(100), nullable=False, index=True) # 如 run_auto_import_task
|
||||
error_type = Column(String(50), nullable=False, index=True) # DB_UNREACHABLE / DNS_FAIL / OOM / etc.
|
||||
error_message = Column(Text, nullable=False) # 原始 exception 訊息(簡短)
|
||||
error_traceback = Column(Text) # 完整 traceback(可大)
|
||||
|
||||
# 嚴重度與狀態
|
||||
severity = Column(String(5), default="P2") # P1 / P2 / P3
|
||||
status = Column(String(20), default="open", index=True) # open / healing / resolved / escalated
|
||||
|
||||
# PlayBook 關聯
|
||||
playbook_id = Column(Integer, ForeignKey("playbooks.id"), nullable=True)
|
||||
|
||||
# 計數
|
||||
retry_count = Column(Integer, default=0)
|
||||
|
||||
# 時間
|
||||
resolved_at = Column(DateTime, nullable=True)
|
||||
created_at = Column(DateTime, default=datetime.now)
|
||||
updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
|
||||
|
||||
__table_args__ = (
|
||||
Index("idx_incident_status_created", "status", "created_at"),
|
||||
Index("idx_incident_task_error", "task_name", "error_type"),
|
||||
)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"id": self.id,
|
||||
"task_name": self.task_name,
|
||||
"error_type": self.error_type,
|
||||
"error_message": self.error_message,
|
||||
"severity": self.severity,
|
||||
"status": self.status,
|
||||
"playbook_id": self.playbook_id,
|
||||
"retry_count": self.retry_count,
|
||||
"resolved_at": self.resolved_at.isoformat() if self.resolved_at else None,
|
||||
"created_at": self.created_at.isoformat() if self.created_at else None,
|
||||
}
|
||||
|
||||
|
||||
class Playbook(Base):
|
||||
"""
|
||||
PlayBook 規則庫 - 每一列是一條「對應到修復動作」的規則。
|
||||
|
||||
match_pattern 是 JSON 陣列,ANY 命中即觸發。
|
||||
action_params 是 JSON 物件,包含執行動作所需的參數。
|
||||
"""
|
||||
__tablename__ = "playbooks"
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
|
||||
# 識別與分類
|
||||
name = Column(String(200), nullable=False, unique=True) # 人類可讀名稱
|
||||
error_type = Column(String(50), nullable=False, index=True) # 必須對應 Incident.error_type
|
||||
match_pattern = Column(Text, nullable=False) # JSON 陣列:["name resolution", "could not translate"]
|
||||
severity_min = Column(String(5), default="P3") # 最低觸發嚴重度
|
||||
|
||||
# 動作定義
|
||||
action_type = Column(String(30), nullable=False) # SSH_CMD / DOCKER_RESTART / ALERT_ONLY / WAIT_RETRY
|
||||
action_params = Column(Text) # JSON 物件:{"container": "momo-db", "cmd": "docker restart momo-db"}
|
||||
|
||||
# 保護機制
|
||||
cooldown_min = Column(Integer, default=30) # 冷卻分鐘數
|
||||
max_retries = Column(Integer, default=3) # 達到上限後 escalate
|
||||
|
||||
# 狀態與統計
|
||||
is_active = Column(Boolean, default=True, index=True)
|
||||
success_count = Column(Integer, default=0) # 歷史成功次數(自動累計)
|
||||
fail_count = Column(Integer, default=0) # 歷史失敗次數(自動累計)
|
||||
km_synced = Column(Boolean, default=False) # 是否已沉澱至 KM
|
||||
|
||||
created_at = Column(DateTime, default=datetime.now)
|
||||
updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
|
||||
|
||||
def get_match_patterns(self) -> list:
|
||||
"""回傳 match_pattern 的 Python list"""
|
||||
try:
|
||||
return json.loads(self.match_pattern)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def get_action_params(self) -> dict:
|
||||
"""回傳 action_params 的 Python dict"""
|
||||
try:
|
||||
return json.loads(self.action_params) if self.action_params else {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"id": self.id,
|
||||
"name": self.name,
|
||||
"error_type": self.error_type,
|
||||
"match_pattern": self.get_match_patterns(),
|
||||
"action_type": self.action_type,
|
||||
"action_params": self.get_action_params(),
|
||||
"cooldown_min": self.cooldown_min,
|
||||
"max_retries": self.max_retries,
|
||||
"is_active": self.is_active,
|
||||
"success_count": self.success_count,
|
||||
"fail_count": self.fail_count,
|
||||
}
|
||||
|
||||
|
||||
class HealLog(Base):
|
||||
"""
|
||||
修復執行紀錄 - 每次 AutoHeal 嘗試都會寫一筆。
|
||||
|
||||
result:success / failed / skipped(冷卻中)
|
||||
"""
|
||||
__tablename__ = "heal_logs"
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
incident_id = Column(Integer, ForeignKey("incidents.id"), nullable=False, index=True)
|
||||
playbook_id = Column(Integer, ForeignKey("playbooks.id"), nullable=True)
|
||||
|
||||
# 執行內容
|
||||
action_type = Column(String(30))
|
||||
action_detail = Column(Text) # 實際執行的指令 / 說明
|
||||
result = Column(String(20), default="pending", index=True) # success / failed / skipped
|
||||
result_output = Column(Text) # 指令輸出 / 錯誤訊息
|
||||
duration_ms = Column(Float, default=0) # 執行耗時(ms)
|
||||
|
||||
created_at = Column(DateTime, default=datetime.now)
|
||||
|
||||
__table_args__ = (
|
||||
Index("idx_heal_log_incident", "incident_id", "created_at"),
|
||||
)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"id": self.id,
|
||||
"incident_id": self.incident_id,
|
||||
"playbook_id": self.playbook_id,
|
||||
"action_type": self.action_type,
|
||||
"action_detail": self.action_detail,
|
||||
"result": self.result,
|
||||
"result_output": self.result_output,
|
||||
"duration_ms": self.duration_ms,
|
||||
"created_at": self.created_at.isoformat() if self.created_at else None,
|
||||
}
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────
|
||||
# 預設種子 PlayBook 資料(首次啟動植入)
|
||||
# ─────────────────────────────────────────────────
|
||||
SEED_PLAYBOOKS = [
|
||||
{
|
||||
"name": "Docker DNS 解析失敗修復",
|
||||
"error_type": "DNS_FAIL",
|
||||
"match_pattern": json.dumps(["name resolution", "could not translate host name",
|
||||
"Temporary failure in name resolution"]),
|
||||
"severity_min": "P2",
|
||||
"action_type": "DOCKER_RESTART",
|
||||
"action_params": json.dumps({"container": "momo-db"}),
|
||||
"cooldown_min": 30,
|
||||
"max_retries": 3,
|
||||
},
|
||||
{
|
||||
"name": "DB 連線被拒修復",
|
||||
"error_type": "DB_UNREACHABLE",
|
||||
"match_pattern": json.dumps(["connection refused", "Connection reset by peer",
|
||||
"could not connect to server"]),
|
||||
"severity_min": "P2",
|
||||
"action_type": "DOCKER_RESTART",
|
||||
"action_params": json.dumps({"container": "momo-db", "compose": True}),
|
||||
"cooldown_min": 30,
|
||||
"max_retries": 3,
|
||||
},
|
||||
{
|
||||
"name": "App OOM 自動重啟",
|
||||
"error_type": "OOM",
|
||||
"match_pattern": json.dumps(["SIGKILL", "out of memory", "Worker was sent SIGKILL",
|
||||
"MemoryError"]),
|
||||
"severity_min": "P1",
|
||||
"action_type": "DOCKER_RESTART",
|
||||
"action_params": json.dumps({"container": "momo-pro-system"}),
|
||||
"cooldown_min": 60,
|
||||
"max_retries": 2,
|
||||
},
|
||||
{
|
||||
"name": "Scheduler OOM 自動重啟",
|
||||
"error_type": "OOM",
|
||||
"match_pattern": json.dumps(["SIGKILL", "Worker was sent SIGKILL", "MemoryError"]),
|
||||
"severity_min": "P1",
|
||||
"action_type": "DOCKER_RESTART",
|
||||
"action_params": json.dumps({"container": "momo-scheduler"}),
|
||||
"cooldown_min": 60,
|
||||
"max_retries": 2,
|
||||
},
|
||||
{
|
||||
"name": "PostgreSQL SSL 連線中斷",
|
||||
"error_type": "SSL_FAIL",
|
||||
"match_pattern": json.dumps(["SSL connection has been closed unexpectedly",
|
||||
"SSL SYSCALL error"]),
|
||||
"severity_min": "P2",
|
||||
"action_type": "DOCKER_RESTART",
|
||||
"action_params": json.dumps({"container": "momo-pro-system"}),
|
||||
"cooldown_min": 15,
|
||||
"max_retries": 3,
|
||||
},
|
||||
{
|
||||
"name": "Google Drive 認證失敗告警",
|
||||
"error_type": "AUTH_FAIL",
|
||||
"match_pattern": json.dumps(["invalid_grant", "google_token.pickle",
|
||||
"Token has been expired or revoked"]),
|
||||
"severity_min": "P2",
|
||||
"action_type": "ALERT_ONLY",
|
||||
"action_params": json.dumps({"message": "Google Drive OAuth Token 已過期,請人工重新認證。參閱 docs/guides/google_drive_setup.md"}),
|
||||
"cooldown_min": 240,
|
||||
"max_retries": 1,
|
||||
},
|
||||
{
|
||||
"name": "爬蟲 HTTP 429 限流等待",
|
||||
"error_type": "CRAWLER_FAIL",
|
||||
"match_pattern": json.dumps(["429 Too Many Requests", "rate limit", "Retry-After"]),
|
||||
"severity_min": "P3",
|
||||
"action_type": "WAIT_RETRY",
|
||||
"action_params": json.dumps({"wait_minutes": 30}),
|
||||
"cooldown_min": 30,
|
||||
"max_retries": 2,
|
||||
},
|
||||
]
|
||||
@@ -8,6 +8,7 @@ from .user_models import User, LoginHistory # noqa: F401 - 必須在 trend_mode
|
||||
from .edm_models import PromoProduct # V-Fix: 確保 EDM 模型被註冊,以便自動建表
|
||||
from .trend_models import TrendRecord, TrendKeyword, TrendAnalysis, WebSearchCache, TelegramUser # noqa: F401 - 趨勢資料表
|
||||
from .ai_models import AIGenerationHistory, AIInsight, AIUsageTracking, AIPromptTemplate # AI 記憶體與洞察模型
|
||||
from .autoheal_models import Incident, Playbook, HealLog # noqa: F401 - ADR-013 AIOps 自動修復表
|
||||
|
||||
# 🚩 導入優化後的日誌管理模組
|
||||
from services.logger_manager import SystemLogger
|
||||
@@ -60,6 +61,8 @@ class DatabaseManager:
|
||||
)
|
||||
self.Session = sessionmaker(bind=self.engine)
|
||||
sys_log.info(f"[Database] ✅ 使用 PostgreSQL 資料庫 (連線池已優化)")
|
||||
# ADR-013: 確保 AIOps 自動修復表存在並植入種子 PlayBook
|
||||
self._init_autoheal_tables()
|
||||
else:
|
||||
# SQLite 模式 - 向後相容
|
||||
if db_path is None:
|
||||
@@ -111,7 +114,44 @@ class DatabaseManager:
|
||||
sys_log.error(f"❌ 資料庫結構檢查失敗: {e}")
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def _init_autoheal_tables(self):
|
||||
"""
|
||||
ADR-013: 在 PostgreSQL 模式下,確保 AIOps 三張表存在並植入種子 PlayBook。
|
||||
使用 Base.metadata.create_all 以 checkfirst=True 確保冪等執行。
|
||||
"""
|
||||
try:
|
||||
# 建立表(已存在則略過)
|
||||
from .autoheal_models import Incident, Playbook, HealLog, SEED_PLAYBOOKS
|
||||
from sqlalchemy import inspect as sa_inspect
|
||||
inspector = sa_inspect(self.engine)
|
||||
existing_tables = inspector.get_table_names()
|
||||
|
||||
for model in [Incident, Playbook, HealLog]:
|
||||
if model.__tablename__ not in existing_tables:
|
||||
model.__table__.create(self.engine, checkfirst=True)
|
||||
sys_log.info(f"[Database] ✅ 建立 AIOps 表: {model.__tablename__}")
|
||||
|
||||
# 植入種子 PlayBook(首次)
|
||||
session = self.get_session()
|
||||
try:
|
||||
count = session.query(Playbook).count()
|
||||
if count == 0:
|
||||
for seed in SEED_PLAYBOOKS:
|
||||
session.add(Playbook(**seed))
|
||||
session.commit()
|
||||
sys_log.info(f"[Database] ✅ 植入 {len(SEED_PLAYBOOKS)} 筆種子 PlayBook")
|
||||
else:
|
||||
sys_log.info(f"[Database] PlayBook 已有 {count} 筆,略過種子植入")
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
sys_log.warning(f"[Database] 種子 PlayBook 植入失敗: {e}")
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
except Exception as e:
|
||||
sys_log.error(f"[Database] _init_autoheal_tables 失敗 (不影響主程序): {e}")
|
||||
|
||||
def get_session(self):
|
||||
"""
|
||||
提供外部調用的 Session 實例。
|
||||
|
||||
Reference in New Issue
Block a user