diff --git a/CLAUDE.md b/CLAUDE.md
index 93fc3d1..f2cff88 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,6 +1,6 @@
# EwoooC (MOMO Pro System) — 核心索引
-> **版本**: V10.3 | **目標**: AI 驅動 MOMO 商品監控、業績分析與策略自動化
+> **版本**: V10.4 | **目標**: AI 驅動 MOMO 商品監控、業績分析、策略自動化與 AIOps 自愈
## 治理
- **憲法**: [CONSTITUTION.md](CONSTITUTION.md) — 所有開發必須遵守
@@ -62,6 +62,7 @@ ssh wooo@192.168.0.110 "ssh ollama@192.168.0.188 \"\
| 憑證對照表 | [docs/memory/credentials_passbook.md](docs/memory/credentials_passbook.md) |
| AIOps 存檔 | [docs/external/aiops_saas.md](docs/external/aiops_saas.md) |
| 跨專案隔離(**必讀**)| [docs/adr/ADR-011-cross-project-resource-isolation.md](docs/adr/ADR-011-cross-project-resource-isolation.md) |
+| **AIOps 自動修復(ADR-013)** | [docs/adr/ADR-013-aiops-autoheal.md](docs/adr/ADR-013-aiops-autoheal.md) |
## AI 開發鐵律(Token 優化)
diff --git a/database/autoheal_models.py b/database/autoheal_models.py
new file mode 100644
index 0000000..6bcf2ce
--- /dev/null
+++ b/database/autoheal_models.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+AIOps 自動修復資料庫模型 (ADR-013)
+三張表:incidents / playbooks / heal_logs
+構成「感知 → 匹配 → 執行 → 記錄」的完整閉環資料層
+"""
+
+import json
+from sqlalchemy import (
+ Column, Integer, String, Text, Boolean, DateTime, Float, ForeignKey, Index
+)
+from datetime import datetime
+from .models import Base
+
+
+class Incident(Base):
+ """
+ 事件主表 - 紀錄每一個系統異常事件。
+
+ status 生命週期:open → healing → resolved / escalated
+ """
+ __tablename__ = "incidents"
+
+ id = Column(Integer, primary_key=True)
+
+ # 來源資訊
+ task_name = Column(String(100), nullable=False, index=True) # 如 run_auto_import_task
+ error_type = Column(String(50), nullable=False, index=True) # DB_UNREACHABLE / DNS_FAIL / OOM / etc.
+ error_message = Column(Text, nullable=False) # 原始 exception 訊息(簡短)
+ error_traceback = Column(Text) # 完整 traceback(可大)
+
+ # 嚴重度與狀態
+ severity = Column(String(5), default="P2") # P1 / P2 / P3
+ status = Column(String(20), default="open", index=True) # open / healing / resolved / escalated
+
+ # PlayBook 關聯
+ playbook_id = Column(Integer, ForeignKey("playbooks.id"), nullable=True)
+
+ # 計數
+ retry_count = Column(Integer, default=0)
+
+ # 時間
+ resolved_at = Column(DateTime, nullable=True)
+ created_at = Column(DateTime, default=datetime.now)
+ updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
+
+ __table_args__ = (
+ Index("idx_incident_status_created", "status", "created_at"),
+ Index("idx_incident_task_error", "task_name", "error_type"),
+ )
+
+ def to_dict(self) -> dict:
+ return {
+ "id": self.id,
+ "task_name": self.task_name,
+ "error_type": self.error_type,
+ "error_message": self.error_message,
+ "severity": self.severity,
+ "status": self.status,
+ "playbook_id": self.playbook_id,
+ "retry_count": self.retry_count,
+ "resolved_at": self.resolved_at.isoformat() if self.resolved_at else None,
+ "created_at": self.created_at.isoformat() if self.created_at else None,
+ }
+
+
+class Playbook(Base):
+ """
+ PlayBook 規則庫 - 每一列是一條「對應到修復動作」的規則。
+
+ match_pattern 是 JSON 陣列,ANY 命中即觸發。
+ action_params 是 JSON 物件,包含執行動作所需的參數。
+ """
+ __tablename__ = "playbooks"
+
+ id = Column(Integer, primary_key=True)
+
+ # 識別與分類
+ name = Column(String(200), nullable=False, unique=True) # 人類可讀名稱
+ error_type = Column(String(50), nullable=False, index=True) # 必須對應 Incident.error_type
+ match_pattern = Column(Text, nullable=False) # JSON 陣列:["name resolution", "could not translate"]
+ severity_min = Column(String(5), default="P3") # 最低觸發嚴重度
+
+ # 動作定義
+ action_type = Column(String(30), nullable=False) # SSH_CMD / DOCKER_RESTART / ALERT_ONLY / WAIT_RETRY
+ action_params = Column(Text) # JSON 物件:{"container": "momo-db", "cmd": "docker restart momo-db"}
+
+ # 保護機制
+ cooldown_min = Column(Integer, default=30) # 冷卻分鐘數
+ max_retries = Column(Integer, default=3) # 達到上限後 escalate
+
+ # 狀態與統計
+ is_active = Column(Boolean, default=True, index=True)
+ success_count = Column(Integer, default=0) # 歷史成功次數(自動累計)
+ fail_count = Column(Integer, default=0) # 歷史失敗次數(自動累計)
+ km_synced = Column(Boolean, default=False) # 是否已沉澱至 KM
+
+ created_at = Column(DateTime, default=datetime.now)
+ updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
+
+ def get_match_patterns(self) -> list:
+ """回傳 match_pattern 的 Python list"""
+ try:
+ return json.loads(self.match_pattern)
+ except Exception:
+ return []
+
+ def get_action_params(self) -> dict:
+ """回傳 action_params 的 Python dict"""
+ try:
+ return json.loads(self.action_params) if self.action_params else {}
+ except Exception:
+ return {}
+
+ def to_dict(self) -> dict:
+ return {
+ "id": self.id,
+ "name": self.name,
+ "error_type": self.error_type,
+ "match_pattern": self.get_match_patterns(),
+ "action_type": self.action_type,
+ "action_params": self.get_action_params(),
+ "cooldown_min": self.cooldown_min,
+ "max_retries": self.max_retries,
+ "is_active": self.is_active,
+ "success_count": self.success_count,
+ "fail_count": self.fail_count,
+ }
+
+
+class HealLog(Base):
+ """
+ 修復執行紀錄 - 每次 AutoHeal 嘗試都會寫一筆。
+
+ result:success / failed / skipped(冷卻中)
+ """
+ __tablename__ = "heal_logs"
+
+ id = Column(Integer, primary_key=True)
+ incident_id = Column(Integer, ForeignKey("incidents.id"), nullable=False, index=True)
+ playbook_id = Column(Integer, ForeignKey("playbooks.id"), nullable=True)
+
+ # 執行內容
+ action_type = Column(String(30))
+ action_detail = Column(Text) # 實際執行的指令 / 說明
+ result = Column(String(20), default="pending", index=True) # success / failed / skipped
+ result_output = Column(Text) # 指令輸出 / 錯誤訊息
+ duration_ms = Column(Float, default=0) # 執行耗時(ms)
+
+ created_at = Column(DateTime, default=datetime.now)
+
+ __table_args__ = (
+ Index("idx_heal_log_incident", "incident_id", "created_at"),
+ )
+
+ def to_dict(self) -> dict:
+ return {
+ "id": self.id,
+ "incident_id": self.incident_id,
+ "playbook_id": self.playbook_id,
+ "action_type": self.action_type,
+ "action_detail": self.action_detail,
+ "result": self.result,
+ "result_output": self.result_output,
+ "duration_ms": self.duration_ms,
+ "created_at": self.created_at.isoformat() if self.created_at else None,
+ }
+
+
+# ─────────────────────────────────────────────────
+# 預設種子 PlayBook 資料(首次啟動植入)
+# ─────────────────────────────────────────────────
+SEED_PLAYBOOKS = [
+ {
+ "name": "Docker DNS 解析失敗修復",
+ "error_type": "DNS_FAIL",
+ "match_pattern": json.dumps(["name resolution", "could not translate host name",
+ "Temporary failure in name resolution"]),
+ "severity_min": "P2",
+ "action_type": "DOCKER_RESTART",
+ "action_params": json.dumps({"container": "momo-db"}),
+ "cooldown_min": 30,
+ "max_retries": 3,
+ },
+ {
+ "name": "DB 連線被拒修復",
+ "error_type": "DB_UNREACHABLE",
+ "match_pattern": json.dumps(["connection refused", "Connection reset by peer",
+ "could not connect to server"]),
+ "severity_min": "P2",
+ "action_type": "DOCKER_RESTART",
+ "action_params": json.dumps({"container": "momo-db", "compose": True}),
+ "cooldown_min": 30,
+ "max_retries": 3,
+ },
+ {
+ "name": "App OOM 自動重啟",
+ "error_type": "OOM",
+ "match_pattern": json.dumps(["SIGKILL", "out of memory", "Worker was sent SIGKILL",
+ "MemoryError"]),
+ "severity_min": "P1",
+ "action_type": "DOCKER_RESTART",
+ "action_params": json.dumps({"container": "momo-pro-system"}),
+ "cooldown_min": 60,
+ "max_retries": 2,
+ },
+ {
+ "name": "Scheduler OOM 自動重啟",
+ "error_type": "OOM",
+ "match_pattern": json.dumps(["SIGKILL", "Worker was sent SIGKILL", "MemoryError"]),
+ "severity_min": "P1",
+ "action_type": "DOCKER_RESTART",
+ "action_params": json.dumps({"container": "momo-scheduler"}),
+ "cooldown_min": 60,
+ "max_retries": 2,
+ },
+ {
+ "name": "PostgreSQL SSL 連線中斷",
+ "error_type": "SSL_FAIL",
+ "match_pattern": json.dumps(["SSL connection has been closed unexpectedly",
+ "SSL SYSCALL error"]),
+ "severity_min": "P2",
+ "action_type": "DOCKER_RESTART",
+ "action_params": json.dumps({"container": "momo-pro-system"}),
+ "cooldown_min": 15,
+ "max_retries": 3,
+ },
+ {
+ "name": "Google Drive 認證失敗告警",
+ "error_type": "AUTH_FAIL",
+ "match_pattern": json.dumps(["invalid_grant", "google_token.pickle",
+ "Token has been expired or revoked"]),
+ "severity_min": "P2",
+ "action_type": "ALERT_ONLY",
+ "action_params": json.dumps({"message": "Google Drive OAuth Token 已過期,請人工重新認證。參閱 docs/guides/google_drive_setup.md"}),
+ "cooldown_min": 240,
+ "max_retries": 1,
+ },
+ {
+ "name": "爬蟲 HTTP 429 限流等待",
+ "error_type": "CRAWLER_FAIL",
+ "match_pattern": json.dumps(["429 Too Many Requests", "rate limit", "Retry-After"]),
+ "severity_min": "P3",
+ "action_type": "WAIT_RETRY",
+ "action_params": json.dumps({"wait_minutes": 30}),
+ "cooldown_min": 30,
+ "max_retries": 2,
+ },
+]
diff --git a/database/manager.py b/database/manager.py
index 61ff3ca..204c5e2 100644
--- a/database/manager.py
+++ b/database/manager.py
@@ -8,6 +8,7 @@ from .user_models import User, LoginHistory # noqa: F401 - 必須在 trend_mode
from .edm_models import PromoProduct # V-Fix: 確保 EDM 模型被註冊,以便自動建表
from .trend_models import TrendRecord, TrendKeyword, TrendAnalysis, WebSearchCache, TelegramUser # noqa: F401 - 趨勢資料表
from .ai_models import AIGenerationHistory, AIInsight, AIUsageTracking, AIPromptTemplate # AI 記憶體與洞察模型
+from .autoheal_models import Incident, Playbook, HealLog # noqa: F401 - ADR-013 AIOps 自動修復表
# 🚩 導入優化後的日誌管理模組
from services.logger_manager import SystemLogger
@@ -60,6 +61,8 @@ class DatabaseManager:
)
self.Session = sessionmaker(bind=self.engine)
sys_log.info(f"[Database] ✅ 使用 PostgreSQL 資料庫 (連線池已優化)")
+ # ADR-013: 確保 AIOps 自動修復表存在並植入種子 PlayBook
+ self._init_autoheal_tables()
else:
# SQLite 模式 - 向後相容
if db_path is None:
@@ -111,7 +114,44 @@ class DatabaseManager:
sys_log.error(f"❌ 資料庫結構檢查失敗: {e}")
finally:
session.close()
-
+
+ def _init_autoheal_tables(self):
+ """
+ ADR-013: 在 PostgreSQL 模式下,確保 AIOps 三張表存在並植入種子 PlayBook。
+ 使用 Base.metadata.create_all 以 checkfirst=True 確保冪等執行。
+ """
+ try:
+ # 建立表(已存在則略過)
+ from .autoheal_models import Incident, Playbook, HealLog, SEED_PLAYBOOKS
+ from sqlalchemy import inspect as sa_inspect
+ inspector = sa_inspect(self.engine)
+ existing_tables = inspector.get_table_names()
+
+ for model in [Incident, Playbook, HealLog]:
+ if model.__tablename__ not in existing_tables:
+ model.__table__.create(self.engine, checkfirst=True)
+ sys_log.info(f"[Database] ✅ 建立 AIOps 表: {model.__tablename__}")
+
+ # 植入種子 PlayBook(首次)
+ session = self.get_session()
+ try:
+ count = session.query(Playbook).count()
+ if count == 0:
+ for seed in SEED_PLAYBOOKS:
+ session.add(Playbook(**seed))
+ session.commit()
+ sys_log.info(f"[Database] ✅ 植入 {len(SEED_PLAYBOOKS)} 筆種子 PlayBook")
+ else:
+ sys_log.info(f"[Database] PlayBook 已有 {count} 筆,略過種子植入")
+ except Exception as e:
+ session.rollback()
+ sys_log.warning(f"[Database] 種子 PlayBook 植入失敗: {e}")
+ finally:
+ session.close()
+
+ except Exception as e:
+ sys_log.error(f"[Database] _init_autoheal_tables 失敗 (不影響主程序): {e}")
+
def get_session(self):
"""
提供外部調用的 Session 實例。
diff --git a/docs/adr/ADR-013-aiops-autoheal.md b/docs/adr/ADR-013-aiops-autoheal.md
new file mode 100644
index 0000000..25d5da8
--- /dev/null
+++ b/docs/adr/ADR-013-aiops-autoheal.md
@@ -0,0 +1,69 @@
+# ADR-013: AIOps 自動修復閉環架構
+
+**狀態**: Accepted
+**日期**: 2026-04-19
+**提案者**: Antigravity
+
+---
+
+## 背景與問題
+
+EwoooC 系統已有 L1 Hermes 告警派發,但告警只能「通知」,無法「自癒」。
+當 `psycopg2.OperationalError: could not translate host name "momo-postgres"` 這類明確的基礎設施問題發生時,仍需人工 SSH 登入修復,缺乏自動化閉環。
+
+---
+
+## 決策
+
+建立三層 AIOps 閉環架構:
+
+```
+Exception → Incident(DB) → PlayBook 匹配 → Auto-Heal 執行 → HealLog(DB) → KM 沉澱(ai_insights) → Telegram 通知
+```
+
+### 新增元件
+
+| 元件 | 類型 | 說明 |
+|------|------|------|
+| `database/autoheal_models.py` | Model | Incident / Playbook / HealLog 三張表 |
+| `migrations/013_autoheal.sql` | Migration | 建表 + 種子 PlayBook 植入 |
+| `services/auto_heal_service.py` | Service | 核心引擎(分類、匹配、執行、沉澱) |
+| `database/manager.py` | 修改 | 加入 `_init_autoheal_tables()` |
+| `scheduler.py` | 修改 | 三個核心任務植入 `handle_exception` |
+| `requirements.txt` | 修改 | 加入 `paramiko` |
+
+### PlayBook 動作類型
+
+| action_type | 說明 |
+|---|---|
+| `DOCKER_RESTART` | 透過 SSH 跳板 restart 指定容器 |
+| `SSH_CMD` | 執行白名單內的任意 SSH 指令 |
+| `ALERT_ONLY` | 僅發 Telegram 告警,人工介入 |
+| `WAIT_RETRY` | 紀錄後等待排程重試 |
+
+### 安全設計
+
+- SSH 指令白名單:僅允許 `docker restart *`, `docker compose restart *`, `docker start *`
+- 冷卻機制:同 PlayBook 在 `cooldown_min` 內不重複觸發
+- 升級機制:達到 `max_retries` 後 incident.status = `escalated` 並通知人工
+
+### KM 沉澱格式
+
+每次修復後寫入 `ai_insights`:
+- `insight_type = "auto_heal_playbook"`
+- 包含事件、症狀、行動、結果、教訓五要素
+- 自動排入 `embedding_retry_queue` 完成 RAG 向量化
+
+---
+
+## 取捨
+
+**優先使用 paramiko** 而非 subprocess + CLI ssh,原因是在容器內環境控制更精準,且支援跳板機 ProxyJump。若 paramiko 未安裝則自動降級到 CLI ssh(向後相容)。
+
+---
+
+## 結果
+
+- P1/P2 等級的 DB_UNREACHABLE / DNS_FAIL 類問題可在 30 秒內完成自動修復
+- 所有修復知識自動沉澱至 RAG KM,提升未來 AI 的判斷品質
+- 覆蓋任務:`run_auto_import_task` / `run_icaim_analysis_task` / `run_weekly_strategy_task`
diff --git a/migrations/013_autoheal.sql b/migrations/013_autoheal.sql
new file mode 100644
index 0000000..1949229
--- /dev/null
+++ b/migrations/013_autoheal.sql
@@ -0,0 +1,129 @@
+-- Migration 013: AIOps 自動修復三張表
+-- incidents / playbooks / heal_logs
+-- 建立日期:2026-04-19
+
+-- ─────────────────────────────────────────────────
+-- 表 1: incidents (事件主表)
+-- ─────────────────────────────────────────────────
+CREATE TABLE IF NOT EXISTS incidents (
+ id SERIAL PRIMARY KEY,
+ task_name VARCHAR(100) NOT NULL,
+ error_type VARCHAR(50) NOT NULL,
+ error_message TEXT NOT NULL,
+ error_traceback TEXT,
+ severity VARCHAR(5) NOT NULL DEFAULT 'P2',
+ status VARCHAR(20) NOT NULL DEFAULT 'open',
+ playbook_id INTEGER REFERENCES playbooks(id),
+ retry_count INTEGER DEFAULT 0,
+ resolved_at TIMESTAMP,
+ created_at TIMESTAMP NOT NULL DEFAULT NOW(),
+ updated_at TIMESTAMP NOT NULL DEFAULT NOW()
+);
+
+CREATE INDEX IF NOT EXISTS idx_incident_status_created ON incidents(status, created_at);
+CREATE INDEX IF NOT EXISTS idx_incident_task_error ON incidents(task_name, error_type);
+
+-- ─────────────────────────────────────────────────
+-- 表 2: playbooks (PlayBook 規則庫)
+-- ─────────────────────────────────────────────────
+CREATE TABLE IF NOT EXISTS playbooks (
+ id SERIAL PRIMARY KEY,
+ name VARCHAR(200) NOT NULL UNIQUE,
+ error_type VARCHAR(50) NOT NULL,
+ match_pattern TEXT NOT NULL, -- JSON 陣列
+ severity_min VARCHAR(5) DEFAULT 'P3',
+ action_type VARCHAR(30) NOT NULL, -- SSH_CMD / DOCKER_RESTART / ALERT_ONLY / WAIT_RETRY
+ action_params TEXT, -- JSON 物件
+ cooldown_min INTEGER DEFAULT 30,
+ max_retries INTEGER DEFAULT 3,
+ is_active BOOLEAN DEFAULT TRUE,
+ success_count INTEGER DEFAULT 0,
+ fail_count INTEGER DEFAULT 0,
+ km_synced BOOLEAN DEFAULT FALSE,
+ created_at TIMESTAMP NOT NULL DEFAULT NOW(),
+ updated_at TIMESTAMP NOT NULL DEFAULT NOW()
+);
+
+CREATE INDEX IF NOT EXISTS idx_playbook_error_type ON playbooks(error_type, is_active);
+
+-- ─────────────────────────────────────────────────
+-- 表 3: heal_logs (修復執行紀錄)
+-- ─────────────────────────────────────────────────
+CREATE TABLE IF NOT EXISTS heal_logs (
+ id SERIAL PRIMARY KEY,
+ incident_id INTEGER NOT NULL REFERENCES incidents(id),
+ playbook_id INTEGER REFERENCES playbooks(id),
+ action_type VARCHAR(30),
+ action_detail TEXT,
+ result VARCHAR(20) DEFAULT 'pending', -- success / failed / skipped
+ result_output TEXT,
+ duration_ms FLOAT DEFAULT 0,
+ created_at TIMESTAMP NOT NULL DEFAULT NOW()
+);
+
+CREATE INDEX IF NOT EXISTS idx_heal_log_incident ON heal_logs(incident_id, created_at);
+CREATE INDEX IF NOT EXISTS idx_heal_log_result ON heal_logs(result, created_at);
+
+-- ─────────────────────────────────────────────────
+-- 種子 PlayBook 資料(首次初始化,已存在則略過)
+-- ─────────────────────────────────────────────────
+INSERT INTO playbooks (name, error_type, match_pattern, severity_min, action_type, action_params, cooldown_min, max_retries)
+SELECT * FROM (VALUES
+ (
+ 'Docker DNS 解析失敗修復',
+ 'DNS_FAIL',
+ '["name resolution", "could not translate host name", "Temporary failure in name resolution"]',
+ 'P2', 'DOCKER_RESTART',
+ '{"container": "momo-db"}',
+ 30, 3
+ ),
+ (
+ 'DB 連線被拒修復',
+ 'DB_UNREACHABLE',
+ '["connection refused", "Connection reset by peer", "could not connect to server"]',
+ 'P2', 'DOCKER_RESTART',
+ '{"container": "momo-db", "compose": true}',
+ 30, 3
+ ),
+ (
+ 'App OOM 自動重啟',
+ 'OOM',
+ '["SIGKILL", "out of memory", "Worker was sent SIGKILL", "MemoryError"]',
+ 'P1', 'DOCKER_RESTART',
+ '{"container": "momo-pro-system"}',
+ 60, 2
+ ),
+ (
+ 'Scheduler OOM 自動重啟',
+ 'OOM',
+ '["SIGKILL", "Worker was sent SIGKILL"]',
+ 'P1', 'DOCKER_RESTART',
+ '{"container": "momo-scheduler"}',
+ 60, 2
+ ),
+ (
+ 'PostgreSQL SSL 連線中斷',
+ 'SSL_FAIL',
+ '["SSL connection has been closed unexpectedly", "SSL SYSCALL error"]',
+ 'P2', 'DOCKER_RESTART',
+ '{"container": "momo-pro-system"}',
+ 15, 3
+ ),
+ (
+ 'Google Drive 認證失敗告警',
+ 'AUTH_FAIL',
+ '["invalid_grant", "google_token.pickle", "Token has been expired or revoked"]',
+ 'P2', 'ALERT_ONLY',
+ '{"message": "Google Drive OAuth Token 已過期,請人工重新認證。參閱 docs/guides/google_drive_setup.md"}',
+ 240, 1
+ ),
+ (
+ '爬蟲 HTTP 429 限流等待',
+ 'CRAWLER_FAIL',
+ '["429 Too Many Requests", "rate limit", "Retry-After"]',
+ 'P3', 'WAIT_RETRY',
+ '{"wait_minutes": 30}',
+ 30, 2
+ )
+) AS v(name, error_type, match_pattern, severity_min, action_type, action_params, cooldown_min, max_retries)
+WHERE NOT EXISTS (SELECT 1 FROM playbooks WHERE playbooks.name = v.name);
diff --git a/requirements.txt b/requirements.txt
index c688bdf..f5b3a88 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,4 +21,5 @@ feedparser
beautifulsoup4
lxml
prometheus-client
-python-telegram-bot
\ No newline at end of file
+python-telegram-bot
+paramiko # ADR-013: AIOps SSH 跳板修復
\ No newline at end of file
diff --git a/scheduler.py b/scheduler.py
index 53b8794..ce2d639 100644
--- a/scheduler.py
+++ b/scheduler.py
@@ -1574,6 +1574,17 @@ def run_auto_import_task():
except Exception as notify_error:
logging.error(f"[Scheduler] [AutoImport] ❌ LINE 通知失敗 | Error: {notify_error}")
+ # ADR-013: AIOps 自動修復 — PlayBook 匹配 + KM 沉澱
+ try:
+ from services.auto_heal_service import auto_heal_service
+ auto_heal_service.handle_exception(
+ task_name="run_auto_import_task",
+ exception=e,
+ traceback_str=_tb.format_exc(),
+ )
+ except Exception as _heal_e:
+ logging.error(f"[Scheduler] [AutoImport] auto_heal_service 失敗: {_heal_e}")
+
def run_competitor_price_feeder_task():
"""
競品價格補給線排程任務(每 4 小時執行一次)
@@ -1679,8 +1690,19 @@ def run_icaim_analysis_task():
_save_stats('icaim_dispatch', {**dispatch_result, "status": "Success"})
except Exception as e:
+ import traceback as _tb
logging.error(f"[Scheduler] [ICAIM] 🚨 任務異常 | Error: {e}")
_save_stats('icaim_analysis', {"status": "Failed", "error": str(e)})
+ # ADR-013: AIOps 自動修復
+ try:
+ from services.auto_heal_service import auto_heal_service
+ auto_heal_service.handle_exception(
+ task_name="run_icaim_analysis_task",
+ exception=e,
+ traceback_str=_tb.format_exc(),
+ )
+ except Exception as _heal_e:
+ logging.error(f"[Scheduler] [ICAIM] auto_heal_service 失敗: {_heal_e}")
def run_weekly_strategy_task():
@@ -1694,8 +1716,19 @@ def run_weekly_strategy_task():
generate_weekly_strategy_report(force_tg_alert=True)
logging.info("[Scheduler] [Strategy] ✅ Gemini 策略師週報任務完成")
except Exception as e:
+ import traceback as _tb
logging.error(f"[Scheduler] [Strategy] 🚨 任務異常 | Error: {e}")
_save_stats('weekly_strategy', {"status": "Failed", "error": str(e)})
+ # ADR-013: AIOps 自動修復
+ try:
+ from services.auto_heal_service import auto_heal_service
+ auto_heal_service.handle_exception(
+ task_name="run_weekly_strategy_task",
+ exception=e,
+ traceback_str=_tb.format_exc(),
+ )
+ except Exception as _heal_e:
+ logging.error(f"[Scheduler] [Strategy] auto_heal_service 失敗: {_heal_e}")
def run_db_backup_task():
diff --git a/services/auto_heal_service.py b/services/auto_heal_service.py
new file mode 100644
index 0000000..9c19cc5
--- /dev/null
+++ b/services/auto_heal_service.py
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+auto_heal_service.py - EwoooC AIOps 自動修復引擎 (ADR-013)
+
+完整閉環:
+ Exception 觸發
+ → create_incident() : 寫入 incidents 表
+ → classify_error() : 識別 error_type
+ → match_playbook() : 比對 playbooks 規則庫
+ → execute_playbook() : 執行修復動作
+ → _write_heal_log() : 寫入 heal_logs 表
+ → sink_to_km() : store_insight → KM RAG 沉澱
+ → notify_telegram() : 推播修復結果
+"""
+
+import json
+import os
+import time
+import traceback as tb
+from datetime import datetime, timedelta
+from typing import Optional, Tuple
+
+import requests
+from dotenv import load_dotenv
+
+from database.manager import get_session
+from database.autoheal_models import Incident, Playbook, HealLog, SEED_PLAYBOOKS
+from services.logger_manager import SystemLogger
+
+load_dotenv()
+sys_log = SystemLogger("AutoHeal").get_logger()
+
+# ─── Telegram 設定 ───────────────────────────────────────
+_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN") or os.getenv("OPENCLAW_BOT_TOKEN", "")
+_CHAT_ID = os.getenv("OPENCLAW_GROUP_ID", "-1003940688311")
+
+# ─── SSH 跳板機設定 ──────────────────────────────────────
+_JUMP_HOST = os.getenv("SSH_JUMP_HOST", "192.168.0.110")
+_JUMP_USER = os.getenv("SSH_JUMP_USER", "wooo")
+_TARGET_HOST = os.getenv("SSH_TARGET_HOST", "192.168.0.188")
+_TARGET_USER = os.getenv("SSH_TARGET_USER", "ollama")
+
+# ─── 白名單允許執行的指令前綴 ────────────────────────────
+_CMD_WHITELIST = [
+ "docker restart ",
+ "docker compose restart ",
+ "docker start ",
+]
+
+# ─── 錯誤分類對照表(keyword → error_type)──────────────
+_ERROR_CLASSIFY_MAP = {
+ "DNS_FAIL": ["name resolution", "could not translate host name",
+ "Temporary failure in name resolution"],
+ "DB_UNREACHABLE": ["connection refused", "Connection reset by peer",
+ "could not connect to server", "psycopg2.OperationalError"],
+ "OOM": ["SIGKILL", "out of memory", "Worker was sent SIGKILL", "MemoryError"],
+ "SSL_FAIL": ["SSL connection has been closed unexpectedly", "SSL SYSCALL error"],
+ "AUTH_FAIL": ["invalid_grant", "google_token.pickle", "Token has been expired"],
+ "CRAWLER_FAIL": ["429 Too Many Requests", "rate limit", "Retry-After",
+ "CloudflareCaptcha", "webdriver"],
+ "IMPORT_FAIL": ["import_service", "ImportError", "sync_daily_sales"],
+ "TIMEOUT": ["Timeout", "timed out", "TimeoutError"],
+}
+
+_SEVERITY_MAP = {
+ "P1": ["OOM", "SSL_FAIL"],
+ "P2": ["DNS_FAIL", "DB_UNREACHABLE", "AUTH_FAIL"],
+ "P3": ["CRAWLER_FAIL", "IMPORT_FAIL", "TIMEOUT"],
+}
+
+
+# ──────────────────────────────────────────────────────────
+# 工具函數
+# ──────────────────────────────────────────────────────────
+
+def _classify_error(error_msg: str) -> Tuple[str, str]:
+ """回傳 (error_type, severity)"""
+ lower = error_msg.lower()
+ for etype, keywords in _ERROR_CLASSIFY_MAP.items():
+ if any(k.lower() in lower for k in keywords):
+ for sev, etypes in _SEVERITY_MAP.items():
+ if etype in etypes:
+ return etype, sev
+ return etype, "P3"
+ return "UNKNOWN", "P3"
+
+
+def _is_cmd_allowed(cmd: str) -> bool:
+ """白名單驗證:防止任意 RCE"""
+ c = cmd.strip()
+ return any(c.startswith(prefix) for prefix in _CMD_WHITELIST)
+
+
+def _send_telegram(msg: str) -> None:
+ """推播訊息至 Telegram 群組"""
+ if not _BOT_TOKEN:
+ sys_log.warning("[AutoHeal] TELEGRAM_BOT_TOKEN 未設定,略過推播")
+ return
+ try:
+ requests.post(
+ f"https://api.telegram.org/bot{_BOT_TOKEN}/sendMessage",
+ json={"chat_id": _CHAT_ID, "text": msg, "parse_mode": "HTML"},
+ timeout=10,
+ )
+ except Exception as e:
+ sys_log.error(f"[AutoHeal] Telegram 推播失敗: {e}")
+
+
+def _execute_ssh_cmd(cmd: str) -> Tuple[bool, str]:
+ """
+ 透過 paramiko 執行 SSH 跳板指令。
+ 若 paramiko 不可用則降級為 subprocess + CLI ssh。
+ """
+ if not _is_cmd_allowed(cmd):
+ return False, f"指令不在白名單中,拒絕執行: {cmd}"
+
+ try:
+ import paramiko
+ jump = paramiko.SSHClient()
+ jump.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ jump.connect(_JUMP_HOST, username=_JUMP_USER, timeout=10)
+
+ # 透過跳板機建立隧道
+ transport = jump.get_transport()
+ dest_addr = (_TARGET_HOST, 22)
+ src_addr = (_JUMP_HOST, 0)
+ chan = transport.open_channel("direct-tcpip", dest_addr, src_addr)
+
+ target = paramiko.SSHClient()
+ target.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ target.connect(_TARGET_HOST, username=_TARGET_USER, sock=chan, timeout=15)
+
+ _stdin, stdout, stderr = target.exec_command(cmd, timeout=60)
+ out = stdout.read().decode("utf-8", errors="replace").strip()
+ err = stderr.read().decode("utf-8", errors="replace").strip()
+ exit_code = stdout.channel.recv_exit_status()
+
+ target.close()
+ jump.close()
+
+ if exit_code == 0:
+ return True, out or "指令執行成功"
+ else:
+ return False, f"exit_code={exit_code}\n{err or out}"
+
+ except ImportError:
+ # paramiko 尚未安裝,降級到 cli ssh
+ sys_log.warning("[AutoHeal] paramiko 未安裝,改用 subprocess + CLI ssh")
+ import subprocess
+ full_cmd = [
+ "ssh", "-o", "StrictHostKeyChecking=no",
+ "-J", f"{_JUMP_USER}@{_JUMP_HOST}",
+ f"{_TARGET_USER}@{_TARGET_HOST}", cmd,
+ ]
+ result = subprocess.run(full_cmd, capture_output=True, text=True, timeout=60)
+ if result.returncode == 0:
+ return True, result.stdout.strip() or "指令執行成功"
+ else:
+ return False, result.stderr.strip() or result.stdout.strip()
+
+ except Exception as e:
+ return False, f"SSH 執行例外: {e}"
+
+
+# ──────────────────────────────────────────────────────────
+# 核心引擎
+# ──────────────────────────────────────────────────────────
+
+class AutoHealService:
+ """
+ AIOps 自動修復引擎。
+
+ 使用方式(在 scheduler.py 的 except 區塊):
+ from services.auto_heal_service import auto_heal_service
+ auto_heal_service.handle_exception(
+ task_name="run_auto_import_task",
+ exception=e,
+ traceback_str=traceback.format_exc()
+ )
+ """
+
+ # ── 步驟 1:統一入口 ────────────────────────────────
+ def handle_exception(self, task_name: str, exception: Exception,
+ traceback_str: str = "") -> Optional[int]:
+ """
+ 統一例外處理入口。回傳 incident_id,若前置失敗則回傳 None。
+ """
+ error_msg = str(exception)
+ error_type, severity = _classify_error(error_msg)
+
+ sys_log.info(f"[AutoHeal] 收到例外 task={task_name} type={error_type} sev={severity}")
+
+ incident = self._create_incident(task_name, error_type, error_msg,
+ traceback_str, severity)
+ if not incident:
+ return None
+
+ playbook = self._match_playbook(incident)
+ if not playbook:
+ sys_log.info(f"[AutoHeal] 未找到匹配 PlayBook (incident_id={incident.id})")
+ self._notify_no_playbook(incident)
+ return incident.id
+
+ heal_log = self._execute_playbook(incident, playbook)
+ self._sink_to_km(incident, playbook, heal_log)
+ self._notify_telegram(incident, playbook, heal_log)
+ return incident.id
+
+ # ── 步驟 2:建立 Incident ───────────────────────────
+ def _create_incident(self, task_name: str, error_type: str, error_msg: str,
+ traceback_str: str, severity: str) -> Optional[Incident]:
+ session = get_session()
+ try:
+ incident = Incident(
+ task_name = task_name,
+ error_type = error_type,
+ error_message = error_msg[:2000], # 限制長度
+ error_traceback = traceback_str[:5000],
+ severity = severity,
+ status = "open",
+ created_at = datetime.now(),
+ updated_at = datetime.now(),
+ )
+ session.add(incident)
+ session.commit()
+ sys_log.info(f"[AutoHeal] 建立 Incident id={incident.id} type={error_type}")
+ return incident
+ except Exception as e:
+ session.rollback()
+ sys_log.error(f"[AutoHeal] create_incident 失敗: {e}")
+ return None
+ finally:
+ session.close()
+
+ # ── 步驟 3:PlayBook 匹配 ───────────────────────────
+ def _match_playbook(self, incident: Incident) -> Optional[Playbook]:
+ """
+ 匹配邏輯:
+ 1. error_type 精確比對
+ 2. match_pattern 任一關鍵字命中
+ 3. 冷卻時間檢查(同 playbook 最近一次執行是否已超過 cooldown_min)
+ """
+ session = get_session()
+ try:
+ candidates = session.query(Playbook).filter_by(
+ error_type=incident.error_type, is_active=True
+ ).all()
+
+ error_lower = incident.error_message.lower()
+
+ for pb in candidates:
+ patterns = pb.get_match_patterns()
+ if not any(p.lower() in error_lower for p in patterns):
+ continue
+
+ # 冷卻檢查
+ cooldown_threshold = datetime.now() - timedelta(minutes=pb.cooldown_min)
+ recent_log = session.query(HealLog).filter(
+ HealLog.playbook_id == pb.id,
+ HealLog.created_at >= cooldown_threshold,
+ HealLog.result == "success",
+ ).first()
+ if recent_log:
+ sys_log.info(f"[AutoHeal] PlayBook '{pb.name}' 在冷卻中,略過")
+ continue
+
+ # 上限檢查(同 incident 的 retry_count)
+ if incident.retry_count >= pb.max_retries:
+ sys_log.warning(f"[AutoHeal] 已達 max_retries({pb.max_retries}),升級為 escalated")
+ self._escalate_incident(incident)
+ return None
+
+ sys_log.info(f"[AutoHeal] 匹配 PlayBook: '{pb.name}' (id={pb.id})")
+ return pb
+
+ return None
+ except Exception as e:
+ sys_log.error(f"[AutoHeal] match_playbook 失敗: {e}")
+ return None
+ finally:
+ session.close()
+
+ # ── 步驟 4:執行 PlayBook ───────────────────────────
+ def _execute_playbook(self, incident: Incident, playbook: Playbook) -> HealLog:
+ """根據 action_type 執行對應動作,回傳 HealLog"""
+ t_start = time.time()
+ params = playbook.get_action_params()
+ action_detail = ""
+ result = "failed"
+ result_output = ""
+
+ # 更新 incident 狀態
+ self._update_incident_status(incident.id, "healing", playbook.id)
+
+ try:
+ if playbook.action_type == "DOCKER_RESTART":
+ container = params.get("container", "")
+ use_compose = params.get("compose", False)
+ if use_compose:
+ cmd = f"cd /home/ollama/momo-pro && docker compose restart {container}"
+ else:
+ cmd = f"docker restart {container}"
+ action_detail = cmd
+ ok, output = _execute_ssh_cmd(cmd if not use_compose else f"docker compose restart {container}")
+ # compose 指令需要在目錄下執行,強制用 SSH
+ if use_compose:
+ ok, output = _execute_ssh_cmd(f"docker restart {container}")
+ result = "success" if ok else "failed"
+ result_output = output
+
+ elif playbook.action_type == "SSH_CMD":
+ cmd = params.get("cmd", "")
+ action_detail = cmd
+ ok, output = _execute_ssh_cmd(cmd)
+ result = "success" if ok else "failed"
+ result_output = output
+
+ elif playbook.action_type == "ALERT_ONLY":
+ msg = params.get("message", "需人工介入")
+ action_detail = f"[ALERT_ONLY] {msg}"
+ result = "success"
+ result_output = msg
+
+ elif playbook.action_type == "WAIT_RETRY":
+ wait_min = params.get("wait_minutes", 30)
+ action_detail = f"[WAIT_RETRY] 靜默等待 {wait_min} 分鐘後由排程自動重試"
+ result = "success"
+ result_output = f"已記錄,排程將在 {wait_min} 分鐘後重試"
+
+ else:
+ action_detail = f"未知 action_type: {playbook.action_type}"
+ result = "skipped"
+ result_output = action_detail
+
+ except Exception as e:
+ result = "failed"
+ result_output = f"執行例外: {e}"
+ sys_log.error(f"[AutoHeal] execute_playbook 例外: {e}")
+
+ duration_ms = (time.time() - t_start) * 1000
+ heal_log = self._write_heal_log(
+ incident.id, playbook.id,
+ playbook.action_type, action_detail,
+ result, result_output, duration_ms,
+ )
+
+ # 更新 PlayBook 統計
+ self._update_playbook_stats(playbook.id, result)
+
+ # 更新 Incident 最終狀態
+ final_status = "resolved" if result == "success" else "open"
+ self._update_incident_status(incident.id, final_status, playbook.id,
+ increment_retry=(result != "success"))
+
+ sys_log.info(f"[AutoHeal] 執行完成 result={result} duration={duration_ms:.0f}ms")
+ return heal_log
+
+ # ── 步驟 5:寫入 HealLog ────────────────────────────
+ def _write_heal_log(self, incident_id, playbook_id, action_type,
+ action_detail, result, result_output, duration_ms) -> HealLog:
+ session = get_session()
+ try:
+ hl = HealLog(
+ incident_id = incident_id,
+ playbook_id = playbook_id,
+ action_type = action_type,
+ action_detail = action_detail,
+ result = result,
+ result_output = (result_output or "")[:2000],
+ duration_ms = duration_ms,
+ created_at = datetime.now(),
+ )
+ session.add(hl)
+ session.commit()
+ return hl
+ except Exception as e:
+ session.rollback()
+ sys_log.error(f"[AutoHeal] write_heal_log 失敗: {e}")
+ return HealLog(result=result, action_detail=action_detail)
+ finally:
+ session.close()
+
+ # ── 步驟 6:KM 沉澱 ────────────────────────────────
+ def _sink_to_km(self, incident: Incident, playbook: Playbook, heal_log: HealLog) -> None:
+ """將修復知識寫入 ai_insights(KM RAG 雙寫)"""
+ try:
+ from services.openclaw_learning_service import store_insight
+ today = datetime.now().strftime("%Y-%m-%d")
+ result_zh = {"success": "成功", "failed": "失敗", "skipped": "跳過"}.get(
+ heal_log.result, heal_log.result
+ )
+
+ content = (
+ f"[AIOps 自動修復紀錄]\n"
+ f"事件:{incident.task_name} 發生 {incident.error_type}(嚴重度 {incident.severity})\n"
+ f"症狀:{incident.error_message[:300]}\n"
+ f"行動:執行 PlayBook「{playbook.name}」→ {heal_log.action_detail}\n"
+ f"結果:{result_zh}(耗時 {heal_log.duration_ms:.0f}ms)\n"
+ f"教訓:此類型錯誤({incident.error_type})可透過 {playbook.action_type} 自動修復。\n"
+ f"處理時間:{today}"
+ )
+
+ store_insight(
+ insight_type = "auto_heal_playbook",
+ period = today,
+ content = content,
+ metadata = {
+ "playbook_id": playbook.id,
+ "incident_id": incident.id,
+ "error_type": incident.error_type,
+ "result": heal_log.result,
+ },
+ ai_model = "auto_heal_engine_v1",
+ )
+ sys_log.info(f"[AutoHeal] KM 沉澱完成 (incident_id={incident.id})")
+ except Exception as e:
+ sys_log.warning(f"[AutoHeal] sink_to_km 失敗(不影響主流程): {e}")
+
+ # ── 步驟 7:Telegram 通知 ───────────────────────────
+ def _notify_telegram(self, incident: Incident, playbook: Playbook,
+ heal_log: HealLog) -> None:
+ """推播修復結果通知"""
+ icon = {"success": "✅", "failed": "❌", "skipped": "⏭️"}.get(heal_log.result, "❓")
+ sev_icon = {"P1": "🔴", "P2": "🟠", "P3": "🟡"}.get(incident.severity, "⚪")
+
+ msg = (
+ f"{sev_icon} [EwoooC AIOps] 自動修復報告\n\n"
+ f"📌 任務:{incident.task_name}\n"
+ f"🚨 錯誤類型:{incident.error_type}\n"
+ f"📝 症狀:{incident.error_message[:200]}\n\n"
+ f"🔧 PlayBook:{playbook.name}\n"
+ f"⚙️ 動作:{heal_log.action_detail}\n"
+ f"{icon} 結果:{heal_log.result}({heal_log.duration_ms:.0f}ms)\n\n"
+ f"💾 已沉澱至 KM(auto_heal_playbook)"
+ )
+ _send_telegram(msg)
+
+ def _notify_no_playbook(self, incident: Incident) -> None:
+ """未找到 PlayBook 時的人工告警"""
+ sev_icon = {"P1": "🔴", "P2": "🟠", "P3": "🟡"}.get(incident.severity, "⚪")
+ msg = (
+ f"{sev_icon} [EwoooC AIOps] 需人工介入\n\n"
+ f"📌 任務:{incident.task_name}\n"
+ f"🚨 錯誤類型:{incident.error_type}\n"
+ f"📝 症狀:{incident.error_message[:300]}\n\n"
+ f"⚠️ 未找到匹配的 PlayBook,請人工排查。\n"
+ f"🆔 Incident ID:{incident.id}"
+ )
+ _send_telegram(msg)
+
+ # ── 輔助函數 ────────────────────────────────────────
+ def _update_incident_status(self, incident_id: int, status: str,
+ playbook_id: Optional[int] = None,
+ increment_retry: bool = False) -> None:
+ session = get_session()
+ try:
+ inc = session.query(Incident).get(incident_id)
+ if inc:
+ inc.status = status
+ inc.updated_at = datetime.now()
+ if playbook_id:
+ inc.playbook_id = playbook_id
+ if status == "resolved":
+ inc.resolved_at = datetime.now()
+ if increment_retry:
+ inc.retry_count = (inc.retry_count or 0) + 1
+ session.commit()
+ except Exception as e:
+ session.rollback()
+ sys_log.error(f"[AutoHeal] update_incident_status 失敗: {e}")
+ finally:
+ session.close()
+
+ def _escalate_incident(self, incident: Incident) -> None:
+ self._update_incident_status(incident.id, "escalated")
+ sev_icon = {"P1": "🔴", "P2": "🟠", "P3": "🟡"}.get(incident.severity, "⚪")
+ msg = (
+ f"{sev_icon} [EwoooC AIOps] 告警升級 — 需立即人工介入\n\n"
+ f"📌 任務:{incident.task_name}\n"
+ f"🚨 錯誤:{incident.error_type}\n"
+ f"🔁 已重試 {incident.retry_count} 次,自動修復失敗。\n"
+ f"📝 {incident.error_message[:300]}"
+ )
+ _send_telegram(msg)
+
+ def _update_playbook_stats(self, playbook_id: int, result: str) -> None:
+ session = get_session()
+ try:
+ pb = session.query(Playbook).get(playbook_id)
+ if pb:
+ if result == "success":
+ pb.success_count = (pb.success_count or 0) + 1
+ else:
+ pb.fail_count = (pb.fail_count or 0) + 1
+ pb.updated_at = datetime.now()
+ session.commit()
+ except Exception as e:
+ session.rollback()
+ sys_log.error(f"[AutoHeal] update_playbook_stats 失敗: {e}")
+ finally:
+ session.close()
+
+ # ── 種子 PlayBook 初始化 ────────────────────────────
+ @staticmethod
+ def init_seed_playbooks() -> None:
+ """首次啟動時植入預設 PlayBook(已存在則略過)"""
+ session = get_session()
+ try:
+ for seed in SEED_PLAYBOOKS:
+ exists = session.query(Playbook).filter_by(name=seed["name"]).first()
+ if not exists:
+ session.add(Playbook(**seed))
+ session.commit()
+ sys_log.info("[AutoHeal] 種子 PlayBook 初始化完成")
+ except Exception as e:
+ session.rollback()
+ sys_log.error(f"[AutoHeal] init_seed_playbooks 失敗: {e}")
+ finally:
+ session.close()
+
+
+# ─── 模組級單例 ─────────────────────────────────────────
+auto_heal_service = AutoHealService()