feat(ai-ops): ADR-013 AIOps 自動修復閉環完整實作
Some checks failed
CD Pipeline / deploy (push) Failing after 3m24s

架構(Exception → Incident → PlayBook → Heal → KM → Telegram):

新增元件:
- database/autoheal_models.py: Incident/Playbook/HealLog 三張表 + 7 條種子 PlayBook
- migrations/013_autoheal.sql: 建表 DDL + 種子資料(冪等 INSERT)
- services/auto_heal_service.py: 核心引擎 7 步閉環
  - _classify_error: 8 類錯誤自動分類 (DNS_FAIL/DB_UNREACHABLE/OOM/...)
  - _match_playbook: error_type + keyword + 冷卻 + max_retries 保護
  - _execute_playbook: DOCKER_RESTART/SSH_CMD/ALERT_ONLY/WAIT_RETRY
  - _sink_to_km: 修復知識寫入 ai_insights (auto_heal_playbook)
  - SSH 白名單:僅允許 docker restart / compose restart / docker start

修改元件:
- database/manager.py: _init_autoheal_tables() 啟動時建表+種子 PlayBook
- scheduler.py: 3 個核心任務植入 handle_exception
  (run_auto_import_task / run_icaim_analysis_task / run_weekly_strategy_task)
- requirements.txt: paramiko(SSH 跳板;不可用時降級 subprocess+CLI ssh)

安全設計: CMD 白名單 + cooldown + max_retries escalation + DB 冪等 migration

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
ogt
2026-04-19 16:03:49 +08:00
parent 7fbeaaf213
commit 77d3a1da48
8 changed files with 1050 additions and 3 deletions

129
migrations/013_autoheal.sql Normal file
View File

@@ -0,0 +1,129 @@
-- Migration 013: AIOps 自動修復三張表
-- incidents / playbooks / heal_logs
-- 建立日期2026-04-19
-- ─────────────────────────────────────────────────
-- 表 1: incidents (事件主表)
-- ─────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS incidents (
id SERIAL PRIMARY KEY,
task_name VARCHAR(100) NOT NULL,
error_type VARCHAR(50) NOT NULL,
error_message TEXT NOT NULL,
error_traceback TEXT,
severity VARCHAR(5) NOT NULL DEFAULT 'P2',
status VARCHAR(20) NOT NULL DEFAULT 'open',
playbook_id INTEGER REFERENCES playbooks(id),
retry_count INTEGER DEFAULT 0,
resolved_at TIMESTAMP,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_incident_status_created ON incidents(status, created_at);
CREATE INDEX IF NOT EXISTS idx_incident_task_error ON incidents(task_name, error_type);
-- ─────────────────────────────────────────────────
-- 表 2: playbooks (PlayBook 規則庫)
-- ─────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS playbooks (
id SERIAL PRIMARY KEY,
name VARCHAR(200) NOT NULL UNIQUE,
error_type VARCHAR(50) NOT NULL,
match_pattern TEXT NOT NULL, -- JSON 陣列
severity_min VARCHAR(5) DEFAULT 'P3',
action_type VARCHAR(30) NOT NULL, -- SSH_CMD / DOCKER_RESTART / ALERT_ONLY / WAIT_RETRY
action_params TEXT, -- JSON 物件
cooldown_min INTEGER DEFAULT 30,
max_retries INTEGER DEFAULT 3,
is_active BOOLEAN DEFAULT TRUE,
success_count INTEGER DEFAULT 0,
fail_count INTEGER DEFAULT 0,
km_synced BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_playbook_error_type ON playbooks(error_type, is_active);
-- ─────────────────────────────────────────────────
-- 表 3: heal_logs (修復執行紀錄)
-- ─────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS heal_logs (
id SERIAL PRIMARY KEY,
incident_id INTEGER NOT NULL REFERENCES incidents(id),
playbook_id INTEGER REFERENCES playbooks(id),
action_type VARCHAR(30),
action_detail TEXT,
result VARCHAR(20) DEFAULT 'pending', -- success / failed / skipped
result_output TEXT,
duration_ms FLOAT DEFAULT 0,
created_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_heal_log_incident ON heal_logs(incident_id, created_at);
CREATE INDEX IF NOT EXISTS idx_heal_log_result ON heal_logs(result, created_at);
-- ─────────────────────────────────────────────────
-- 種子 PlayBook 資料(首次初始化,已存在則略過)
-- ─────────────────────────────────────────────────
INSERT INTO playbooks (name, error_type, match_pattern, severity_min, action_type, action_params, cooldown_min, max_retries)
SELECT * FROM (VALUES
(
'Docker DNS 解析失敗修復',
'DNS_FAIL',
'["name resolution", "could not translate host name", "Temporary failure in name resolution"]',
'P2', 'DOCKER_RESTART',
'{"container": "momo-db"}',
30, 3
),
(
'DB 連線被拒修復',
'DB_UNREACHABLE',
'["connection refused", "Connection reset by peer", "could not connect to server"]',
'P2', 'DOCKER_RESTART',
'{"container": "momo-db", "compose": true}',
30, 3
),
(
'App OOM 自動重啟',
'OOM',
'["SIGKILL", "out of memory", "Worker was sent SIGKILL", "MemoryError"]',
'P1', 'DOCKER_RESTART',
'{"container": "momo-pro-system"}',
60, 2
),
(
'Scheduler OOM 自動重啟',
'OOM',
'["SIGKILL", "Worker was sent SIGKILL"]',
'P1', 'DOCKER_RESTART',
'{"container": "momo-scheduler"}',
60, 2
),
(
'PostgreSQL SSL 連線中斷',
'SSL_FAIL',
'["SSL connection has been closed unexpectedly", "SSL SYSCALL error"]',
'P2', 'DOCKER_RESTART',
'{"container": "momo-pro-system"}',
15, 3
),
(
'Google Drive 認證失敗告警',
'AUTH_FAIL',
'["invalid_grant", "google_token.pickle", "Token has been expired or revoked"]',
'P2', 'ALERT_ONLY',
'{"message": "Google Drive OAuth Token 已過期,請人工重新認證。參閱 docs/guides/google_drive_setup.md"}',
240, 1
),
(
'爬蟲 HTTP 429 限流等待',
'CRAWLER_FAIL',
'["429 Too Many Requests", "rate limit", "Retry-After"]',
'P3', 'WAIT_RETRY',
'{"wait_minutes": 30}',
30, 2
)
) AS v(name, error_type, match_pattern, severity_min, action_type, action_params, cooldown_min, max_retries)
WHERE NOT EXISTS (SELECT 1 FROM playbooks WHERE playbooks.name = v.name);