Files
ewoooc/migrations/013_autoheal.sql
ogt 77d3a1da48
Some checks failed
CD Pipeline / deploy (push) Failing after 3m24s
feat(ai-ops): ADR-013 AIOps 自動修復閉環完整實作
架構(Exception → Incident → PlayBook → Heal → KM → Telegram):

新增元件:
- database/autoheal_models.py: Incident/Playbook/HealLog 三張表 + 7 條種子 PlayBook
- migrations/013_autoheal.sql: 建表 DDL + 種子資料(冪等 INSERT)
- services/auto_heal_service.py: 核心引擎 7 步閉環
  - _classify_error: 8 類錯誤自動分類 (DNS_FAIL/DB_UNREACHABLE/OOM/...)
  - _match_playbook: error_type + keyword + 冷卻 + max_retries 保護
  - _execute_playbook: DOCKER_RESTART/SSH_CMD/ALERT_ONLY/WAIT_RETRY
  - _sink_to_km: 修復知識寫入 ai_insights (auto_heal_playbook)
  - SSH 白名單:僅允許 docker restart / compose restart / docker start

修改元件:
- database/manager.py: _init_autoheal_tables() 啟動時建表+種子 PlayBook
- scheduler.py: 3 個核心任務植入 handle_exception
  (run_auto_import_task / run_icaim_analysis_task / run_weekly_strategy_task)
- requirements.txt: paramiko(SSH 跳板;不可用時降級 subprocess+CLI ssh)

安全設計: CMD 白名單 + cooldown + max_retries escalation + DB 冪等 migration

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 16:03:49 +08:00

130 lines
5.8 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- Migration 013: AIOps 自動修復三張表
-- incidents / playbooks / heal_logs
-- 建立日期2026-04-19
-- ─────────────────────────────────────────────────
-- 表 1: incidents (事件主表)
-- ─────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS incidents (
id SERIAL PRIMARY KEY,
task_name VARCHAR(100) NOT NULL,
error_type VARCHAR(50) NOT NULL,
error_message TEXT NOT NULL,
error_traceback TEXT,
severity VARCHAR(5) NOT NULL DEFAULT 'P2',
status VARCHAR(20) NOT NULL DEFAULT 'open',
playbook_id INTEGER REFERENCES playbooks(id),
retry_count INTEGER DEFAULT 0,
resolved_at TIMESTAMP,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_incident_status_created ON incidents(status, created_at);
CREATE INDEX IF NOT EXISTS idx_incident_task_error ON incidents(task_name, error_type);
-- ─────────────────────────────────────────────────
-- 表 2: playbooks (PlayBook 規則庫)
-- ─────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS playbooks (
id SERIAL PRIMARY KEY,
name VARCHAR(200) NOT NULL UNIQUE,
error_type VARCHAR(50) NOT NULL,
match_pattern TEXT NOT NULL, -- JSON 陣列
severity_min VARCHAR(5) DEFAULT 'P3',
action_type VARCHAR(30) NOT NULL, -- SSH_CMD / DOCKER_RESTART / ALERT_ONLY / WAIT_RETRY
action_params TEXT, -- JSON 物件
cooldown_min INTEGER DEFAULT 30,
max_retries INTEGER DEFAULT 3,
is_active BOOLEAN DEFAULT TRUE,
success_count INTEGER DEFAULT 0,
fail_count INTEGER DEFAULT 0,
km_synced BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_playbook_error_type ON playbooks(error_type, is_active);
-- ─────────────────────────────────────────────────
-- 表 3: heal_logs (修復執行紀錄)
-- ─────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS heal_logs (
id SERIAL PRIMARY KEY,
incident_id INTEGER NOT NULL REFERENCES incidents(id),
playbook_id INTEGER REFERENCES playbooks(id),
action_type VARCHAR(30),
action_detail TEXT,
result VARCHAR(20) DEFAULT 'pending', -- success / failed / skipped
result_output TEXT,
duration_ms FLOAT DEFAULT 0,
created_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_heal_log_incident ON heal_logs(incident_id, created_at);
CREATE INDEX IF NOT EXISTS idx_heal_log_result ON heal_logs(result, created_at);
-- ─────────────────────────────────────────────────
-- 種子 PlayBook 資料(首次初始化,已存在則略過)
-- ─────────────────────────────────────────────────
INSERT INTO playbooks (name, error_type, match_pattern, severity_min, action_type, action_params, cooldown_min, max_retries)
SELECT * FROM (VALUES
(
'Docker DNS 解析失敗修復',
'DNS_FAIL',
'["name resolution", "could not translate host name", "Temporary failure in name resolution"]',
'P2', 'DOCKER_RESTART',
'{"container": "momo-db"}',
30, 3
),
(
'DB 連線被拒修復',
'DB_UNREACHABLE',
'["connection refused", "Connection reset by peer", "could not connect to server"]',
'P2', 'DOCKER_RESTART',
'{"container": "momo-db", "compose": true}',
30, 3
),
(
'App OOM 自動重啟',
'OOM',
'["SIGKILL", "out of memory", "Worker was sent SIGKILL", "MemoryError"]',
'P1', 'DOCKER_RESTART',
'{"container": "momo-pro-system"}',
60, 2
),
(
'Scheduler OOM 自動重啟',
'OOM',
'["SIGKILL", "Worker was sent SIGKILL"]',
'P1', 'DOCKER_RESTART',
'{"container": "momo-scheduler"}',
60, 2
),
(
'PostgreSQL SSL 連線中斷',
'SSL_FAIL',
'["SSL connection has been closed unexpectedly", "SSL SYSCALL error"]',
'P2', 'DOCKER_RESTART',
'{"container": "momo-pro-system"}',
15, 3
),
(
'Google Drive 認證失敗告警',
'AUTH_FAIL',
'["invalid_grant", "google_token.pickle", "Token has been expired or revoked"]',
'P2', 'ALERT_ONLY',
'{"message": "Google Drive OAuth Token 已過期,請人工重新認證。參閱 docs/guides/google_drive_setup.md"}',
240, 1
),
(
'爬蟲 HTTP 429 限流等待',
'CRAWLER_FAIL',
'["429 Too Many Requests", "rate limit", "Retry-After"]',
'P3', 'WAIT_RETRY',
'{"wait_minutes": 30}',
30, 2
)
) AS v(name, error_type, match_pattern, severity_min, action_type, action_params, cooldown_min, max_retries)
WHERE NOT EXISTS (SELECT 1 FROM playbooks WHERE playbooks.name = v.name);