Files
ewoooc/migrations/013_autoheal.sql
OoO 0875dd8fda
All checks were successful
CD Pipeline / deploy (push) Successful in 1m11s
補強 5.5 自癒安全回看
2026-04-29 22:48:24 +08:00

130 lines
5.9 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- Migration 013: AIOps 自動修復三張表
-- incidents / playbooks / heal_logs
-- 建立日期2026-04-19
-- ─────────────────────────────────────────────────
-- 表 1: playbooks (PlayBook 規則庫)
-- ─────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS playbooks (
id SERIAL PRIMARY KEY,
name VARCHAR(200) NOT NULL UNIQUE,
error_type VARCHAR(50) NOT NULL,
match_pattern TEXT NOT NULL, -- JSON 陣列
severity_min VARCHAR(5) DEFAULT 'P3',
action_type VARCHAR(30) NOT NULL, -- SSH_CMD / DOCKER_RESTART / ALERT_ONLY / WAIT_RETRY
action_params TEXT, -- JSON 物件
cooldown_min INTEGER DEFAULT 30,
max_retries INTEGER DEFAULT 3,
is_active BOOLEAN DEFAULT TRUE,
success_count INTEGER DEFAULT 0,
fail_count INTEGER DEFAULT 0,
km_synced BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_playbook_error_type ON playbooks(error_type, is_active);
-- ─────────────────────────────────────────────────
-- 表 2: incidents (事件主表)
-- ─────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS incidents (
id SERIAL PRIMARY KEY,
task_name VARCHAR(100) NOT NULL,
error_type VARCHAR(50) NOT NULL,
error_message TEXT NOT NULL,
error_traceback TEXT,
severity VARCHAR(5) NOT NULL DEFAULT 'P2',
status VARCHAR(20) NOT NULL DEFAULT 'open',
playbook_id INTEGER REFERENCES playbooks(id),
retry_count INTEGER DEFAULT 0,
resolved_at TIMESTAMP,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_incident_status_created ON incidents(status, created_at);
CREATE INDEX IF NOT EXISTS idx_incident_task_error ON incidents(task_name, error_type);
-- ─────────────────────────────────────────────────
-- 表 3: heal_logs (修復執行紀錄)
-- ─────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS heal_logs (
id SERIAL PRIMARY KEY,
incident_id INTEGER NOT NULL REFERENCES incidents(id),
playbook_id INTEGER REFERENCES playbooks(id),
action_type VARCHAR(30),
action_detail TEXT,
result VARCHAR(20) DEFAULT 'pending', -- success / failed / skipped
result_output TEXT,
duration_ms FLOAT DEFAULT 0,
created_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_heal_log_incident ON heal_logs(incident_id, created_at);
CREATE INDEX IF NOT EXISTS idx_heal_log_result ON heal_logs(result, created_at);
-- ─────────────────────────────────────────────────
-- 種子 PlayBook 資料(首次初始化,已存在則略過)
-- ─────────────────────────────────────────────────
INSERT INTO playbooks (name, error_type, match_pattern, severity_min, action_type, action_params, cooldown_min, max_retries)
SELECT * FROM (VALUES
(
'Docker DNS 解析失敗修復',
'DNS_FAIL',
'["name resolution", "could not translate host name", "Temporary failure in name resolution"]',
'P2', 'ALERT_ONLY',
'{"message": "DB/DNS 異常,依 ADR-011 不重啟 momo-db轉人工檢查。"}',
30, 3
),
(
'DB 連線被拒修復',
'DB_UNREACHABLE',
'["connection refused", "Connection reset by peer", "could not connect to server"]',
'P2', 'ALERT_ONLY',
'{"message": "DB 無法連線,依 ADR-011 不重啟 momo-db轉人工檢查。"}',
30, 3
),
(
'App OOM 自動重啟',
'OOM',
'["SIGKILL", "out of memory", "Worker was sent SIGKILL", "MemoryError"]',
'P1', 'DOCKER_RESTART',
'{"container": "momo-pro-system"}',
60, 2
),
(
'Scheduler OOM 自動重啟',
'OOM',
'["SIGKILL", "Worker was sent SIGKILL"]',
'P1', 'DOCKER_RESTART',
'{"container": "momo-scheduler"}',
60, 2
),
(
'PostgreSQL SSL 連線中斷',
'SSL_FAIL',
'["SSL connection has been closed unexpectedly", "SSL SYSCALL error"]',
'P2', 'DOCKER_RESTART',
'{"container": "momo-pro-system"}',
15, 3
),
(
'Google Drive 認證失敗告警',
'AUTH_FAIL',
'["invalid_grant", "google_token.pickle", "Token has been expired or revoked"]',
'P2', 'ALERT_ONLY',
'{"message": "Google Drive OAuth Token 已過期,請人工重新認證。參閱 docs/guides/google_drive_setup.md"}',
240, 1
),
(
'爬蟲 HTTP 429 限流等待',
'CRAWLER_FAIL',
'["429 Too Many Requests", "rate limit", "Retry-After"]',
'P3', 'WAIT_RETRY',
'{"wait_minutes": 30}',
30, 2
)
) AS v(name, error_type, match_pattern, severity_min, action_type, action_params, cooldown_min, max_retries)
WHERE NOT EXISTS (SELECT 1 FROM playbooks WHERE playbooks.name = v.name);