Files
ewoooc/migrations/013_autoheal.sql
ogt 0099543c05
Some checks failed
CD Pipeline / deploy (push) Failing after 5m18s
fix(security): 全域健檢 — 40 項安全/Bug/品質修復
🔴 Critical
- auto_heal_service: 補 import re + sqlalchemy.text + 修正 orchestrator 變數名
  + autoheal_playbook→playbooks 表名 + _alert_and_store cooldown 修復
- aider_heal_executor: shell injection 改 shell=False + list 參數
- docker-compose: DISABLE_LOGIN 改 env var + 移除密碼 fallback + POSTGRES_HOST 修正
- app.py: /api/backup /api/run_task 等 6 個管理 API 加 @login_required
- config.py + pg_sync + e2e_test: 移除 wooo_pg_2026 hardcoded 密碼 fallback
- pg_backup.sh: 移除 TELEGRAM_TOKEN= 中間變數,直接用 $TELEGRAM_BOT_TOKEN
- migration 014: trigger_pattern→match_pattern + 補 error_type NOT NULL 欄位

🟡 High
- telegram_bot_service: str(e) 改通用訊息 + session try/finally + 移除 pa:/pr: 舊 callback
- run_scheduler: ElephantAlpha thread 死亡監控 + 自動重啟 + Telegram 告警
  + agent_context 03:30 TTL 定時清理任務
- openclaw_learning_service: build_rag_context 兩路徑加 .limit(200)
- hooks: commit-quality + momo-prod-guard 空 catch 改 stderr+exit(1)
- scripts/code_review: auto_yes 預設改 false
- db_backup_service: PGPASSWORD 透過 env dict 傳遞

📦 Migrations
- 013_autoheal: 修正建表順序 playbooks→incidents(外鍵前向引用)
- 018_add_missing_indexes: heal_logs/incidents 外鍵索引 + cleanup_expired_agent_context()

🟢 Infrastructure
- requirements.txt: 加版本下界 Flask>=2.3 SQLAlchemy>=1.4 等
- cd.yaml: 新增 run_scheduler.py + run_telegram_bot.py 監聽路徑
- .gitignore: insert_playbook_local.py 加入忽略

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 01:12:23 +08:00

130 lines
5.8 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- Migration 013: AIOps 自動修復三張表
-- incidents / playbooks / heal_logs
-- 建立日期2026-04-19
-- ─────────────────────────────────────────────────
-- 表 1: playbooks (PlayBook 規則庫)
-- ─────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS playbooks (
id SERIAL PRIMARY KEY,
name VARCHAR(200) NOT NULL UNIQUE,
error_type VARCHAR(50) NOT NULL,
match_pattern TEXT NOT NULL, -- JSON 陣列
severity_min VARCHAR(5) DEFAULT 'P3',
action_type VARCHAR(30) NOT NULL, -- SSH_CMD / DOCKER_RESTART / ALERT_ONLY / WAIT_RETRY
action_params TEXT, -- JSON 物件
cooldown_min INTEGER DEFAULT 30,
max_retries INTEGER DEFAULT 3,
is_active BOOLEAN DEFAULT TRUE,
success_count INTEGER DEFAULT 0,
fail_count INTEGER DEFAULT 0,
km_synced BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_playbook_error_type ON playbooks(error_type, is_active);
-- ─────────────────────────────────────────────────
-- 表 2: incidents (事件主表)
-- ─────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS incidents (
id SERIAL PRIMARY KEY,
task_name VARCHAR(100) NOT NULL,
error_type VARCHAR(50) NOT NULL,
error_message TEXT NOT NULL,
error_traceback TEXT,
severity VARCHAR(5) NOT NULL DEFAULT 'P2',
status VARCHAR(20) NOT NULL DEFAULT 'open',
playbook_id INTEGER REFERENCES playbooks(id),
retry_count INTEGER DEFAULT 0,
resolved_at TIMESTAMP,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_incident_status_created ON incidents(status, created_at);
CREATE INDEX IF NOT EXISTS idx_incident_task_error ON incidents(task_name, error_type);
-- ─────────────────────────────────────────────────
-- 表 3: heal_logs (修復執行紀錄)
-- ─────────────────────────────────────────────────
CREATE TABLE IF NOT EXISTS heal_logs (
id SERIAL PRIMARY KEY,
incident_id INTEGER NOT NULL REFERENCES incidents(id),
playbook_id INTEGER REFERENCES playbooks(id),
action_type VARCHAR(30),
action_detail TEXT,
result VARCHAR(20) DEFAULT 'pending', -- success / failed / skipped
result_output TEXT,
duration_ms FLOAT DEFAULT 0,
created_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_heal_log_incident ON heal_logs(incident_id, created_at);
CREATE INDEX IF NOT EXISTS idx_heal_log_result ON heal_logs(result, created_at);
-- ─────────────────────────────────────────────────
-- 種子 PlayBook 資料(首次初始化,已存在則略過)
-- ─────────────────────────────────────────────────
INSERT INTO playbooks (name, error_type, match_pattern, severity_min, action_type, action_params, cooldown_min, max_retries)
SELECT * FROM (VALUES
(
'Docker DNS 解析失敗修復',
'DNS_FAIL',
'["name resolution", "could not translate host name", "Temporary failure in name resolution"]',
'P2', 'DOCKER_RESTART',
'{"container": "momo-db"}',
30, 3
),
(
'DB 連線被拒修復',
'DB_UNREACHABLE',
'["connection refused", "Connection reset by peer", "could not connect to server"]',
'P2', 'DOCKER_RESTART',
'{"container": "momo-db", "compose": true}',
30, 3
),
(
'App OOM 自動重啟',
'OOM',
'["SIGKILL", "out of memory", "Worker was sent SIGKILL", "MemoryError"]',
'P1', 'DOCKER_RESTART',
'{"container": "momo-pro-system"}',
60, 2
),
(
'Scheduler OOM 自動重啟',
'OOM',
'["SIGKILL", "Worker was sent SIGKILL"]',
'P1', 'DOCKER_RESTART',
'{"container": "momo-scheduler"}',
60, 2
),
(
'PostgreSQL SSL 連線中斷',
'SSL_FAIL',
'["SSL connection has been closed unexpectedly", "SSL SYSCALL error"]',
'P2', 'DOCKER_RESTART',
'{"container": "momo-pro-system"}',
15, 3
),
(
'Google Drive 認證失敗告警',
'AUTH_FAIL',
'["invalid_grant", "google_token.pickle", "Token has been expired or revoked"]',
'P2', 'ALERT_ONLY',
'{"message": "Google Drive OAuth Token 已過期,請人工重新認證。參閱 docs/guides/google_drive_setup.md"}',
240, 1
),
(
'爬蟲 HTTP 429 限流等待',
'CRAWLER_FAIL',
'["429 Too Many Requests", "rate limit", "Retry-After"]',
'P3', 'WAIT_RETRY',
'{"wait_minutes": 30}',
30, 2
)
) AS v(name, error_type, match_pattern, severity_min, action_type, action_params, cooldown_min, max_retries)
WHERE NOT EXISTS (SELECT 1 FROM playbooks WHERE playbooks.name = v.name);