Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m29s
統帥指令「所有告警訊息通通寫入資料庫,並記錄相關操作」 變更: - phase11_alert_operation_log.sql: 新表 (Event Sourcing,不可變) - phase11b_backfill_alert_operation_log.sql: 歷史回填 654 筆 - 14 筆 ALERT_RECEIVED (incidents) - 265 筆 TELEGRAM_SENT (approval_records) - 265 筆 USER_ACTION (approval_records) - 110 筆 EXECUTION_COMPLETED (audit_logs) - db/models.py: AlertOperationLog SQLAlchemy model - repositories/alert_operation_log_repository.py: append/list_by_incident/get_stats - webhooks.py: _try_auto_repair_background 寫入 AUTO_REPAIR_TRIGGERED + EXECUTION_COMPLETED + TELEGRAM_RESULT_SENT - webhooks.py: _push_to_telegram_background 寫入 TELEGRAM_SENT - telegram.py: handle_callback 寫入 USER_ACTION (approve/reject) 已執行 migration: awoooi_prod@192.168.0.188 ✅ Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
73 lines
3.3 KiB
SQL
73 lines
3.3 KiB
SQL
-- Phase 11: Alert Operation Log — 告警操作完整溯源表
|
||
-- 建立時間: 2026-04-08 (台北時區)
|
||
-- 建立者: Claude Code — 統帥指令「所有操作都必須被記錄,寫入資料庫」
|
||
--
|
||
-- 設計理念: Event Sourcing
|
||
-- 每個告警的生命週期,每個事件都寫一筆
|
||
-- 不可變 (Immutable) — 只 INSERT,不 UPDATE/DELETE
|
||
--
|
||
-- 事件類型 (event_type):
|
||
-- ALERT_RECEIVED — Alertmanager/外部告警進來
|
||
-- TELEGRAM_SENT — 推送 Telegram 審核卡片
|
||
-- USER_ACTION — 使用者在 Telegram 按按鈕 (approve/reject/silence)
|
||
-- AUTO_REPAIR_TRIGGERED — 自動修復評估通過,準備執行
|
||
-- EXECUTION_STARTED — 開始執行 K8s/SSH 指令
|
||
-- EXECUTION_COMPLETED — 執行完成 (success/failure)
|
||
-- TELEGRAM_RESULT_SENT — 自動修復結果推送到 Telegram
|
||
-- RESOLVED — 告警解除
|
||
-- SILENCED — 靜默中
|
||
-- ESCALATED — 升級 (P3→P2 等)
|
||
|
||
CREATE TYPE alert_event_type AS ENUM (
|
||
'ALERT_RECEIVED',
|
||
'TELEGRAM_SENT',
|
||
'USER_ACTION',
|
||
'AUTO_REPAIR_TRIGGERED',
|
||
'EXECUTION_STARTED',
|
||
'EXECUTION_COMPLETED',
|
||
'TELEGRAM_RESULT_SENT',
|
||
'RESOLVED',
|
||
'SILENCED',
|
||
'ESCALATED'
|
||
);
|
||
|
||
CREATE TABLE IF NOT EXISTS alert_operation_log (
|
||
-- 主鍵 (不可變)
|
||
id VARCHAR(36) PRIMARY KEY DEFAULT gen_random_uuid()::text,
|
||
|
||
-- 關聯 (所有欄位允許 NULL,避免不同事件強制關聯)
|
||
incident_id VARCHAR(30), -- incidents.incident_id
|
||
approval_id VARCHAR(36), -- approval_records.id
|
||
audit_log_id VARCHAR(36), -- audit_logs.id
|
||
auto_repair_id VARCHAR(36), -- auto_repair_executions.id
|
||
|
||
-- 事件核心
|
||
event_type alert_event_type NOT NULL,
|
||
actor VARCHAR(100), -- 誰觸發: 'alertmanager' / 'telegram:user_id' / 'auto_repair' / 'system'
|
||
action_detail VARCHAR(200), -- 具體動作: 'approve' / 'reject' / 'silence' / kubectl 指令摘要
|
||
|
||
-- 執行結果
|
||
success BOOLEAN, -- NULL=不適用 (如 ALERT_RECEIVED), TRUE/FALSE=有執行結果
|
||
error_message TEXT,
|
||
|
||
-- 上下文 (結構化存儲)
|
||
context JSONB NOT NULL DEFAULT '{}',
|
||
-- 範例:
|
||
-- ALERT_RECEIVED: {"alert_name": "KubePodCrashLooping", "severity": "P2", "namespace": "awoooi-prod"}
|
||
-- USER_ACTION: {"button": "approve", "telegram_user_id": "12345", "message_id": "67890"}
|
||
-- EXECUTION: {"playbook": "restart-deployment", "steps": 3, "duration_ms": 2340}
|
||
|
||
-- 時間戳 (台北時區,不可變)
|
||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||
);
|
||
|
||
-- 索引 (查詢模式: 按 incident / 按時間 / 按事件類型)
|
||
CREATE INDEX IF NOT EXISTS ix_aol_incident_id ON alert_operation_log (incident_id);
|
||
CREATE INDEX IF NOT EXISTS ix_aol_approval_id ON alert_operation_log (approval_id);
|
||
CREATE INDEX IF NOT EXISTS ix_aol_event_type ON alert_operation_log (event_type);
|
||
CREATE INDEX IF NOT EXISTS ix_aol_created_at ON alert_operation_log (created_at DESC);
|
||
CREATE INDEX IF NOT EXISTS ix_aol_actor ON alert_operation_log (actor);
|
||
|
||
COMMENT ON TABLE alert_operation_log IS
|
||
'告警操作完整溯源 — Event Sourcing,不可變,每個告警生命週期的每個事件一筆記錄';
|