Files
awoooi/apps/api/src/db/models.py
OG T bf45b80bd2
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
feat(Phase 3.5 + Phase 4): AI 學習成果持久化到 PostgreSQL — 修正「AI 失憶」架構缺陷
ADR-085: AI 學習成果不可存在 Cache

架構鐵律確立:
- PostgreSQL = System of Record(AI 的永久記憶)
- Redis = Warm Cache(加速讀取,TTL 到期從 PG 復原)

核心變更:
1. models.py: 新增 PlaybookRecord / DynamicBaselineRecord / LogClusterRecord ORM
2. base.py: ALTER TABLE playbooks 補加 trust_score / requires_approval_level 等欄位
3. playbook_repository.py: 完整雙寫實作(PG upsert + Redis cache)
4. dynamic_baseline_service.py: Holt-Winters 訓練結果寫入 PG,Redis 只作 24h warm cache
5. log_anomaly_detector.py: Drain3 cluster template 寫入 PG(UPSERT on cluster_id)
6. main.py: 啟動時執行 backfill_redis_to_pg()(Redis → PG 冪等補救)

修正的問題:
- Playbook 7天 Redis TTL 到期 → AI 失去所有修復知識
- trust_score EWMA 隨 Redis TTL 歸零 → AI 重新回到初始信任度 0.3
- Holt-Winters 基線 24h TTL → AI 每天重新學習「正常」的定義
- Drain3 cluster 沒有持久化 → AI 把已知 log pattern 反覆當新 pattern

Phase 4 新服務(requirements.txt 已加入 statsmodels + drain3 + numpy)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 15:34:04 +08:00

1113 lines
40 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Database Models
===============
CTO-201: Approval & AuditLog persistence
Schema 設計原則:
- UUID 主鍵 (PostgreSQL 相容)
- JSON 欄位儲存複雜結構
- 完整時間戳記
- 索引優化查詢
"""
from datetime import datetime
from typing import Any
from uuid import uuid4
from sqlalchemy import (
JSON,
DateTime,
Index,
Integer,
String,
Text,
)
from sqlalchemy import (
Enum as SQLEnum,
)
from sqlalchemy.dialects.postgresql import ENUM as PgEnum
from sqlalchemy.orm import Mapped, mapped_column
from src.db.base import Base
from src.models.approval import ApprovalStatus, RiskLevel
from src.models.incident import IncidentStatus, Severity
from src.models.knowledge import EntrySource, EntryStatus, EntryType
# =============================================================================
# Helper Functions
# =============================================================================
def taipei_now() -> datetime:
"""取得台北時區當前時間 (UTC+8)
🔴 HARD RULE: 全系統使用台北時區,禁止 UTC
2026-04-02 Claude Code: C1 時區統一遷移 (首席架構師審查)
"""
from src.utils.timezone import now_taipei
return now_taipei()
def generate_uuid() -> str:
"""Generate UUID string"""
return str(uuid4())
# =============================================================================
# ApprovalRecord - 授權記錄持久化
# =============================================================================
class ApprovalRecord(Base):
"""
授權記錄 - 對應 Pydantic ApprovalRequest
Note: 與 in-memory TrustEngine 的 ApprovalRequest 同步
"""
__tablename__ = "approval_records"
# Primary Key
id: Mapped[str] = mapped_column(
String(36),
primary_key=True,
default=generate_uuid,
)
# Core Fields
action: Mapped[str] = mapped_column(String(500), nullable=False)
description: Mapped[str] = mapped_column(Text, nullable=False)
status: Mapped[str] = mapped_column(
SQLEnum(ApprovalStatus),
default=ApprovalStatus.PENDING,
nullable=False,
)
risk_level: Mapped[str] = mapped_column(
SQLEnum(RiskLevel),
nullable=False,
)
# Signature Tracking
required_signatures: Mapped[int] = mapped_column(Integer, default=1)
current_signatures: Mapped[int] = mapped_column(Integer, default=0)
signatures: Mapped[dict[str, Any]] = mapped_column(JSON, default=list)
# Blast Radius (JSON)
blast_radius: Mapped[dict[str, Any]] = mapped_column(JSON, default=dict)
# Dry-Run Checks (JSON)
dry_run_checks: Mapped[list[dict[str, Any]]] = mapped_column(JSON, default=list)
# Metadata
requested_by: Mapped[str] = mapped_column(String(100), nullable=False)
rejection_reason: Mapped[str | None] = mapped_column(Text, nullable=True)
extra_metadata: Mapped[dict[str, Any] | None] = mapped_column(JSON, nullable=True)
# ==========================================================================
# 戰略 B: 告警風暴收斂 (Alert Storm Convergence)
# ==========================================================================
# 告警指紋 - 根據 namespace + deployment + alert_name 產生的唯一 Hash
fingerprint: Mapped[str | None] = mapped_column(
String(64),
nullable=True,
index=True,
comment="SHA256 hash of alert identity (namespace:deployment:alert_name)",
)
# 聚合次數 - 相同指紋告警的累計觸發次數
hit_count: Mapped[int] = mapped_column(
Integer,
default=1,
nullable=False,
comment="Number of times this alert pattern was triggered",
)
# 最後觸發時間 - 同指紋告警最近一次出現的時間
last_seen_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=taipei_now,
nullable=False,
comment="Last time this alert pattern was seen",
)
# Sprint 5.1 MultiSig 雙簽核支援 (2026-04-08 Claude Sonnet 4.6 Asia/TaipeiADR-062 Q3)
approval_level: Mapped[str] = mapped_column(
String(20),
default="standard",
nullable=False,
comment="standard=1票審核, critical=2票MultiSig",
)
approval_votes: Mapped[list[dict[str, Any]]] = mapped_column(
JSON,
default=list,
nullable=False,
comment="[{user_id, voted_at, action}]",
)
required_votes: Mapped[int] = mapped_column(
Integer,
default=1,
nullable=False,
comment="standard=1, critical=2",
)
# 2026-04-06 ogt: Phase 26 — 關聯 Incident ID
# Playbook 萃取和 KM 寫入必須知道 incident_id不能靠文字解析
incident_id: Mapped[str | None] = mapped_column(
String(64),
nullable=True,
index=True,
comment="Associated Incident ID (INC-YYYYMMDD-XXXXXX)",
)
# 2026-04-09 Claude Sonnet 4.6: Telegram 訊息持久化
# Redis tg_msg:{id} TTL 24h 過期後仍可查詢,支援跨 Session 狀態更新
telegram_message_id: Mapped[int | None] = mapped_column(
Integer,
nullable=True,
comment="Telegram message_id of the approval card sent to operator",
)
telegram_chat_id: Mapped[int | None] = mapped_column(
Integer,
nullable=True,
comment="Telegram chat_id where the approval card was sent",
)
# Timestamps
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=taipei_now,
)
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=taipei_now,
onupdate=taipei_now,
)
expires_at: Mapped[datetime | None] = mapped_column(
DateTime(timezone=True),
nullable=True,
)
resolved_at: Mapped[datetime | None] = mapped_column(
DateTime(timezone=True),
nullable=True,
)
# Indexes
__table_args__ = (
Index("ix_approval_status", "status"),
Index("ix_approval_risk_level", "risk_level"),
Index("ix_approval_created_at", "created_at"),
Index("ix_approval_requested_by", "requested_by"),
Index("ix_approval_fingerprint", "fingerprint"), # 戰略 B: 指紋查詢優化
)
# =============================================================================
# AuditLog - 稽核日誌
# =============================================================================
class TimelineEvent(Base):
"""
時間軸事件 - Phase 4 Action Timeline
事件類型:
- system: 系統告警接收
- agent: OpenClaw AI 分析
- security: 權限阻擋
- human: 人類授權
- exec: 執行完成
"""
__tablename__ = "timeline_events"
# Primary Key
id: Mapped[str] = mapped_column(
String(36),
primary_key=True,
default=generate_uuid,
)
# Event Type & Status
event_type: Mapped[str] = mapped_column(
String(20),
nullable=False,
comment="system, agent, security, human, exec",
)
status: Mapped[str] = mapped_column(
String(20),
nullable=False,
default="info",
comment="info, success, warning, error",
)
# Content
title: Mapped[str] = mapped_column(String(500), nullable=False)
description: Mapped[str | None] = mapped_column(Text, nullable=True)
# Actor
actor: Mapped[str | None] = mapped_column(String(100), nullable=True)
actor_role: Mapped[str | None] = mapped_column(String(50), nullable=True)
# Context
risk_level: Mapped[str | None] = mapped_column(String(20), nullable=True)
approval_id: Mapped[str | None] = mapped_column(String(36), nullable=True, index=True)
# Timestamp
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=taipei_now,
)
# Indexes
__table_args__ = (
Index("ix_timeline_event_type", "event_type"),
Index("ix_timeline_created_at", "created_at"),
)
class AuditLog(Base):
"""
稽核日誌 - 記錄所有執行結果
每次 K8s 操作完成後寫入一筆記錄
"""
__tablename__ = "audit_logs"
# Primary Key
id: Mapped[str] = mapped_column(
String(36),
primary_key=True,
default=generate_uuid,
)
# Reference to Approval
approval_id: Mapped[str] = mapped_column(
String(36),
nullable=False,
index=True,
)
# Operation Details
operation_type: Mapped[str] = mapped_column(
String(50),
nullable=False,
comment="e.g., RESTART_DEPLOYMENT, DELETE_POD",
)
target_resource: Mapped[str] = mapped_column(
String(200),
nullable=False,
comment="e.g., deployment/api-backend, pod/nginx-xxx",
)
namespace: Mapped[str] = mapped_column(
String(63),
default="default",
nullable=False,
)
# Execution Result
success: Mapped[bool] = mapped_column(default=False, nullable=False)
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
# K8s Response (Raw)
k8s_response: Mapped[dict[str, Any] | None] = mapped_column(
JSON,
nullable=True,
comment="Raw Kubernetes API response",
)
# Execution Context
executed_by: Mapped[str] = mapped_column(
String(100),
nullable=False,
comment="Who triggered the execution",
)
execution_duration_ms: Mapped[int | None] = mapped_column(
Integer,
nullable=True,
comment="Execution time in milliseconds",
)
# Dry-Run Result (pre-execution validation)
dry_run_passed: Mapped[bool] = mapped_column(
default=True,
nullable=False,
)
dry_run_message: Mapped[str | None] = mapped_column(Text, nullable=True)
# ==========================================================================
# Phase 18: 失敗自動修復閉環欄位 (2026-03-26)
# ==========================================================================
# 授權來源追蹤
authorization_channel: Mapped[str | None] = mapped_column(
String(20),
nullable=True,
comment="Authorization source: web, telegram, auto",
)
# 重試與修復追蹤
retry_count: Mapped[int] = mapped_column(
Integer,
default=0,
nullable=False,
comment="Number of retry attempts",
)
failure_classification: Mapped[str | None] = mapped_column(
String(50),
nullable=True,
comment="Failure type: TIMEOUT, K8S_ERROR, NETWORK_ERROR, PERMISSION_DENIED",
)
source_approval_id: Mapped[str | None] = mapped_column(
String(36),
nullable=True,
index=True,
comment="Original approval ID if this is a repair attempt",
)
# 自動修復狀態
auto_repair_attempted: Mapped[bool] = mapped_column(
default=False,
nullable=False,
comment="Whether auto-repair was attempted",
)
auto_repair_result: Mapped[str | None] = mapped_column(
Text,
nullable=True,
comment="Auto-repair result: AI analysis and repair outcome",
)
# Timestamps
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=taipei_now,
)
# Indexes
__table_args__ = (
Index("ix_audit_approval_id", "approval_id"),
Index("ix_audit_operation_type", "operation_type"),
Index("ix_audit_success", "success"),
Index("ix_audit_created_at", "created_at"),
Index("ix_audit_authorization_channel", "authorization_channel"), # Phase 18
Index("ix_audit_failure_classification", "failure_classification"), # Phase 18
)
# =============================================================================
# AutoRepairExecution - Phase 10 操作記錄
# 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」
# =============================================================================
class AutoRepairExecution(Base):
"""
自動修復執行記錄
每次 evaluate_auto_repair 觸發並執行 (成功或失敗) 都寫入此表。
不依賴 approval_id自動修復不需人工批准
"""
__tablename__ = "auto_repair_executions"
id: Mapped[str] = mapped_column(String(36), primary_key=True, default=generate_uuid)
# 關聯
incident_id: Mapped[str] = mapped_column(String(30), nullable=False, index=True)
playbook_id: Mapped[str] = mapped_column(String(36), nullable=False, index=True)
playbook_name: Mapped[str] = mapped_column(String(200), nullable=False)
# 執行結果
success: Mapped[bool] = mapped_column(default=False, nullable=False)
executed_steps: Mapped[list] = mapped_column(JSON, default=list, nullable=False)
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
# 執行上下文
triggered_by: Mapped[str] = mapped_column(
String(50), default="auto_repair", nullable=False,
comment="auto_repair / cold_start_trust",
)
similarity_score: Mapped[float | None] = mapped_column(nullable=True)
risk_level: Mapped[str | None] = mapped_column(String(20), nullable=True)
execution_time_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
# 時間戳 (台北時區)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=taipei_now)
__table_args__ = (
Index("ix_are_created_at", "created_at"),
Index("ix_are_success", "success"),
)
# =============================================================================
# AlertOperationLog - Phase 11 告警操作溯源 (Event Sourcing)
# 2026-04-08 Claude Code: 統帥指令「所有操作都必須被記錄,寫入資料庫」
# 不可變 — 只 INSERT不 UPDATE/DELETE
# =============================================================================
class AlertOperationLog(Base):
"""
告警操作完整溯源
Event Sourcing 模式:每個告警生命週期的每個事件都寫一筆。
不可變 (Immutable)。
event_type 值:
ALERT_RECEIVED / TELEGRAM_SENT / USER_ACTION /
AUTO_REPAIR_TRIGGERED / EXECUTION_STARTED / EXECUTION_COMPLETED /
TELEGRAM_RESULT_SENT / RESOLVED / SILENCED / ESCALATED
"""
__tablename__ = "alert_operation_log"
id: Mapped[str] = mapped_column(String(36), primary_key=True, default=generate_uuid)
# 關聯 (允許 NULL不同事件有不同關聯)
incident_id: Mapped[str | None] = mapped_column(String(30), nullable=True, index=True)
approval_id: Mapped[str | None] = mapped_column(String(36), nullable=True, index=True)
audit_log_id: Mapped[str | None] = mapped_column(String(36), nullable=True)
auto_repair_id: Mapped[str | None] = mapped_column(String(36), nullable=True)
# 事件核心
# 2026-04-08 Claude Sonnet 4.6: Sprint 5.1 — 修正 enum 型別不符 (String→PgEnum, create_type=False)
event_type: Mapped[str] = mapped_column(
PgEnum(
"ALERT_RECEIVED", "TELEGRAM_SENT", "USER_ACTION", "AUTO_REPAIR_TRIGGERED",
"EXECUTION_STARTED", "EXECUTION_COMPLETED", "TELEGRAM_RESULT_SENT",
"RESOLVED", "SILENCED", "ESCALATED", "GUARDRAIL_BLOCKED",
"PRE_FLIGHT_PASSED", "PRE_FLIGHT_FAILED", "BACKUP_TRIGGERED",
"BACKUP_COMPLETED", "BACKUP_FAILED", "APPROVAL_ESCALATED", "CHANGE_APPLIED",
name="alert_event_type", create_type=False,
),
nullable=False, index=True,
)
actor: Mapped[str | None] = mapped_column(String(100), nullable=True, index=True)
action_detail: Mapped[str | None] = mapped_column(String(200), nullable=True)
# 執行結果 (NULL = 不適用)
success: Mapped[bool | None] = mapped_column(nullable=True)
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
# 結構化上下文
context: Mapped[dict] = mapped_column(JSON, default=dict, nullable=False)
# 時間戳 (台北時區,不可變)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=taipei_now)
__table_args__ = (
Index("ix_aol_created_at", "created_at"),
)
# =============================================================================
# IncidentRecord - Phase 6.2 Episodic Memory (PostgreSQL)
# =============================================================================
class IncidentRecord(Base):
"""
事件記錄 - 對應 Pydantic Incident Schema v0.3
Phase 6.2: Episodic Memory (長期記憶)
- 從 Working Memory (Redis) 遷移過來
- 永久保留,供 RAG 檢索
- 複雜結構使用 JSONB 欄位
三層記憶架構:
- Working Memory (Redis): 7 天 TTL
- Episodic Memory (PostgreSQL): 此表,永久保留
- Semantic Memory (Vector DB): Phase 6.3+
"""
__tablename__ = "incidents"
# === 主鍵 ===
incident_id: Mapped[str] = mapped_column(
String(30),
primary_key=True,
comment="事件唯一識別碼 (如 INC-20260322-A1B2C3)",
)
# === 狀態與嚴重度 ===
status: Mapped[str] = mapped_column(
SQLEnum(IncidentStatus),
default=IncidentStatus.INVESTIGATING,
nullable=False,
comment="事件狀態 (investigating, mitigating, resolved, closed, escalated)",
)
severity: Mapped[str] = mapped_column(
SQLEnum(Severity),
nullable=False,
comment="事件嚴重度 (P0, P1, P2, P3)",
)
# === 感知層 (Signals) - JSONB ===
signals: Mapped[list[dict[str, Any]]] = mapped_column(
JSON,
default=list,
nullable=False,
comment="關聯的告警信號列表 (JSONB)",
)
affected_services: Mapped[list[str]] = mapped_column(
JSON,
default=list,
nullable=False,
comment="受影響的服務列表",
)
# === 認知層 (AI Decision Chain) - JSONB ===
decision_chain: Mapped[dict[str, Any] | None] = mapped_column(
JSON,
nullable=True,
comment="AI 決策鏈 (完整推論過程)",
)
# === 決策層 (Proposals) ===
proposal_ids: Mapped[list[str]] = mapped_column(
JSON,
default=list,
nullable=False,
comment="關聯的 ApprovalRequest ID 列表",
)
# === 結果層 (Outcome) - JSONB ===
outcome: Mapped[dict[str, Any] | None] = mapped_column(
JSON,
nullable=True,
comment="事件結果與人類回饋",
)
# === ADR-073 Phase 2 欄位 (2026-04-12 ogt) ===
alertname: Mapped[str | None] = mapped_column(
String(100),
nullable=True,
comment="告警名稱 (從 signals labels 抽取)",
)
notification_type: Mapped[str | None] = mapped_column(
String(10),
nullable=True,
comment="通知類型 TYPE-1/2/3/4/4D (早期分診)",
)
alert_category: Mapped[str | None] = mapped_column(
String(50),
nullable=True,
comment="告警類別 config_drift/info/backup/infrastructure/kubernetes/database/general",
)
# === 頻率快照 (Phase 27, 2026-04-10 ogt) ===
# frequency_stats 原本只存記憶體/Redis(TTL=35天)Pod重啟或超期即失
# 此欄位在 incident 建立時寫入快照,永久保存當時的頻率統計
frequency_snapshot: Mapped[dict[str, Any] | None] = mapped_column(
JSON,
nullable=True,
comment="建立時刻的 AnomalyFrequency 快照,永久保存 (Phase 27)",
)
# === 時間軸 ===
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=taipei_now,
nullable=False,
)
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=taipei_now,
onupdate=taipei_now,
nullable=False,
)
resolved_at: Mapped[datetime | None] = mapped_column(
DateTime(timezone=True),
nullable=True,
)
closed_at: Mapped[datetime | None] = mapped_column(
DateTime(timezone=True),
nullable=True,
)
# === 記憶管理 ===
ttl_days: Mapped[int] = mapped_column(
Integer,
default=7,
nullable=False,
comment="Working Memory TTL (天)",
)
vectorized: Mapped[bool] = mapped_column(
default=False,
nullable=False,
comment="是否已向量化到 Vector DB (Semantic Memory)",
)
# === 索引 ===
__table_args__ = (
Index("ix_incident_status", "status"),
Index("ix_incident_severity", "severity"),
Index("ix_incident_created_at", "created_at"),
Index("ix_incident_resolved_at", "resolved_at"),
)
# =============================================================================
# KnowledgeEntry - Knowledge Base Phase 1
# =============================================================================
class KnowledgeEntryRecord(Base):
"""
知識庫條目 - Knowledge Base Phase 1
兩層架構:
- KnowledgeEntry: 知識條目 (此表)
- Playbook: 獨立 Redis透過 related_playbook_id 關聯
建立時間: 2026-04-02 (台北時區)
建立者: Claude Code (Knowledge Base Phase 1)
"""
__tablename__ = "knowledge_entries"
# Primary Key
id: Mapped[str] = mapped_column(
String(36),
primary_key=True,
default=generate_uuid,
)
# Core Fields
title: Mapped[str] = mapped_column(String(255), nullable=False)
content: Mapped[str] = mapped_column(Text, nullable=False)
entry_type: Mapped[str] = mapped_column(
SQLEnum(EntryType),
nullable=False,
comment="incident_case / runbook / best_practice / postmortem",
)
category: Mapped[str] = mapped_column(
String(100),
nullable=False,
comment="分類樹節點 (基礎設施/應用層/AI系統/安全合規)",
)
tags: Mapped[list[str]] = mapped_column(
JSON,
default=list,
nullable=False,
comment="標籤列表 (JSONB string array)",
)
# Source & Status
source: Mapped[str] = mapped_column(
SQLEnum(EntrySource),
nullable=False,
comment="ai_extracted / human",
)
status: Mapped[str] = mapped_column(
SQLEnum(EntryStatus),
default=EntryStatus.DRAFT,
nullable=False,
comment="draft / review / approved / archived",
)
# Relations (soft references, not FK)
related_incident_id: Mapped[str | None] = mapped_column(
String(30),
nullable=True,
comment="關聯 Incident ID",
)
related_playbook_id: Mapped[str | None] = mapped_column(
String(255),
nullable=True,
comment="關聯 Playbook Redis Key",
)
# 2026-04-04 ogt: Phase 25 P1 — Anti-Pattern 閉環攔截用症狀 hash (SymptomPattern.compute_hash())
symptoms_hash: Mapped[str | None] = mapped_column(
String(16),
nullable=True,
comment="症狀模式 hash (16字元 SHA256 前綴)Anti-Pattern 閉環攔截使用",
)
# Metrics
view_count: Mapped[int] = mapped_column(
Integer,
default=0,
nullable=False,
)
# Metadata
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=taipei_now,
)
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=taipei_now,
onupdate=taipei_now,
)
# Indexes
__table_args__ = (
Index("ix_knowledge_entry_type", "entry_type"),
Index("ix_knowledge_category", "category"),
Index("ix_knowledge_status", "status"),
Index("ix_knowledge_created_at", "created_at"),
# 2026-04-04 ogt: Phase 25 P1 — Anti-Pattern 快速查詢
Index("ix_knowledge_symptoms_hash", "symptoms_hash"),
)
# IncidentEvidence — ADR-081 Phase 1 EvidenceSnapshot 持久化
# 2026-04-15 ogt + Claude Sonnet 4.6: AI 自主化飛輪 Phase 1 初始建立
class IncidentEvidence(Base):
"""
不可變事件證據快照表
每次決策前 PreDecisionInvestigator 拍攝一次 EvidenceSnapshot
寫入此表以供:
- 決策溯源LLM 推理過程的完整情報上下文)
- 學習訓練Phase 3 fine-tune pipeline 金礦資料)
- 異常驗證(執行前 vs 執行後 state diff
ADR-081: PreDecisionInvestigator + EvidenceSnapshot
設計原則:只追加寫入,禁止 UPDATEevent sourcing 對齊)
"""
__tablename__ = "incident_evidence"
id: Mapped[str] = mapped_column(String(36), primary_key=True, default=generate_uuid)
# 關聯
incident_id: Mapped[str] = mapped_column(String(30), nullable=False, index=True)
# Phase 3 填充matched_playbook_id 目前永久 nullPhase 3 修復
matched_playbook_id: Mapped[str | None] = mapped_column(String(36), nullable=True)
# Schema 版本(方便 fine-tune pipeline 過濾相容版本)
schema_version: Mapped[str] = mapped_column(String(10), default="v1", nullable=False)
# 8D 感官數據(各維度 nullable — MCP 失敗時部分缺失)
k8s_state: Mapped[dict | None] = mapped_column(
JSON, nullable=True, comment="D1: kubectl describe pod + events"
)
recent_logs: Mapped[str | None] = mapped_column(
Text, nullable=True, comment="D2: container stderr tail-50經 SanitizationService 清洗"
)
metrics_snapshot: Mapped[dict | None] = mapped_column(
JSON, nullable=True, comment="D3: Prometheus 5min vs 1h baseline 對比"
)
recent_deployments: Mapped[list | None] = mapped_column(
JSON, nullable=True, comment="D4: ArgoCD/Gitea 過去 1h 部署 diff"
)
business_metrics: Mapped[dict | None] = mapped_column(
JSON, nullable=True, comment="D5: 訂單量 / 登入成功率 / P0 SLI"
)
historical_context: Mapped[str | None] = mapped_column(
Text, nullable=True, comment="D6: 過去 30 天同 alertname 處置歷史摘要"
)
peer_health: Mapped[dict | None] = mapped_column(
JSON, nullable=True, comment="D7: 同 Deployment 其他 replica 健康度"
)
dependency_topology: Mapped[dict | None] = mapped_column(
JSON, nullable=True, comment="D8: Istio/Service Mesh 上下游 latency/error rate"
)
# 感官品質指標
mcp_health: Mapped[dict] = mapped_column(
JSON, default=dict, nullable=False,
comment="各 MCP 呼叫成敗 {tool_name: bool},用於 decision_fusion 權重調整"
)
collection_duration_ms: Mapped[int | None] = mapped_column(
Integer, nullable=True, comment="情報蒐集總耗時msP99 目標 < 8000"
)
sensors_attempted: Mapped[int] = mapped_column(
default=0, nullable=False, comment="嘗試啟動的感官數"
)
sensors_succeeded: Mapped[int] = mapped_column(
default=0, nullable=False, comment="成功回傳資料的感官數"
)
# LLM 輸入摘要(不超 8K tokens由 Investigator 壓縮)
evidence_summary: Mapped[str | None] = mapped_column(
Text, nullable=True, comment="最終餵給 LLM 的情報摘要UTF-8< 8K tokens"
)
# 執行前後 StatePostExecutionVerifier 填入 post_execution_state
pre_execution_state: Mapped[dict | None] = mapped_column(
JSON, nullable=True, comment="執行前環境狀態快照PostExecutionVerifier 基準線)"
)
post_execution_state: Mapped[dict | None] = mapped_column(
JSON, nullable=True, comment="執行後環境狀態PostExecutionVerifier 抓取Phase 1 接線)"
)
verification_result: Mapped[str | None] = mapped_column(
String(20), nullable=True, comment="success / degraded / failed / timeoutPostExecutionVerifier 填入)"
)
# 時間戳(台北時區)
collected_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=taipei_now, nullable=False
)
__table_args__ = (
Index("ix_incident_evidence_incident_id", "incident_id"),
Index("ix_incident_evidence_collected_at", "collected_at"),
Index("ix_incident_evidence_playbook_id", "matched_playbook_id"),
)
# =============================================================================
# PlaybookRecord — Phase 3.5 Playbook PostgreSQL 持久化 (System of Record)
# ADR-085: AI 學習成果不可存在 Cache — Playbook 是 AI 的肌肉記憶
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3.5 初始建立
#
# 核心鐵律:
# - PostgreSQL = System of Record永久保存AI 的長期記憶)
# - Redis = Warm Cache7天 TTL加速讀取DB 為 source of truth
# - trust_score, EWMA, 統計數據必須持久化 — 不能因 Redis TTL 消失
# =============================================================================
class PlaybookRecord(Base):
"""
Playbook 修復劇本 PostgreSQL ORM
與 Pydantic Playbook 模型對應。
Redis 為 warm cache7d TTLPostgreSQL 為 source of truth。
設計原則:
- AI 的學習成果trust_score、success_count、failure_count永久保存
- EWMA 信任度在 Redis TTL 後不會重置Pod 重啟後 AI 記憶不失
- 雙寫create/update 先寫 PG再更新 Redis cache
- 讀取Redis-firstcache hitmiss 時從 PG 載入並回填 Redis
"""
__tablename__ = "playbooks"
# Primary Key
playbook_id: Mapped[str] = mapped_column(
String(36), primary_key=True,
comment="Playbook 唯一識別碼 (PB-YYYYMMDD-XXXXXX)",
)
# Core Fields
name: Mapped[str] = mapped_column(String(256), nullable=False)
description: Mapped[str] = mapped_column(Text, default="", nullable=False)
status: Mapped[str] = mapped_column(String(20), default="draft", nullable=False)
source: Mapped[str] = mapped_column(String(20), default="extracted", nullable=False)
# Complex structures (JSONB)
symptom_pattern: Mapped[dict[str, Any]] = mapped_column(JSON, default=dict, nullable=False)
repair_steps: Mapped[list[dict[str, Any]]] = mapped_column(JSON, default=list, nullable=False)
# Timing
estimated_duration_minutes: Mapped[int] = mapped_column(Integer, default=5, nullable=False)
# Source tracing
source_incident_ids: Mapped[list[str]] = mapped_column(JSON, default=list, nullable=False)
ai_confidence: Mapped[float] = mapped_column(default=0.0, nullable=False)
# Stats — MUST be in PG (AI learning artifacts, cannot expire)
success_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
failure_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
last_used_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
# EWMA trust score — ADR-083 Phase 3, 絕對不能用 Redis TTL 管理
# trust_score 是 AI 累積學習的結晶TTL 到期就歸零 = AI 記憶全部消失
trust_score: Mapped[float] = mapped_column(default=0.3, nullable=False,
comment="EWMA 動態信任度 (Phase 3)。成功 α=0.1,失敗 α=0.22x 衰減)。< 0.1 → 封存")
# Approval metadata
approved_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
approved_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
tags: Mapped[list[str]] = mapped_column(JSON, default=list, nullable=False)
notes: Mapped[str | None] = mapped_column(Text, nullable=True)
# Sprint 5.1 護欄欄位 (2026-04-08)
requires_approval_level: Mapped[str] = mapped_column(
String(20), default="auto", nullable=False,
comment="auto=直接執行, standard=1票, critical=2票MultiSig",
)
stateful_targets: Mapped[list[str]] = mapped_column(JSON, default=list, nullable=False)
requires_pre_backup: Mapped[bool] = mapped_column(default=False, nullable=False)
# Timestamps
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=taipei_now, nullable=False)
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=taipei_now,
onupdate=taipei_now, nullable=False)
__table_args__ = (
Index("ix_playbook_status", "status"),
Index("ix_playbook_trust_score", "trust_score"),
Index("ix_playbook_created_at", "created_at"),
)
# =============================================================================
# DynamicBaselineRecord — Phase 4 Holt-Winters 訓練基線持久化
# ADR-084: 動態基線不能只存 Redis — AI 每天重學「正常」不是在學習
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 初始建立
#
# 核心鐵律:
# - 訓練好的 Holt-Winters 模型必須在 PG 長期保存
# - Redis 為 24h warm cache加速 is_anomaly() 讀取)
# - 基線消失 = AI 對「正常」的認識消失 = 每天從頭學習 = 不是 AI
# =============================================================================
class DynamicBaselineRecord(Base):
"""
動態基線訓練結果 PostgreSQL ORM
Holt-Winters 訓練完成後:
1. 先寫入 PG永久保存
2. 再寫入 Redis24h warm cache加速讀取
Redis key: baseline:{metric_name}
PG: 此表metric_name 為主鍵,最新一筆 = 有效基線
"""
__tablename__ = "dynamic_baselines"
id: Mapped[str] = mapped_column(String(36), primary_key=True, default=generate_uuid)
# 基線識別
metric_name: Mapped[str] = mapped_column(
String(200), nullable=False, index=True,
comment="基線識別名 (e.g. cpu_usage_node_mon)",
)
# 訓練結果Holt-Winters 統計)
mean: Mapped[float] = mapped_column(nullable=False, comment="擬合值均值")
std: Mapped[float] = mapped_column(nullable=False, comment="殘差標準差")
# 24h 季節性因子JSON 陣列,長度 24
seasonal_factors: Mapped[list[float]] = mapped_column(
JSON, default=list, nullable=False,
comment="24h 週期季節性因子(乘法形式,均值 ≈ 1.0",
)
# 訓練元資料
datapoint_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
promql: Mapped[str] = mapped_column(Text, default="", nullable=False,
comment="訓練使用的 PromQL 查詢")
lookback_hours: Mapped[int] = mapped_column(Integer, default=336, nullable=False)
# Timestamps
trained_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=taipei_now, nullable=False)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=taipei_now, nullable=False)
__table_args__ = (
Index("ix_dynamic_baseline_metric", "metric_name"),
Index("ix_dynamic_baseline_trained_at", "trained_at"),
)
# =============================================================================
# LogClusterRecord — Phase 4 Drain3 學習到的 Log Pattern 持久化
# ADR-084: Drain3 模板不能只存 Redis — 每次重啟 AI 把已知 pattern 當新 pattern
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 初始建立
#
# 核心鐵律:
# - Drain3 學到的 log cluster template 必須在 PG 長期保存
# - 新 cluster 事件列表 (log_anomaly:new) 才存 Redis短期工作記憶
# - 基礎知識庫(已學到的 pattern必須在 PG
# =============================================================================
class LogClusterRecord(Base):
"""
Drain3 Log Cluster Template 持久化
每個新 pattern 首次偵測到時:
1. 寫入 PG永久保存AI 的 log 語意理解)
2. 推送到 Redis list log_anomaly:new短期工作記憶
Re-detect 相同 template 時只更新 last_seen_at + size不重複寫入 PG。
"""
__tablename__ = "log_clusters"
id: Mapped[str] = mapped_column(String(36), primary_key=True, default=generate_uuid)
# Cluster 識別MD5[:8] of template
cluster_id: Mapped[str] = mapped_column(
String(16), nullable=False, unique=True, index=True,
comment="模板 MD5[:8].upper(),穩定 ID",
)
# Drain3 模板
template: Mapped[str] = mapped_column(
Text, nullable=False,
comment="Drain3 萃取的 log 模板 (e.g. 'ERROR <*> connection failed to <*>')",
)
# 統計
size: Mapped[int] = mapped_column(Integer, default=1, nullable=False,
comment="命中次數(第一次 = 1")
source: Mapped[str] = mapped_column(String(50), default="k8s_pod", nullable=False,
comment="k8s_pod | host_syslog | app_log")
# 樣本日誌(保留首次觸發的原始行,供事後分析)
sample_log: Mapped[str | None] = mapped_column(Text, nullable=True,
comment="首次觸發的原始 log 行(前 500 字元)")
# Timestamps
first_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=taipei_now, nullable=False)
last_seen_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=taipei_now,
onupdate=taipei_now, nullable=False)
__table_args__ = (
Index("ix_log_cluster_first_seen", "first_seen_at"),
Index("ix_log_cluster_source", "source"),
)
# =============================================================================
# AgentSession — Phase 2 多 Agent 辯證 Audit Trail
# =============================================================================
class AgentSession(Base):
"""
ADR-082 Phase 2: 多 Agent 辯證 Immutable Event Log
每個 Agent 每次「發言」寫一行。
session_id 串連同一次 Incident 決策的所有 Agent turns。
不可刪除 — 只能新增Immutable Event Sourcing
Phase 3 學習閉環依賴此表Critic 挑戰成功作為負向學習信號)。
ADR-082: 多 Agent 協作架構
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 2 初始建立
"""
__tablename__ = "agent_sessions"
id: Mapped[str] = mapped_column(
String(36), primary_key=True, default=lambda: str(uuid4()),
comment="行主鍵UUID"
)
session_id: Mapped[str] = mapped_column(
String(36), nullable=False,
comment="辯證 Session ID一次 Incident 決策的所有 turns 共用同一 session_id"
)
incident_id: Mapped[str] = mapped_column(
String(50), nullable=False,
comment="關聯 Incident ID"
)
agent_role: Mapped[str] = mapped_column(
String(20), nullable=False,
comment="Agent 角色diagnostician / solver / reviewer / critic / coordinator"
)
# 輸入指紋sha256[:16])— 用於查重、快取命中追蹤
input_hash: Mapped[str] = mapped_column(
String(16), nullable=False, default="",
comment="sha256(input_json)[:16],供查重與快取命中追蹤"
)
# Agent 輸出(完整 JSON供 Phase 3 學習 + 事後複盤)
output_json: Mapped[dict] = mapped_column(
JSON, nullable=False, default=dict,
comment="Agent 原始輸出DiagnosisReport / ActionPlan / 等序列化 dict"
)
# 品質指標
latency_ms: Mapped[int] = mapped_column(
Integer, nullable=False, default=0,
comment="此 Agent 的執行耗時ms"
)
vote: Mapped[str] = mapped_column(
String(20), nullable=False, default="abstain",
comment="Agent 投票approve / reject / request_revision / abstain / degraded"
)
degraded: Mapped[bool] = mapped_column(
nullable=False, default=False,
comment="True = 此 Agent 因熔斷/超時降級,輸出為 rule-based mock"
)
# 時間戳(台北時區)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=taipei_now, nullable=False
)
__table_args__ = (
Index("ix_agent_sessions_session_id", "session_id"),
Index("ix_agent_sessions_incident_id", "incident_id"),
Index("ix_agent_sessions_created_at", "created_at"),
# 查詢某 session 中特定 role 的 turnCoordinator 聚合時常用)
Index("ix_agent_sessions_session_role", "session_id", "agent_role"),
)