awoooi/apps/api/src/models/incident.py

"""
Incident Schema v0.3 - 認知覺醒計畫核心資料結構
=================================================

C-Suite 戰略會議決議 (2026-03-22):
- AWOOOI 定位為 AI Ops OS (決策層)
- 三層記憶架構: Working (Redis) + Episodic (PG) + Semantic (Vector)
- 復用現有 approval.py 子模型，避免重複定義

設計原則:
1. 復用現有 approval.py 的子模型 (BlastRadius, DryRunCheck)
2. Severity (P0-P3) 用於事件嚴重度，RiskLevel 用於操作風險
3. proposal_ids 支援多重決策軌跡
4. 完整的 AI 決策鏈可稽核性 (CISO 要求)
5. Feedback Loop 回饋循環 (CPO 要求)

三層記憶對應:
- Working Memory (Redis): 活躍事件，7 天 TTL
- Episodic Memory (PostgreSQL): 歷史事件，永久保留
- Semantic Memory (Vector DB): 向量化後的知識，供 RAG 檢索
"""

from datetime import datetime, timezone
from enum import Enum
from typing import Literal
from uuid import UUID, uuid4

from pydantic import BaseModel, Field, field_validator

# 復用現有模型 (避免重複定義)
from src.models.approval import BlastRadius

# =============================================================================
# Incident 專用 Enums
# =============================================================================


class Severity(str, Enum):
    """
    事件嚴重度 (Incident Severity)

    與 RiskLevel 的區別:
    - Severity: 事件本身的嚴重程度 (P0 最嚴重)
    - RiskLevel: 修復操作的風險等級 (CRITICAL 最危險)

    用於:
    - AI 分層調用策略 (P0 直接用 Claude，P2/P3 用 Ollama)
    - SLA 響應時間門檻
    - 告警通知優先級
    """

    P0 = "P0"  # Critical - 服務完全中斷，5 分鐘響應
    P1 = "P1"  # High - 服務嚴重降級，15 分鐘響應
    P2 = "P2"  # Medium - 服務部分影響，1 小時響應
    P3 = "P3"  # Low - 輕微影響，4 小時響應


class IncidentStatus(str, Enum):
    """
    事件狀態機

    INVESTIGATING → MITIGATING → RESOLVED → CLOSED
                 ↘ (無法解決) → ESCALATED
    """

    INVESTIGATING = "investigating"  # 調查中 - AI 正在分析根因
    MITIGATING = "mitigating"  # 處置中 - 已產生 Proposal，等待簽核或執行中
    RESOLVED = "resolved"  # 已解決 - 服務恢復正常
    CLOSED = "closed"  # 已關閉 - 含人類回饋，可納入長期記憶
    ESCALATED = "escalated"  # 已升級 - 需要人工介入


# =============================================================================
# Signal (原始告警)
# =============================================================================


class Signal(BaseModel):
    """
    原始告警信號 - 從 Prometheus/SignOz/Alertmanager 接收

    這是 Incident 的「感知輸入」，一個 Incident 可能包含多個 Signal。
    例如: CPU Spike + Memory OOM + Pod Restart 三個告警可能屬於同一個 Incident。
    """

    signal_id: str = Field(
        default_factory=lambda: str(uuid4())[:8],
        description="信號唯一識別碼 (8 字元)",
    )
    alert_name: str = Field(..., description="告警名稱 (如 HighCPUUsage)")
    severity: Severity = Field(..., description="告警嚴重度")
    source: Literal["prometheus", "signoz", "alertmanager", "manual", "telegram"] = (
        Field(..., description="告警來源")
    )
    fired_at: datetime = Field(..., description="告警觸發時間")
    resolved_at: datetime | None = Field(None, description="告警解除時間")
    labels: dict[str, str] = Field(
        default_factory=dict,
        description="Prometheus 標籤 (如 pod, namespace, service)",
    )
    annotations: dict[str, str] = Field(
        default_factory=dict,
        description="告警附加資訊 (如 summary, description)",
    )
    fingerprint: str | None = Field(
        None,
        description="告警指紋 Hash，用於去重與聚合",
    )

    # [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated)，原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei


# =============================================================================
# AI Decision Chain (CISO 要求：可稽核性)
# =============================================================================


class AIDecisionChain(BaseModel):
    """
    AI 決策鏈 - 完整記錄推論過程，供稽核使用

    CISO 要求:
    - 必須記錄 AI 使用的模型、Prompt 版本
    - 必須記錄推理步驟 (可解釋性)
    - 必須記錄推論延遲 (效能監控)

    用於回答:
    - 「AI 為什麼做出這個建議？」
    - 「AI 當時參考了哪些資料？」
    - 「這個決策可以被重現嗎？」
    """

    # === 輸入 ===
    input_signal_ids: list[str] = Field(
        default_factory=list,
        description="觸發此推論的告警 ID 列表",
    )
    context_retrieved: list[str] = Field(
        default_factory=list,
        description="從記憶中檢索的上下文摘要",
    )

    # === 模型資訊 ===
    model_used: str = Field(
        ...,
        description="使用的 AI 模型 (如 ollama/llama3.2:latest, gemini/gemini-pro)",
    )
    prompt_template_version: str = Field(
        default="v1.0.0",
        description="Prompt 模板版本號",
    )

    # === 推論結果 ===
    hypothesis: str = Field(..., description="AI 的根因推論")
    confidence: float = Field(
        ...,
        ge=0.0,
        le=1.0,
        description="信心指數 (0.0 - 1.0)",
    )
    reasoning_steps: list[str] = Field(
        default_factory=list,
        description="推理步驟 (可解釋性)",
    )

    # === GraphRAG 結果 ===
    blast_radius: BlastRadius | None = Field(
        None,
        description="爆炸半徑分析結果 (復用現有模型)",
    )
    probable_root_causes: list[str] = Field(
        default_factory=list,
        description="可能的根本原因列表",
    )

    # === 效能追蹤 ===
    inference_started_at: datetime = Field(..., description="推論開始時間")
    inference_completed_at: datetime = Field(..., description="推論完成時間")
    latency_ms: int = Field(..., description="推論延遲 (毫秒)")

    # [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated)，原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei


# =============================================================================
# Incident Frequency Stats (ADR-037: 異常頻率統計)
# =============================================================================


class IncidentFrequencyStats(BaseModel):
    """
    事件頻率統計 - ADR-037 監控增強架構

    2026-03-29 ogt: 統帥指示「重啟只是治標，太常發生的異常必須徹底解決」

    用途:
    - 統計同一異常在不同時間窗口內的發生次數
    - 根據頻率決定修復策略的升級 (Tier 1→4)
    - 讓使用者知道這個問題有多頻繁

    升級閾值:
    - REPEAT: ≥ 3 次/24h (標記重複)
    - ESCALATE: ≥ 5 次/24h (升級 Tier，通知 Owner)
    - PERMANENT_FIX: ≥ 10 次/24h (強制根因修復)
    """

    anomaly_key: str = Field(
        ...,
        description="異常簽名 Hash (前 16 字元)",
    )
    count_1h: int = Field(
        default=0,
        ge=0,
        description="1 小時內發生次數",
    )
    count_24h: int = Field(
        default=0,
        ge=0,
        description="24 小時內發生次數",
    )
    count_7d: int = Field(
        default=0,
        ge=0,
        description="7 天內發生次數",
    )
    count_30d: int = Field(
        default=0,
        ge=0,
        description="30 天內發生次數",
    )
    escalation_level: Literal["REPEAT", "ESCALATE", "PERMANENT_FIX"] | None = Field(
        None,
        description="升級建議 (基於 24h 頻率)",
    )
    auto_repair_count: int = Field(
        default=0,
        ge=0,
        description="自動修復嘗試次數",
    )
    last_repair_action: str | None = Field(
        None,
        description="最後一次修復動作",
    )
    last_repair_success: bool | None = Field(
        None,
        description="最後一次修復是否成功",
    )

    # 2026-04-07 Claude Code: Sprint 4 — 告警處置統計 (A1)
    human_approved_count: int = Field(
        default=0,
        ge=0,
        description="人工按批准後執行次數",
    )
    manual_resolved_count: int = Field(
        default=0,
        ge=0,
        description="無系統修復紀錄但 resolved 次數",
    )
    cold_start_trust_count: int = Field(
        default=0,
        ge=0,
        description="首次信任自動放行次數",
    )
    total_resolution_count: int = Field(
        default=0,
        ge=0,
        description="總處置次數 (auto + human + manual + cold_start)",
    )


# =============================================================================
# Incident Outcome (CPO 要求：回饋循環)
# =============================================================================


class IncidentOutcome(BaseModel):
    """
    事件結果 - AI 學習的關鍵回饋

    CPO 要求:
    - 必須記錄執行結果 (成功/失敗)
    - 必須收集人類回饋 (AI 建議是否有效)
    - 必須標記是否納入長期記憶

    這是讓 AI 「從經驗中學習」的關鍵:
    - 如果 AI 的建議有效 → 強化這個模式
    - 如果 AI 的建議無效 → 記錄為負面案例
    """

    # === 執行結果 ===
    proposal_executed: bool = Field(
        default=False,
        description="是否已執行修復提案",
    )
    execution_success: bool | None = Field(
        None,
        description="執行是否成功 (None = 未執行)",
    )
    actual_downtime_minutes: int | None = Field(
        None,
        description="實際停機時間 (分鐘)",
    )

    # === 人類回饋 ===
    human_feedback: str | None = Field(
        None,
        description="人類的文字回饋 (如 '這個建議很準' 或 '下次應該先檢查 X')",
    )
    effectiveness_score: int | None = Field(
        None,
        ge=1,
        le=5,
        description="有效性評分 (1-5 分)",
    )

    # === 學習標記 ===
    should_remember: bool = Field(
        default=True,
        description="是否納入長期記憶 (Episodic Memory)",
    )
    learning_notes: str | None = Field(
        None,
        description="給未來 AI 的學習筆記",
    )


# =============================================================================
# Incident (核心模型)
# =============================================================================


class Incident(BaseModel):
    """
    事件模型 - AWOOOI 認知系統的核心資料結構

    這是 AWOOOI 2.0「認知覺醒計畫」的基石，承載了:
    - 感知 (Signals): 原始告警
    - 認知 (Decision Chain): AI 推論過程
    - 決策 (Proposals): 修復建議
    - 記憶 (Outcome): 結果回饋

    三層記憶架構:
    ┌─────────────────┐
    │ Working Memory  │ ← Redis Hash, 7 天 TTL
    │ (活躍事件)       │
    └────────┬────────┘
             │ 定期遷移
             ▼
    ┌─────────────────┐
    │ Episodic Memory │ ← PostgreSQL, 永久保留
    │ (歷史事件)       │
    └────────┬────────┘
             │ 向量化
             ▼
    ┌─────────────────┐
    │ Semantic Memory │ ← Vector DB, RAG 檢索
    │ (知識庫)        │
    └─────────────────┘
    """

    # === 識別 ===
    incident_id: str = Field(
        default_factory=lambda: f"INC-{datetime.now(timezone.utc).strftime('%Y%m%d')}-{str(uuid4())[:6].upper()}",
        description="事件唯一識別碼 (如 INC-20260322-A1B2C3)",
    )

    # === 狀態 ===
    status: IncidentStatus = Field(
        default=IncidentStatus.INVESTIGATING,
        description="事件狀態",
    )
    severity: Severity = Field(..., description="事件嚴重度")

    # === 感知層 (Signals) ===
    signals: list[Signal] = Field(
        default_factory=list,
        description="關聯的告警信號列表",
    )
    affected_services: list[str] = Field(
        default_factory=list,
        description="受影響的服務列表 (GraphRAG Blast Radius)",
    )

    # === 認知層 (AI) ===
    decision_chain: AIDecisionChain | None = Field(
        None,
        description="AI 決策鏈 (完整推論過程)",
    )

    # === 決策層 (Proposals) ===
    # 支援多重決策軌跡: Proposal A 失敗 → Proposal B
    proposal_ids: list[UUID] = Field(
        default_factory=list,
        description="關聯的 ApprovalRequest ID 列表 (支援多重決策軌跡)",
    )

    # === 結果層 (Feedback Loop) ===
    outcome: IncidentOutcome | None = Field(
        None,
        description="事件結果與人類回饋",
    )

    # === 頻率統計 (ADR-037) ===
    # 2026-03-29 ogt: 統帥指示「重啟只是治標，太常發生的異常必須徹底解決」
    frequency_stats: IncidentFrequencyStats | None = Field(
        None,
        description="異常頻率統計 (用於 Tier 分級修復策略)",
    )

    # === 時間軸 ===
    created_at: datetime = Field(
        default_factory=lambda: datetime.now(timezone.utc),
        description="事件建立時間",
    )
    updated_at: datetime = Field(
        default_factory=lambda: datetime.now(timezone.utc),
        description="最後更新時間",
    )
    resolved_at: datetime | None = Field(
        None,
        description="事件解決時間",
    )
    closed_at: datetime | None = Field(
        None,
        description="事件關閉時間 (含回饋)",
    )

    # === 記憶管理 ===
    ttl_days: int = Field(
        default=7,
        description="Working Memory TTL (天)",
    )
    persisted_to_pg: bool = Field(
        default=False,
        description="是否已固化到 PostgreSQL (Episodic Memory)",
    )
    vectorized: bool = Field(
        default=False,
        description="是否已向量化到 Vector DB (Semantic Memory)",
    )

    # ADR-071-A: 告警通知四類型 + 全生命週期 DB 記錄 (2026-04-11 Claude Sonnet 4.6)
    notification_type: str | None = Field(None, description="通知類型 TYPE-1/2/3/4/4D")
    alert_category: str | None = Field(None, description="告警類別 k8s_workload/database/host_resource/...")
    context_bundle: dict | None = Field(None, description="MCP 情報收集快照（執行前）")
    metrics_before: dict | None = Field(None, description="指標快照（執行前，Prometheus MCP）")
    metrics_after: dict | None = Field(None, description="指標快照（執行後，Prometheus MCP）")
    verification_result: dict | None = Field(None, description="執行驗證結果（K8s MCP watch_rollout）")
    manual_fix_steps: str | None = Field(None, description="手動修復步驟（TYPE-4 使用者輸入）")
    manual_fix_by: str | None = Field(None, description="手動修復執行者")

    # [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated)，原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei

    # 2026-04-01 Claude Code: 舊 Redis 資料相容性 - outcome 可能存為字串 "resolved"
    @field_validator("outcome", mode="before")
    @classmethod
    def coerce_outcome_string(cls, v: object) -> object:
        if isinstance(v, str):
            return None  # 舊格式字串無法還原為 IncidentOutcome，捨棄即可
        return v


# =============================================================================
# DTOs (Data Transfer Objects)
# =============================================================================


class IncidentCreate(BaseModel):
    """建立事件的 DTO"""

    severity: Severity
    signals: list[Signal] = Field(default_factory=list)
    affected_services: list[str] = Field(default_factory=list)


class IncidentUpdate(BaseModel):
    """更新事件的 DTO"""

    status: IncidentStatus | None = None
    severity: Severity | None = None
    affected_services: list[str] | None = None
    decision_chain: AIDecisionChain | None = None
    outcome: IncidentOutcome | None = None


class IncidentResponse(BaseModel):
    """事件 API 回應"""

    incident_id: str
    status: IncidentStatus
    severity: Severity
    signals: list[Signal]
    affected_services: list[str]
    decision_chain: AIDecisionChain | None
    proposal_ids: list[str]  # 轉為字串
    outcome: IncidentOutcome | None
    created_at: datetime
    updated_at: datetime
    resolved_at: datetime | None
    closed_at: datetime | None

    @classmethod
    def from_incident(cls, incident: Incident) -> "IncidentResponse":
        """從 Incident 轉換"""
        return cls(
            incident_id=incident.incident_id,
            status=incident.status,
            severity=incident.severity,
            signals=incident.signals,
            affected_services=incident.affected_services,
            decision_chain=incident.decision_chain,
            proposal_ids=[str(pid) for pid in incident.proposal_ids],
            outcome=incident.outcome,
            created_at=incident.created_at,
            updated_at=incident.updated_at,
            resolved_at=incident.resolved_at,
            closed_at=incident.closed_at,
        )

    # [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated)，原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei