Files
awoooi/apps/api/src/models/incident.py
OG T 196d269b92 feat: add all application source code
- apps/api: FastAPI backend with Dockerfile
- apps/web: Next.js frontend with Dockerfile
- apps/sensor: Signal collection agent
- packages: shared packages

Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-22 18:57:44 +08:00

423 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Incident Schema v0.3 - 認知覺醒計畫核心資料結構
=================================================
C-Suite 戰略會議決議 (2026-03-22):
- AWOOOI 定位為 AI Ops OS (決策層)
- 三層記憶架構: Working (Redis) + Episodic (PG) + Semantic (Vector)
- 復用現有 approval.py 子模型,避免重複定義
設計原則:
1. 復用現有 approval.py 的子模型 (BlastRadius, DryRunCheck)
2. Severity (P0-P3) 用於事件嚴重度RiskLevel 用於操作風險
3. proposal_ids 支援多重決策軌跡
4. 完整的 AI 決策鏈可稽核性 (CISO 要求)
5. Feedback Loop 回饋循環 (CPO 要求)
三層記憶對應:
- Working Memory (Redis): 活躍事件7 天 TTL
- Episodic Memory (PostgreSQL): 歷史事件,永久保留
- Semantic Memory (Vector DB): 向量化後的知識,供 RAG 檢索
"""
from datetime import datetime, timezone
from enum import Enum
from typing import Literal
from uuid import UUID, uuid4
from pydantic import BaseModel, Field
# 復用現有模型 (避免重複定義)
from src.models.approval import BlastRadius, DryRunCheck
# =============================================================================
# Incident 專用 Enums
# =============================================================================
class Severity(str, Enum):
"""
事件嚴重度 (Incident Severity)
與 RiskLevel 的區別:
- Severity: 事件本身的嚴重程度 (P0 最嚴重)
- RiskLevel: 修復操作的風險等級 (CRITICAL 最危險)
用於:
- AI 分層調用策略 (P0 直接用 ClaudeP2/P3 用 Ollama)
- SLA 響應時間門檻
- 告警通知優先級
"""
P0 = "P0" # Critical - 服務完全中斷5 分鐘響應
P1 = "P1" # High - 服務嚴重降級15 分鐘響應
P2 = "P2" # Medium - 服務部分影響1 小時響應
P3 = "P3" # Low - 輕微影響4 小時響應
class IncidentStatus(str, Enum):
"""
事件狀態機
INVESTIGATING → MITIGATING → RESOLVED → CLOSED
↘ (無法解決) → ESCALATED
"""
INVESTIGATING = "investigating" # 調查中 - AI 正在分析根因
MITIGATING = "mitigating" # 處置中 - 已產生 Proposal等待簽核或執行中
RESOLVED = "resolved" # 已解決 - 服務恢復正常
CLOSED = "closed" # 已關閉 - 含人類回饋,可納入長期記憶
ESCALATED = "escalated" # 已升級 - 需要人工介入
# =============================================================================
# Signal (原始告警)
# =============================================================================
class Signal(BaseModel):
"""
原始告警信號 - 從 Prometheus/SignOz/Alertmanager 接收
這是 Incident 的「感知輸入」,一個 Incident 可能包含多個 Signal。
例如: CPU Spike + Memory OOM + Pod Restart 三個告警可能屬於同一個 Incident。
"""
signal_id: str = Field(
default_factory=lambda: str(uuid4())[:8],
description="信號唯一識別碼 (8 字元)",
)
alert_name: str = Field(..., description="告警名稱 (如 HighCPUUsage)")
severity: Severity = Field(..., description="告警嚴重度")
source: Literal["prometheus", "signoz", "alertmanager", "manual", "telegram"] = (
Field(..., description="告警來源")
)
fired_at: datetime = Field(..., description="告警觸發時間")
resolved_at: datetime | None = Field(None, description="告警解除時間")
labels: dict[str, str] = Field(
default_factory=dict,
description="Prometheus 標籤 (如 pod, namespace, service)",
)
annotations: dict[str, str] = Field(
default_factory=dict,
description="告警附加資訊 (如 summary, description)",
)
fingerprint: str | None = Field(
None,
description="告警指紋 Hash用於去重與聚合",
)
class Config:
json_encoders = {
datetime: lambda v: v.isoformat(),
}
# =============================================================================
# AI Decision Chain (CISO 要求:可稽核性)
# =============================================================================
class AIDecisionChain(BaseModel):
"""
AI 決策鏈 - 完整記錄推論過程,供稽核使用
CISO 要求:
- 必須記錄 AI 使用的模型、Prompt 版本
- 必須記錄推理步驟 (可解釋性)
- 必須記錄推論延遲 (效能監控)
用於回答:
- 「AI 為什麼做出這個建議?」
- 「AI 當時參考了哪些資料?」
- 「這個決策可以被重現嗎?」
"""
# === 輸入 ===
input_signal_ids: list[str] = Field(
default_factory=list,
description="觸發此推論的告警 ID 列表",
)
context_retrieved: list[str] = Field(
default_factory=list,
description="從記憶中檢索的上下文摘要",
)
# === 模型資訊 ===
model_used: str = Field(
...,
description="使用的 AI 模型 (如 ollama/llama3.2:latest, gemini/gemini-pro)",
)
prompt_template_version: str = Field(
default="v1.0.0",
description="Prompt 模板版本號",
)
# === 推論結果 ===
hypothesis: str = Field(..., description="AI 的根因推論")
confidence: float = Field(
...,
ge=0.0,
le=1.0,
description="信心指數 (0.0 - 1.0)",
)
reasoning_steps: list[str] = Field(
default_factory=list,
description="推理步驟 (可解釋性)",
)
# === GraphRAG 結果 ===
blast_radius: BlastRadius | None = Field(
None,
description="爆炸半徑分析結果 (復用現有模型)",
)
probable_root_causes: list[str] = Field(
default_factory=list,
description="可能的根本原因列表",
)
# === 效能追蹤 ===
inference_started_at: datetime = Field(..., description="推論開始時間")
inference_completed_at: datetime = Field(..., description="推論完成時間")
latency_ms: int = Field(..., description="推論延遲 (毫秒)")
class Config:
json_encoders = {
datetime: lambda v: v.isoformat(),
}
# =============================================================================
# Incident Outcome (CPO 要求:回饋循環)
# =============================================================================
class IncidentOutcome(BaseModel):
"""
事件結果 - AI 學習的關鍵回饋
CPO 要求:
- 必須記錄執行結果 (成功/失敗)
- 必須收集人類回饋 (AI 建議是否有效)
- 必須標記是否納入長期記憶
這是讓 AI 「從經驗中學習」的關鍵:
- 如果 AI 的建議有效 → 強化這個模式
- 如果 AI 的建議無效 → 記錄為負面案例
"""
# === 執行結果 ===
proposal_executed: bool = Field(
default=False,
description="是否已執行修復提案",
)
execution_success: bool | None = Field(
None,
description="執行是否成功 (None = 未執行)",
)
actual_downtime_minutes: int | None = Field(
None,
description="實際停機時間 (分鐘)",
)
# === 人類回饋 ===
human_feedback: str | None = Field(
None,
description="人類的文字回饋 (如 '這個建議很準''下次應該先檢查 X')",
)
effectiveness_score: int | None = Field(
None,
ge=1,
le=5,
description="有效性評分 (1-5 分)",
)
# === 學習標記 ===
should_remember: bool = Field(
default=True,
description="是否納入長期記憶 (Episodic Memory)",
)
learning_notes: str | None = Field(
None,
description="給未來 AI 的學習筆記",
)
# =============================================================================
# Incident (核心模型)
# =============================================================================
class Incident(BaseModel):
"""
事件模型 - AWOOOI 認知系統的核心資料結構
這是 AWOOOI 2.0「認知覺醒計畫」的基石,承載了:
- 感知 (Signals): 原始告警
- 認知 (Decision Chain): AI 推論過程
- 決策 (Proposals): 修復建議
- 記憶 (Outcome): 結果回饋
三層記憶架構:
┌─────────────────┐
│ Working Memory │ ← Redis Hash, 7 天 TTL
│ (活躍事件) │
└────────┬────────┘
│ 定期遷移
┌─────────────────┐
│ Episodic Memory │ ← PostgreSQL, 永久保留
│ (歷史事件) │
└────────┬────────┘
│ 向量化
┌─────────────────┐
│ Semantic Memory │ ← Vector DB, RAG 檢索
│ (知識庫) │
└─────────────────┘
"""
# === 識別 ===
incident_id: str = Field(
default_factory=lambda: f"INC-{datetime.now(timezone.utc).strftime('%Y%m%d')}-{str(uuid4())[:6].upper()}",
description="事件唯一識別碼 (如 INC-20260322-A1B2C3)",
)
# === 狀態 ===
status: IncidentStatus = Field(
default=IncidentStatus.INVESTIGATING,
description="事件狀態",
)
severity: Severity = Field(..., description="事件嚴重度")
# === 感知層 (Signals) ===
signals: list[Signal] = Field(
default_factory=list,
description="關聯的告警信號列表",
)
affected_services: list[str] = Field(
default_factory=list,
description="受影響的服務列表 (GraphRAG Blast Radius)",
)
# === 認知層 (AI) ===
decision_chain: AIDecisionChain | None = Field(
None,
description="AI 決策鏈 (完整推論過程)",
)
# === 決策層 (Proposals) ===
# 支援多重決策軌跡: Proposal A 失敗 → Proposal B
proposal_ids: list[UUID] = Field(
default_factory=list,
description="關聯的 ApprovalRequest ID 列表 (支援多重決策軌跡)",
)
# === 結果層 (Feedback Loop) ===
outcome: IncidentOutcome | None = Field(
None,
description="事件結果與人類回饋",
)
# === 時間軸 ===
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
description="事件建立時間",
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
description="最後更新時間",
)
resolved_at: datetime | None = Field(
None,
description="事件解決時間",
)
closed_at: datetime | None = Field(
None,
description="事件關閉時間 (含回饋)",
)
# === 記憶管理 ===
ttl_days: int = Field(
default=7,
description="Working Memory TTL (天)",
)
persisted_to_pg: bool = Field(
default=False,
description="是否已固化到 PostgreSQL (Episodic Memory)",
)
vectorized: bool = Field(
default=False,
description="是否已向量化到 Vector DB (Semantic Memory)",
)
class Config:
json_encoders = {
datetime: lambda v: v.isoformat(),
UUID: lambda v: str(v),
}
# =============================================================================
# DTOs (Data Transfer Objects)
# =============================================================================
class IncidentCreate(BaseModel):
"""建立事件的 DTO"""
severity: Severity
signals: list[Signal] = Field(default_factory=list)
affected_services: list[str] = Field(default_factory=list)
class IncidentUpdate(BaseModel):
"""更新事件的 DTO"""
status: IncidentStatus | None = None
severity: Severity | None = None
affected_services: list[str] | None = None
decision_chain: AIDecisionChain | None = None
outcome: IncidentOutcome | None = None
class IncidentResponse(BaseModel):
"""事件 API 回應"""
incident_id: str
status: IncidentStatus
severity: Severity
signals: list[Signal]
affected_services: list[str]
decision_chain: AIDecisionChain | None
proposal_ids: list[str] # 轉為字串
outcome: IncidentOutcome | None
created_at: datetime
updated_at: datetime
resolved_at: datetime | None
closed_at: datetime | None
@classmethod
def from_incident(cls, incident: Incident) -> "IncidentResponse":
"""從 Incident 轉換"""
return cls(
incident_id=incident.incident_id,
status=incident.status,
severity=incident.severity,
signals=incident.signals,
affected_services=incident.affected_services,
decision_chain=incident.decision_chain,
proposal_ids=[str(pid) for pid in incident.proposal_ids],
outcome=incident.outcome,
created_at=incident.created_at,
updated_at=incident.updated_at,
resolved_at=incident.resolved_at,
closed_at=incident.closed_at,
)
class Config:
json_encoders = {
datetime: lambda v: v.isoformat(),
}