- apps/api: FastAPI backend with Dockerfile - apps/web: Next.js frontend with Dockerfile - apps/sensor: Signal collection agent - packages: shared packages Co-Authored-By: Claude <noreply@anthropic.com>
423 lines
13 KiB
Python
423 lines
13 KiB
Python
"""
|
||
Incident Schema v0.3 - 認知覺醒計畫核心資料結構
|
||
=================================================
|
||
|
||
C-Suite 戰略會議決議 (2026-03-22):
|
||
- AWOOOI 定位為 AI Ops OS (決策層)
|
||
- 三層記憶架構: Working (Redis) + Episodic (PG) + Semantic (Vector)
|
||
- 復用現有 approval.py 子模型,避免重複定義
|
||
|
||
設計原則:
|
||
1. 復用現有 approval.py 的子模型 (BlastRadius, DryRunCheck)
|
||
2. Severity (P0-P3) 用於事件嚴重度,RiskLevel 用於操作風險
|
||
3. proposal_ids 支援多重決策軌跡
|
||
4. 完整的 AI 決策鏈可稽核性 (CISO 要求)
|
||
5. Feedback Loop 回饋循環 (CPO 要求)
|
||
|
||
三層記憶對應:
|
||
- Working Memory (Redis): 活躍事件,7 天 TTL
|
||
- Episodic Memory (PostgreSQL): 歷史事件,永久保留
|
||
- Semantic Memory (Vector DB): 向量化後的知識,供 RAG 檢索
|
||
"""
|
||
|
||
from datetime import datetime, timezone
|
||
from enum import Enum
|
||
from typing import Literal
|
||
from uuid import UUID, uuid4
|
||
|
||
from pydantic import BaseModel, Field
|
||
|
||
# 復用現有模型 (避免重複定義)
|
||
from src.models.approval import BlastRadius, DryRunCheck
|
||
|
||
|
||
# =============================================================================
|
||
# Incident 專用 Enums
|
||
# =============================================================================
|
||
|
||
|
||
class Severity(str, Enum):
|
||
"""
|
||
事件嚴重度 (Incident Severity)
|
||
|
||
與 RiskLevel 的區別:
|
||
- Severity: 事件本身的嚴重程度 (P0 最嚴重)
|
||
- RiskLevel: 修復操作的風險等級 (CRITICAL 最危險)
|
||
|
||
用於:
|
||
- AI 分層調用策略 (P0 直接用 Claude,P2/P3 用 Ollama)
|
||
- SLA 響應時間門檻
|
||
- 告警通知優先級
|
||
"""
|
||
|
||
P0 = "P0" # Critical - 服務完全中斷,5 分鐘響應
|
||
P1 = "P1" # High - 服務嚴重降級,15 分鐘響應
|
||
P2 = "P2" # Medium - 服務部分影響,1 小時響應
|
||
P3 = "P3" # Low - 輕微影響,4 小時響應
|
||
|
||
|
||
class IncidentStatus(str, Enum):
|
||
"""
|
||
事件狀態機
|
||
|
||
INVESTIGATING → MITIGATING → RESOLVED → CLOSED
|
||
↘ (無法解決) → ESCALATED
|
||
"""
|
||
|
||
INVESTIGATING = "investigating" # 調查中 - AI 正在分析根因
|
||
MITIGATING = "mitigating" # 處置中 - 已產生 Proposal,等待簽核或執行中
|
||
RESOLVED = "resolved" # 已解決 - 服務恢復正常
|
||
CLOSED = "closed" # 已關閉 - 含人類回饋,可納入長期記憶
|
||
ESCALATED = "escalated" # 已升級 - 需要人工介入
|
||
|
||
|
||
# =============================================================================
|
||
# Signal (原始告警)
|
||
# =============================================================================
|
||
|
||
|
||
class Signal(BaseModel):
|
||
"""
|
||
原始告警信號 - 從 Prometheus/SignOz/Alertmanager 接收
|
||
|
||
這是 Incident 的「感知輸入」,一個 Incident 可能包含多個 Signal。
|
||
例如: CPU Spike + Memory OOM + Pod Restart 三個告警可能屬於同一個 Incident。
|
||
"""
|
||
|
||
signal_id: str = Field(
|
||
default_factory=lambda: str(uuid4())[:8],
|
||
description="信號唯一識別碼 (8 字元)",
|
||
)
|
||
alert_name: str = Field(..., description="告警名稱 (如 HighCPUUsage)")
|
||
severity: Severity = Field(..., description="告警嚴重度")
|
||
source: Literal["prometheus", "signoz", "alertmanager", "manual", "telegram"] = (
|
||
Field(..., description="告警來源")
|
||
)
|
||
fired_at: datetime = Field(..., description="告警觸發時間")
|
||
resolved_at: datetime | None = Field(None, description="告警解除時間")
|
||
labels: dict[str, str] = Field(
|
||
default_factory=dict,
|
||
description="Prometheus 標籤 (如 pod, namespace, service)",
|
||
)
|
||
annotations: dict[str, str] = Field(
|
||
default_factory=dict,
|
||
description="告警附加資訊 (如 summary, description)",
|
||
)
|
||
fingerprint: str | None = Field(
|
||
None,
|
||
description="告警指紋 Hash,用於去重與聚合",
|
||
)
|
||
|
||
class Config:
|
||
json_encoders = {
|
||
datetime: lambda v: v.isoformat(),
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# AI Decision Chain (CISO 要求:可稽核性)
|
||
# =============================================================================
|
||
|
||
|
||
class AIDecisionChain(BaseModel):
|
||
"""
|
||
AI 決策鏈 - 完整記錄推論過程,供稽核使用
|
||
|
||
CISO 要求:
|
||
- 必須記錄 AI 使用的模型、Prompt 版本
|
||
- 必須記錄推理步驟 (可解釋性)
|
||
- 必須記錄推論延遲 (效能監控)
|
||
|
||
用於回答:
|
||
- 「AI 為什麼做出這個建議?」
|
||
- 「AI 當時參考了哪些資料?」
|
||
- 「這個決策可以被重現嗎?」
|
||
"""
|
||
|
||
# === 輸入 ===
|
||
input_signal_ids: list[str] = Field(
|
||
default_factory=list,
|
||
description="觸發此推論的告警 ID 列表",
|
||
)
|
||
context_retrieved: list[str] = Field(
|
||
default_factory=list,
|
||
description="從記憶中檢索的上下文摘要",
|
||
)
|
||
|
||
# === 模型資訊 ===
|
||
model_used: str = Field(
|
||
...,
|
||
description="使用的 AI 模型 (如 ollama/llama3.2:latest, gemini/gemini-pro)",
|
||
)
|
||
prompt_template_version: str = Field(
|
||
default="v1.0.0",
|
||
description="Prompt 模板版本號",
|
||
)
|
||
|
||
# === 推論結果 ===
|
||
hypothesis: str = Field(..., description="AI 的根因推論")
|
||
confidence: float = Field(
|
||
...,
|
||
ge=0.0,
|
||
le=1.0,
|
||
description="信心指數 (0.0 - 1.0)",
|
||
)
|
||
reasoning_steps: list[str] = Field(
|
||
default_factory=list,
|
||
description="推理步驟 (可解釋性)",
|
||
)
|
||
|
||
# === GraphRAG 結果 ===
|
||
blast_radius: BlastRadius | None = Field(
|
||
None,
|
||
description="爆炸半徑分析結果 (復用現有模型)",
|
||
)
|
||
probable_root_causes: list[str] = Field(
|
||
default_factory=list,
|
||
description="可能的根本原因列表",
|
||
)
|
||
|
||
# === 效能追蹤 ===
|
||
inference_started_at: datetime = Field(..., description="推論開始時間")
|
||
inference_completed_at: datetime = Field(..., description="推論完成時間")
|
||
latency_ms: int = Field(..., description="推論延遲 (毫秒)")
|
||
|
||
class Config:
|
||
json_encoders = {
|
||
datetime: lambda v: v.isoformat(),
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Incident Outcome (CPO 要求:回饋循環)
|
||
# =============================================================================
|
||
|
||
|
||
class IncidentOutcome(BaseModel):
|
||
"""
|
||
事件結果 - AI 學習的關鍵回饋
|
||
|
||
CPO 要求:
|
||
- 必須記錄執行結果 (成功/失敗)
|
||
- 必須收集人類回饋 (AI 建議是否有效)
|
||
- 必須標記是否納入長期記憶
|
||
|
||
這是讓 AI 「從經驗中學習」的關鍵:
|
||
- 如果 AI 的建議有效 → 強化這個模式
|
||
- 如果 AI 的建議無效 → 記錄為負面案例
|
||
"""
|
||
|
||
# === 執行結果 ===
|
||
proposal_executed: bool = Field(
|
||
default=False,
|
||
description="是否已執行修復提案",
|
||
)
|
||
execution_success: bool | None = Field(
|
||
None,
|
||
description="執行是否成功 (None = 未執行)",
|
||
)
|
||
actual_downtime_minutes: int | None = Field(
|
||
None,
|
||
description="實際停機時間 (分鐘)",
|
||
)
|
||
|
||
# === 人類回饋 ===
|
||
human_feedback: str | None = Field(
|
||
None,
|
||
description="人類的文字回饋 (如 '這個建議很準' 或 '下次應該先檢查 X')",
|
||
)
|
||
effectiveness_score: int | None = Field(
|
||
None,
|
||
ge=1,
|
||
le=5,
|
||
description="有效性評分 (1-5 分)",
|
||
)
|
||
|
||
# === 學習標記 ===
|
||
should_remember: bool = Field(
|
||
default=True,
|
||
description="是否納入長期記憶 (Episodic Memory)",
|
||
)
|
||
learning_notes: str | None = Field(
|
||
None,
|
||
description="給未來 AI 的學習筆記",
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Incident (核心模型)
|
||
# =============================================================================
|
||
|
||
|
||
class Incident(BaseModel):
|
||
"""
|
||
事件模型 - AWOOOI 認知系統的核心資料結構
|
||
|
||
這是 AWOOOI 2.0「認知覺醒計畫」的基石,承載了:
|
||
- 感知 (Signals): 原始告警
|
||
- 認知 (Decision Chain): AI 推論過程
|
||
- 決策 (Proposals): 修復建議
|
||
- 記憶 (Outcome): 結果回饋
|
||
|
||
三層記憶架構:
|
||
┌─────────────────┐
|
||
│ Working Memory │ ← Redis Hash, 7 天 TTL
|
||
│ (活躍事件) │
|
||
└────────┬────────┘
|
||
│ 定期遷移
|
||
▼
|
||
┌─────────────────┐
|
||
│ Episodic Memory │ ← PostgreSQL, 永久保留
|
||
│ (歷史事件) │
|
||
└────────┬────────┘
|
||
│ 向量化
|
||
▼
|
||
┌─────────────────┐
|
||
│ Semantic Memory │ ← Vector DB, RAG 檢索
|
||
│ (知識庫) │
|
||
└─────────────────┘
|
||
"""
|
||
|
||
# === 識別 ===
|
||
incident_id: str = Field(
|
||
default_factory=lambda: f"INC-{datetime.now(timezone.utc).strftime('%Y%m%d')}-{str(uuid4())[:6].upper()}",
|
||
description="事件唯一識別碼 (如 INC-20260322-A1B2C3)",
|
||
)
|
||
|
||
# === 狀態 ===
|
||
status: IncidentStatus = Field(
|
||
default=IncidentStatus.INVESTIGATING,
|
||
description="事件狀態",
|
||
)
|
||
severity: Severity = Field(..., description="事件嚴重度")
|
||
|
||
# === 感知層 (Signals) ===
|
||
signals: list[Signal] = Field(
|
||
default_factory=list,
|
||
description="關聯的告警信號列表",
|
||
)
|
||
affected_services: list[str] = Field(
|
||
default_factory=list,
|
||
description="受影響的服務列表 (GraphRAG Blast Radius)",
|
||
)
|
||
|
||
# === 認知層 (AI) ===
|
||
decision_chain: AIDecisionChain | None = Field(
|
||
None,
|
||
description="AI 決策鏈 (完整推論過程)",
|
||
)
|
||
|
||
# === 決策層 (Proposals) ===
|
||
# 支援多重決策軌跡: Proposal A 失敗 → Proposal B
|
||
proposal_ids: list[UUID] = Field(
|
||
default_factory=list,
|
||
description="關聯的 ApprovalRequest ID 列表 (支援多重決策軌跡)",
|
||
)
|
||
|
||
# === 結果層 (Feedback Loop) ===
|
||
outcome: IncidentOutcome | None = Field(
|
||
None,
|
||
description="事件結果與人類回饋",
|
||
)
|
||
|
||
# === 時間軸 ===
|
||
created_at: datetime = Field(
|
||
default_factory=lambda: datetime.now(timezone.utc),
|
||
description="事件建立時間",
|
||
)
|
||
updated_at: datetime = Field(
|
||
default_factory=lambda: datetime.now(timezone.utc),
|
||
description="最後更新時間",
|
||
)
|
||
resolved_at: datetime | None = Field(
|
||
None,
|
||
description="事件解決時間",
|
||
)
|
||
closed_at: datetime | None = Field(
|
||
None,
|
||
description="事件關閉時間 (含回饋)",
|
||
)
|
||
|
||
# === 記憶管理 ===
|
||
ttl_days: int = Field(
|
||
default=7,
|
||
description="Working Memory TTL (天)",
|
||
)
|
||
persisted_to_pg: bool = Field(
|
||
default=False,
|
||
description="是否已固化到 PostgreSQL (Episodic Memory)",
|
||
)
|
||
vectorized: bool = Field(
|
||
default=False,
|
||
description="是否已向量化到 Vector DB (Semantic Memory)",
|
||
)
|
||
|
||
class Config:
|
||
json_encoders = {
|
||
datetime: lambda v: v.isoformat(),
|
||
UUID: lambda v: str(v),
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# DTOs (Data Transfer Objects)
|
||
# =============================================================================
|
||
|
||
|
||
class IncidentCreate(BaseModel):
|
||
"""建立事件的 DTO"""
|
||
|
||
severity: Severity
|
||
signals: list[Signal] = Field(default_factory=list)
|
||
affected_services: list[str] = Field(default_factory=list)
|
||
|
||
|
||
class IncidentUpdate(BaseModel):
|
||
"""更新事件的 DTO"""
|
||
|
||
status: IncidentStatus | None = None
|
||
severity: Severity | None = None
|
||
affected_services: list[str] | None = None
|
||
decision_chain: AIDecisionChain | None = None
|
||
outcome: IncidentOutcome | None = None
|
||
|
||
|
||
class IncidentResponse(BaseModel):
|
||
"""事件 API 回應"""
|
||
|
||
incident_id: str
|
||
status: IncidentStatus
|
||
severity: Severity
|
||
signals: list[Signal]
|
||
affected_services: list[str]
|
||
decision_chain: AIDecisionChain | None
|
||
proposal_ids: list[str] # 轉為字串
|
||
outcome: IncidentOutcome | None
|
||
created_at: datetime
|
||
updated_at: datetime
|
||
resolved_at: datetime | None
|
||
closed_at: datetime | None
|
||
|
||
@classmethod
|
||
def from_incident(cls, incident: Incident) -> "IncidentResponse":
|
||
"""從 Incident 轉換"""
|
||
return cls(
|
||
incident_id=incident.incident_id,
|
||
status=incident.status,
|
||
severity=incident.severity,
|
||
signals=incident.signals,
|
||
affected_services=incident.affected_services,
|
||
decision_chain=incident.decision_chain,
|
||
proposal_ids=[str(pid) for pid in incident.proposal_ids],
|
||
outcome=incident.outcome,
|
||
created_at=incident.created_at,
|
||
updated_at=incident.updated_at,
|
||
resolved_at=incident.resolved_at,
|
||
closed_at=incident.closed_at,
|
||
)
|
||
|
||
class Config:
|
||
json_encoders = {
|
||
datetime: lambda v: v.isoformat(),
|
||
}
|