Files
awoooi/apps/api/src/models/incident.py
Your Name 14fb08bcfe revert(models): restore src.* imports in __init__.py + incident.py
Task A3 implementer 誤把既有 `from src.models.*` 改成 `from apps.api.src.models.*`
導致 tests/test_action_parsing.py 等既有測試 collect 失敗
(ModuleNotFoundError: No module named 'apps.api.src.models').

pytest rootdir=apps/api(由 pyproject.toml testpaths=["tests"]),
所以 awoooi 慣例為 `from src.*` 絕對路徑,切勿改。

A3 test file (test_aider_event_models.py) 已用正確 src.models.aider,
無需動。

15 tests (A2+A3) 過,existing tests 恢復(test_action_parsing: 24 collected)。

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 04:11:59 +08:00

521 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Incident Schema v0.3 - 認知覺醒計畫核心資料結構
=================================================
C-Suite 戰略會議決議 (2026-03-22):
- AWOOOI 定位為 AI Ops OS (決策層)
- 三層記憶架構: Working (Redis) + Episodic (PG) + Semantic (Vector)
- 復用現有 approval.py 子模型,避免重複定義
設計原則:
1. 復用現有 approval.py 的子模型 (BlastRadius, DryRunCheck)
2. Severity (P0-P3) 用於事件嚴重度RiskLevel 用於操作風險
3. proposal_ids 支援多重決策軌跡
4. 完整的 AI 決策鏈可稽核性 (CISO 要求)
5. Feedback Loop 回饋循環 (CPO 要求)
三層記憶對應:
- Working Memory (Redis): 活躍事件7 天 TTL
- Episodic Memory (PostgreSQL): 歷史事件,永久保留
- Semantic Memory (Vector DB): 向量化後的知識,供 RAG 檢索
"""
from datetime import datetime, timezone
from enum import Enum
from typing import Literal
from uuid import UUID, uuid4
from pydantic import BaseModel, Field, field_validator
# 復用現有模型 (避免重複定義)
from src.models.approval import BlastRadius
# =============================================================================
# Incident 專用 Enums
# =============================================================================
class Severity(str, Enum):
"""
事件嚴重度 (Incident Severity)
與 RiskLevel 的區別:
- Severity: 事件本身的嚴重程度 (P0 最嚴重)
- RiskLevel: 修復操作的風險等級 (CRITICAL 最危險)
用於:
- AI 分層調用策略 (P0 直接用 ClaudeP2/P3 用 Ollama)
- SLA 響應時間門檻
- 告警通知優先級
"""
P0 = "P0" # Critical - 服務完全中斷5 分鐘響應
P1 = "P1" # High - 服務嚴重降級15 分鐘響應
P2 = "P2" # Medium - 服務部分影響1 小時響應
P3 = "P3" # Low - 輕微影響4 小時響應
class IncidentStatus(str, Enum):
"""
事件狀態機
INVESTIGATING → MITIGATING → RESOLVED → CLOSED
↘ (無法解決) → ESCALATED
"""
INVESTIGATING = "investigating" # 調查中 - AI 正在分析根因
MITIGATING = "mitigating" # 處置中 - 已產生 Proposal等待簽核或執行中
RESOLVED = "resolved" # 已解決 - 服務恢復正常
CLOSED = "closed" # 已關閉 - 含人類回饋,可納入長期記憶
ESCALATED = "escalated" # 已升級 - 需要人工介入
# =============================================================================
# Signal (原始告警)
# =============================================================================
class Signal(BaseModel):
"""
原始告警信號 - 從 Prometheus/SignOz/Alertmanager 接收
這是 Incident 的「感知輸入」,一個 Incident 可能包含多個 Signal。
例如: CPU Spike + Memory OOM + Pod Restart 三個告警可能屬於同一個 Incident。
"""
signal_id: str = Field(
default_factory=lambda: str(uuid4())[:8],
description="信號唯一識別碼 (8 字元)",
)
alert_name: str = Field(..., description="告警名稱 (如 HighCPUUsage)")
severity: Severity = Field(..., description="告警嚴重度")
source: Literal["prometheus", "signoz", "alertmanager", "manual", "telegram"] = (
Field(..., description="告警來源")
)
fired_at: datetime = Field(..., description="告警觸發時間")
resolved_at: datetime | None = Field(None, description="告警解除時間")
labels: dict[str, str] = Field(
default_factory=dict,
description="Prometheus 標籤 (如 pod, namespace, service)",
)
annotations: dict[str, str] = Field(
default_factory=dict,
description="告警附加資訊 (如 summary, description)",
)
fingerprint: str | None = Field(
None,
description="告警指紋 Hash用於去重與聚合",
)
# [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated),原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei
# =============================================================================
# AI Decision Chain (CISO 要求:可稽核性)
# =============================================================================
class AIDecisionChain(BaseModel):
"""
AI 決策鏈 - 完整記錄推論過程,供稽核使用
CISO 要求:
- 必須記錄 AI 使用的模型、Prompt 版本
- 必須記錄推理步驟 (可解釋性)
- 必須記錄推論延遲 (效能監控)
用於回答:
- 「AI 為什麼做出這個建議?」
- 「AI 當時參考了哪些資料?」
- 「這個決策可以被重現嗎?」
"""
# === 輸入 ===
input_signal_ids: list[str] = Field(
default_factory=list,
description="觸發此推論的告警 ID 列表",
)
context_retrieved: list[str] = Field(
default_factory=list,
description="從記憶中檢索的上下文摘要",
)
# === 模型資訊 ===
model_used: str = Field(
...,
description="使用的 AI 模型 (如 ollama/llama3.2:latest, gemini/gemini-pro)",
)
prompt_template_version: str = Field(
default="v1.0.0",
description="Prompt 模板版本號",
)
# === 推論結果 ===
hypothesis: str = Field(..., description="AI 的根因推論")
confidence: float = Field(
...,
ge=0.0,
le=1.0,
description="信心指數 (0.0 - 1.0)",
)
reasoning_steps: list[str] = Field(
default_factory=list,
description="推理步驟 (可解釋性)",
)
# === GraphRAG 結果 ===
blast_radius: BlastRadius | None = Field(
None,
description="爆炸半徑分析結果 (復用現有模型)",
)
probable_root_causes: list[str] = Field(
default_factory=list,
description="可能的根本原因列表",
)
# === 效能追蹤 ===
inference_started_at: datetime = Field(..., description="推論開始時間")
inference_completed_at: datetime = Field(..., description="推論完成時間")
latency_ms: int = Field(..., description="推論延遲 (毫秒)")
# [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated),原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei
# =============================================================================
# Incident Frequency Stats (ADR-037: 異常頻率統計)
# =============================================================================
class IncidentFrequencyStats(BaseModel):
"""
事件頻率統計 - ADR-037 監控增強架構
2026-03-29 ogt: 統帥指示「重啟只是治標,太常發生的異常必須徹底解決」
用途:
- 統計同一異常在不同時間窗口內的發生次數
- 根據頻率決定修復策略的升級 (Tier 1→4)
- 讓使用者知道這個問題有多頻繁
升級閾值:
- REPEAT: ≥ 3 次/24h (標記重複)
- ESCALATE: ≥ 5 次/24h (升級 Tier通知 Owner)
- PERMANENT_FIX: ≥ 10 次/24h (強制根因修復)
"""
anomaly_key: str = Field(
...,
description="異常簽名 Hash (前 16 字元)",
)
count_1h: int = Field(
default=0,
ge=0,
description="1 小時內發生次數",
)
count_24h: int = Field(
default=0,
ge=0,
description="24 小時內發生次數",
)
count_7d: int = Field(
default=0,
ge=0,
description="7 天內發生次數",
)
count_30d: int = Field(
default=0,
ge=0,
description="30 天內發生次數",
)
escalation_level: Literal["REPEAT", "ESCALATE", "PERMANENT_FIX"] | None = Field(
None,
description="升級建議 (基於 24h 頻率)",
)
auto_repair_count: int = Field(
default=0,
ge=0,
description="自動修復嘗試次數",
)
last_repair_action: str | None = Field(
None,
description="最後一次修復動作",
)
last_repair_success: bool | None = Field(
None,
description="最後一次修復是否成功",
)
# 2026-04-07 Claude Code: Sprint 4 — 告警處置統計 (A1)
human_approved_count: int = Field(
default=0,
ge=0,
description="人工按批准後執行次數",
)
manual_resolved_count: int = Field(
default=0,
ge=0,
description="無系統修復紀錄但 resolved 次數",
)
cold_start_trust_count: int = Field(
default=0,
ge=0,
description="首次信任自動放行次數",
)
total_resolution_count: int = Field(
default=0,
ge=0,
description="總處置次數 (auto + human + manual + cold_start)",
)
# =============================================================================
# Incident Outcome (CPO 要求:回饋循環)
# =============================================================================
class IncidentOutcome(BaseModel):
"""
事件結果 - AI 學習的關鍵回饋
CPO 要求:
- 必須記錄執行結果 (成功/失敗)
- 必須收集人類回饋 (AI 建議是否有效)
- 必須標記是否納入長期記憶
這是讓 AI 「從經驗中學習」的關鍵:
- 如果 AI 的建議有效 → 強化這個模式
- 如果 AI 的建議無效 → 記錄為負面案例
"""
# === 執行結果 ===
proposal_executed: bool = Field(
default=False,
description="是否已執行修復提案",
)
execution_success: bool | None = Field(
None,
description="執行是否成功 (None = 未執行)",
)
actual_downtime_minutes: int | None = Field(
None,
description="實際停機時間 (分鐘)",
)
# === 人類回饋 ===
human_feedback: str | None = Field(
None,
description="人類的文字回饋 (如 '這個建議很準''下次應該先檢查 X')",
)
effectiveness_score: int | None = Field(
None,
ge=1,
le=5,
description="有效性評分 (1-5 分)",
)
# === 學習標記 ===
should_remember: bool = Field(
default=True,
description="是否納入長期記憶 (Episodic Memory)",
)
learning_notes: str | None = Field(
None,
description="給未來 AI 的學習筆記",
)
# =============================================================================
# Incident (核心模型)
# =============================================================================
class Incident(BaseModel):
"""
事件模型 - AWOOOI 認知系統的核心資料結構
這是 AWOOOI 2.0「認知覺醒計畫」的基石,承載了:
- 感知 (Signals): 原始告警
- 認知 (Decision Chain): AI 推論過程
- 決策 (Proposals): 修復建議
- 記憶 (Outcome): 結果回饋
三層記憶架構:
┌─────────────────┐
│ Working Memory │ ← Redis Hash, 7 天 TTL
│ (活躍事件) │
└────────┬────────┘
│ 定期遷移
┌─────────────────┐
│ Episodic Memory │ ← PostgreSQL, 永久保留
│ (歷史事件) │
└────────┬────────┘
│ 向量化
┌─────────────────┐
│ Semantic Memory │ ← Vector DB, RAG 檢索
│ (知識庫) │
└─────────────────┘
"""
# === 識別 ===
incident_id: str = Field(
default_factory=lambda: f"INC-{datetime.now(timezone.utc).strftime('%Y%m%d')}-{str(uuid4())[:6].upper()}",
description="事件唯一識別碼 (如 INC-20260322-A1B2C3)",
)
# === 狀態 ===
status: IncidentStatus = Field(
default=IncidentStatus.INVESTIGATING,
description="事件狀態",
)
severity: Severity = Field(..., description="事件嚴重度")
# === 感知層 (Signals) ===
signals: list[Signal] = Field(
default_factory=list,
description="關聯的告警信號列表",
)
affected_services: list[str] = Field(
default_factory=list,
description="受影響的服務列表 (GraphRAG Blast Radius)",
)
# === 認知層 (AI) ===
decision_chain: AIDecisionChain | None = Field(
None,
description="AI 決策鏈 (完整推論過程)",
)
# === 決策層 (Proposals) ===
# 支援多重決策軌跡: Proposal A 失敗 → Proposal B
proposal_ids: list[UUID] = Field(
default_factory=list,
description="關聯的 ApprovalRequest ID 列表 (支援多重決策軌跡)",
)
# === 結果層 (Feedback Loop) ===
outcome: IncidentOutcome | None = Field(
None,
description="事件結果與人類回饋",
)
# === 頻率統計 (ADR-037) ===
# 2026-03-29 ogt: 統帥指示「重啟只是治標,太常發生的異常必須徹底解決」
frequency_stats: IncidentFrequencyStats | None = Field(
None,
description="異常頻率統計 (用於 Tier 分級修復策略)",
)
# === 時間軸 ===
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
description="事件建立時間",
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
description="最後更新時間",
)
resolved_at: datetime | None = Field(
None,
description="事件解決時間",
)
closed_at: datetime | None = Field(
None,
description="事件關閉時間 (含回饋)",
)
# === 記憶管理 ===
ttl_days: int = Field(
default=7,
description="Working Memory TTL (天)",
)
persisted_to_pg: bool = Field(
default=False,
description="是否已固化到 PostgreSQL (Episodic Memory)",
)
vectorized: bool = Field(
default=False,
description="是否已向量化到 Vector DB (Semantic Memory)",
)
# ADR-071-A: 告警通知四類型 + 全生命週期 DB 記錄 (2026-04-11 Claude Sonnet 4.6)
notification_type: str | None = Field(None, description="通知類型 TYPE-1/2/3/4/4D")
alert_category: str | None = Field(None, description="告警類別 k8s_workload/database/host_resource/...")
context_bundle: dict | None = Field(None, description="MCP 情報收集快照(執行前)")
metrics_before: dict | None = Field(None, description="指標快照執行前Prometheus MCP")
metrics_after: dict | None = Field(None, description="指標快照執行後Prometheus MCP")
verification_result: dict | None = Field(None, description="執行驗證結果K8s MCP watch_rollout")
manual_fix_steps: str | None = Field(None, description="手動修復步驟TYPE-4 使用者輸入)")
manual_fix_by: str | None = Field(None, description="手動修復執行者")
# [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated),原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei
# 2026-04-01 Claude Code: 舊 Redis 資料相容性 - outcome 可能存為字串 "resolved"
@field_validator("outcome", mode="before")
@classmethod
def coerce_outcome_string(cls, v: object) -> object:
if isinstance(v, str):
return None # 舊格式字串無法還原為 IncidentOutcome捨棄即可
return v
# =============================================================================
# DTOs (Data Transfer Objects)
# =============================================================================
class IncidentCreate(BaseModel):
"""建立事件的 DTO"""
severity: Severity
signals: list[Signal] = Field(default_factory=list)
affected_services: list[str] = Field(default_factory=list)
class IncidentUpdate(BaseModel):
"""更新事件的 DTO"""
status: IncidentStatus | None = None
severity: Severity | None = None
affected_services: list[str] | None = None
decision_chain: AIDecisionChain | None = None
outcome: IncidentOutcome | None = None
class IncidentResponse(BaseModel):
"""事件 API 回應"""
incident_id: str
status: IncidentStatus
severity: Severity
signals: list[Signal]
affected_services: list[str]
decision_chain: AIDecisionChain | None
proposal_ids: list[str] # 轉為字串
outcome: IncidentOutcome | None
created_at: datetime
updated_at: datetime
resolved_at: datetime | None
closed_at: datetime | None
@classmethod
def from_incident(cls, incident: Incident) -> "IncidentResponse":
"""從 Incident 轉換"""
return cls(
incident_id=incident.incident_id,
status=incident.status,
severity=incident.severity,
signals=incident.signals,
affected_services=incident.affected_services,
decision_chain=incident.decision_chain,
proposal_ids=[str(pid) for pid in incident.proposal_ids],
outcome=incident.outcome,
created_at=incident.created_at,
updated_at=incident.updated_at,
resolved_at=incident.resolved_at,
closed_at=incident.closed_at,
)
# [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated),原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei