Task A3 implementer 誤把既有 `from src.models.*` 改成 `from apps.api.src.models.*` 導致 tests/test_action_parsing.py 等既有測試 collect 失敗 (ModuleNotFoundError: No module named 'apps.api.src.models'). pytest rootdir=apps/api(由 pyproject.toml testpaths=["tests"]), 所以 awoooi 慣例為 `from src.*` 絕對路徑,切勿改。 A3 test file (test_aider_event_models.py) 已用正確 src.models.aider, 無需動。 15 tests (A2+A3) 過,existing tests 恢復(test_action_parsing: 24 collected)。 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
521 lines
17 KiB
Python
521 lines
17 KiB
Python
"""
|
||
Incident Schema v0.3 - 認知覺醒計畫核心資料結構
|
||
=================================================
|
||
|
||
C-Suite 戰略會議決議 (2026-03-22):
|
||
- AWOOOI 定位為 AI Ops OS (決策層)
|
||
- 三層記憶架構: Working (Redis) + Episodic (PG) + Semantic (Vector)
|
||
- 復用現有 approval.py 子模型,避免重複定義
|
||
|
||
設計原則:
|
||
1. 復用現有 approval.py 的子模型 (BlastRadius, DryRunCheck)
|
||
2. Severity (P0-P3) 用於事件嚴重度,RiskLevel 用於操作風險
|
||
3. proposal_ids 支援多重決策軌跡
|
||
4. 完整的 AI 決策鏈可稽核性 (CISO 要求)
|
||
5. Feedback Loop 回饋循環 (CPO 要求)
|
||
|
||
三層記憶對應:
|
||
- Working Memory (Redis): 活躍事件,7 天 TTL
|
||
- Episodic Memory (PostgreSQL): 歷史事件,永久保留
|
||
- Semantic Memory (Vector DB): 向量化後的知識,供 RAG 檢索
|
||
"""
|
||
|
||
from datetime import datetime, timezone
|
||
from enum import Enum
|
||
from typing import Literal
|
||
from uuid import UUID, uuid4
|
||
|
||
from pydantic import BaseModel, Field, field_validator
|
||
|
||
# 復用現有模型 (避免重複定義)
|
||
from src.models.approval import BlastRadius
|
||
|
||
# =============================================================================
|
||
# Incident 專用 Enums
|
||
# =============================================================================
|
||
|
||
|
||
class Severity(str, Enum):
|
||
"""
|
||
事件嚴重度 (Incident Severity)
|
||
|
||
與 RiskLevel 的區別:
|
||
- Severity: 事件本身的嚴重程度 (P0 最嚴重)
|
||
- RiskLevel: 修復操作的風險等級 (CRITICAL 最危險)
|
||
|
||
用於:
|
||
- AI 分層調用策略 (P0 直接用 Claude,P2/P3 用 Ollama)
|
||
- SLA 響應時間門檻
|
||
- 告警通知優先級
|
||
"""
|
||
|
||
P0 = "P0" # Critical - 服務完全中斷,5 分鐘響應
|
||
P1 = "P1" # High - 服務嚴重降級,15 分鐘響應
|
||
P2 = "P2" # Medium - 服務部分影響,1 小時響應
|
||
P3 = "P3" # Low - 輕微影響,4 小時響應
|
||
|
||
|
||
class IncidentStatus(str, Enum):
|
||
"""
|
||
事件狀態機
|
||
|
||
INVESTIGATING → MITIGATING → RESOLVED → CLOSED
|
||
↘ (無法解決) → ESCALATED
|
||
"""
|
||
|
||
INVESTIGATING = "investigating" # 調查中 - AI 正在分析根因
|
||
MITIGATING = "mitigating" # 處置中 - 已產生 Proposal,等待簽核或執行中
|
||
RESOLVED = "resolved" # 已解決 - 服務恢復正常
|
||
CLOSED = "closed" # 已關閉 - 含人類回饋,可納入長期記憶
|
||
ESCALATED = "escalated" # 已升級 - 需要人工介入
|
||
|
||
|
||
# =============================================================================
|
||
# Signal (原始告警)
|
||
# =============================================================================
|
||
|
||
|
||
class Signal(BaseModel):
|
||
"""
|
||
原始告警信號 - 從 Prometheus/SignOz/Alertmanager 接收
|
||
|
||
這是 Incident 的「感知輸入」,一個 Incident 可能包含多個 Signal。
|
||
例如: CPU Spike + Memory OOM + Pod Restart 三個告警可能屬於同一個 Incident。
|
||
"""
|
||
|
||
signal_id: str = Field(
|
||
default_factory=lambda: str(uuid4())[:8],
|
||
description="信號唯一識別碼 (8 字元)",
|
||
)
|
||
alert_name: str = Field(..., description="告警名稱 (如 HighCPUUsage)")
|
||
severity: Severity = Field(..., description="告警嚴重度")
|
||
source: Literal["prometheus", "signoz", "alertmanager", "manual", "telegram"] = (
|
||
Field(..., description="告警來源")
|
||
)
|
||
fired_at: datetime = Field(..., description="告警觸發時間")
|
||
resolved_at: datetime | None = Field(None, description="告警解除時間")
|
||
labels: dict[str, str] = Field(
|
||
default_factory=dict,
|
||
description="Prometheus 標籤 (如 pod, namespace, service)",
|
||
)
|
||
annotations: dict[str, str] = Field(
|
||
default_factory=dict,
|
||
description="告警附加資訊 (如 summary, description)",
|
||
)
|
||
fingerprint: str | None = Field(
|
||
None,
|
||
description="告警指紋 Hash,用於去重與聚合",
|
||
)
|
||
|
||
# [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated),原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei
|
||
|
||
|
||
# =============================================================================
|
||
# AI Decision Chain (CISO 要求:可稽核性)
|
||
# =============================================================================
|
||
|
||
|
||
class AIDecisionChain(BaseModel):
|
||
"""
|
||
AI 決策鏈 - 完整記錄推論過程,供稽核使用
|
||
|
||
CISO 要求:
|
||
- 必須記錄 AI 使用的模型、Prompt 版本
|
||
- 必須記錄推理步驟 (可解釋性)
|
||
- 必須記錄推論延遲 (效能監控)
|
||
|
||
用於回答:
|
||
- 「AI 為什麼做出這個建議?」
|
||
- 「AI 當時參考了哪些資料?」
|
||
- 「這個決策可以被重現嗎?」
|
||
"""
|
||
|
||
# === 輸入 ===
|
||
input_signal_ids: list[str] = Field(
|
||
default_factory=list,
|
||
description="觸發此推論的告警 ID 列表",
|
||
)
|
||
context_retrieved: list[str] = Field(
|
||
default_factory=list,
|
||
description="從記憶中檢索的上下文摘要",
|
||
)
|
||
|
||
# === 模型資訊 ===
|
||
model_used: str = Field(
|
||
...,
|
||
description="使用的 AI 模型 (如 ollama/llama3.2:latest, gemini/gemini-pro)",
|
||
)
|
||
prompt_template_version: str = Field(
|
||
default="v1.0.0",
|
||
description="Prompt 模板版本號",
|
||
)
|
||
|
||
# === 推論結果 ===
|
||
hypothesis: str = Field(..., description="AI 的根因推論")
|
||
confidence: float = Field(
|
||
...,
|
||
ge=0.0,
|
||
le=1.0,
|
||
description="信心指數 (0.0 - 1.0)",
|
||
)
|
||
reasoning_steps: list[str] = Field(
|
||
default_factory=list,
|
||
description="推理步驟 (可解釋性)",
|
||
)
|
||
|
||
# === GraphRAG 結果 ===
|
||
blast_radius: BlastRadius | None = Field(
|
||
None,
|
||
description="爆炸半徑分析結果 (復用現有模型)",
|
||
)
|
||
probable_root_causes: list[str] = Field(
|
||
default_factory=list,
|
||
description="可能的根本原因列表",
|
||
)
|
||
|
||
# === 效能追蹤 ===
|
||
inference_started_at: datetime = Field(..., description="推論開始時間")
|
||
inference_completed_at: datetime = Field(..., description="推論完成時間")
|
||
latency_ms: int = Field(..., description="推論延遲 (毫秒)")
|
||
|
||
# [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated),原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei
|
||
|
||
|
||
# =============================================================================
|
||
# Incident Frequency Stats (ADR-037: 異常頻率統計)
|
||
# =============================================================================
|
||
|
||
|
||
class IncidentFrequencyStats(BaseModel):
|
||
"""
|
||
事件頻率統計 - ADR-037 監控增強架構
|
||
|
||
2026-03-29 ogt: 統帥指示「重啟只是治標,太常發生的異常必須徹底解決」
|
||
|
||
用途:
|
||
- 統計同一異常在不同時間窗口內的發生次數
|
||
- 根據頻率決定修復策略的升級 (Tier 1→4)
|
||
- 讓使用者知道這個問題有多頻繁
|
||
|
||
升級閾值:
|
||
- REPEAT: ≥ 3 次/24h (標記重複)
|
||
- ESCALATE: ≥ 5 次/24h (升級 Tier,通知 Owner)
|
||
- PERMANENT_FIX: ≥ 10 次/24h (強制根因修復)
|
||
"""
|
||
|
||
anomaly_key: str = Field(
|
||
...,
|
||
description="異常簽名 Hash (前 16 字元)",
|
||
)
|
||
count_1h: int = Field(
|
||
default=0,
|
||
ge=0,
|
||
description="1 小時內發生次數",
|
||
)
|
||
count_24h: int = Field(
|
||
default=0,
|
||
ge=0,
|
||
description="24 小時內發生次數",
|
||
)
|
||
count_7d: int = Field(
|
||
default=0,
|
||
ge=0,
|
||
description="7 天內發生次數",
|
||
)
|
||
count_30d: int = Field(
|
||
default=0,
|
||
ge=0,
|
||
description="30 天內發生次數",
|
||
)
|
||
escalation_level: Literal["REPEAT", "ESCALATE", "PERMANENT_FIX"] | None = Field(
|
||
None,
|
||
description="升級建議 (基於 24h 頻率)",
|
||
)
|
||
auto_repair_count: int = Field(
|
||
default=0,
|
||
ge=0,
|
||
description="自動修復嘗試次數",
|
||
)
|
||
last_repair_action: str | None = Field(
|
||
None,
|
||
description="最後一次修復動作",
|
||
)
|
||
last_repair_success: bool | None = Field(
|
||
None,
|
||
description="最後一次修復是否成功",
|
||
)
|
||
|
||
# 2026-04-07 Claude Code: Sprint 4 — 告警處置統計 (A1)
|
||
human_approved_count: int = Field(
|
||
default=0,
|
||
ge=0,
|
||
description="人工按批准後執行次數",
|
||
)
|
||
manual_resolved_count: int = Field(
|
||
default=0,
|
||
ge=0,
|
||
description="無系統修復紀錄但 resolved 次數",
|
||
)
|
||
cold_start_trust_count: int = Field(
|
||
default=0,
|
||
ge=0,
|
||
description="首次信任自動放行次數",
|
||
)
|
||
total_resolution_count: int = Field(
|
||
default=0,
|
||
ge=0,
|
||
description="總處置次數 (auto + human + manual + cold_start)",
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Incident Outcome (CPO 要求:回饋循環)
|
||
# =============================================================================
|
||
|
||
|
||
class IncidentOutcome(BaseModel):
|
||
"""
|
||
事件結果 - AI 學習的關鍵回饋
|
||
|
||
CPO 要求:
|
||
- 必須記錄執行結果 (成功/失敗)
|
||
- 必須收集人類回饋 (AI 建議是否有效)
|
||
- 必須標記是否納入長期記憶
|
||
|
||
這是讓 AI 「從經驗中學習」的關鍵:
|
||
- 如果 AI 的建議有效 → 強化這個模式
|
||
- 如果 AI 的建議無效 → 記錄為負面案例
|
||
"""
|
||
|
||
# === 執行結果 ===
|
||
proposal_executed: bool = Field(
|
||
default=False,
|
||
description="是否已執行修復提案",
|
||
)
|
||
execution_success: bool | None = Field(
|
||
None,
|
||
description="執行是否成功 (None = 未執行)",
|
||
)
|
||
actual_downtime_minutes: int | None = Field(
|
||
None,
|
||
description="實際停機時間 (分鐘)",
|
||
)
|
||
|
||
# === 人類回饋 ===
|
||
human_feedback: str | None = Field(
|
||
None,
|
||
description="人類的文字回饋 (如 '這個建議很準' 或 '下次應該先檢查 X')",
|
||
)
|
||
effectiveness_score: int | None = Field(
|
||
None,
|
||
ge=1,
|
||
le=5,
|
||
description="有效性評分 (1-5 分)",
|
||
)
|
||
|
||
# === 學習標記 ===
|
||
should_remember: bool = Field(
|
||
default=True,
|
||
description="是否納入長期記憶 (Episodic Memory)",
|
||
)
|
||
learning_notes: str | None = Field(
|
||
None,
|
||
description="給未來 AI 的學習筆記",
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Incident (核心模型)
|
||
# =============================================================================
|
||
|
||
|
||
class Incident(BaseModel):
|
||
"""
|
||
事件模型 - AWOOOI 認知系統的核心資料結構
|
||
|
||
這是 AWOOOI 2.0「認知覺醒計畫」的基石,承載了:
|
||
- 感知 (Signals): 原始告警
|
||
- 認知 (Decision Chain): AI 推論過程
|
||
- 決策 (Proposals): 修復建議
|
||
- 記憶 (Outcome): 結果回饋
|
||
|
||
三層記憶架構:
|
||
┌─────────────────┐
|
||
│ Working Memory │ ← Redis Hash, 7 天 TTL
|
||
│ (活躍事件) │
|
||
└────────┬────────┘
|
||
│ 定期遷移
|
||
▼
|
||
┌─────────────────┐
|
||
│ Episodic Memory │ ← PostgreSQL, 永久保留
|
||
│ (歷史事件) │
|
||
└────────┬────────┘
|
||
│ 向量化
|
||
▼
|
||
┌─────────────────┐
|
||
│ Semantic Memory │ ← Vector DB, RAG 檢索
|
||
│ (知識庫) │
|
||
└─────────────────┘
|
||
"""
|
||
|
||
# === 識別 ===
|
||
incident_id: str = Field(
|
||
default_factory=lambda: f"INC-{datetime.now(timezone.utc).strftime('%Y%m%d')}-{str(uuid4())[:6].upper()}",
|
||
description="事件唯一識別碼 (如 INC-20260322-A1B2C3)",
|
||
)
|
||
|
||
# === 狀態 ===
|
||
status: IncidentStatus = Field(
|
||
default=IncidentStatus.INVESTIGATING,
|
||
description="事件狀態",
|
||
)
|
||
severity: Severity = Field(..., description="事件嚴重度")
|
||
|
||
# === 感知層 (Signals) ===
|
||
signals: list[Signal] = Field(
|
||
default_factory=list,
|
||
description="關聯的告警信號列表",
|
||
)
|
||
affected_services: list[str] = Field(
|
||
default_factory=list,
|
||
description="受影響的服務列表 (GraphRAG Blast Radius)",
|
||
)
|
||
|
||
# === 認知層 (AI) ===
|
||
decision_chain: AIDecisionChain | None = Field(
|
||
None,
|
||
description="AI 決策鏈 (完整推論過程)",
|
||
)
|
||
|
||
# === 決策層 (Proposals) ===
|
||
# 支援多重決策軌跡: Proposal A 失敗 → Proposal B
|
||
proposal_ids: list[UUID] = Field(
|
||
default_factory=list,
|
||
description="關聯的 ApprovalRequest ID 列表 (支援多重決策軌跡)",
|
||
)
|
||
|
||
# === 結果層 (Feedback Loop) ===
|
||
outcome: IncidentOutcome | None = Field(
|
||
None,
|
||
description="事件結果與人類回饋",
|
||
)
|
||
|
||
# === 頻率統計 (ADR-037) ===
|
||
# 2026-03-29 ogt: 統帥指示「重啟只是治標,太常發生的異常必須徹底解決」
|
||
frequency_stats: IncidentFrequencyStats | None = Field(
|
||
None,
|
||
description="異常頻率統計 (用於 Tier 分級修復策略)",
|
||
)
|
||
|
||
# === 時間軸 ===
|
||
created_at: datetime = Field(
|
||
default_factory=lambda: datetime.now(timezone.utc),
|
||
description="事件建立時間",
|
||
)
|
||
updated_at: datetime = Field(
|
||
default_factory=lambda: datetime.now(timezone.utc),
|
||
description="最後更新時間",
|
||
)
|
||
resolved_at: datetime | None = Field(
|
||
None,
|
||
description="事件解決時間",
|
||
)
|
||
closed_at: datetime | None = Field(
|
||
None,
|
||
description="事件關閉時間 (含回饋)",
|
||
)
|
||
|
||
# === 記憶管理 ===
|
||
ttl_days: int = Field(
|
||
default=7,
|
||
description="Working Memory TTL (天)",
|
||
)
|
||
persisted_to_pg: bool = Field(
|
||
default=False,
|
||
description="是否已固化到 PostgreSQL (Episodic Memory)",
|
||
)
|
||
vectorized: bool = Field(
|
||
default=False,
|
||
description="是否已向量化到 Vector DB (Semantic Memory)",
|
||
)
|
||
|
||
# ADR-071-A: 告警通知四類型 + 全生命週期 DB 記錄 (2026-04-11 Claude Sonnet 4.6)
|
||
notification_type: str | None = Field(None, description="通知類型 TYPE-1/2/3/4/4D")
|
||
alert_category: str | None = Field(None, description="告警類別 k8s_workload/database/host_resource/...")
|
||
context_bundle: dict | None = Field(None, description="MCP 情報收集快照(執行前)")
|
||
metrics_before: dict | None = Field(None, description="指標快照(執行前,Prometheus MCP)")
|
||
metrics_after: dict | None = Field(None, description="指標快照(執行後,Prometheus MCP)")
|
||
verification_result: dict | None = Field(None, description="執行驗證結果(K8s MCP watch_rollout)")
|
||
manual_fix_steps: str | None = Field(None, description="手動修復步驟(TYPE-4 使用者輸入)")
|
||
manual_fix_by: str | None = Field(None, description="手動修復執行者")
|
||
|
||
# [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated),原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei
|
||
|
||
# 2026-04-01 Claude Code: 舊 Redis 資料相容性 - outcome 可能存為字串 "resolved"
|
||
@field_validator("outcome", mode="before")
|
||
@classmethod
|
||
def coerce_outcome_string(cls, v: object) -> object:
|
||
if isinstance(v, str):
|
||
return None # 舊格式字串無法還原為 IncidentOutcome,捨棄即可
|
||
return v
|
||
|
||
|
||
# =============================================================================
|
||
# DTOs (Data Transfer Objects)
|
||
# =============================================================================
|
||
|
||
|
||
class IncidentCreate(BaseModel):
|
||
"""建立事件的 DTO"""
|
||
|
||
severity: Severity
|
||
signals: list[Signal] = Field(default_factory=list)
|
||
affected_services: list[str] = Field(default_factory=list)
|
||
|
||
|
||
class IncidentUpdate(BaseModel):
|
||
"""更新事件的 DTO"""
|
||
|
||
status: IncidentStatus | None = None
|
||
severity: Severity | None = None
|
||
affected_services: list[str] | None = None
|
||
decision_chain: AIDecisionChain | None = None
|
||
outcome: IncidentOutcome | None = None
|
||
|
||
|
||
class IncidentResponse(BaseModel):
|
||
"""事件 API 回應"""
|
||
|
||
incident_id: str
|
||
status: IncidentStatus
|
||
severity: Severity
|
||
signals: list[Signal]
|
||
affected_services: list[str]
|
||
decision_chain: AIDecisionChain | None
|
||
proposal_ids: list[str] # 轉為字串
|
||
outcome: IncidentOutcome | None
|
||
created_at: datetime
|
||
updated_at: datetime
|
||
resolved_at: datetime | None
|
||
closed_at: datetime | None
|
||
|
||
@classmethod
|
||
def from_incident(cls, incident: Incident) -> "IncidentResponse":
|
||
"""從 Incident 轉換"""
|
||
return cls(
|
||
incident_id=incident.incident_id,
|
||
status=incident.status,
|
||
severity=incident.severity,
|
||
signals=incident.signals,
|
||
affected_services=incident.affected_services,
|
||
decision_chain=incident.decision_chain,
|
||
proposal_ids=[str(pid) for pid in incident.proposal_ids],
|
||
outcome=incident.outcome,
|
||
created_at=incident.created_at,
|
||
updated_at=incident.updated_at,
|
||
resolved_at=incident.resolved_at,
|
||
closed_at=incident.closed_at,
|
||
)
|
||
|
||
# [首席架構師] 移除 json_encoders (Pydantic v2 已 deprecated),原生序列化輸出格式與 .isoformat() 一致 v1.1 2026-04-01 Asia/Taipei
|