702 lines
23 KiB
Python
702 lines
23 KiB
Python
"""
|
||
Incident Memory Provider - 事件記憶體提供者
|
||
============================================
|
||
Phase 6.4e: DualIncidentMemory 整合
|
||
Phase 16 R1.2: 絞殺者模式 (Strangler Fig Pattern) 2026-03-26
|
||
|
||
設計:
|
||
- 實作 IIncidentMemory 協定 (Protocol)
|
||
- 雙層記憶體: Working (Redis) + Episodic (PostgreSQL)
|
||
- 反向索引: namespace:target -> incident_id
|
||
|
||
統帥鐵律:
|
||
- Working Memory (Redis): 7 天 TTL
|
||
- Episodic Memory (PostgreSQL): 永久
|
||
- 反向索引: 30 分鐘 TTL (聚合窗口)
|
||
|
||
Phase 16 絞殺者模式:
|
||
- USE_NEW_ENGINE=False: 使用此內嵌版本 (當前預設)
|
||
- USE_NEW_ENGINE=True: 使用 lewooogo-brain 套件版本
|
||
- 回滾指令: kubectl set env deployment/awoooi-api USE_NEW_ENGINE=false
|
||
- 監控週期: 48 小時驗證期
|
||
|
||
NOTE: 此模組為 lewooogo-brain/adapters/incident_memory.py 的 apps/api 內嵌版本
|
||
Phase 16 R2 階段會在 48 小時驗證通過後移除此檔案
|
||
"""
|
||
|
||
from datetime import UTC, datetime, timedelta
|
||
from typing import Any, Protocol
|
||
|
||
import structlog
|
||
|
||
from src.core.redis_client import get_redis
|
||
from src.db.base import get_db_context
|
||
from src.db.models import IncidentRecord
|
||
from src.models.incident import Incident
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
# =============================================================================
|
||
# Constants
|
||
# =============================================================================
|
||
|
||
WORKING_MEMORY_TTL = 604800 # 7 天
|
||
AGGREGATION_WINDOW_MINUTES = 30
|
||
INDEX_TTL = 1800 # 索引 30 分鐘 TTL
|
||
|
||
# Redis Key Patterns
|
||
INCIDENT_KEY_PREFIX = "awoooi:incidents:"
|
||
INDEX_PREFIX = "awoooi:incidents:index:"
|
||
|
||
|
||
# =============================================================================
|
||
# Protocol Definition (與 lewooogo-brain 保持一致)
|
||
# =============================================================================
|
||
|
||
class IIncidentMemory(Protocol):
|
||
"""Incident 專用記憶體提供者協定"""
|
||
|
||
async def load_incident(self, incident_id: str) -> Incident | None:
|
||
"""從 Working Memory 載入 Incident"""
|
||
...
|
||
|
||
async def save_incident(self, incident: Incident, ttl_seconds: int = WORKING_MEMORY_TTL) -> bool:
|
||
"""儲存 Incident 到 Working Memory (預設 7 天 TTL)"""
|
||
...
|
||
|
||
async def persist_incident(self, incident: Incident) -> bool:
|
||
"""持久化到 Episodic Memory (PostgreSQL)"""
|
||
...
|
||
|
||
async def find_related_incident(
|
||
self,
|
||
namespace: str,
|
||
target: str,
|
||
window_minutes: int = AGGREGATION_WINDOW_MINUTES,
|
||
) -> Incident | None:
|
||
"""尋找相關的活躍 Incident (用於聚合)"""
|
||
...
|
||
|
||
async def update_index(
|
||
self,
|
||
incident_id: str,
|
||
namespace: str,
|
||
target: str,
|
||
) -> bool:
|
||
"""更新反向索引 (namespace/target -> incident_id)"""
|
||
...
|
||
|
||
|
||
# =============================================================================
|
||
# DualIncidentMemory Implementation
|
||
# =============================================================================
|
||
|
||
class DualIncidentMemory:
|
||
"""
|
||
Incident 專用雙層記憶體適配器
|
||
|
||
實作 IIncidentMemory 協定:
|
||
- load_incident: 從 Working/Episodic 載入
|
||
- save_incident: 儲存到 Working
|
||
- persist_incident: 持久化到 Episodic
|
||
- find_related_incident: 透過反向索引尋找相關 Incident
|
||
- update_index: 更新反向索引
|
||
|
||
反向索引結構:
|
||
Key: awoooi:incidents:index:{namespace}:{target}
|
||
Value: incident_id
|
||
TTL: 30 分鐘 (聚合窗口)
|
||
"""
|
||
|
||
def __init__(self, redis_client: Any = None, key_prefix: str = INCIDENT_KEY_PREFIX):
|
||
"""
|
||
初始化適配器
|
||
|
||
Args:
|
||
redis_client: Redis 連線客戶端 (可選,預設使用 get_redis())
|
||
key_prefix: Redis Key 前綴
|
||
"""
|
||
self._redis = redis_client
|
||
self._key_prefix = key_prefix
|
||
self._index_prefix = INDEX_PREFIX
|
||
|
||
def _get_redis(self) -> Any:
|
||
"""取得 Redis 客戶端 (延遲初始化)"""
|
||
if self._redis is None:
|
||
self._redis = get_redis()
|
||
return self._redis
|
||
|
||
def _make_key(self, incident_id: str) -> str:
|
||
"""生成 Incident Key"""
|
||
return f"{self._key_prefix}{incident_id}"
|
||
|
||
def _make_index_key(self, namespace: str, target: str) -> str:
|
||
"""生成索引 Key"""
|
||
return f"{self._index_prefix}{namespace}:{target}"
|
||
|
||
async def load_incident(self, incident_id: str) -> Incident | None:
|
||
"""
|
||
載入 Incident
|
||
|
||
策略:
|
||
1. 從 Redis (Working Memory) 讀取
|
||
2. 若 miss,從 PostgreSQL (Episodic) 讀取
|
||
|
||
Args:
|
||
incident_id: Incident ID
|
||
|
||
Returns:
|
||
Incident 或 None
|
||
"""
|
||
try:
|
||
redis_client = self._get_redis()
|
||
key = self._make_key(incident_id)
|
||
data = await redis_client.get(key)
|
||
|
||
if data is not None:
|
||
# JSON -> Incident
|
||
return Incident.model_validate_json(data)
|
||
|
||
# Working Memory miss, 嘗試從 Episodic Memory 載入
|
||
logger.debug("incident_not_found_in_working", incident_id=incident_id)
|
||
|
||
async with get_db_context() as db:
|
||
from sqlalchemy import select
|
||
stmt = select(IncidentRecord).where(
|
||
IncidentRecord.incident_id == incident_id
|
||
)
|
||
result = await db.execute(stmt)
|
||
record = result.scalar_one_or_none()
|
||
|
||
if record:
|
||
# 從 DB 重建 Incident
|
||
incident = self._record_to_incident(record)
|
||
# 寫回 Working Memory (快取)
|
||
await self.save_incident(incident)
|
||
return incident
|
||
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error("load_incident_failed", incident_id=incident_id, error=str(e))
|
||
return None
|
||
|
||
async def save_incident(
|
||
self,
|
||
incident: Incident,
|
||
ttl_seconds: int = WORKING_MEMORY_TTL,
|
||
) -> bool:
|
||
"""
|
||
儲存 Incident 到 Working Memory (Redis)
|
||
|
||
Args:
|
||
incident: Incident 物件
|
||
ttl_seconds: TTL (預設 7 天)
|
||
|
||
Returns:
|
||
是否成功
|
||
"""
|
||
try:
|
||
redis_client = self._get_redis()
|
||
key = self._make_key(incident.incident_id)
|
||
json_data = incident.model_dump_json()
|
||
|
||
await redis_client.setex(key, ttl_seconds, json_data)
|
||
|
||
logger.debug(
|
||
"incident_saved_to_working",
|
||
incident_id=incident.incident_id,
|
||
ttl=ttl_seconds,
|
||
)
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"save_incident_failed",
|
||
incident_id=incident.incident_id,
|
||
error=str(e),
|
||
)
|
||
return False
|
||
|
||
async def persist_incident(self, incident: Incident) -> bool:
|
||
"""
|
||
持久化到 Episodic Memory (PostgreSQL)
|
||
|
||
Args:
|
||
incident: Incident 物件
|
||
|
||
Returns:
|
||
是否成功
|
||
"""
|
||
try:
|
||
async with get_db_context() as db:
|
||
from sqlalchemy import select
|
||
|
||
# 檢查是否已存在
|
||
stmt = select(IncidentRecord).where(
|
||
IncidentRecord.incident_id == incident.incident_id
|
||
)
|
||
result = await db.execute(stmt)
|
||
existing = result.scalar_one_or_none()
|
||
|
||
if existing:
|
||
# 更新現有記錄
|
||
existing.status = incident.status.value
|
||
existing.severity = incident.severity.value
|
||
existing.signals = [
|
||
s.model_dump(mode="json") for s in incident.signals
|
||
]
|
||
existing.affected_services = incident.affected_services
|
||
existing.updated_at = incident.updated_at
|
||
if incident.resolved_at:
|
||
existing.resolved_at = incident.resolved_at
|
||
if incident.closed_at:
|
||
existing.closed_at = incident.closed_at
|
||
else:
|
||
# 建立新記錄
|
||
record = IncidentRecord(
|
||
incident_id=incident.incident_id,
|
||
status=incident.status.value,
|
||
severity=incident.severity.value,
|
||
signals=[
|
||
s.model_dump(mode="json") for s in incident.signals
|
||
],
|
||
affected_services=incident.affected_services,
|
||
decision_chain=(
|
||
incident.decision_chain.model_dump(mode="json")
|
||
if incident.decision_chain
|
||
else None
|
||
),
|
||
proposal_ids=[str(pid) for pid in incident.proposal_ids],
|
||
outcome=(
|
||
incident.outcome.model_dump(mode="json")
|
||
if incident.outcome
|
||
else None
|
||
),
|
||
created_at=incident.created_at,
|
||
updated_at=incident.updated_at,
|
||
resolved_at=incident.resolved_at,
|
||
closed_at=incident.closed_at,
|
||
ttl_days=incident.ttl_days,
|
||
vectorized=incident.vectorized,
|
||
)
|
||
db.add(record)
|
||
|
||
logger.debug(
|
||
"incident_persisted_to_episodic",
|
||
incident_id=incident.incident_id,
|
||
)
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"persist_incident_failed",
|
||
incident_id=incident.incident_id,
|
||
error=str(e),
|
||
)
|
||
return False
|
||
|
||
async def find_related_incident(
|
||
self,
|
||
namespace: str,
|
||
target: str,
|
||
window_minutes: int = AGGREGATION_WINDOW_MINUTES,
|
||
) -> Incident | None:
|
||
"""
|
||
尋找相關的活躍 Incident (用於聚合)
|
||
|
||
透過反向索引快速查找:
|
||
1. 查詢索引 Key: namespace:target -> incident_id
|
||
2. 載入 Incident
|
||
3. 檢查是否仍在聚合窗口內
|
||
|
||
Args:
|
||
namespace: 命名空間
|
||
target: 目標服務
|
||
window_minutes: 聚合窗口 (分鐘)
|
||
|
||
Returns:
|
||
相關 Incident 或 None
|
||
"""
|
||
try:
|
||
redis_client = self._get_redis()
|
||
|
||
# Step 1: 查詢索引
|
||
index_key = self._make_index_key(namespace, target)
|
||
incident_id = await redis_client.get(index_key)
|
||
|
||
if incident_id is None:
|
||
return None
|
||
|
||
# 解碼 bytes
|
||
if isinstance(incident_id, bytes):
|
||
incident_id = incident_id.decode()
|
||
|
||
# Step 2: 載入 Incident
|
||
incident = await self.load_incident(incident_id)
|
||
if incident is None:
|
||
# 索引存在但 Incident 不存在,清除索引
|
||
await redis_client.delete(index_key)
|
||
return None
|
||
|
||
# Step 3: 檢查聚合窗口
|
||
window_start = datetime.now(UTC) - timedelta(minutes=window_minutes)
|
||
if incident.updated_at < window_start:
|
||
# 超出聚合窗口,不聚合
|
||
logger.debug(
|
||
"incident_outside_window",
|
||
incident_id=incident_id,
|
||
updated_at=incident.updated_at.isoformat(),
|
||
)
|
||
return None
|
||
|
||
logger.debug(
|
||
"found_related_incident",
|
||
incident_id=incident_id,
|
||
namespace=namespace,
|
||
target=target,
|
||
)
|
||
return incident
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"find_related_incident_failed",
|
||
namespace=namespace,
|
||
target=target,
|
||
error=str(e),
|
||
)
|
||
return None
|
||
|
||
async def update_index(
|
||
self,
|
||
incident_id: str,
|
||
namespace: str,
|
||
target: str,
|
||
) -> bool:
|
||
"""
|
||
更新反向索引
|
||
|
||
索引結構:
|
||
Key: awoooi:incidents:index:{namespace}:{target}
|
||
Value: incident_id
|
||
TTL: 30 分鐘
|
||
|
||
Args:
|
||
incident_id: Incident ID
|
||
namespace: 命名空間
|
||
target: 目標服務
|
||
|
||
Returns:
|
||
是否成功
|
||
"""
|
||
try:
|
||
redis_client = self._get_redis()
|
||
index_key = self._make_index_key(namespace, target)
|
||
await redis_client.setex(index_key, INDEX_TTL, incident_id)
|
||
|
||
logger.debug(
|
||
"index_updated",
|
||
incident_id=incident_id,
|
||
namespace=namespace,
|
||
target=target,
|
||
ttl=INDEX_TTL,
|
||
)
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"update_index_failed",
|
||
incident_id=incident_id,
|
||
namespace=namespace,
|
||
target=target,
|
||
error=str(e),
|
||
)
|
||
return False
|
||
|
||
async def delete_incident(self, incident_id: str) -> bool:
|
||
"""
|
||
刪除 Incident
|
||
|
||
Args:
|
||
incident_id: Incident ID
|
||
|
||
Returns:
|
||
是否成功
|
||
"""
|
||
try:
|
||
redis_client = self._get_redis()
|
||
key = self._make_key(incident_id)
|
||
result = await redis_client.delete(key)
|
||
return result > 0
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"delete_incident_failed",
|
||
incident_id=incident_id,
|
||
error=str(e),
|
||
)
|
||
return False
|
||
|
||
def _record_to_incident(self, record: IncidentRecord) -> Incident:
|
||
"""
|
||
將 DB Record 轉換為 Incident 物件
|
||
|
||
Args:
|
||
record: IncidentRecord
|
||
|
||
Returns:
|
||
Incident
|
||
"""
|
||
from src.models.incident import (
|
||
IncidentStatus,
|
||
Severity,
|
||
Signal,
|
||
)
|
||
|
||
# 重建 Signals
|
||
signals = []
|
||
for s in record.signals or []:
|
||
signals.append(Signal.model_validate(s))
|
||
|
||
return Incident(
|
||
incident_id=record.incident_id,
|
||
status=IncidentStatus(record.status),
|
||
severity=Severity(record.severity),
|
||
signals=signals,
|
||
affected_services=record.affected_services or [],
|
||
proposal_ids=record.proposal_ids or [],
|
||
created_at=record.created_at,
|
||
updated_at=record.updated_at,
|
||
resolved_at=record.resolved_at,
|
||
closed_at=record.closed_at,
|
||
ttl_days=record.ttl_days or 30,
|
||
vectorized=record.vectorized or False,
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Phase 16: IncidentDbAdapter (DI 注入實現)
|
||
# =============================================================================
|
||
|
||
class IncidentDbAdapter:
|
||
"""
|
||
Incident DB Adapter - 實現 lewooogo-brain 的 IIncidentDbAdapter
|
||
|
||
Phase 16: 將 apps/api 的 SQLAlchemy Model 操作封裝為 adapter
|
||
注入到 lewooogo-brain 的 DualIncidentMemory
|
||
"""
|
||
|
||
async def load(self, incident_id: str) -> Incident | None:
|
||
"""從 PostgreSQL 載入 Incident"""
|
||
try:
|
||
async with get_db_context() as db:
|
||
from sqlalchemy import select
|
||
stmt = select(IncidentRecord).where(
|
||
IncidentRecord.incident_id == incident_id
|
||
)
|
||
result = await db.execute(stmt)
|
||
record = result.scalar_one_or_none()
|
||
|
||
if record:
|
||
return self._record_to_incident(record)
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error("db_adapter_load_failed", incident_id=incident_id, error=str(e))
|
||
return None
|
||
|
||
async def save(self, incident: Incident) -> bool:
|
||
"""儲存 Incident 到 PostgreSQL (upsert)"""
|
||
try:
|
||
async with get_db_context() as db:
|
||
from sqlalchemy import select
|
||
|
||
# 檢查是否已存在
|
||
stmt = select(IncidentRecord).where(
|
||
IncidentRecord.incident_id == incident.incident_id
|
||
)
|
||
result = await db.execute(stmt)
|
||
existing = result.scalar_one_or_none()
|
||
|
||
if existing:
|
||
# 更新現有記錄
|
||
existing.status = incident.status.value
|
||
existing.severity = incident.severity.value
|
||
existing.signals = [
|
||
s.model_dump(mode="json") for s in incident.signals
|
||
]
|
||
existing.affected_services = incident.affected_services
|
||
existing.updated_at = incident.updated_at
|
||
if incident.resolved_at:
|
||
existing.resolved_at = incident.resolved_at
|
||
if incident.closed_at:
|
||
existing.closed_at = incident.closed_at
|
||
else:
|
||
# 建立新記錄
|
||
record = IncidentRecord(
|
||
incident_id=incident.incident_id,
|
||
status=incident.status.value,
|
||
severity=incident.severity.value,
|
||
signals=[
|
||
s.model_dump(mode="json") for s in incident.signals
|
||
],
|
||
affected_services=incident.affected_services,
|
||
decision_chain=(
|
||
incident.decision_chain.model_dump(mode="json")
|
||
if hasattr(incident, 'decision_chain') and incident.decision_chain
|
||
else None
|
||
),
|
||
proposal_ids=[str(pid) for pid in incident.proposal_ids],
|
||
outcome=(
|
||
incident.outcome.model_dump(mode="json")
|
||
if hasattr(incident, 'outcome') and incident.outcome
|
||
else None
|
||
),
|
||
created_at=incident.created_at,
|
||
updated_at=incident.updated_at,
|
||
resolved_at=incident.resolved_at,
|
||
closed_at=incident.closed_at,
|
||
ttl_days=getattr(incident, 'ttl_days', 30),
|
||
vectorized=getattr(incident, 'vectorized', False),
|
||
)
|
||
db.add(record)
|
||
|
||
logger.debug("db_adapter_save_success", incident_id=incident.incident_id)
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error("db_adapter_save_failed", incident_id=incident.incident_id, error=str(e))
|
||
return False
|
||
|
||
def _record_to_incident(self, record: IncidentRecord) -> Incident:
|
||
"""將 DB Record 轉換為 Incident (lewooogo-brain 版本)"""
|
||
# 延遲導入 lewooogo-brain 的 Incident
|
||
from lewooogo_brain.interfaces.incident_processor import (
|
||
Incident as BrainIncident,
|
||
)
|
||
from lewooogo_brain.interfaces.incident_processor import (
|
||
IncidentStatus as BrainIncidentStatus,
|
||
)
|
||
from lewooogo_brain.interfaces.incident_processor import (
|
||
Severity as BrainSeverity,
|
||
)
|
||
from lewooogo_brain.interfaces.incident_processor import (
|
||
Signal as BrainSignal,
|
||
)
|
||
|
||
# 重建 Signals
|
||
signals = []
|
||
for s in record.signals or []:
|
||
signals.append(BrainSignal.model_validate(s))
|
||
|
||
return BrainIncident(
|
||
incident_id=record.incident_id,
|
||
status=BrainIncidentStatus(record.status),
|
||
severity=BrainSeverity(record.severity),
|
||
signals=signals,
|
||
affected_services=record.affected_services or [],
|
||
proposal_ids=record.proposal_ids or [],
|
||
created_at=record.created_at,
|
||
updated_at=record.updated_at,
|
||
resolved_at=record.resolved_at,
|
||
closed_at=record.closed_at,
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton + 絞殺者模式 (Phase 16 R1.2)
|
||
# =============================================================================
|
||
|
||
_dual_memory: DualIncidentMemory | None = None
|
||
_new_engine_memory: Any | None = None # lewooogo-brain 版本
|
||
_db_adapter: IncidentDbAdapter | None = None # DB Adapter singleton
|
||
|
||
|
||
def get_incident_memory() -> DualIncidentMemory:
|
||
"""
|
||
取得 DualIncidentMemory 實例 (Singleton)
|
||
|
||
Phase 16 絞殺者模式:
|
||
- USE_NEW_ENGINE=False (預設): 返回內嵌版本
|
||
- USE_NEW_ENGINE=True: 返回 lewooogo-brain 套件版本
|
||
|
||
回滾指令: kubectl set env deployment/awoooi-api USE_NEW_ENGINE=false
|
||
"""
|
||
from src.core.config import settings
|
||
|
||
if settings.USE_NEW_ENGINE:
|
||
return _get_new_engine_memory()
|
||
else:
|
||
return _get_legacy_memory()
|
||
|
||
|
||
def _get_legacy_memory() -> DualIncidentMemory:
|
||
"""取得內嵌版本 (Phase 16 舊引擎)"""
|
||
global _dual_memory
|
||
if _dual_memory is None:
|
||
_dual_memory = DualIncidentMemory()
|
||
logger.info("incident_memory_initialized", engine="legacy_embedded")
|
||
return _dual_memory
|
||
|
||
|
||
def _get_new_engine_memory() -> Any:
|
||
"""
|
||
取得 lewooogo-brain 套件版本 (Phase 16 新引擎)
|
||
|
||
注意事項:
|
||
- 需要 lewooogo-brain 已安裝 (Dockerfile 已配置)
|
||
- PostgreSQL 透過 IncidentDbAdapter 注入 (Phase 16 DI 模式)
|
||
- 初次啟用建議 48 小時監控
|
||
|
||
回滾: 設定 USE_NEW_ENGINE=false 即可瞬間切回
|
||
"""
|
||
global _new_engine_memory, _db_adapter
|
||
|
||
if _new_engine_memory is None:
|
||
try:
|
||
# 延遲導入: 避免 lewooogo-brain 未安裝時啟動失敗
|
||
from lewooogo_brain.adapters.incident_memory import (
|
||
DualIncidentMemory as NewDualIncidentMemory,
|
||
)
|
||
|
||
from src.core.redis_client import get_redis
|
||
|
||
redis_client = get_redis()
|
||
|
||
# 初始化 DB Adapter (Phase 16 DI 模式)
|
||
if _db_adapter is None:
|
||
_db_adapter = IncidentDbAdapter()
|
||
|
||
# 初始化 lewooogo-brain 版本 (含 DB Adapter)
|
||
_new_engine_memory = NewDualIncidentMemory(
|
||
redis_client=redis_client,
|
||
db_adapter=_db_adapter,
|
||
key_prefix="awoooi:incidents",
|
||
)
|
||
|
||
logger.info(
|
||
"incident_memory_initialized",
|
||
engine="lewooogo_brain_package",
|
||
db_adapter="IncidentDbAdapter",
|
||
redis_connected=True,
|
||
)
|
||
|
||
except ImportError as e:
|
||
# lewooogo-brain 未安裝,降級到內嵌版本
|
||
logger.warning(
|
||
"lewooogo_brain_not_available_fallback_to_legacy",
|
||
error=str(e),
|
||
)
|
||
return _get_legacy_memory()
|
||
|
||
except Exception as e:
|
||
# 其他錯誤,降級到內嵌版本
|
||
logger.error(
|
||
"new_engine_init_failed_fallback_to_legacy",
|
||
error=str(e),
|
||
)
|
||
return _get_legacy_memory()
|
||
|
||
return _new_engine_memory
|