Files
awoooi/apps/api/src/services/incident_memory.py
OG T 6f049877fc fix(lint): ruff auto-fix + lewooogo-core src 加入 git
- Python: ruff --fix 修復 280 個 lint 錯誤
- lewooogo-core: src/ 目錄未追蹤,導致 CI eslint 失敗

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-23 23:51:37 +08:00

484 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Incident Memory Provider - 事件記憶體提供者
============================================
Phase 6.4e: DualIncidentMemory 整合
設計:
- 實作 IIncidentMemory 協定 (Protocol)
- 雙層記憶體: Working (Redis) + Episodic (PostgreSQL)
- 反向索引: namespace:target -> incident_id
統帥鐵律:
- Working Memory (Redis): 7 天 TTL
- Episodic Memory (PostgreSQL): 永久
- 反向索引: 30 分鐘 TTL (聚合窗口)
NOTE: 此模組為 lewooogo-brain/adapters/incident_memory.py 的 apps/api 內嵌版本
待 Phase 6.4i 完成 monorepo Docker 解法後,將直接引用 lewooogo-brain 套件
"""
from datetime import UTC, datetime, timedelta
from typing import Any, Protocol
import structlog
from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from src.models.incident import Incident
logger = structlog.get_logger(__name__)
# =============================================================================
# Constants
# =============================================================================
WORKING_MEMORY_TTL = 604800 # 7 天
AGGREGATION_WINDOW_MINUTES = 30
INDEX_TTL = 1800 # 索引 30 分鐘 TTL
# Redis Key Patterns
INCIDENT_KEY_PREFIX = "awoooi:incidents:"
INDEX_PREFIX = "awoooi:incidents:index:"
# =============================================================================
# Protocol Definition (與 lewooogo-brain 保持一致)
# =============================================================================
class IIncidentMemory(Protocol):
"""Incident 專用記憶體提供者協定"""
async def load_incident(self, incident_id: str) -> Incident | None:
"""從 Working Memory 載入 Incident"""
...
async def save_incident(self, incident: Incident, ttl_seconds: int = WORKING_MEMORY_TTL) -> bool:
"""儲存 Incident 到 Working Memory (預設 7 天 TTL)"""
...
async def persist_incident(self, incident: Incident) -> bool:
"""持久化到 Episodic Memory (PostgreSQL)"""
...
async def find_related_incident(
self,
namespace: str,
target: str,
window_minutes: int = AGGREGATION_WINDOW_MINUTES,
) -> Incident | None:
"""尋找相關的活躍 Incident (用於聚合)"""
...
async def update_index(
self,
incident_id: str,
namespace: str,
target: str,
) -> bool:
"""更新反向索引 (namespace/target -> incident_id)"""
...
# =============================================================================
# DualIncidentMemory Implementation
# =============================================================================
class DualIncidentMemory:
"""
Incident 專用雙層記憶體適配器
實作 IIncidentMemory 協定:
- load_incident: 從 Working/Episodic 載入
- save_incident: 儲存到 Working
- persist_incident: 持久化到 Episodic
- find_related_incident: 透過反向索引尋找相關 Incident
- update_index: 更新反向索引
反向索引結構:
Key: awoooi:incidents:index:{namespace}:{target}
Value: incident_id
TTL: 30 分鐘 (聚合窗口)
"""
def __init__(self, redis_client: Any = None, key_prefix: str = INCIDENT_KEY_PREFIX):
"""
初始化適配器
Args:
redis_client: Redis 連線客戶端 (可選,預設使用 get_redis())
key_prefix: Redis Key 前綴
"""
self._redis = redis_client
self._key_prefix = key_prefix
self._index_prefix = INDEX_PREFIX
def _get_redis(self) -> Any:
"""取得 Redis 客戶端 (延遲初始化)"""
if self._redis is None:
self._redis = get_redis()
return self._redis
def _make_key(self, incident_id: str) -> str:
"""生成 Incident Key"""
return f"{self._key_prefix}{incident_id}"
def _make_index_key(self, namespace: str, target: str) -> str:
"""生成索引 Key"""
return f"{self._index_prefix}{namespace}:{target}"
async def load_incident(self, incident_id: str) -> Incident | None:
"""
載入 Incident
策略:
1. 從 Redis (Working Memory) 讀取
2. 若 miss從 PostgreSQL (Episodic) 讀取
Args:
incident_id: Incident ID
Returns:
Incident 或 None
"""
try:
redis_client = self._get_redis()
key = self._make_key(incident_id)
data = await redis_client.get(key)
if data is not None:
# JSON -> Incident
return Incident.model_validate_json(data)
# Working Memory miss, 嘗試從 Episodic Memory 載入
logger.debug("incident_not_found_in_working", incident_id=incident_id)
async with get_db_context() as db:
from sqlalchemy import select
stmt = select(IncidentRecord).where(
IncidentRecord.incident_id == incident_id
)
result = await db.execute(stmt)
record = result.scalar_one_or_none()
if record:
# 從 DB 重建 Incident
incident = self._record_to_incident(record)
# 寫回 Working Memory (快取)
await self.save_incident(incident)
return incident
return None
except Exception as e:
logger.error("load_incident_failed", incident_id=incident_id, error=str(e))
return None
async def save_incident(
self,
incident: Incident,
ttl_seconds: int = WORKING_MEMORY_TTL,
) -> bool:
"""
儲存 Incident 到 Working Memory (Redis)
Args:
incident: Incident 物件
ttl_seconds: TTL (預設 7 天)
Returns:
是否成功
"""
try:
redis_client = self._get_redis()
key = self._make_key(incident.incident_id)
json_data = incident.model_dump_json()
await redis_client.setex(key, ttl_seconds, json_data)
logger.debug(
"incident_saved_to_working",
incident_id=incident.incident_id,
ttl=ttl_seconds,
)
return True
except Exception as e:
logger.error(
"save_incident_failed",
incident_id=incident.incident_id,
error=str(e),
)
return False
async def persist_incident(self, incident: Incident) -> bool:
"""
持久化到 Episodic Memory (PostgreSQL)
Args:
incident: Incident 物件
Returns:
是否成功
"""
try:
async with get_db_context() as db:
from sqlalchemy import select
# 檢查是否已存在
stmt = select(IncidentRecord).where(
IncidentRecord.incident_id == incident.incident_id
)
result = await db.execute(stmt)
existing = result.scalar_one_or_none()
if existing:
# 更新現有記錄
existing.status = incident.status.value
existing.severity = incident.severity.value
existing.signals = [
s.model_dump(mode="json") for s in incident.signals
]
existing.affected_services = incident.affected_services
existing.updated_at = incident.updated_at
if incident.resolved_at:
existing.resolved_at = incident.resolved_at
if incident.closed_at:
existing.closed_at = incident.closed_at
else:
# 建立新記錄
record = IncidentRecord(
incident_id=incident.incident_id,
status=incident.status.value,
severity=incident.severity.value,
signals=[
s.model_dump(mode="json") for s in incident.signals
],
affected_services=incident.affected_services,
decision_chain=(
incident.decision_chain.model_dump(mode="json")
if incident.decision_chain
else None
),
proposal_ids=[str(pid) for pid in incident.proposal_ids],
outcome=(
incident.outcome.model_dump(mode="json")
if incident.outcome
else None
),
created_at=incident.created_at,
updated_at=incident.updated_at,
resolved_at=incident.resolved_at,
closed_at=incident.closed_at,
ttl_days=incident.ttl_days,
vectorized=incident.vectorized,
)
db.add(record)
logger.debug(
"incident_persisted_to_episodic",
incident_id=incident.incident_id,
)
return True
except Exception as e:
logger.error(
"persist_incident_failed",
incident_id=incident.incident_id,
error=str(e),
)
return False
async def find_related_incident(
self,
namespace: str,
target: str,
window_minutes: int = AGGREGATION_WINDOW_MINUTES,
) -> Incident | None:
"""
尋找相關的活躍 Incident (用於聚合)
透過反向索引快速查找:
1. 查詢索引 Key: namespace:target -> incident_id
2. 載入 Incident
3. 檢查是否仍在聚合窗口內
Args:
namespace: 命名空間
target: 目標服務
window_minutes: 聚合窗口 (分鐘)
Returns:
相關 Incident 或 None
"""
try:
redis_client = self._get_redis()
# Step 1: 查詢索引
index_key = self._make_index_key(namespace, target)
incident_id = await redis_client.get(index_key)
if incident_id is None:
return None
# 解碼 bytes
if isinstance(incident_id, bytes):
incident_id = incident_id.decode()
# Step 2: 載入 Incident
incident = await self.load_incident(incident_id)
if incident is None:
# 索引存在但 Incident 不存在,清除索引
await redis_client.delete(index_key)
return None
# Step 3: 檢查聚合窗口
window_start = datetime.now(UTC) - timedelta(minutes=window_minutes)
if incident.updated_at < window_start:
# 超出聚合窗口,不聚合
logger.debug(
"incident_outside_window",
incident_id=incident_id,
updated_at=incident.updated_at.isoformat(),
)
return None
logger.debug(
"found_related_incident",
incident_id=incident_id,
namespace=namespace,
target=target,
)
return incident
except Exception as e:
logger.error(
"find_related_incident_failed",
namespace=namespace,
target=target,
error=str(e),
)
return None
async def update_index(
self,
incident_id: str,
namespace: str,
target: str,
) -> bool:
"""
更新反向索引
索引結構:
Key: awoooi:incidents:index:{namespace}:{target}
Value: incident_id
TTL: 30 分鐘
Args:
incident_id: Incident ID
namespace: 命名空間
target: 目標服務
Returns:
是否成功
"""
try:
redis_client = self._get_redis()
index_key = self._make_index_key(namespace, target)
await redis_client.setex(index_key, INDEX_TTL, incident_id)
logger.debug(
"index_updated",
incident_id=incident_id,
namespace=namespace,
target=target,
ttl=INDEX_TTL,
)
return True
except Exception as e:
logger.error(
"update_index_failed",
incident_id=incident_id,
namespace=namespace,
target=target,
error=str(e),
)
return False
async def delete_incident(self, incident_id: str) -> bool:
"""
刪除 Incident
Args:
incident_id: Incident ID
Returns:
是否成功
"""
try:
redis_client = self._get_redis()
key = self._make_key(incident_id)
result = await redis_client.delete(key)
return result > 0
except Exception as e:
logger.error(
"delete_incident_failed",
incident_id=incident_id,
error=str(e),
)
return False
def _record_to_incident(self, record: IncidentRecord) -> Incident:
"""
將 DB Record 轉換為 Incident 物件
Args:
record: IncidentRecord
Returns:
Incident
"""
from src.models.incident import (
IncidentStatus,
Severity,
Signal,
)
# 重建 Signals
signals = []
for s in record.signals or []:
signals.append(Signal.model_validate(s))
return Incident(
incident_id=record.incident_id,
status=IncidentStatus(record.status),
severity=Severity(record.severity),
signals=signals,
affected_services=record.affected_services or [],
proposal_ids=record.proposal_ids or [],
created_at=record.created_at,
updated_at=record.updated_at,
resolved_at=record.resolved_at,
closed_at=record.closed_at,
ttl_days=record.ttl_days or 30,
vectorized=record.vectorized or False,
)
# =============================================================================
# Singleton
# =============================================================================
_dual_memory: DualIncidentMemory | None = None
def get_incident_memory() -> DualIncidentMemory:
"""取得 DualIncidentMemory 實例 (Singleton)"""
global _dual_memory
if _dual_memory is None:
_dual_memory = DualIncidentMemory()
return _dual_memory