Files
awoooi/apps/api/src/services/incident_memory.py
Your Name 1a2b04f5cf
Some checks failed
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / tests (push) Successful in 1m14s
CD Pipeline / build-and-deploy (push) Failing after 3m33s
CD Pipeline / post-deploy-checks (push) Has been skipped
fix(awooop): persist signal metadata and auto-repair prestate
2026-05-18 10:59:54 +08:00

339 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Incident Memory Provider - 事件記憶體提供者
============================================
Phase 6.4e: DualIncidentMemory 整合
Phase 16 R1.2: 絞殺者模式 (Strangler Fig Pattern) 2026-03-26
Phase R-R2 (2026-04-01 ogt): 移除內嵌 DualIncidentMemory 重複邏輯,
全面切換至 lewooogo-brain。回滾方式: git revert + redeploy。
設計:
- IncidentDbAdapter: SQLAlchemy Bridge注入 lewooogo-brain DualIncidentMemory
- 雙層記憶體: Working (Redis) + Episodic (PostgreSQL)
- 反向索引: namespace:target -> incident_id
統帥鐵律:
- Working Memory (Redis): 7 天 TTL
- Episodic Memory (PostgreSQL): 永久
- 反向索引: 30 分鐘 TTL (聚合窗口)
"""
from typing import Any
import structlog
from src.core.redis_client import get_redis
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from src.models.incident import Incident
logger = structlog.get_logger(__name__)
def _signal_to_dict(signal: Any) -> dict[str, Any]:
"""Normalize brain/local Signal objects and raw dicts into one shape."""
if isinstance(signal, dict):
return signal
if hasattr(signal, "model_dump"):
return signal.model_dump(mode="json")
return {
"alert_name": getattr(signal, "alert_name", None),
"severity": getattr(signal, "severity", None),
"source": getattr(signal, "source", None),
"labels": getattr(signal, "labels", None) or {},
"annotations": getattr(signal, "annotations", None) or {},
"fingerprint": getattr(signal, "fingerprint", None),
}
def _derive_incident_alert_metadata(incident: Incident) -> dict[str, Any]:
"""Derive alert metadata for incidents saved through the lewooogo bridge."""
first_signal = incident.signals[0] if incident.signals else None
signal = _signal_to_dict(first_signal) if first_signal else {}
labels = signal.get("labels") or {}
annotations = signal.get("annotations") or {}
alertname = (
labels.get("alertname")
or signal.get("alert_name")
or signal.get("alertname")
or ""
)
severity = (
signal.get("severity")
or getattr(incident.severity, "value", incident.severity)
or labels.get("severity")
or "warning"
)
severity = getattr(severity, "value", severity)
alert_category = None
notification_type = None
if alertname:
from src.services.incident_service import classify_alert_early
alert_category, notification_type = classify_alert_early(
str(alertname),
str(severity),
labels,
)
description = (
annotations.get("message")
or annotations.get("description")
or annotations.get("summary")
or ""
)
return {
"alertname": str(alertname) if alertname else None,
"severity": str(severity) if severity else None,
"alert_category": alert_category,
"notification_type": notification_type,
"description": str(description) if description else None,
"actor": signal.get("source") or labels.get("source") or "signal_worker",
}
async def _add_signal_timeline_event(
incident: Incident,
metadata: dict[str, Any],
) -> None:
"""Best-effort timeline seed for incidents created outside Alertmanager."""
alertname = metadata.get("alertname")
if not alertname:
return
try:
from src.services.approval_db import get_timeline_service
await get_timeline_service().add_event(
event_type="webhook",
status="success",
title=f"Signal received: {alertname}",
description=metadata.get("description"),
actor=metadata.get("actor"),
actor_role="signal_worker",
risk_level=getattr(incident.severity, "value", incident.severity),
incident_id=incident.incident_id,
)
except Exception as exc:
logger.warning(
"incident_signal_timeline_seed_failed",
incident_id=incident.incident_id,
alertname=alertname,
error=str(exc),
)
# =============================================================================
# Phase 16: IncidentDbAdapter (DI 注入實現)
# =============================================================================
class IncidentDbAdapter:
"""
Incident DB Adapter - 實現 lewooogo-brain 的 IIncidentDbAdapter
Phase 16: 將 apps/api 的 SQLAlchemy Model 操作封裝為 adapter
注入到 lewooogo-brain 的 DualIncidentMemory
"""
async def load(self, incident_id: str) -> Incident | None:
"""從 PostgreSQL 載入 Incident"""
try:
async with get_db_context() as db:
from sqlalchemy import select
stmt = select(IncidentRecord).where(
IncidentRecord.incident_id == incident_id
)
result = await db.execute(stmt)
record = result.scalar_one_or_none()
if record:
return self._record_to_incident(record)
return None
except Exception as e:
logger.error("db_adapter_load_failed", incident_id=incident_id, error=str(e))
return None
async def save(self, incident: Incident) -> bool:
"""儲存 Incident 到 PostgreSQL (upsert)"""
metadata = _derive_incident_alert_metadata(incident)
created = False
try:
async with get_db_context() as db:
from sqlalchemy import select
stmt = select(IncidentRecord).where(
IncidentRecord.incident_id == incident.incident_id
)
result = await db.execute(stmt)
existing = result.scalar_one_or_none()
if existing:
existing.status = incident.status.value
existing.severity = incident.severity.value
existing.signals = [
s.model_dump(mode="json") for s in incident.signals
]
existing.affected_services = incident.affected_services
existing.updated_at = incident.updated_at
if incident.resolved_at:
existing.resolved_at = incident.resolved_at
if incident.closed_at:
existing.closed_at = incident.closed_at
if metadata.get("alertname") and not existing.alertname:
existing.alertname = metadata["alertname"]
if metadata.get("notification_type") and not existing.notification_type:
existing.notification_type = metadata["notification_type"]
if metadata.get("alert_category") and not existing.alert_category:
existing.alert_category = metadata["alert_category"]
else:
record = IncidentRecord(
incident_id=incident.incident_id,
status=incident.status.value,
severity=incident.severity.value,
signals=[
s.model_dump(mode="json") for s in incident.signals
],
affected_services=incident.affected_services,
decision_chain=(
incident.decision_chain.model_dump(mode="json")
if hasattr(incident, 'decision_chain') and incident.decision_chain
else None
),
proposal_ids=[str(pid) for pid in incident.proposal_ids],
outcome=(
incident.outcome.model_dump(mode="json")
if hasattr(incident, 'outcome') and incident.outcome
else None
),
created_at=incident.created_at,
updated_at=incident.updated_at,
resolved_at=incident.resolved_at,
closed_at=incident.closed_at,
ttl_days=getattr(incident, 'ttl_days', 30),
vectorized=getattr(incident, 'vectorized', False),
alertname=metadata.get("alertname"),
notification_type=metadata.get("notification_type"),
alert_category=metadata.get("alert_category"),
)
db.add(record)
created = True
if created:
await _add_signal_timeline_event(incident, metadata)
logger.debug("db_adapter_save_success", incident_id=incident.incident_id)
return True
except Exception as e:
logger.error("db_adapter_save_failed", incident_id=incident.incident_id, error=str(e))
return False
def _record_to_incident(self, record: IncidentRecord) -> Any:
"""
將 DB Record 轉換為 BrainIncident (lewooogo-brain 版本)
注意: 返回 BrainIncident 供 lewooogo-brain DualIncidentMemory 內部使用。
本地服務消費時透過 IncidentConverter.brain_to_local() 轉換。
(ADR-046 - 2026-04-01 ogt)
"""
from lewooogo_brain.interfaces.incident_processor import (
Incident as BrainIncident,
)
from lewooogo_brain.interfaces.incident_processor import (
IncidentStatus as BrainIncidentStatus,
)
from lewooogo_brain.interfaces.incident_processor import (
Severity as BrainSeverity,
)
from lewooogo_brain.interfaces.incident_processor import (
Signal as BrainSignal,
)
signals = []
for s in record.signals or []:
signals.append(BrainSignal.model_validate(s))
return BrainIncident(
incident_id=record.incident_id,
status=BrainIncidentStatus(record.status),
severity=BrainSeverity(record.severity),
signals=signals,
affected_services=record.affected_services or [],
proposal_ids=record.proposal_ids or [],
created_at=record.created_at,
updated_at=record.updated_at,
resolved_at=record.resolved_at,
closed_at=record.closed_at,
)
# =============================================================================
# Singleton (Phase R-R2: 僅保留 lewooogo-brain 版本)
# =============================================================================
_new_engine_memory: Any | None = None
_db_adapter: IncidentDbAdapter | None = None
def get_incident_memory() -> Any:
"""
取得 DualIncidentMemory 實例 (Singleton)
Phase R-R2: 統一使用 lewooogo-brain 套件版本。
回滾方式: git revert Phase R-R2 commit + redeploy。
"""
return _get_new_engine_memory()
def _get_new_engine_memory() -> Any:
"""
取得 lewooogo-brain 套件版本
注意事項:
- 需要 lewooogo-brain 已安裝 (Dockerfile 已配置)
- PostgreSQL 透過 IncidentDbAdapter 注入 (Phase 16 DI 模式)
"""
global _new_engine_memory, _db_adapter
if _new_engine_memory is None:
try:
from lewooogo_brain.adapters.incident_memory import (
DualIncidentMemory as NewDualIncidentMemory,
)
redis_client = get_redis()
if _db_adapter is None:
_db_adapter = IncidentDbAdapter()
_new_engine_memory = NewDualIncidentMemory(
redis_client=redis_client,
db_adapter=_db_adapter,
key_prefix="awoooi:incidents",
)
logger.info(
"incident_memory_initialized",
engine="lewooogo_brain_package",
db_adapter="IncidentDbAdapter",
redis_connected=True,
)
except ImportError as e:
logger.error(
"lewooogo_brain_not_available",
error=str(e),
)
raise
except Exception as e:
logger.error(
"new_engine_init_failed",
error=str(e),
)
raise
return _new_engine_memory