""" Signal Worker - Redis Streams Consumer ======================================= Phase 6.1: Event Bus Implementation 功能: - XREADGROUP 消費 stream:awoooi_signals - Signal → Incident 聚合邏輯 (Phase 6.3 實作) - 失敗重試 + ACK 機制 - Graceful Shutdown Redis Streams 概念: - Stream: stream:awoooi_signals (訊息佇列) - Consumer Group: awoooi_workers (消費者群組) - Consumer: worker_{hostname} (單一消費者) 統帥鐵律: - 使用 XREADGROUP 確保訊息只被處理一次 - 處理完成後必須 XACK - 失敗訊息進入 Pending List,需定期清理 """ import asyncio import socket from typing import Any import structlog from src.core.redis_client import get_redis from src.services.incident_engine import get_incident_engine logger = structlog.get_logger(__name__) # ============================================================================= # Constants # ============================================================================= STREAM_KEY = "stream:awoooi_signals" CONSUMER_GROUP = "awoooi_workers" CONSUMER_NAME = f"worker_{socket.gethostname()}" # 每次讀取的訊息數量 BATCH_SIZE = 10 # 讀取超時 (毫秒) - 0 表示阻塞等待 BLOCK_MS = 5000 # 失敗重試上限 MAX_RETRIES = 3 # ============================================================================= # Signal Worker # ============================================================================= class SignalWorker: """ Redis Streams 訊號消費者 職責: 1. 從 stream:awoooi_signals 讀取訊號 2. 將訊號聚合為 Incident (Phase 6.3) 3. 更新 Working Memory (Redis) 4. 觸發決策引擎 (Phase 6.4) 使用方式: worker = SignalWorker() await worker.start() # 啟動消費循環 await worker.stop() # 優雅關閉 """ def __init__(self) -> None: self._running = False self._task: asyncio.Task | None = None async def _ensure_consumer_group(self) -> None: """ 確保 Consumer Group 存在 XGROUP CREATE 如果 Group 已存在會報錯, 因此使用 MKSTREAM 選項並忽略 BUSYGROUP 錯誤。 """ redis_client = get_redis() try: # MKSTREAM: 如果 Stream 不存在則建立 await redis_client.xgroup_create( STREAM_KEY, CONSUMER_GROUP, id="0", # 從頭開始消費 mkstream=True, ) logger.info( "consumer_group_created", stream=STREAM_KEY, group=CONSUMER_GROUP, ) except Exception as e: # BUSYGROUP: Group 已存在,忽略 if "BUSYGROUP" in str(e): logger.debug("consumer_group_exists", group=CONSUMER_GROUP) else: raise async def start(self) -> None: """ 啟動消費循環 在背景執行,不阻塞主執行緒。 """ if self._running: logger.warning("signal_worker_already_running") return await self._ensure_consumer_group() self._running = True self._task = asyncio.create_task(self._consume_loop()) logger.info( "signal_worker_started", stream=STREAM_KEY, group=CONSUMER_GROUP, consumer=CONSUMER_NAME, ) async def stop(self) -> None: """ 優雅關閉 等待當前處理完成後停止。 """ if not self._running: return self._running = False if self._task: try: # 給予 5 秒完成當前處理 await asyncio.wait_for(self._task, timeout=5.0) except asyncio.TimeoutError: logger.warning("signal_worker_stop_timeout") self._task.cancel() except asyncio.CancelledError: pass logger.info("signal_worker_stopped") async def _consume_loop(self) -> None: """ 主消費循環 XREADGROUP 阻塞等待新訊息,處理後 XACK。 """ redis_client = get_redis() while self._running: try: # XREADGROUP: 從 Consumer Group 讀取訊息 # >: 只讀取新訊息 (不包含 Pending List) messages = await redis_client.xreadgroup( groupname=CONSUMER_GROUP, consumername=CONSUMER_NAME, streams={STREAM_KEY: ">"}, count=BATCH_SIZE, block=BLOCK_MS, ) if not messages: # 超時,沒有新訊息 continue # messages 格式: [[stream_name, [(id, data), ...]]] for stream_name, entries in messages: for message_id, data in entries: await self._process_signal(message_id, data) except asyncio.CancelledError: logger.info("signal_worker_cancelled") break except Exception as e: logger.exception("signal_worker_error", error=str(e)) # 避免無限快速重試 await asyncio.sleep(1.0) async def _process_signal(self, message_id: str, data: dict[str, Any]) -> None: """ 處理單一訊號 Phase 6.3 核心邏輯: 1. 訊號去重 (fingerprint) 2. 訊號聚合 (30分鐘時間窗口 + 服務關聯) 3. Incident 建立/更新 (聚合到同一 Incident) 4. GraphRAG 爆炸半徑分析 5. 雙層持久化 (Redis + PostgreSQL) """ redis_client = get_redis() try: logger.info( "signal_received", message_id=message_id, source=data.get("source", "unknown"), alert_name=data.get("alert_name", "unknown"), severity=data.get("severity", "unknown"), namespace=data.get("namespace", "default"), target=data.get("target", "unknown"), ) # Phase 6.3: 使用 IncidentEngine 處理訊號 # - 自動聚合相關告警到同一 Incident # - GraphRAG 分析爆炸半徑 # - 雙層持久化 engine = get_incident_engine() incident = await engine.process_signal(data) if incident: logger.info( "signal_processed_by_engine", message_id=message_id, incident_id=incident.incident_id, severity=incident.severity.value, signal_count=len(incident.signals), affected_services=incident.affected_services, persisted_to_pg=incident.persisted_to_pg, ) else: logger.warning( "signal_processing_failed", message_id=message_id, signal_data=data, ) # ACK: 確認訊息已處理 await redis_client.xack(STREAM_KEY, CONSUMER_GROUP, message_id) logger.debug("signal_acked", message_id=message_id) except Exception as e: logger.exception( "signal_process_error", message_id=message_id, error=str(e), ) # 不 ACK,訊息會留在 Pending List # Phase 6.3 將實作 Pending List 清理機制 # ============================================================================= # Singleton # ============================================================================= _signal_worker: SignalWorker | None = None async def init_signal_worker() -> SignalWorker: """ 初始化並啟動 Signal Worker 統帥鐵律: 在 Lifespan 啟動時調用 """ global _signal_worker if _signal_worker is not None: return _signal_worker _signal_worker = SignalWorker() await _signal_worker.start() return _signal_worker async def close_signal_worker() -> None: """ 關閉 Signal Worker 統帥鐵律: 在 Lifespan 關閉時調用 """ global _signal_worker if _signal_worker is not None: await _signal_worker.stop() _signal_worker = None def get_signal_worker() -> SignalWorker: """ 取得 Signal Worker 實例 Raises: RuntimeError: 若 Worker 未初始化 """ if _signal_worker is None: raise RuntimeError( "Signal worker not initialized. Call init_signal_worker() first." ) return _signal_worker