- apps/api: FastAPI backend with Dockerfile - apps/web: Next.js frontend with Dockerfile - apps/sensor: Signal collection agent - packages: shared packages Co-Authored-By: Claude <noreply@anthropic.com>
295 lines
8.6 KiB
Python
295 lines
8.6 KiB
Python
"""
|
||
Signal Worker - Redis Streams Consumer
|
||
=======================================
|
||
Phase 6.1: Event Bus Implementation
|
||
|
||
功能:
|
||
- XREADGROUP 消費 stream:awoooi_signals
|
||
- Signal → Incident 聚合邏輯 (Phase 6.3 實作)
|
||
- 失敗重試 + ACK 機制
|
||
- Graceful Shutdown
|
||
|
||
Redis Streams 概念:
|
||
- Stream: stream:awoooi_signals (訊息佇列)
|
||
- Consumer Group: awoooi_workers (消費者群組)
|
||
- Consumer: worker_{hostname} (單一消費者)
|
||
|
||
統帥鐵律:
|
||
- 使用 XREADGROUP 確保訊息只被處理一次
|
||
- 處理完成後必須 XACK
|
||
- 失敗訊息進入 Pending List,需定期清理
|
||
"""
|
||
|
||
import asyncio
|
||
import socket
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
from src.core.redis_client import get_redis
|
||
from src.services.incident_engine import get_incident_engine
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
# =============================================================================
|
||
# Constants
|
||
# =============================================================================
|
||
|
||
STREAM_KEY = "stream:awoooi_signals"
|
||
CONSUMER_GROUP = "awoooi_workers"
|
||
CONSUMER_NAME = f"worker_{socket.gethostname()}"
|
||
|
||
# 每次讀取的訊息數量
|
||
BATCH_SIZE = 10
|
||
# 讀取超時 (毫秒) - 0 表示阻塞等待
|
||
BLOCK_MS = 5000
|
||
# 失敗重試上限
|
||
MAX_RETRIES = 3
|
||
|
||
|
||
# =============================================================================
|
||
# Signal Worker
|
||
# =============================================================================
|
||
|
||
class SignalWorker:
|
||
"""
|
||
Redis Streams 訊號消費者
|
||
|
||
職責:
|
||
1. 從 stream:awoooi_signals 讀取訊號
|
||
2. 將訊號聚合為 Incident (Phase 6.3)
|
||
3. 更新 Working Memory (Redis)
|
||
4. 觸發決策引擎 (Phase 6.4)
|
||
|
||
使用方式:
|
||
worker = SignalWorker()
|
||
await worker.start() # 啟動消費循環
|
||
await worker.stop() # 優雅關閉
|
||
"""
|
||
|
||
def __init__(self) -> None:
|
||
self._running = False
|
||
self._task: asyncio.Task | None = None
|
||
|
||
async def _ensure_consumer_group(self) -> None:
|
||
"""
|
||
確保 Consumer Group 存在
|
||
|
||
XGROUP CREATE 如果 Group 已存在會報錯,
|
||
因此使用 MKSTREAM 選項並忽略 BUSYGROUP 錯誤。
|
||
"""
|
||
redis_client = get_redis()
|
||
try:
|
||
# MKSTREAM: 如果 Stream 不存在則建立
|
||
await redis_client.xgroup_create(
|
||
STREAM_KEY,
|
||
CONSUMER_GROUP,
|
||
id="0", # 從頭開始消費
|
||
mkstream=True,
|
||
)
|
||
logger.info(
|
||
"consumer_group_created",
|
||
stream=STREAM_KEY,
|
||
group=CONSUMER_GROUP,
|
||
)
|
||
except Exception as e:
|
||
# BUSYGROUP: Group 已存在,忽略
|
||
if "BUSYGROUP" in str(e):
|
||
logger.debug("consumer_group_exists", group=CONSUMER_GROUP)
|
||
else:
|
||
raise
|
||
|
||
async def start(self) -> None:
|
||
"""
|
||
啟動消費循環
|
||
|
||
在背景執行,不阻塞主執行緒。
|
||
"""
|
||
if self._running:
|
||
logger.warning("signal_worker_already_running")
|
||
return
|
||
|
||
await self._ensure_consumer_group()
|
||
|
||
self._running = True
|
||
self._task = asyncio.create_task(self._consume_loop())
|
||
logger.info(
|
||
"signal_worker_started",
|
||
stream=STREAM_KEY,
|
||
group=CONSUMER_GROUP,
|
||
consumer=CONSUMER_NAME,
|
||
)
|
||
|
||
async def stop(self) -> None:
|
||
"""
|
||
優雅關閉
|
||
|
||
等待當前處理完成後停止。
|
||
"""
|
||
if not self._running:
|
||
return
|
||
|
||
self._running = False
|
||
|
||
if self._task:
|
||
try:
|
||
# 給予 5 秒完成當前處理
|
||
await asyncio.wait_for(self._task, timeout=5.0)
|
||
except asyncio.TimeoutError:
|
||
logger.warning("signal_worker_stop_timeout")
|
||
self._task.cancel()
|
||
except asyncio.CancelledError:
|
||
pass
|
||
|
||
logger.info("signal_worker_stopped")
|
||
|
||
async def _consume_loop(self) -> None:
|
||
"""
|
||
主消費循環
|
||
|
||
XREADGROUP 阻塞等待新訊息,處理後 XACK。
|
||
"""
|
||
redis_client = get_redis()
|
||
|
||
while self._running:
|
||
try:
|
||
# XREADGROUP: 從 Consumer Group 讀取訊息
|
||
# >: 只讀取新訊息 (不包含 Pending List)
|
||
messages = await redis_client.xreadgroup(
|
||
groupname=CONSUMER_GROUP,
|
||
consumername=CONSUMER_NAME,
|
||
streams={STREAM_KEY: ">"},
|
||
count=BATCH_SIZE,
|
||
block=BLOCK_MS,
|
||
)
|
||
|
||
if not messages:
|
||
# 超時,沒有新訊息
|
||
continue
|
||
|
||
# messages 格式: [[stream_name, [(id, data), ...]]]
|
||
for stream_name, entries in messages:
|
||
for message_id, data in entries:
|
||
await self._process_signal(message_id, data)
|
||
|
||
except asyncio.CancelledError:
|
||
logger.info("signal_worker_cancelled")
|
||
break
|
||
except Exception as e:
|
||
logger.exception("signal_worker_error", error=str(e))
|
||
# 避免無限快速重試
|
||
await asyncio.sleep(1.0)
|
||
|
||
async def _process_signal(self, message_id: str, data: dict[str, Any]) -> None:
|
||
"""
|
||
處理單一訊號
|
||
|
||
Phase 6.3 核心邏輯:
|
||
1. 訊號去重 (fingerprint)
|
||
2. 訊號聚合 (30分鐘時間窗口 + 服務關聯)
|
||
3. Incident 建立/更新 (聚合到同一 Incident)
|
||
4. GraphRAG 爆炸半徑分析
|
||
5. 雙層持久化 (Redis + PostgreSQL)
|
||
"""
|
||
redis_client = get_redis()
|
||
|
||
try:
|
||
logger.info(
|
||
"signal_received",
|
||
message_id=message_id,
|
||
source=data.get("source", "unknown"),
|
||
alert_name=data.get("alert_name", "unknown"),
|
||
severity=data.get("severity", "unknown"),
|
||
namespace=data.get("namespace", "default"),
|
||
target=data.get("target", "unknown"),
|
||
)
|
||
|
||
# Phase 6.3: 使用 IncidentEngine 處理訊號
|
||
# - 自動聚合相關告警到同一 Incident
|
||
# - GraphRAG 分析爆炸半徑
|
||
# - 雙層持久化
|
||
engine = get_incident_engine()
|
||
incident = await engine.process_signal(data)
|
||
|
||
if incident:
|
||
logger.info(
|
||
"signal_processed_by_engine",
|
||
message_id=message_id,
|
||
incident_id=incident.incident_id,
|
||
severity=incident.severity.value,
|
||
signal_count=len(incident.signals),
|
||
affected_services=incident.affected_services,
|
||
persisted_to_pg=incident.persisted_to_pg,
|
||
)
|
||
else:
|
||
logger.warning(
|
||
"signal_processing_failed",
|
||
message_id=message_id,
|
||
signal_data=data,
|
||
)
|
||
|
||
# ACK: 確認訊息已處理
|
||
await redis_client.xack(STREAM_KEY, CONSUMER_GROUP, message_id)
|
||
|
||
logger.debug("signal_acked", message_id=message_id)
|
||
|
||
except Exception as e:
|
||
logger.exception(
|
||
"signal_process_error",
|
||
message_id=message_id,
|
||
error=str(e),
|
||
)
|
||
# 不 ACK,訊息會留在 Pending List
|
||
# Phase 6.3 將實作 Pending List 清理機制
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_signal_worker: SignalWorker | None = None
|
||
|
||
|
||
async def init_signal_worker() -> SignalWorker:
|
||
"""
|
||
初始化並啟動 Signal Worker
|
||
|
||
統帥鐵律: 在 Lifespan 啟動時調用
|
||
"""
|
||
global _signal_worker
|
||
|
||
if _signal_worker is not None:
|
||
return _signal_worker
|
||
|
||
_signal_worker = SignalWorker()
|
||
await _signal_worker.start()
|
||
return _signal_worker
|
||
|
||
|
||
async def close_signal_worker() -> None:
|
||
"""
|
||
關閉 Signal Worker
|
||
|
||
統帥鐵律: 在 Lifespan 關閉時調用
|
||
"""
|
||
global _signal_worker
|
||
|
||
if _signal_worker is not None:
|
||
await _signal_worker.stop()
|
||
_signal_worker = None
|
||
|
||
|
||
def get_signal_worker() -> SignalWorker:
|
||
"""
|
||
取得 Signal Worker 實例
|
||
|
||
Raises:
|
||
RuntimeError: 若 Worker 未初始化
|
||
"""
|
||
if _signal_worker is None:
|
||
raise RuntimeError(
|
||
"Signal worker not initialized. Call init_signal_worker() first."
|
||
)
|
||
return _signal_worker
|