Files
awoooi/apps/api/src/workers/signal_worker.py
OG T 196d269b92 feat: add all application source code
- apps/api: FastAPI backend with Dockerfile
- apps/web: Next.js frontend with Dockerfile
- apps/sensor: Signal collection agent
- packages: shared packages

Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-22 18:57:44 +08:00

295 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Signal Worker - Redis Streams Consumer
=======================================
Phase 6.1: Event Bus Implementation
功能:
- XREADGROUP 消費 stream:awoooi_signals
- Signal → Incident 聚合邏輯 (Phase 6.3 實作)
- 失敗重試 + ACK 機制
- Graceful Shutdown
Redis Streams 概念:
- Stream: stream:awoooi_signals (訊息佇列)
- Consumer Group: awoooi_workers (消費者群組)
- Consumer: worker_{hostname} (單一消費者)
統帥鐵律:
- 使用 XREADGROUP 確保訊息只被處理一次
- 處理完成後必須 XACK
- 失敗訊息進入 Pending List需定期清理
"""
import asyncio
import socket
from typing import Any
import structlog
from src.core.redis_client import get_redis
from src.services.incident_engine import get_incident_engine
logger = structlog.get_logger(__name__)
# =============================================================================
# Constants
# =============================================================================
STREAM_KEY = "stream:awoooi_signals"
CONSUMER_GROUP = "awoooi_workers"
CONSUMER_NAME = f"worker_{socket.gethostname()}"
# 每次讀取的訊息數量
BATCH_SIZE = 10
# 讀取超時 (毫秒) - 0 表示阻塞等待
BLOCK_MS = 5000
# 失敗重試上限
MAX_RETRIES = 3
# =============================================================================
# Signal Worker
# =============================================================================
class SignalWorker:
"""
Redis Streams 訊號消費者
職責:
1. 從 stream:awoooi_signals 讀取訊號
2. 將訊號聚合為 Incident (Phase 6.3)
3. 更新 Working Memory (Redis)
4. 觸發決策引擎 (Phase 6.4)
使用方式:
worker = SignalWorker()
await worker.start() # 啟動消費循環
await worker.stop() # 優雅關閉
"""
def __init__(self) -> None:
self._running = False
self._task: asyncio.Task | None = None
async def _ensure_consumer_group(self) -> None:
"""
確保 Consumer Group 存在
XGROUP CREATE 如果 Group 已存在會報錯,
因此使用 MKSTREAM 選項並忽略 BUSYGROUP 錯誤。
"""
redis_client = get_redis()
try:
# MKSTREAM: 如果 Stream 不存在則建立
await redis_client.xgroup_create(
STREAM_KEY,
CONSUMER_GROUP,
id="0", # 從頭開始消費
mkstream=True,
)
logger.info(
"consumer_group_created",
stream=STREAM_KEY,
group=CONSUMER_GROUP,
)
except Exception as e:
# BUSYGROUP: Group 已存在,忽略
if "BUSYGROUP" in str(e):
logger.debug("consumer_group_exists", group=CONSUMER_GROUP)
else:
raise
async def start(self) -> None:
"""
啟動消費循環
在背景執行,不阻塞主執行緒。
"""
if self._running:
logger.warning("signal_worker_already_running")
return
await self._ensure_consumer_group()
self._running = True
self._task = asyncio.create_task(self._consume_loop())
logger.info(
"signal_worker_started",
stream=STREAM_KEY,
group=CONSUMER_GROUP,
consumer=CONSUMER_NAME,
)
async def stop(self) -> None:
"""
優雅關閉
等待當前處理完成後停止。
"""
if not self._running:
return
self._running = False
if self._task:
try:
# 給予 5 秒完成當前處理
await asyncio.wait_for(self._task, timeout=5.0)
except asyncio.TimeoutError:
logger.warning("signal_worker_stop_timeout")
self._task.cancel()
except asyncio.CancelledError:
pass
logger.info("signal_worker_stopped")
async def _consume_loop(self) -> None:
"""
主消費循環
XREADGROUP 阻塞等待新訊息,處理後 XACK。
"""
redis_client = get_redis()
while self._running:
try:
# XREADGROUP: 從 Consumer Group 讀取訊息
# >: 只讀取新訊息 (不包含 Pending List)
messages = await redis_client.xreadgroup(
groupname=CONSUMER_GROUP,
consumername=CONSUMER_NAME,
streams={STREAM_KEY: ">"},
count=BATCH_SIZE,
block=BLOCK_MS,
)
if not messages:
# 超時,沒有新訊息
continue
# messages 格式: [[stream_name, [(id, data), ...]]]
for stream_name, entries in messages:
for message_id, data in entries:
await self._process_signal(message_id, data)
except asyncio.CancelledError:
logger.info("signal_worker_cancelled")
break
except Exception as e:
logger.exception("signal_worker_error", error=str(e))
# 避免無限快速重試
await asyncio.sleep(1.0)
async def _process_signal(self, message_id: str, data: dict[str, Any]) -> None:
"""
處理單一訊號
Phase 6.3 核心邏輯:
1. 訊號去重 (fingerprint)
2. 訊號聚合 (30分鐘時間窗口 + 服務關聯)
3. Incident 建立/更新 (聚合到同一 Incident)
4. GraphRAG 爆炸半徑分析
5. 雙層持久化 (Redis + PostgreSQL)
"""
redis_client = get_redis()
try:
logger.info(
"signal_received",
message_id=message_id,
source=data.get("source", "unknown"),
alert_name=data.get("alert_name", "unknown"),
severity=data.get("severity", "unknown"),
namespace=data.get("namespace", "default"),
target=data.get("target", "unknown"),
)
# Phase 6.3: 使用 IncidentEngine 處理訊號
# - 自動聚合相關告警到同一 Incident
# - GraphRAG 分析爆炸半徑
# - 雙層持久化
engine = get_incident_engine()
incident = await engine.process_signal(data)
if incident:
logger.info(
"signal_processed_by_engine",
message_id=message_id,
incident_id=incident.incident_id,
severity=incident.severity.value,
signal_count=len(incident.signals),
affected_services=incident.affected_services,
persisted_to_pg=incident.persisted_to_pg,
)
else:
logger.warning(
"signal_processing_failed",
message_id=message_id,
signal_data=data,
)
# ACK: 確認訊息已處理
await redis_client.xack(STREAM_KEY, CONSUMER_GROUP, message_id)
logger.debug("signal_acked", message_id=message_id)
except Exception as e:
logger.exception(
"signal_process_error",
message_id=message_id,
error=str(e),
)
# 不 ACK訊息會留在 Pending List
# Phase 6.3 將實作 Pending List 清理機制
# =============================================================================
# Singleton
# =============================================================================
_signal_worker: SignalWorker | None = None
async def init_signal_worker() -> SignalWorker:
"""
初始化並啟動 Signal Worker
統帥鐵律: 在 Lifespan 啟動時調用
"""
global _signal_worker
if _signal_worker is not None:
return _signal_worker
_signal_worker = SignalWorker()
await _signal_worker.start()
return _signal_worker
async def close_signal_worker() -> None:
"""
關閉 Signal Worker
統帥鐵律: 在 Lifespan 關閉時調用
"""
global _signal_worker
if _signal_worker is not None:
await _signal_worker.stop()
_signal_worker = None
def get_signal_worker() -> SignalWorker:
"""
取得 Signal Worker 實例
Raises:
RuntimeError: 若 Worker 未初始化
"""
if _signal_worker is None:
raise RuntimeError(
"Signal worker not initialized. Call init_signal_worker() first."
)
return _signal_worker