feat(api): Phase 15.2 Redis Trace Context 傳遞
實現 Redis Streams 跨服務追蹤零斷鏈: - telemetry.py: 新增 get_trace_context() + restore_trace_context() - webhooks.py: Producer 注入 _trace_id, _span_id 到 Redis - signal_worker.py: Consumer 還原 Trace Context 建立子 Span 架構: API → Redis Streams → Worker 完整追蹤鏈 格式: W3C Trace Context (traceparent) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -2,18 +2,25 @@
|
||||
Signal Worker - Redis Streams Consumer
|
||||
=======================================
|
||||
Phase 6.1: Event Bus Implementation
|
||||
Phase 15.2: Redis Trace Context Propagation (2026-03-26)
|
||||
|
||||
功能:
|
||||
- XREADGROUP 消費 stream:awoooi_signals
|
||||
- Signal → Incident 聚合邏輯 (Phase 6.3 實作)
|
||||
- 失敗重試 + ACK 機制
|
||||
- Graceful Shutdown
|
||||
- **Phase 15.2**: Trace Context 還原 (零斷鏈觀測)
|
||||
|
||||
Redis Streams 概念:
|
||||
- Stream: stream:awoooi_signals (訊息佇列)
|
||||
- Consumer Group: awoooi_workers (消費者群組)
|
||||
- Consumer: worker_{hostname} (單一消費者)
|
||||
|
||||
Trace Context 傳遞 (Phase 15.2):
|
||||
- Producer (webhooks.py) 寫入 _trace_id, _span_id 到 Redis
|
||||
- Consumer (此檔案) 還原 Context,建立子 Span
|
||||
- 實現 API → Redis → Worker 完整追蹤鏈
|
||||
|
||||
統帥鐵律:
|
||||
- 使用 XREADGROUP 確保訊息只被處理一次
|
||||
- 處理完成後必須 XACK
|
||||
@@ -27,6 +34,7 @@ from typing import Any
|
||||
import structlog
|
||||
|
||||
from src.core.redis_client import get_redis, get_worker_redis
|
||||
from src.core.telemetry import restore_trace_context
|
||||
from src.services.incident_engine import get_incident_engine
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
@@ -195,57 +203,96 @@ class SignalWorker:
|
||||
3. Incident 建立/更新 (聚合到同一 Incident)
|
||||
4. GraphRAG 爆炸半徑分析
|
||||
5. 雙層持久化 (Redis + PostgreSQL)
|
||||
|
||||
Phase 15.2: Trace Context 還原
|
||||
- 從 Redis 訊息提取 _trace_id, _span_id
|
||||
- 建立子 Span 繼承原始 Trace,實現零斷鏈觀測
|
||||
"""
|
||||
redis_client = get_redis()
|
||||
|
||||
try:
|
||||
logger.info(
|
||||
"signal_received",
|
||||
message_id=message_id,
|
||||
source=data.get("source", "unknown"),
|
||||
alert_name=data.get("alert_name", "unknown"),
|
||||
severity=data.get("severity", "unknown"),
|
||||
namespace=data.get("namespace", "default"),
|
||||
target=data.get("target", "unknown"),
|
||||
)
|
||||
# =================================================================
|
||||
# Phase 15.2: 提取 Trace Context (從 Producer 注入的欄位)
|
||||
# =================================================================
|
||||
trace_context = None
|
||||
trace_id = data.pop("_trace_id", None) # pop 避免污染 signal data
|
||||
span_id = data.pop("_span_id", None)
|
||||
|
||||
# Phase 6.3: 使用 IncidentEngine 處理訊號
|
||||
# - 自動聚合相關告警到同一 Incident
|
||||
# - GraphRAG 分析爆炸半徑
|
||||
# - 雙層持久化
|
||||
engine = get_incident_engine()
|
||||
incident = await engine.process_signal(data)
|
||||
if trace_id:
|
||||
trace_context = {
|
||||
"trace_id": trace_id,
|
||||
"span_id": span_id or "",
|
||||
}
|
||||
|
||||
# =================================================================
|
||||
# 在還原的 Trace Context 中處理訊號
|
||||
# =================================================================
|
||||
with restore_trace_context(trace_context) as span:
|
||||
try:
|
||||
# 設置 Span 屬性 (用於 SignOz 搜尋)
|
||||
span.set_attribute("messaging.system", "redis_streams")
|
||||
span.set_attribute("messaging.destination", STREAM_KEY)
|
||||
span.set_attribute("messaging.message_id", message_id)
|
||||
span.set_attribute("signal.source", data.get("source", "unknown"))
|
||||
span.set_attribute("signal.alert_name", data.get("alert_name", "unknown"))
|
||||
span.set_attribute("signal.severity", data.get("severity", "unknown"))
|
||||
|
||||
if incident:
|
||||
logger.info(
|
||||
"signal_processed_by_engine",
|
||||
"signal_received",
|
||||
message_id=message_id,
|
||||
incident_id=incident.incident_id,
|
||||
severity=incident.severity.value,
|
||||
signal_count=len(incident.signals),
|
||||
affected_services=incident.affected_services,
|
||||
persisted_to_pg=incident.persisted_to_pg,
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"signal_processing_failed",
|
||||
message_id=message_id,
|
||||
signal_data=data,
|
||||
source=data.get("source", "unknown"),
|
||||
alert_name=data.get("alert_name", "unknown"),
|
||||
severity=data.get("severity", "unknown"),
|
||||
namespace=data.get("namespace", "default"),
|
||||
target=data.get("target", "unknown"),
|
||||
trace_restored=trace_context is not None,
|
||||
)
|
||||
|
||||
# ACK: 確認訊息已處理
|
||||
await redis_client.xack(STREAM_KEY, CONSUMER_GROUP, message_id)
|
||||
# Phase 6.3: 使用 IncidentEngine 處理訊號
|
||||
# - 自動聚合相關告警到同一 Incident
|
||||
# - GraphRAG 分析爆炸半徑
|
||||
# - 雙層持久化
|
||||
engine = get_incident_engine()
|
||||
incident = await engine.process_signal(data)
|
||||
|
||||
logger.debug("signal_acked", message_id=message_id)
|
||||
if incident:
|
||||
# 記錄 Incident 到 Span
|
||||
span.set_attribute("incident.id", incident.incident_id)
|
||||
span.set_attribute("incident.severity", incident.severity.value)
|
||||
span.set_attribute("incident.signal_count", len(incident.signals))
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
"signal_process_error",
|
||||
message_id=message_id,
|
||||
error=str(e),
|
||||
)
|
||||
# 不 ACK,訊息會留在 Pending List
|
||||
# Phase 6.3 將實作 Pending List 清理機制
|
||||
logger.info(
|
||||
"signal_processed_by_engine",
|
||||
message_id=message_id,
|
||||
incident_id=incident.incident_id,
|
||||
severity=incident.severity.value,
|
||||
signal_count=len(incident.signals),
|
||||
affected_services=incident.affected_services,
|
||||
persisted_to_pg=incident.persisted_to_pg,
|
||||
)
|
||||
else:
|
||||
span.set_attribute("signal.processing_failed", True)
|
||||
logger.warning(
|
||||
"signal_processing_failed",
|
||||
message_id=message_id,
|
||||
signal_data=data,
|
||||
)
|
||||
|
||||
# ACK: 確認訊息已處理
|
||||
await redis_client.xack(STREAM_KEY, CONSUMER_GROUP, message_id)
|
||||
span.set_attribute("messaging.acked", True)
|
||||
|
||||
logger.debug("signal_acked", message_id=message_id)
|
||||
|
||||
except Exception as e:
|
||||
span.set_attribute("error", True)
|
||||
span.set_attribute("error.message", str(e))
|
||||
logger.exception(
|
||||
"signal_process_error",
|
||||
message_id=message_id,
|
||||
error=str(e),
|
||||
)
|
||||
# 不 ACK,訊息會留在 Pending List
|
||||
# Phase 6.3 將實作 Pending List 清理機制
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
||||
Reference in New Issue
Block a user