Files
awoooi/apps/api/src/services/alert_grouping_service.py
Your Name 6ac61ab6d7
All checks were successful
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / tests (push) Successful in 1m2s
CD Pipeline / build-and-deploy (push) Successful in 3m39s
CD Pipeline / post-deploy-checks (push) Successful in 1m18s
fix(telegram): digest grouped alert storms
2026-05-07 01:51:31 +08:00

287 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
告警聚合引擎 (Alert Grouping Engine)
=====================================
ADR-076: 告警風暴防禦 — 滑動視窗聚合
建立: 2026-04-14 (台北時區) Claude Haiku 4.5
目標:
- 防止告警風暴:同一 namespace/alertname 在 5 分鐘內爆出多個告警 → 聚合為 Parent Alert
- 節省 LLM token 費用
- 避免 Telegram 被洗版
設計原則:
- Redis Sorted Set 滑動視窗(同 anomaly_counter.py ADR-037 模式)
- 遵循 leWOOOgo 積木化鐵律
- 只用 Redis不直接存取 DB
- Graceful DegradationRedis 失敗不阻斷主流程
- 統帥設定 THRESHOLD=35 分鐘內 3 個以上才聚合)
Redis Key 設計:
- alert_group:{group_key}:count — Sorted Set (timestamp → timestamp)
- alert_group:{group_key}:meta — Hash (parent_fingerprint, first_seen, count)
TTL: 10 分鐘(略長於 5 分鐘視窗)
"""
from __future__ import annotations
import time
from dataclasses import dataclass
from typing import TYPE_CHECKING
import structlog
if TYPE_CHECKING:
import redis.asyncio as redis
logger = structlog.get_logger(__name__)
def _decode_redis_member(value: object, fallback: str) -> str:
"""Redis client 可能回 bytes 或 str統一成 str 供 DB / log 使用。"""
if isinstance(value, bytes):
return value.decode("utf-8", errors="replace")
if isinstance(value, str):
return value
if value is None:
return fallback
return str(value)
# =============================================================================
# Data Types
# =============================================================================
@dataclass
class GroupingResult:
"""聚合評估結果"""
is_grouped: bool
"""是否已被聚合True = 此告警是子告警,應跳過 LLM"""
group_key: str
"""聚合分組 key"""
count: int
"""目前視窗內的告警數量"""
parent_fingerprint: str | None
"""父告警的指紋(第一個進來的告警)"""
is_parent: bool
"""是否為父告警(第一個進來觸發聚合的那個)"""
# =============================================================================
# AlertGroupingService
# =============================================================================
class AlertGroupingService:
"""
告警聚合引擎
統帥指令 (2026-04-14):
- "防禦告警風暴:同一 namespace/deployment 在 5 分鐘內炸出 10 個相同告警 → 搓合成 1 個 Parent Alert"
- "大幅節省 LLM Token 費用,避免 Telegram 被洗版"
滑動視窗設計(同 anomaly_counter.py ADR-037:
- ZADD alert_group:{key}:window {ts} {ts}
- ZCOUNT alert_group:{key}:window {cutoff} +inf
- ZREMRANGEBYSCORE alert_group:{key}:window -inf {cutoff}
"""
# 5 分鐘滑動視窗
WINDOW_SECONDS: int = 300
# 觸發聚合的閾值:保留第一張主卡,第二個同組告警開始收斂。
# 2026-05-07 Codex — Telegram 群組噪音治理:舊值 3 會讓前兩張同類告警仍進 AI/Telegram。
GROUP_THRESHOLD: int = 2
# Redis Key 前綴
PREFIX_WINDOW = "alert_group:window:"
PREFIX_META = "alert_group:meta:"
# TTL視窗 + 5 分鐘緩衝)
TTL_SECONDS: int = 600
def __init__(self, redis_client: redis.Redis) -> None:
self.redis = redis_client
@staticmethod
def build_group_key(alertname: str, namespace: str) -> str:
"""
從 alertname + namespace 建構聚合分組 key
分組邏輯:取 alertname 的前綴(去掉數字後綴)+ namespace
PodCrashLoopBackOff-pod-1 + awoooi-prod → PodCrashLoopBackOff:awoooi-prod
Args:
alertname: 告警名稱
namespace: K8s namespace
Returns:
分組 key 字串
"""
import re
# 取 alertname 前綴(去掉尾端的數字或 UUID 後綴)
prefix = re.split(r"[-_]\d+$|[-_][0-9a-f]{8,}$", alertname, maxsplit=1)[0]
return f"{prefix}:{namespace}"
async def evaluate(
self,
alertname: str,
namespace: str,
fingerprint: str,
) -> GroupingResult:
"""
評估告警是否應被聚合
流程:
1. 計算 group_key
2. 將此告警加入滑動視窗
3. 計算視窗內告警數量
4. 若數量 >= THRESHOLD標記為子告警is_grouped=True
5. 第一個告警count==1為父告警
Graceful Degradation: Redis 失敗 → 返回 is_grouped=False不阻斷主流程
Args:
alertname: 告警名稱
namespace: K8s namespace
fingerprint: 此告警的指紋
Returns:
GroupingResult
"""
group_key = self.build_group_key(alertname, namespace)
try:
return await self._do_evaluate(group_key, fingerprint)
except Exception:
logger.warning(
"alert_grouping_redis_error",
group_key=group_key,
alertname=alertname,
namespace=namespace,
)
# Graceful DegradationRedis 失敗不阻斷主流程
return GroupingResult(
is_grouped=False,
group_key=group_key,
count=0,
parent_fingerprint=None,
is_parent=True,
)
async def _do_evaluate(self, group_key: str, fingerprint: str) -> GroupingResult:
"""
核心聚合邏輯(內部方法)
使用 Redis Pipeline 保證原子性
"""
now_ts = time.time()
cutoff_ts = now_ts - self.WINDOW_SECONDS
window_key = f"{self.PREFIX_WINDOW}{group_key}"
async with self.redis.pipeline(transaction=True) as pipe:
# 1. 清理過期記錄
pipe.zremrangebyscore(window_key, "-inf", cutoff_ts)
# 2. 加入當前告警score=timestamp, member=fingerprint
pipe.zadd(window_key, {fingerprint: now_ts})
# 3. 計算視窗內告警數量
pipe.zcount(window_key, cutoff_ts, "+inf")
# 4. 取第一個告警(父告警)
pipe.zrange(window_key, 0, 0)
# 5. 設定 TTL
pipe.expire(window_key, self.TTL_SECONDS)
results = await pipe.execute()
count = results[2]
first_members = results[3]
parent_fingerprint = _decode_redis_member(
first_members[0] if first_members else None,
fallback=fingerprint,
)
# 是否為父告警(第一個)
is_parent = parent_fingerprint == fingerprint or count == 1
# 是否觸發聚合count >= THRESHOLD 且非父告警)
is_grouped = count >= self.GROUP_THRESHOLD and not is_parent
if is_grouped:
logger.info(
"alert_grouped_as_child",
group_key=group_key,
fingerprint=fingerprint,
parent_fingerprint=parent_fingerprint,
count=count,
threshold=self.GROUP_THRESHOLD,
)
elif count >= self.GROUP_THRESHOLD and is_parent:
# 父告警 + 超過閾值:表示新的父告警開始聚合
logger.info(
"alert_grouping_parent_promoted",
group_key=group_key,
fingerprint=fingerprint,
count=count,
)
return GroupingResult(
is_grouped=is_grouped,
group_key=group_key,
count=count,
parent_fingerprint=parent_fingerprint,
is_parent=is_parent,
)
async def get_group_count(self, alertname: str, namespace: str) -> int:
"""
查詢分組當前視窗內的告警數量
Args:
alertname: 告警名稱
namespace: K8s namespace
Returns:
視窗內告警數量Redis 失敗返回 0
"""
group_key = self.build_group_key(alertname, namespace)
window_key = f"{self.PREFIX_WINDOW}{group_key}"
try:
now_ts = time.time()
cutoff_ts = now_ts - self.WINDOW_SECONDS
count = await self.redis.zcount(window_key, cutoff_ts, "+inf")
return int(count)
except Exception:
logger.warning("alert_grouping_count_error", group_key=group_key)
return 0
# =============================================================================
# Factory Function
# =============================================================================
_instance: AlertGroupingService | None = None
def get_alert_grouping_service() -> AlertGroupingService:
"""
取得 AlertGroupingService 單例
依賴注入:需要在 Redis 初始化後呼叫
Returns:
AlertGroupingService 實例
"""
global _instance
if _instance is None:
from src.core.redis_client import get_redis
redis_client = get_redis()
_instance = AlertGroupingService(redis_client)
return _instance