287 lines
8.9 KiB
Python
287 lines
8.9 KiB
Python
"""
|
||
告警聚合引擎 (Alert Grouping Engine)
|
||
=====================================
|
||
ADR-076: 告警風暴防禦 — 滑動視窗聚合
|
||
建立: 2026-04-14 (台北時區) Claude Haiku 4.5
|
||
|
||
目標:
|
||
- 防止告警風暴:同一 namespace/alertname 在 5 分鐘內爆出多個告警 → 聚合為 Parent Alert
|
||
- 節省 LLM token 費用
|
||
- 避免 Telegram 被洗版
|
||
|
||
設計原則:
|
||
- Redis Sorted Set 滑動視窗(同 anomaly_counter.py ADR-037 模式)
|
||
- 遵循 leWOOOgo 積木化鐵律
|
||
- 只用 Redis,不直接存取 DB
|
||
- Graceful Degradation:Redis 失敗不阻斷主流程
|
||
- 統帥設定 THRESHOLD=3(5 分鐘內 3 個以上才聚合)
|
||
|
||
Redis Key 設計:
|
||
- alert_group:{group_key}:count — Sorted Set (timestamp → timestamp)
|
||
- alert_group:{group_key}:meta — Hash (parent_fingerprint, first_seen, count)
|
||
TTL: 10 分鐘(略長於 5 分鐘視窗)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import time
|
||
from dataclasses import dataclass
|
||
from typing import TYPE_CHECKING
|
||
|
||
import structlog
|
||
|
||
if TYPE_CHECKING:
|
||
import redis.asyncio as redis
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
def _decode_redis_member(value: object, fallback: str) -> str:
|
||
"""Redis client 可能回 bytes 或 str;統一成 str 供 DB / log 使用。"""
|
||
if isinstance(value, bytes):
|
||
return value.decode("utf-8", errors="replace")
|
||
if isinstance(value, str):
|
||
return value
|
||
if value is None:
|
||
return fallback
|
||
return str(value)
|
||
|
||
|
||
# =============================================================================
|
||
# Data Types
|
||
# =============================================================================
|
||
|
||
|
||
@dataclass
|
||
class GroupingResult:
|
||
"""聚合評估結果"""
|
||
|
||
is_grouped: bool
|
||
"""是否已被聚合(True = 此告警是子告警,應跳過 LLM)"""
|
||
|
||
group_key: str
|
||
"""聚合分組 key"""
|
||
|
||
count: int
|
||
"""目前視窗內的告警數量"""
|
||
|
||
parent_fingerprint: str | None
|
||
"""父告警的指紋(第一個進來的告警)"""
|
||
|
||
is_parent: bool
|
||
"""是否為父告警(第一個進來觸發聚合的那個)"""
|
||
|
||
|
||
# =============================================================================
|
||
# AlertGroupingService
|
||
# =============================================================================
|
||
|
||
|
||
class AlertGroupingService:
|
||
"""
|
||
告警聚合引擎
|
||
|
||
統帥指令 (2026-04-14):
|
||
- "防禦告警風暴:同一 namespace/deployment 在 5 分鐘內炸出 10 個相同告警 → 搓合成 1 個 Parent Alert"
|
||
- "大幅節省 LLM Token 費用,避免 Telegram 被洗版"
|
||
|
||
滑動視窗設計(同 anomaly_counter.py ADR-037):
|
||
- ZADD alert_group:{key}:window {ts} {ts}
|
||
- ZCOUNT alert_group:{key}:window {cutoff} +inf
|
||
- ZREMRANGEBYSCORE alert_group:{key}:window -inf {cutoff}
|
||
"""
|
||
|
||
# 5 分鐘滑動視窗
|
||
WINDOW_SECONDS: int = 300
|
||
|
||
# 觸發聚合的閾值:保留第一張主卡,第二個同組告警開始收斂。
|
||
# 2026-05-07 Codex — Telegram 群組噪音治理:舊值 3 會讓前兩張同類告警仍進 AI/Telegram。
|
||
GROUP_THRESHOLD: int = 2
|
||
|
||
# Redis Key 前綴
|
||
PREFIX_WINDOW = "alert_group:window:"
|
||
PREFIX_META = "alert_group:meta:"
|
||
|
||
# TTL(視窗 + 5 分鐘緩衝)
|
||
TTL_SECONDS: int = 600
|
||
|
||
def __init__(self, redis_client: redis.Redis) -> None:
|
||
self.redis = redis_client
|
||
|
||
@staticmethod
|
||
def build_group_key(alertname: str, namespace: str) -> str:
|
||
"""
|
||
從 alertname + namespace 建構聚合分組 key
|
||
|
||
分組邏輯:取 alertname 的前綴(去掉數字後綴)+ namespace
|
||
例:PodCrashLoopBackOff-pod-1 + awoooi-prod → PodCrashLoopBackOff:awoooi-prod
|
||
|
||
Args:
|
||
alertname: 告警名稱
|
||
namespace: K8s namespace
|
||
|
||
Returns:
|
||
分組 key 字串
|
||
"""
|
||
import re
|
||
# 取 alertname 前綴(去掉尾端的數字或 UUID 後綴)
|
||
prefix = re.split(r"[-_]\d+$|[-_][0-9a-f]{8,}$", alertname, maxsplit=1)[0]
|
||
return f"{prefix}:{namespace}"
|
||
|
||
async def evaluate(
|
||
self,
|
||
alertname: str,
|
||
namespace: str,
|
||
fingerprint: str,
|
||
) -> GroupingResult:
|
||
"""
|
||
評估告警是否應被聚合
|
||
|
||
流程:
|
||
1. 計算 group_key
|
||
2. 將此告警加入滑動視窗
|
||
3. 計算視窗內告警數量
|
||
4. 若數量 >= THRESHOLD,標記為子告警(is_grouped=True)
|
||
5. 第一個告警(count==1)為父告警
|
||
|
||
Graceful Degradation: Redis 失敗 → 返回 is_grouped=False,不阻斷主流程
|
||
|
||
Args:
|
||
alertname: 告警名稱
|
||
namespace: K8s namespace
|
||
fingerprint: 此告警的指紋
|
||
|
||
Returns:
|
||
GroupingResult
|
||
"""
|
||
group_key = self.build_group_key(alertname, namespace)
|
||
|
||
try:
|
||
return await self._do_evaluate(group_key, fingerprint)
|
||
except Exception:
|
||
logger.warning(
|
||
"alert_grouping_redis_error",
|
||
group_key=group_key,
|
||
alertname=alertname,
|
||
namespace=namespace,
|
||
)
|
||
# Graceful Degradation:Redis 失敗不阻斷主流程
|
||
return GroupingResult(
|
||
is_grouped=False,
|
||
group_key=group_key,
|
||
count=0,
|
||
parent_fingerprint=None,
|
||
is_parent=True,
|
||
)
|
||
|
||
async def _do_evaluate(self, group_key: str, fingerprint: str) -> GroupingResult:
|
||
"""
|
||
核心聚合邏輯(內部方法)
|
||
|
||
使用 Redis Pipeline 保證原子性
|
||
"""
|
||
now_ts = time.time()
|
||
cutoff_ts = now_ts - self.WINDOW_SECONDS
|
||
|
||
window_key = f"{self.PREFIX_WINDOW}{group_key}"
|
||
|
||
async with self.redis.pipeline(transaction=True) as pipe:
|
||
# 1. 清理過期記錄
|
||
pipe.zremrangebyscore(window_key, "-inf", cutoff_ts)
|
||
# 2. 加入當前告警(score=timestamp, member=fingerprint)
|
||
pipe.zadd(window_key, {fingerprint: now_ts})
|
||
# 3. 計算視窗內告警數量
|
||
pipe.zcount(window_key, cutoff_ts, "+inf")
|
||
# 4. 取第一個告警(父告警)
|
||
pipe.zrange(window_key, 0, 0)
|
||
# 5. 設定 TTL
|
||
pipe.expire(window_key, self.TTL_SECONDS)
|
||
results = await pipe.execute()
|
||
|
||
count = results[2]
|
||
first_members = results[3]
|
||
parent_fingerprint = _decode_redis_member(
|
||
first_members[0] if first_members else None,
|
||
fallback=fingerprint,
|
||
)
|
||
|
||
# 是否為父告警(第一個)
|
||
is_parent = parent_fingerprint == fingerprint or count == 1
|
||
|
||
# 是否觸發聚合(count >= THRESHOLD 且非父告警)
|
||
is_grouped = count >= self.GROUP_THRESHOLD and not is_parent
|
||
|
||
if is_grouped:
|
||
logger.info(
|
||
"alert_grouped_as_child",
|
||
group_key=group_key,
|
||
fingerprint=fingerprint,
|
||
parent_fingerprint=parent_fingerprint,
|
||
count=count,
|
||
threshold=self.GROUP_THRESHOLD,
|
||
)
|
||
elif count >= self.GROUP_THRESHOLD and is_parent:
|
||
# 父告警 + 超過閾值:表示新的父告警開始聚合
|
||
logger.info(
|
||
"alert_grouping_parent_promoted",
|
||
group_key=group_key,
|
||
fingerprint=fingerprint,
|
||
count=count,
|
||
)
|
||
|
||
return GroupingResult(
|
||
is_grouped=is_grouped,
|
||
group_key=group_key,
|
||
count=count,
|
||
parent_fingerprint=parent_fingerprint,
|
||
is_parent=is_parent,
|
||
)
|
||
|
||
async def get_group_count(self, alertname: str, namespace: str) -> int:
|
||
"""
|
||
查詢分組當前視窗內的告警數量
|
||
|
||
Args:
|
||
alertname: 告警名稱
|
||
namespace: K8s namespace
|
||
|
||
Returns:
|
||
視窗內告警數量(Redis 失敗返回 0)
|
||
"""
|
||
group_key = self.build_group_key(alertname, namespace)
|
||
window_key = f"{self.PREFIX_WINDOW}{group_key}"
|
||
|
||
try:
|
||
now_ts = time.time()
|
||
cutoff_ts = now_ts - self.WINDOW_SECONDS
|
||
count = await self.redis.zcount(window_key, cutoff_ts, "+inf")
|
||
return int(count)
|
||
except Exception:
|
||
logger.warning("alert_grouping_count_error", group_key=group_key)
|
||
return 0
|
||
|
||
|
||
# =============================================================================
|
||
# Factory Function
|
||
# =============================================================================
|
||
|
||
|
||
_instance: AlertGroupingService | None = None
|
||
|
||
|
||
def get_alert_grouping_service() -> AlertGroupingService:
|
||
"""
|
||
取得 AlertGroupingService 單例
|
||
|
||
依賴注入:需要在 Redis 初始化後呼叫
|
||
|
||
Returns:
|
||
AlertGroupingService 實例
|
||
"""
|
||
global _instance
|
||
if _instance is None:
|
||
from src.core.redis_client import get_redis
|
||
redis_client = get_redis()
|
||
_instance = AlertGroupingService(redis_client)
|
||
return _instance
|