diff --git a/apps/api/src/services/alert_grouping_service.py b/apps/api/src/services/alert_grouping_service.py index cd167ce6..fc9beac3 100644 --- a/apps/api/src/services/alert_grouping_service.py +++ b/apps/api/src/services/alert_grouping_service.py @@ -36,6 +36,17 @@ if TYPE_CHECKING: logger = structlog.get_logger(__name__) +def _decode_redis_member(value: object, fallback: str) -> str: + """Redis client 可能回 bytes 或 str;統一成 str 供 DB / log 使用。""" + if isinstance(value, bytes): + return value.decode("utf-8", errors="replace") + if isinstance(value, str): + return value + if value is None: + return fallback + return str(value) + + # ============================================================================= # Data Types # ============================================================================= @@ -189,7 +200,10 @@ class AlertGroupingService: count = results[2] first_members = results[3] - parent_fingerprint = first_members[0] if first_members else fingerprint + parent_fingerprint = _decode_redis_member( + first_members[0] if first_members else None, + fallback=fingerprint, + ) # 是否為父告警(第一個) is_parent = parent_fingerprint == fingerprint or count == 1 diff --git a/apps/api/src/services/channel_hub.py b/apps/api/src/services/channel_hub.py index 07f12122..93ca04db 100644 --- a/apps/api/src/services/channel_hub.py +++ b/apps/api/src/services/channel_hub.py @@ -28,6 +28,7 @@ from __future__ import annotations import asyncio import hashlib +import html import json from datetime import datetime, timezone from typing import Any @@ -167,6 +168,111 @@ def format_grouped_alert_event_content( ) +def format_grouped_alert_digest_text( + *, + alertname: str, + severity: str, + namespace: str, + target_resource: str, + group_key: str, + count: int, +) -> str: + """格式化要回覆到父告警卡的短 digest。""" + safe_alert = html.escape(alertname or "unknown") + safe_severity = html.escape(severity or "unknown") + safe_namespace = html.escape(namespace or "default") + safe_target = html.escape(target_resource or "unknown") + safe_group = html.escape(group_key or "unknown") + + return "\n".join( + [ + "🧩 告警已收斂到父卡", + f"├ 類型:{safe_alert}", + f"├ 等級:{safe_severity}", + f"├ 範圍:{safe_namespace}", + f"├ 最新目標:{safe_target}", + f"├ 群組:{safe_group}", + f"└ 目前視窗:{count} 筆同組告警", + "", + "完整子告警請看 AwoooP Run 監控,不再逐筆發 Telegram。", + ] + ) + + +async def maybe_send_grouped_alert_digest( + *, + project_id: str, + alertname: str, + severity: str, + namespace: str, + target_resource: str, + group_key: str, + count: int, + parent_fingerprint: str | None, +) -> bool: + """若父告警卡已存在,回覆一則低頻 digest;找不到父卡則安靜降級。""" + if not parent_fingerprint: + return False + + try: + from sqlalchemy import select + + from src.db.base import get_db_context + from src.db.models import ApprovalRecord + from src.services.telegram_gateway import get_telegram_gateway + + async with get_db_context(project_id) as db: + result = await db.execute( + select(ApprovalRecord.incident_id) + .where(ApprovalRecord.fingerprint == parent_fingerprint) + .where(ApprovalRecord.incident_id.is_not(None)) + .order_by(ApprovalRecord.created_at.desc()) + .limit(1) + ) + incident_id = result.scalar_one_or_none() + + if not incident_id: + logger.info( + "grouped_alert_digest_parent_not_ready", + project_id=project_id, + group_key=group_key, + parent_fingerprint=parent_fingerprint, + ) + return False + + digest_text = format_grouped_alert_digest_text( + alertname=alertname, + severity=severity, + namespace=namespace, + target_resource=target_resource, + group_key=group_key, + count=count, + ) + sent = await get_telegram_gateway().append_grouped_alert_digest( + incident_id=str(incident_id), + group_key=group_key, + digest_text=digest_text, + ) + logger.info( + "grouped_alert_digest_result", + project_id=project_id, + incident_id=str(incident_id), + group_key=group_key, + count=count, + sent=sent, + ) + return sent + except Exception as exc: + logger.warning( + "grouped_alert_digest_failed", + project_id=project_id, + group_key=group_key, + parent_fingerprint=parent_fingerprint, + error=str(exc), + ) + return False + + async def record_grouped_alert_event( *, project_id: str, @@ -226,6 +332,16 @@ async def record_grouped_alert_event( group_key=group_key, count=count, ) + await maybe_send_grouped_alert_digest( + project_id=project_id, + alertname=alertname, + severity=severity, + namespace=namespace, + target_resource=target_resource, + group_key=group_key, + count=count, + parent_fingerprint=parent_fingerprint, + ) return event_id except Exception as exc: logger.warning( diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index e6153882..0a1e619f 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -56,6 +56,8 @@ INCIDENT_UPDATE_DEDUP_PREFIX = "awoooi:tg_update_dedup:" # {incident_id}:{statu INCIDENT_UPDATE_DEDUP_TTL_SECONDS = 5 * 60 # 5 分鐘內相同狀態不重複洗版 INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_PREFIX = "awoooi:tg_update_global_failure_dedup:" INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_TTL_SECONDS = 10 * 60 # 相同失敗摘要跨 incident 10 分鐘只推一次 +GROUPED_ALERT_DIGEST_DEDUP_PREFIX = "awoooi:tg_group_digest:" # {group_key} +GROUPED_ALERT_DIGEST_DEDUP_TTL_SECONDS = 5 * 60 # 同一告警群組 5 分鐘只推一則 digest # 2026-04-01 Claude Code: Long Polling 分散式 Leader Election # 防止多 Pod 同時 getUpdates → 409 Conflict 互搶問題 @@ -4765,6 +4767,91 @@ class TelegramGateway: ) return True + async def append_grouped_alert_digest( + self, + *, + incident_id: str, + group_key: str, + digest_text: str, + ) -> bool: + """ + 將同組告警收斂摘要回覆到父告警卡,不移除原卡按鈕。 + + 與 append_incident_update 不同:digest 是觀測訊息,不代表執行狀態改變, + 因此不能動 approve/reject/silence 按鈕。 + """ + redis = get_redis() + stored = await redis.get(f"tg_msg:{incident_id}") + if not stored: + logger.info( + "grouped_alert_digest_no_parent_message", + incident_id=incident_id, + group_key=group_key, + ) + return False + + try: + message_id = int(stored) + except (ValueError, TypeError): + logger.warning( + "grouped_alert_digest_invalid_parent_message", + incident_id=incident_id, + stored=stored, + ) + return False + + dedup_key = f"{GROUPED_ALERT_DIGEST_DEDUP_PREFIX}{group_key}" + try: + was_set = await redis.set( + dedup_key, + incident_id, + ex=GROUPED_ALERT_DIGEST_DEDUP_TTL_SECONDS, + nx=True, + ) + if not was_set: + logger.info( + "grouped_alert_digest_dedup_suppressed", + incident_id=incident_id, + group_key=group_key, + ) + return True + except Exception as exc: + logger.warning( + "grouped_alert_digest_dedup_failed", + incident_id=incident_id, + group_key=group_key, + error=str(exc), + ) + + try: + await self._send_request("sendMessage", { + "chat_id": self.alert_chat_id, + "text": digest_text[:1400], + "parse_mode": "HTML", + "reply_parameters": { + "message_id": message_id, + "allow_sending_without_reply": True, + }, + "disable_web_page_preview": True, + }) + except TelegramGatewayError as exc: + logger.warning( + "grouped_alert_digest_reply_failed", + incident_id=incident_id, + group_key=group_key, + message_id=message_id, + error=str(exc), + ) + return False + + logger.info( + "grouped_alert_digest_reply_sent", + incident_id=incident_id, + group_key=group_key, + message_id=message_id, + ) + return True + async def _dispatch_category_action( self, callback_query_id: str, diff --git a/apps/api/tests/test_alert_grouping_service.py b/apps/api/tests/test_alert_grouping_service.py index 70151a3e..6c0a343f 100644 --- a/apps/api/tests/test_alert_grouping_service.py +++ b/apps/api/tests/test_alert_grouping_service.py @@ -12,7 +12,11 @@ ADR-076: 告警聚合引擎 — 告警風暴防禦 import pytest -from src.services.alert_grouping_service import AlertGroupingService, GroupingResult +from src.services.alert_grouping_service import ( + AlertGroupingService, + GroupingResult, + _decode_redis_member, +) class TestBuildGroupKey: @@ -116,6 +120,16 @@ class TestGroupingResultDataclass: assert len(parts) == 2 +class TestRedisMemberDecode: + """測試 Redis zrange member 正規化。""" + + def test_decode_bytes_member(self): + assert _decode_redis_member(b"fp-parent", "fallback") == "fp-parent" + + def test_decode_none_uses_fallback(self): + assert _decode_redis_member(None, "fallback") == "fallback" + + class TestAlertGroupingServiceConstants: """測試服務常量設定""" diff --git a/apps/api/tests/test_channel_hub_grouped_alert_events.py b/apps/api/tests/test_channel_hub_grouped_alert_events.py index c16020d6..1cae72ea 100644 --- a/apps/api/tests/test_channel_hub_grouped_alert_events.py +++ b/apps/api/tests/test_channel_hub_grouped_alert_events.py @@ -2,6 +2,7 @@ from __future__ import annotations from src.services.channel_hub import ( build_grouped_alert_provider_event_id, + format_grouped_alert_digest_text, format_grouped_alert_event_content, ) @@ -34,3 +35,20 @@ def test_format_grouped_alert_event_content_keeps_operator_context() -> None: assert "Target: sentry-self-hosted-events-consumer-1" in content assert "Group Count: 4" in content assert "Parent Fingerprint: parent-fp" in content + + +def test_format_grouped_alert_digest_text_is_html_safe() -> None: + content = format_grouped_alert_digest_text( + alertname="Docker", + severity="critical", + namespace="default", + target_resource="sentry&snuba", + group_key="Docker:default", + count=7, + ) + + assert "告警已收斂到父卡" in content + assert "Docker<Restart>" in content + assert "sentry&snuba" in content + assert "7 筆同組告警" in content + assert "AwoooP Run 監控" in content