fix(api): restore converged alert recurrence notifications
All checks were successful
CD Pipeline / tests (push) Successful in 1m26s
Code Review / ai-code-review (push) Successful in 15s
CD Pipeline / build-and-deploy (push) Successful in 4m18s
CD Pipeline / post-deploy-checks (push) Successful in 1m50s

This commit is contained in:
Your Name
2026-06-11 12:24:10 +08:00
parent 0f9f341afc
commit dfca4dd67e
3 changed files with 367 additions and 6 deletions

View File

@@ -59,6 +59,9 @@ from src.services.channel_hub import (
record_alertmanager_event,
record_grouped_alert_event,
)
from src.services.converged_alert_recurrence_notifier import (
notify_converged_alert_recurrence,
)
# Phase 15.2: Trace Context (moved to SignalProducerService)
# get_trace_context 已移至 Service 層
@@ -1148,15 +1151,29 @@ async def receive_alert(
# 避免 Telegram 洗版,用戶可在 UI 查看聚合次數
# =================================================================
logger.info(
"alert_converged_telegram_skipped",
"alert_converged_telegram_recurrence_scheduled",
approval_id=str(updated_approval.id),
hit_count=updated_approval.hit_count,
reason="Converged alert - Telegram already sent for this fingerprint",
reason="Converged alert - scheduling throttled recurrence notice",
)
background_tasks.add_task(
notify_converged_alert_recurrence,
source=alert.source,
fingerprint=fingerprint,
alertname=alert.alert_type,
severity=alert.severity,
namespace=alert.namespace,
target_resource=alert.target_resource,
hit_count=updated_approval.hit_count,
incident_id=getattr(updated_approval, "incident_id", None),
approval_id=str(updated_approval.id),
alert_category=alert.alert_type,
notification_type="generic",
)
return AlertResponse(
success=True,
message=f"🛡️ 告警收斂 (x{updated_approval.hit_count}) - Telegram 已發送,跳過重複通知",
message=f"🛡️ 告警收斂 (x{updated_approval.hit_count}) - 已排程節流再通知",
alert_id=alert_id,
approval_created=False, # 未建立新卡片
approval_id=str(updated_approval.id),
@@ -2693,10 +2710,10 @@ async def alertmanager_webhook(
# 2026-03-27 ogt: 收斂告警不重複發送 Telegram只更新 hit_count
# 用戶可在 UI 查看聚合次數,避免 Telegram 洗版
logger.info(
"alertmanager_converged_telegram_skipped",
"alertmanager_converged_telegram_recurrence_scheduled",
approval_id=str(updated_approval.id),
hit_count=updated_approval.hit_count,
reason="Converged alert - Telegram already sent for this fingerprint",
reason="Converged alert - scheduling throttled recurrence notice",
)
background_tasks.add_task(
record_alertmanager_event,
@@ -2718,10 +2735,24 @@ async def alertmanager_webhook(
labels=dict(alert.labels) if alert.labels else {},
annotations=dict(alert.annotations) if alert.annotations else {},
)
background_tasks.add_task(
notify_converged_alert_recurrence,
source="alertmanager",
fingerprint=fingerprint,
alertname=alertname,
severity=severity,
namespace=namespace,
target_resource=target_resource,
hit_count=updated_approval.hit_count,
incident_id=getattr(updated_approval, "incident_id", None),
approval_id=str(updated_approval.id),
alert_category=alert_category,
notification_type=notification_type,
)
return AlertResponse(
success=True,
message=f"🛡️ 告警收斂 (x{updated_approval.hit_count}) - Telegram 已發送,跳過重複通知",
message=f"🛡️ 告警收斂 (x{updated_approval.hit_count}) - 已排程節流再通知",
alert_id=alert_id,
approval_created=False,
approval_id=str(updated_approval.id),

View File

@@ -0,0 +1,194 @@
"""Throttled Telegram notices for converged alert fingerprints."""
import hashlib
import html
from src.core.config import settings
from src.core.logging import get_logger
from src.core.redis_client import get_redis
from src.services.telegram_gateway import get_telegram_gateway
logger = get_logger("awoooi.converged_alert_recurrence")
CONVERGED_ALERT_RECURRENCE_REDIS_PREFIX = "awoooi:tg:converged_alert_recurrence:"
CONVERGED_ALERT_RECURRENCE_TTL_SECONDS = 30 * 60
CONVERGED_ALERT_RECURRENCE_FALLBACK_MILESTONES = frozenset({2, 3, 5, 10, 20, 50, 100})
def shorten_alert_text(value: str | None, *, limit: int = 80) -> str:
"""Keep Telegram recurrence notices readable and HTML-safe."""
text = " ".join(str(value or "-").split())
if len(text) <= limit:
return html.escape(text)
return html.escape(f"{text[: limit - 3]}...")
def is_converged_alert_recurrence_milestone(hit_count: int) -> bool:
"""Fallback dedupe when Redis is unavailable."""
if hit_count <= 1:
return False
return hit_count in CONVERGED_ALERT_RECURRENCE_FALLBACK_MILESTONES or hit_count % 100 == 0
def converged_alert_recurrence_key(fingerprint: str) -> str:
digest = hashlib.sha256(str(fingerprint or "missing").encode()).hexdigest()[:32]
return f"{CONVERGED_ALERT_RECURRENCE_REDIS_PREFIX}{digest}"
async def should_notify_converged_alert_recurrence(
*,
fingerprint: str,
hit_count: int,
) -> bool:
"""Throttle converged alert notices without making active incidents disappear."""
if hit_count <= 1:
return False
key = converged_alert_recurrence_key(fingerprint)
try:
redis = get_redis()
acquired = await redis.set(
key,
str(hit_count),
ex=CONVERGED_ALERT_RECURRENCE_TTL_SECONDS,
nx=True,
)
return bool(acquired)
except Exception as exc:
logger.warning(
"converged_alert_recurrence_dedup_unavailable",
fingerprint_hash=key.rsplit(":", 1)[-1],
hit_count=hit_count,
fallback="milestone",
error=str(exc),
)
return is_converged_alert_recurrence_milestone(hit_count)
def format_converged_alert_recurrence_message(
*,
source: str,
alertname: str,
severity: str,
namespace: str,
target_resource: str,
hit_count: int,
incident_id: str | None,
approval_id: str | None,
alert_category: str = "",
notification_type: str = "",
) -> str:
"""Build a concise recurrence notice for an already-open alert fingerprint."""
return "\n".join(
[
"<b>告警仍在發生</b>",
"同一指紋已收斂,系統保留去重,但不再完全靜音。",
"",
f"來源:<code>{shorten_alert_text(source, limit=40)}</code>",
f"告警:<code>{shorten_alert_text(alertname, limit=80)}</code>",
f"嚴重度:<code>{shorten_alert_text(severity, limit=24)}</code>",
f"目標:<code>{shorten_alert_text(target_resource, limit=80)}</code>",
f"命名空間:<code>{shorten_alert_text(namespace, limit=48)}</code>",
f"累計次數:<b>{hit_count}</b>",
f"事件:<code>{shorten_alert_text(incident_id, limit=48)}</code>",
f"簽核:<code>{shorten_alert_text(approval_id, limit=48)}</code>",
f"分類:<code>{shorten_alert_text(alert_category or '-', limit=48)}</code>",
f"通知型別:<code>{shorten_alert_text(notification_type or '-', limit=48)}</code>",
"",
"下一步:請查看 AwoooP 事件時間線;這不是新的自動修復授權。",
]
)
async def notify_converged_alert_recurrence(
*,
source: str,
fingerprint: str,
alertname: str,
severity: str,
namespace: str,
target_resource: str,
hit_count: int,
incident_id: str | None,
approval_id: str | None,
alert_category: str = "",
notification_type: str = "",
) -> None:
"""Send a throttled recurrence notice for an already-open alert fingerprint."""
if not await should_notify_converged_alert_recurrence(
fingerprint=fingerprint,
hit_count=hit_count,
):
logger.info(
"converged_alert_recurrence_throttled",
source=source,
hit_count=hit_count,
approval_id=approval_id,
)
return
text = format_converged_alert_recurrence_message(
source=source,
alertname=alertname,
severity=severity,
namespace=namespace,
target_resource=target_resource,
hit_count=hit_count,
incident_id=incident_id,
approval_id=approval_id,
alert_category=alert_category,
notification_type=notification_type,
)
gateway = get_telegram_gateway()
sent_count = 0
failures: list[str] = []
try:
await gateway.send_alert_notification(text)
sent_count += 1
except Exception as exc:
failures.append(f"primary:{type(exc).__name__}")
logger.warning(
"converged_alert_recurrence_primary_failed",
source=source,
approval_id=approval_id,
error=str(exc),
)
private_chat_id = settings.OPENCLAW_TG_CHAT_ID
if private_chat_id and private_chat_id != gateway.alert_chat_id:
try:
await gateway.send_notification(text, chat_id=private_chat_id)
sent_count += 1
except Exception as exc:
failures.append(f"private:{type(exc).__name__}")
logger.warning(
"converged_alert_recurrence_private_mirror_failed",
source=source,
approval_id=approval_id,
error=str(exc),
)
if sent_count:
logger.info(
"converged_alert_recurrence_sent",
source=source,
hit_count=hit_count,
approval_id=approval_id,
mirrored_to_private=bool(private_chat_id and private_chat_id != gateway.alert_chat_id),
sent_count=sent_count,
)
else:
logger.error(
"converged_alert_recurrence_failed",
source=source,
hit_count=hit_count,
approval_id=approval_id,
failures=failures,
)