fix(api): restore converged alert recurrence notifications
This commit is contained in:
194
apps/api/src/services/converged_alert_recurrence_notifier.py
Normal file
194
apps/api/src/services/converged_alert_recurrence_notifier.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""Throttled Telegram notices for converged alert fingerprints."""
|
||||
|
||||
import hashlib
|
||||
import html
|
||||
|
||||
from src.core.config import settings
|
||||
from src.core.logging import get_logger
|
||||
from src.core.redis_client import get_redis
|
||||
from src.services.telegram_gateway import get_telegram_gateway
|
||||
|
||||
logger = get_logger("awoooi.converged_alert_recurrence")
|
||||
|
||||
CONVERGED_ALERT_RECURRENCE_REDIS_PREFIX = "awoooi:tg:converged_alert_recurrence:"
|
||||
CONVERGED_ALERT_RECURRENCE_TTL_SECONDS = 30 * 60
|
||||
CONVERGED_ALERT_RECURRENCE_FALLBACK_MILESTONES = frozenset({2, 3, 5, 10, 20, 50, 100})
|
||||
|
||||
|
||||
def shorten_alert_text(value: str | None, *, limit: int = 80) -> str:
|
||||
"""Keep Telegram recurrence notices readable and HTML-safe."""
|
||||
|
||||
text = " ".join(str(value or "-").split())
|
||||
if len(text) <= limit:
|
||||
return html.escape(text)
|
||||
return html.escape(f"{text[: limit - 3]}...")
|
||||
|
||||
|
||||
def is_converged_alert_recurrence_milestone(hit_count: int) -> bool:
|
||||
"""Fallback dedupe when Redis is unavailable."""
|
||||
|
||||
if hit_count <= 1:
|
||||
return False
|
||||
return hit_count in CONVERGED_ALERT_RECURRENCE_FALLBACK_MILESTONES or hit_count % 100 == 0
|
||||
|
||||
|
||||
def converged_alert_recurrence_key(fingerprint: str) -> str:
|
||||
digest = hashlib.sha256(str(fingerprint or "missing").encode()).hexdigest()[:32]
|
||||
return f"{CONVERGED_ALERT_RECURRENCE_REDIS_PREFIX}{digest}"
|
||||
|
||||
|
||||
async def should_notify_converged_alert_recurrence(
|
||||
*,
|
||||
fingerprint: str,
|
||||
hit_count: int,
|
||||
) -> bool:
|
||||
"""Throttle converged alert notices without making active incidents disappear."""
|
||||
|
||||
if hit_count <= 1:
|
||||
return False
|
||||
|
||||
key = converged_alert_recurrence_key(fingerprint)
|
||||
try:
|
||||
redis = get_redis()
|
||||
acquired = await redis.set(
|
||||
key,
|
||||
str(hit_count),
|
||||
ex=CONVERGED_ALERT_RECURRENCE_TTL_SECONDS,
|
||||
nx=True,
|
||||
)
|
||||
return bool(acquired)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"converged_alert_recurrence_dedup_unavailable",
|
||||
fingerprint_hash=key.rsplit(":", 1)[-1],
|
||||
hit_count=hit_count,
|
||||
fallback="milestone",
|
||||
error=str(exc),
|
||||
)
|
||||
return is_converged_alert_recurrence_milestone(hit_count)
|
||||
|
||||
|
||||
def format_converged_alert_recurrence_message(
|
||||
*,
|
||||
source: str,
|
||||
alertname: str,
|
||||
severity: str,
|
||||
namespace: str,
|
||||
target_resource: str,
|
||||
hit_count: int,
|
||||
incident_id: str | None,
|
||||
approval_id: str | None,
|
||||
alert_category: str = "",
|
||||
notification_type: str = "",
|
||||
) -> str:
|
||||
"""Build a concise recurrence notice for an already-open alert fingerprint."""
|
||||
|
||||
return "\n".join(
|
||||
[
|
||||
"<b>告警仍在發生</b>",
|
||||
"同一指紋已收斂,系統保留去重,但不再完全靜音。",
|
||||
"",
|
||||
f"來源:<code>{shorten_alert_text(source, limit=40)}</code>",
|
||||
f"告警:<code>{shorten_alert_text(alertname, limit=80)}</code>",
|
||||
f"嚴重度:<code>{shorten_alert_text(severity, limit=24)}</code>",
|
||||
f"目標:<code>{shorten_alert_text(target_resource, limit=80)}</code>",
|
||||
f"命名空間:<code>{shorten_alert_text(namespace, limit=48)}</code>",
|
||||
f"累計次數:<b>{hit_count}</b>",
|
||||
f"事件:<code>{shorten_alert_text(incident_id, limit=48)}</code>",
|
||||
f"簽核:<code>{shorten_alert_text(approval_id, limit=48)}</code>",
|
||||
f"分類:<code>{shorten_alert_text(alert_category or '-', limit=48)}</code>",
|
||||
f"通知型別:<code>{shorten_alert_text(notification_type or '-', limit=48)}</code>",
|
||||
"",
|
||||
"下一步:請查看 AwoooP 事件時間線;這不是新的自動修復授權。",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
async def notify_converged_alert_recurrence(
|
||||
*,
|
||||
source: str,
|
||||
fingerprint: str,
|
||||
alertname: str,
|
||||
severity: str,
|
||||
namespace: str,
|
||||
target_resource: str,
|
||||
hit_count: int,
|
||||
incident_id: str | None,
|
||||
approval_id: str | None,
|
||||
alert_category: str = "",
|
||||
notification_type: str = "",
|
||||
) -> None:
|
||||
"""Send a throttled recurrence notice for an already-open alert fingerprint."""
|
||||
|
||||
if not await should_notify_converged_alert_recurrence(
|
||||
fingerprint=fingerprint,
|
||||
hit_count=hit_count,
|
||||
):
|
||||
logger.info(
|
||||
"converged_alert_recurrence_throttled",
|
||||
source=source,
|
||||
hit_count=hit_count,
|
||||
approval_id=approval_id,
|
||||
)
|
||||
return
|
||||
|
||||
text = format_converged_alert_recurrence_message(
|
||||
source=source,
|
||||
alertname=alertname,
|
||||
severity=severity,
|
||||
namespace=namespace,
|
||||
target_resource=target_resource,
|
||||
hit_count=hit_count,
|
||||
incident_id=incident_id,
|
||||
approval_id=approval_id,
|
||||
alert_category=alert_category,
|
||||
notification_type=notification_type,
|
||||
)
|
||||
|
||||
gateway = get_telegram_gateway()
|
||||
sent_count = 0
|
||||
failures: list[str] = []
|
||||
|
||||
try:
|
||||
await gateway.send_alert_notification(text)
|
||||
sent_count += 1
|
||||
except Exception as exc:
|
||||
failures.append(f"primary:{type(exc).__name__}")
|
||||
logger.warning(
|
||||
"converged_alert_recurrence_primary_failed",
|
||||
source=source,
|
||||
approval_id=approval_id,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
private_chat_id = settings.OPENCLAW_TG_CHAT_ID
|
||||
if private_chat_id and private_chat_id != gateway.alert_chat_id:
|
||||
try:
|
||||
await gateway.send_notification(text, chat_id=private_chat_id)
|
||||
sent_count += 1
|
||||
except Exception as exc:
|
||||
failures.append(f"private:{type(exc).__name__}")
|
||||
logger.warning(
|
||||
"converged_alert_recurrence_private_mirror_failed",
|
||||
source=source,
|
||||
approval_id=approval_id,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
if sent_count:
|
||||
logger.info(
|
||||
"converged_alert_recurrence_sent",
|
||||
source=source,
|
||||
hit_count=hit_count,
|
||||
approval_id=approval_id,
|
||||
mirrored_to_private=bool(private_chat_id and private_chat_id != gateway.alert_chat_id),
|
||||
sent_count=sent_count,
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
"converged_alert_recurrence_failed",
|
||||
source=source,
|
||||
hit_count=hit_count,
|
||||
approval_id=approval_id,
|
||||
failures=failures,
|
||||
)
|
||||
Reference in New Issue
Block a user