fix(api): restore converged alert recurrence notifications
All checks were successful
CD Pipeline / tests (push) Successful in 1m26s
Code Review / ai-code-review (push) Successful in 15s
CD Pipeline / build-and-deploy (push) Successful in 4m18s
CD Pipeline / post-deploy-checks (push) Successful in 1m50s

This commit is contained in:
Your Name
2026-06-11 12:24:10 +08:00
parent 0f9f341afc
commit dfca4dd67e
3 changed files with 367 additions and 6 deletions

View File

@@ -0,0 +1,194 @@
"""Throttled Telegram notices for converged alert fingerprints."""
import hashlib
import html
from src.core.config import settings
from src.core.logging import get_logger
from src.core.redis_client import get_redis
from src.services.telegram_gateway import get_telegram_gateway
logger = get_logger("awoooi.converged_alert_recurrence")
CONVERGED_ALERT_RECURRENCE_REDIS_PREFIX = "awoooi:tg:converged_alert_recurrence:"
CONVERGED_ALERT_RECURRENCE_TTL_SECONDS = 30 * 60
CONVERGED_ALERT_RECURRENCE_FALLBACK_MILESTONES = frozenset({2, 3, 5, 10, 20, 50, 100})
def shorten_alert_text(value: str | None, *, limit: int = 80) -> str:
"""Keep Telegram recurrence notices readable and HTML-safe."""
text = " ".join(str(value or "-").split())
if len(text) <= limit:
return html.escape(text)
return html.escape(f"{text[: limit - 3]}...")
def is_converged_alert_recurrence_milestone(hit_count: int) -> bool:
"""Fallback dedupe when Redis is unavailable."""
if hit_count <= 1:
return False
return hit_count in CONVERGED_ALERT_RECURRENCE_FALLBACK_MILESTONES or hit_count % 100 == 0
def converged_alert_recurrence_key(fingerprint: str) -> str:
digest = hashlib.sha256(str(fingerprint or "missing").encode()).hexdigest()[:32]
return f"{CONVERGED_ALERT_RECURRENCE_REDIS_PREFIX}{digest}"
async def should_notify_converged_alert_recurrence(
*,
fingerprint: str,
hit_count: int,
) -> bool:
"""Throttle converged alert notices without making active incidents disappear."""
if hit_count <= 1:
return False
key = converged_alert_recurrence_key(fingerprint)
try:
redis = get_redis()
acquired = await redis.set(
key,
str(hit_count),
ex=CONVERGED_ALERT_RECURRENCE_TTL_SECONDS,
nx=True,
)
return bool(acquired)
except Exception as exc:
logger.warning(
"converged_alert_recurrence_dedup_unavailable",
fingerprint_hash=key.rsplit(":", 1)[-1],
hit_count=hit_count,
fallback="milestone",
error=str(exc),
)
return is_converged_alert_recurrence_milestone(hit_count)
def format_converged_alert_recurrence_message(
*,
source: str,
alertname: str,
severity: str,
namespace: str,
target_resource: str,
hit_count: int,
incident_id: str | None,
approval_id: str | None,
alert_category: str = "",
notification_type: str = "",
) -> str:
"""Build a concise recurrence notice for an already-open alert fingerprint."""
return "\n".join(
[
"<b>告警仍在發生</b>",
"同一指紋已收斂,系統保留去重,但不再完全靜音。",
"",
f"來源:<code>{shorten_alert_text(source, limit=40)}</code>",
f"告警:<code>{shorten_alert_text(alertname, limit=80)}</code>",
f"嚴重度:<code>{shorten_alert_text(severity, limit=24)}</code>",
f"目標:<code>{shorten_alert_text(target_resource, limit=80)}</code>",
f"命名空間:<code>{shorten_alert_text(namespace, limit=48)}</code>",
f"累計次數:<b>{hit_count}</b>",
f"事件:<code>{shorten_alert_text(incident_id, limit=48)}</code>",
f"簽核:<code>{shorten_alert_text(approval_id, limit=48)}</code>",
f"分類:<code>{shorten_alert_text(alert_category or '-', limit=48)}</code>",
f"通知型別:<code>{shorten_alert_text(notification_type or '-', limit=48)}</code>",
"",
"下一步:請查看 AwoooP 事件時間線;這不是新的自動修復授權。",
]
)
async def notify_converged_alert_recurrence(
*,
source: str,
fingerprint: str,
alertname: str,
severity: str,
namespace: str,
target_resource: str,
hit_count: int,
incident_id: str | None,
approval_id: str | None,
alert_category: str = "",
notification_type: str = "",
) -> None:
"""Send a throttled recurrence notice for an already-open alert fingerprint."""
if not await should_notify_converged_alert_recurrence(
fingerprint=fingerprint,
hit_count=hit_count,
):
logger.info(
"converged_alert_recurrence_throttled",
source=source,
hit_count=hit_count,
approval_id=approval_id,
)
return
text = format_converged_alert_recurrence_message(
source=source,
alertname=alertname,
severity=severity,
namespace=namespace,
target_resource=target_resource,
hit_count=hit_count,
incident_id=incident_id,
approval_id=approval_id,
alert_category=alert_category,
notification_type=notification_type,
)
gateway = get_telegram_gateway()
sent_count = 0
failures: list[str] = []
try:
await gateway.send_alert_notification(text)
sent_count += 1
except Exception as exc:
failures.append(f"primary:{type(exc).__name__}")
logger.warning(
"converged_alert_recurrence_primary_failed",
source=source,
approval_id=approval_id,
error=str(exc),
)
private_chat_id = settings.OPENCLAW_TG_CHAT_ID
if private_chat_id and private_chat_id != gateway.alert_chat_id:
try:
await gateway.send_notification(text, chat_id=private_chat_id)
sent_count += 1
except Exception as exc:
failures.append(f"private:{type(exc).__name__}")
logger.warning(
"converged_alert_recurrence_private_mirror_failed",
source=source,
approval_id=approval_id,
error=str(exc),
)
if sent_count:
logger.info(
"converged_alert_recurrence_sent",
source=source,
hit_count=hit_count,
approval_id=approval_id,
mirrored_to_private=bool(private_chat_id and private_chat_id != gateway.alert_chat_id),
sent_count=sent_count,
)
else:
logger.error(
"converged_alert_recurrence_failed",
source=source,
hit_count=hit_count,
approval_id=approval_id,
failures=failures,
)