fix(aiops): suppress repeated llm alert loops
This commit is contained in:
@@ -53,6 +53,10 @@ from src.models.approval import (
|
||||
# [首席架構師] 移除 generate_alert_fingerprint 直接 import,改用 AlertAnalyzer.generate_fingerprint v1.2 2026-04-01 Asia/Taipei
|
||||
from src.models.webhook import AlertPayload, AlertResponse
|
||||
from src.services.alert_analyzer_service import AlertAnalyzer
|
||||
from src.services.alertmanager_llm_guard import (
|
||||
ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
|
||||
try_acquire_alertmanager_llm_lock,
|
||||
)
|
||||
from src.services.approval_db import get_approval_service
|
||||
|
||||
# Phase 17 P0: Service 層 (消除 Router 直接存取 Redis)
|
||||
@@ -2150,12 +2154,22 @@ async def alertmanager_webhook(
|
||||
# 2026-04-14 Claude Haiku 4.5 Asia/Taipei
|
||||
# 位置:指紋生成後、LLM 分析前(短路子告警)
|
||||
# ==========================================================================
|
||||
grouping_result = await get_alert_grouping_service().evaluate(
|
||||
alertname=alertname,
|
||||
namespace=namespace,
|
||||
fingerprint=fingerprint,
|
||||
)
|
||||
if grouping_result.is_grouped:
|
||||
try:
|
||||
grouping_result = await get_alert_grouping_service().evaluate(
|
||||
alertname=alertname,
|
||||
namespace=namespace,
|
||||
fingerprint=fingerprint,
|
||||
)
|
||||
except Exception as e:
|
||||
grouping_result = None
|
||||
logger.warning(
|
||||
"alertmanager_grouping_failed_fail_open",
|
||||
alert_id=alert_id,
|
||||
fingerprint=fingerprint,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
if grouping_result and grouping_result.is_grouped:
|
||||
logger.info(
|
||||
"alertmanager_grouped_skip",
|
||||
alert_id=alert_id,
|
||||
@@ -2258,6 +2272,21 @@ async def alertmanager_webhook(
|
||||
approval_created=False,
|
||||
)
|
||||
|
||||
if not await try_acquire_alertmanager_llm_lock(fingerprint, alert_id):
|
||||
logger.info(
|
||||
"alertmanager_llm_inflight_suppressed",
|
||||
alert_id=alert_id,
|
||||
fingerprint=fingerprint,
|
||||
ttl_seconds=ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
|
||||
)
|
||||
return AlertResponse(
|
||||
success=True,
|
||||
message="🛡️ 告警已由同指紋背景 AI 分析處理中,跳過重複 LLM 呼叫",
|
||||
alert_id=alert_id,
|
||||
approval_created=False,
|
||||
converged=True,
|
||||
)
|
||||
|
||||
# ==========================================================================
|
||||
# ADR-089 (2026-04-17 ogt + Claude Sonnet 4.6): 新告警 — 背景 LLM 分析
|
||||
# 立即回傳 202,AI 辯證在背景非同步執行
|
||||
@@ -2271,6 +2300,7 @@ async def alertmanager_webhook(
|
||||
"source": "alertmanager",
|
||||
"target_resource": target_resource,
|
||||
"namespace": namespace,
|
||||
"fingerprint": fingerprint,
|
||||
"message": message,
|
||||
"annotations": dict(alert.annotations) if alert.annotations else {},
|
||||
"metrics": {},
|
||||
@@ -2303,11 +2333,18 @@ async def alertmanager_webhook(
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("alertmanager_error", error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to process alert: {str(e)}",
|
||||
) from e
|
||||
logger.error(
|
||||
"alertmanager_degraded_accepted_no_retry",
|
||||
alert_id=alert_id,
|
||||
fingerprint=fingerprint,
|
||||
error=str(e),
|
||||
)
|
||||
return AlertResponse(
|
||||
success=False,
|
||||
message="⚠️ 告警已接收但處理降級,避免 Alertmanager retry storm;已交由背景治理/人工介入追蹤",
|
||||
alert_id=alert_id,
|
||||
approval_created=False,
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
|
||||
Reference in New Issue
Block a user