fix(aiops): suppress repeated llm alert loops
Some checks failed
CD Pipeline / tests (push) Successful in 1m37s
Code Review / ai-code-review (push) Successful in 28s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-05-01 13:02:07 +08:00
parent 3691402561
commit 9db87f177e
9 changed files with 244 additions and 24 deletions

View File

@@ -53,6 +53,10 @@ from src.models.approval import (
# [首席架構師] 移除 generate_alert_fingerprint 直接 import改用 AlertAnalyzer.generate_fingerprint v1.2 2026-04-01 Asia/Taipei
from src.models.webhook import AlertPayload, AlertResponse
from src.services.alert_analyzer_service import AlertAnalyzer
from src.services.alertmanager_llm_guard import (
ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
try_acquire_alertmanager_llm_lock,
)
from src.services.approval_db import get_approval_service
# Phase 17 P0: Service 層 (消除 Router 直接存取 Redis)
@@ -2150,12 +2154,22 @@ async def alertmanager_webhook(
# 2026-04-14 Claude Haiku 4.5 Asia/Taipei
# 位置指紋生成後、LLM 分析前(短路子告警)
# ==========================================================================
grouping_result = await get_alert_grouping_service().evaluate(
alertname=alertname,
namespace=namespace,
fingerprint=fingerprint,
)
if grouping_result.is_grouped:
try:
grouping_result = await get_alert_grouping_service().evaluate(
alertname=alertname,
namespace=namespace,
fingerprint=fingerprint,
)
except Exception as e:
grouping_result = None
logger.warning(
"alertmanager_grouping_failed_fail_open",
alert_id=alert_id,
fingerprint=fingerprint,
error=str(e),
)
if grouping_result and grouping_result.is_grouped:
logger.info(
"alertmanager_grouped_skip",
alert_id=alert_id,
@@ -2258,6 +2272,21 @@ async def alertmanager_webhook(
approval_created=False,
)
if not await try_acquire_alertmanager_llm_lock(fingerprint, alert_id):
logger.info(
"alertmanager_llm_inflight_suppressed",
alert_id=alert_id,
fingerprint=fingerprint,
ttl_seconds=ALERTMANAGER_LLM_INFLIGHT_LOCK_TTL_SECONDS,
)
return AlertResponse(
success=True,
message="🛡️ 告警已由同指紋背景 AI 分析處理中,跳過重複 LLM 呼叫",
alert_id=alert_id,
approval_created=False,
converged=True,
)
# ==========================================================================
# ADR-089 (2026-04-17 ogt + Claude Sonnet 4.6): 新告警 — 背景 LLM 分析
# 立即回傳 202AI 辯證在背景非同步執行
@@ -2271,6 +2300,7 @@ async def alertmanager_webhook(
"source": "alertmanager",
"target_resource": target_resource,
"namespace": namespace,
"fingerprint": fingerprint,
"message": message,
"annotations": dict(alert.annotations) if alert.annotations else {},
"metrics": {},
@@ -2303,11 +2333,18 @@ async def alertmanager_webhook(
)
except Exception as e:
logger.error("alertmanager_error", error=str(e))
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to process alert: {str(e)}",
) from e
logger.error(
"alertmanager_degraded_accepted_no_retry",
alert_id=alert_id,
fingerprint=fingerprint,
error=str(e),
)
return AlertResponse(
success=False,
message="⚠️ 告警已接收但處理降級,避免 Alertmanager retry storm已交由背景治理/人工介入追蹤",
alert_id=alert_id,
approval_created=False,
)
@router.get(