fix(rules): ADR-064 L1 Redis 分散式鎖防止多 Pod 重複生成規則
Some checks are pending
CD Pipeline / build-and-deploy (push) Has started running
Some checks are pending
CD Pipeline / build-and-deploy (push) Has started running
問題: _generating set 是進程級,多 Pod 各自獨立,同一 alertname 可能被 多個 Pod 同時送給 Ollama/Gemini 生成規則 修復: SET NX EX lock_key — 只有第一個 Pod 能取鎖,其他 Pod 直接跳過 降級: Redis 不可用時 fallback 進程級 set(保持原有行為) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -35,9 +35,12 @@ logger = structlog.get_logger(__name__)
|
||||
|
||||
RULES_FILE = Path(__file__).parent.parent.parent / "alert_rules.yaml"
|
||||
|
||||
# 防止同一 alertname 重複生成(進程記憶體內去重)
|
||||
# 進程級去重(保留作為 Redis 不可用時的 fallback)
|
||||
_generating: set[str] = set()
|
||||
|
||||
# Redis 分散式鎖 TTL (秒),覆蓋 Ollama + Gemini 最長生成時間
|
||||
_RULE_GEN_LOCK_TTL = 120
|
||||
|
||||
# ── 變數提取 ────────────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -375,12 +378,30 @@ async def auto_generate_rule(
|
||||
alertname_safe = re.sub(r"[{}]", "", alertname)
|
||||
|
||||
# 去重:同一 alertname 同時只跑一次
|
||||
# ADR-064 L1: 優先用 Redis 分散式鎖,Redis 不可用時 fallback 進程級 set
|
||||
if alertname_safe in _generating:
|
||||
return
|
||||
if _rule_id_exists(alertname_safe):
|
||||
logger.debug("auto_rule_skip_exists", alertname=alertname_safe)
|
||||
return
|
||||
|
||||
# 嘗試取得 Redis 分散式鎖
|
||||
lock_key = f"rule_generating:{alertname_safe}"
|
||||
redis_lock_acquired = False
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
redis = get_redis()
|
||||
# SET NX EX — 只有第一個 Pod 能 SET 成功
|
||||
redis_lock_acquired = bool(await redis.set(lock_key, "1", nx=True, ex=_RULE_GEN_LOCK_TTL))
|
||||
if not redis_lock_acquired:
|
||||
logger.info("auto_rule_skip_redis_lock", alertname=alertname_safe)
|
||||
return
|
||||
except Exception as redis_err:
|
||||
# Redis 不可用時 fallback 進程級去重(降級不中斷)
|
||||
logger.warning("auto_rule_redis_lock_unavailable", error=str(redis_err))
|
||||
if alertname_safe in _generating:
|
||||
return
|
||||
|
||||
_generating.add(alertname_safe)
|
||||
try:
|
||||
rule_id = re.sub(r"[^a-z0-9_]", "_", alertname_safe.lower()).strip("_")
|
||||
@@ -427,3 +448,10 @@ async def auto_generate_rule(
|
||||
logger.error("auto_rule_exception", alertname=alertname_safe, error=str(e))
|
||||
finally:
|
||||
_generating.discard(alertname_safe)
|
||||
# 釋放 Redis 分散式鎖(生成完成,讓其他 Pod 可以讀到新規則)
|
||||
if redis_lock_acquired:
|
||||
try:
|
||||
from src.core.redis_client import get_redis
|
||||
await get_redis().delete(lock_key)
|
||||
except Exception:
|
||||
pass # TTL 到期自動釋放,不影響正確性
|
||||
|
||||
Reference in New Issue
Block a user