diff --git a/apps/api/src/jobs/ai_slo_watchdog_job.py b/apps/api/src/jobs/ai_slo_watchdog_job.py index d0c382c1..7c7fbd08 100644 --- a/apps/api/src/jobs/ai_slo_watchdog_job.py +++ b/apps/api/src/jobs/ai_slo_watchdog_job.py @@ -183,13 +183,18 @@ async def _check_once() -> None: return # 去重:violations 相同內容 1 小時內不重複發 - dedup_hash = f"{hash(tuple(sorted(violations))) & 0xFFFFFF:06x}" + # 2026-05-04 ogt: 修正 hash() 非確定性 bug — Python hash() 每次 process 重啟值不同 + # (PYTHONHASHSEED 隨機化),導致每次 rollout 都繞過 dedup。改用 hashlib.sha256 確保跨 pod/重啟一致。 + import hashlib + _content = "|".join(sorted(violations)) + dedup_hash = hashlib.sha256(_content.encode()).hexdigest()[:12] dedup_key = f"watchdog:alert:{dedup_hash}" redis = get_redis() - if await redis.exists(dedup_key): + # setnx atomic — 同時多個 pod 只有第一個能 set,避免並發多發 + set_ok = await redis.set(dedup_key, "1", ex=_DEDUP_TTL_SEC, nx=True) + if not set_ok: logger.debug("ai_slo_watchdog_deduped", key=dedup_key) return - await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1") violation_lines = [ f"{idx + 1}. {item}" for idx, item in enumerate(violations)