From 45f6f17558155b194f6c21318b837fdccadc0f5d Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 4 May 2026 19:47:42 +0800 Subject: [PATCH] =?UTF-8?q?fix(watchdog):=20dedup=20hash=20=E9=9D=9E?= =?UTF-8?q?=E7=A2=BA=E5=AE=9A=E6=80=A7=20bug=20=E2=80=94=20=E6=94=B9?= =?UTF-8?q?=E7=94=A8=20hashlib.sha256=20+=20setnx=20atomic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根因:Python 內建 hash() 受 PYTHONHASHSEED 影響,每次 process 重啟值不同。 每次 kubectl rollout restart → 新 pod 算出不同 dedup_hash → 繞過 1h TTL → 洗版。 症狀:連續 rollout 4-5 次後,META SYSTEM 每分鐘一條狂發(19:39/40/41/42 截圖)。 修法: 1. hash() → hashlib.sha256(content.encode()).hexdigest()[:12](跨 pod/重啟確定性) 2. redis.exists+setex → redis.set(nx=True) atomic setnx(防多 replica 並發多發) 2026-05-04 ogt Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/jobs/ai_slo_watchdog_job.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/apps/api/src/jobs/ai_slo_watchdog_job.py b/apps/api/src/jobs/ai_slo_watchdog_job.py index d0c382c1..7c7fbd08 100644 --- a/apps/api/src/jobs/ai_slo_watchdog_job.py +++ b/apps/api/src/jobs/ai_slo_watchdog_job.py @@ -183,13 +183,18 @@ async def _check_once() -> None: return # 去重:violations 相同內容 1 小時內不重複發 - dedup_hash = f"{hash(tuple(sorted(violations))) & 0xFFFFFF:06x}" + # 2026-05-04 ogt: 修正 hash() 非確定性 bug — Python hash() 每次 process 重啟值不同 + # (PYTHONHASHSEED 隨機化),導致每次 rollout 都繞過 dedup。改用 hashlib.sha256 確保跨 pod/重啟一致。 + import hashlib + _content = "|".join(sorted(violations)) + dedup_hash = hashlib.sha256(_content.encode()).hexdigest()[:12] dedup_key = f"watchdog:alert:{dedup_hash}" redis = get_redis() - if await redis.exists(dedup_key): + # setnx atomic — 同時多個 pod 只有第一個能 set,避免並發多發 + set_ok = await redis.set(dedup_key, "1", ex=_DEDUP_TTL_SEC, nx=True) + if not set_ok: logger.debug("ai_slo_watchdog_deduped", key=dedup_key) return - await redis.setex(dedup_key, _DEDUP_TTL_SEC, "1") violation_lines = [ f"{idx + 1}. {item}" for idx, item in enumerate(violations)