From b3a0f0d76635d8d200fdf7057e1ffdc8e20f6940 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 2 May 2026 16:25:48 +0800 Subject: [PATCH] fix(telegram): dedup by fingerprint + 24h TTL to stop repeat alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Telegram 重複發告警鐵證(4 個 agent 真實數據): - INC-6FE3BD (HostBackupFailed) 24h 內被推 15 次 - INC-FD6E21 (HostHighCpuLoad) 24h 內被推 6 次 - 06:44:18 同秒兩送 = pod 並發 race 根因: 1. `telegram_sent:{incident_id}` dedup key 綁 uuid4 隨機 INC ID, 同 fingerprint 換新 INC 完全不去重 2. dedup TTL=600s 比 incident_analysis_sweeper 重觸週期 1h、 alertmanager repeat_interval 4h 都短 → 每輪都過期通過 3. pod restart 走 _resend_unconfirmed_ready_tokens 用同一 incident_id key → 重啟必炸一波 修法(不消音、是「AI 認得這是同一事故」): - decision_manager.py:207-225 dedup key 改 alertname+target fingerprint - decision_manager.py:573-578 TTL 600s → 86400s (蓋住 sweeper 1h × alertmanager 4h) - decision_manager.py:3189-3208 pod restart resend 路徑同步改 fingerprint - incident_analysis_sweeper.py:37-42 sweeper_done TTL 3600s → 86400s 預期:同症狀 24h 內最多發 1 張 decision card;resolved 後 line 220-226 status check 會 early return,不影響復發偵測。 Tests: 35 passed (test_telegram_adr050 + test_decision_manager_docker_prune_routing) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../api/src/jobs/incident_analysis_sweeper.py | 5 ++- apps/api/src/services/decision_manager.py | 37 +++++++++++++++---- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/apps/api/src/jobs/incident_analysis_sweeper.py b/apps/api/src/jobs/incident_analysis_sweeper.py index ae442b74..fcb994bf 100644 --- a/apps/api/src/jobs/incident_analysis_sweeper.py +++ b/apps/api/src/jobs/incident_analysis_sweeper.py @@ -35,7 +35,10 @@ _SWEEP_INTERVAL_SEC = 90 # 每 90 秒掃一次 _MAX_BATCH = 5 # 每批最多 5 個 _SEMAPHORE_LIMIT = 3 # 最多 3 個並發 AI 分析 _DONE_MARKER_PREFIX = "sweeper_done:" # 輕量標記:已觸發過分析 -_DONE_MARKER_TTL = 3600 # 1 小時 TTL,後續由 get_or_create 去重 +# 2026-05-02 Claude Opus 4.7 + 統帥 ogt:TTL 從 3600s 拉到 86400s。 +# 原因:sweeper_done 過期 → 同 incident 被重新掃描觸發 decision → 通過 telegram dedup 重發。 +# 與 decision_manager.py:574 telegram dedup 24h 對齊,徹底治住「INVESTIGATING 中 INC 每小時被推一次」。 +_DONE_MARKER_TTL = 86400 # 24 小時 TTL,後續由 get_or_create 去重 # 2026-04-16 ogt: 只處理 48h 內的 incident,避免首次啟動把所有歷史舊案洗版到 Telegram _MAX_INCIDENT_AGE_HOURS = 48 diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index 39dcfc48..542db95a 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -204,14 +204,24 @@ async def _push_decision_to_telegram( _smart_truncate as _smt, ) - # 🔴 去重檢查:同一個 incident 10 分鐘內只發一次 + # 🔴 去重檢查:同一 fingerprint (alertname+target) 24h 內只發一次 decision card + # 2026-05-02 Claude Opus 4.7 + 統帥 ogt:原本 dedup_key 用 incident.incident_id, + # 但 incident_id = uuid4()[:6] 隨機,且 incident_analysis_sweeper 每 1h 重觸決策、 + # alertmanager 預設 repeat_interval=4h,導致同症狀 24h 內推 15 次(INC-6FE3BD 鐵證)。 + # 改成 alertname+target 構造的 fingerprint key + TTL 86400s,同症狀共用 dedup。 + # Incident 真正 RESOLVED/CLOSED 時走 line 220-226 的 status check 提早 return,不影響復發偵測。 redis = get_redis() - dedup_key = f"telegram_sent:{incident.incident_id}" + _alertname_fp = (incident.title or "unknown").strip().lower().replace(" ", "_")[:60] + _target_fp = ( + incident.affected_services[0] if incident.affected_services else "unknown" + ).lower()[:40] + dedup_key = f"telegram_sent:fp:{_alertname_fp}:{_target_fp}" if await redis.exists(dedup_key): logger.debug( "telegram_push_skipped", - reason="Already sent within 10 minutes", + reason="Already sent within 24h (fingerprint dedup)", incident_id=incident.incident_id, + fingerprint=f"{_alertname_fp}:{_target_fp}", ) return @@ -570,8 +580,11 @@ async def _push_decision_to_telegram( ) ) - # 🔴 發送成功後設置去重 key (TTL 10 分鐘) - await redis.setex(dedup_key, 600, "1") + # 🔴 發送成功後設置去重 key (TTL 24 小時) + # 2026-05-02 Claude Opus 4.7 + 統帥 ogt:TTL 從 600s 拉到 86400s。 + # 原因:incident_analysis_sweeper 每 1h 重觸 decision、alertmanager 預設 4h repeat, + # 600s TTL 早就過期 → 重發;24h TTL 蓋住兩個週期,徹底治住「同症狀 24h 推 15 次」。 + await redis.setex(dedup_key, 86400, "1") logger.info( "telegram_decision_pushed", @@ -3174,9 +3187,6 @@ class DecisionManager: continue incident_id = data.get("incident_id", "") - dedup_key = f"telegram_sent:{incident_id}" - if await redis.exists(dedup_key): - continue # dedup 還在,跳過 # 取 Incident 資料(確認未 resolved) async with get_db_context() as _db: @@ -3186,6 +3196,17 @@ class DecisionManager: if str(getattr(incident, "status", "")).lower() in ("resolved", "closed"): continue + # 2026-05-02 Claude Opus 4.7 + 統帥 ogt:dedup key 改 fingerprint, + # 與 line 217-218 同邏輯,避免 pod restart resend 路徑繞過 fingerprint dedup。 + # 原本 telegram_sent:{incident_id} TTL 600s 早就過期 → 重啟必重發; + # 改 fingerprint + 24h TTL → 同症狀 24h 內任何 INC ID 都不會重推。 + _alertname_fp = (getattr(incident, "title", None) or "unknown").strip().lower().replace(" ", "_")[:60] + _affected = getattr(incident, "affected_services", None) or [] + _target_fp = (_affected[0] if _affected else "unknown").lower()[:40] + dedup_key = f"telegram_sent:fp:{_alertname_fp}:{_target_fp}" + if await redis.exists(dedup_key): + continue # dedup 還在,跳過 + # 2026-04-15 Claude Sonnet 4.6 (節點 3 修復): # 跳過 > 3 天的 stale incident — labels 已過時,重 process 無意義 # 只會壓爆 Ollama + 污染 Telegram 卡片(截圖:4/11 的卡片今天還在彈)