From 47342dfb34c903e80e5d9156fb731ebe8ae2f134 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 2 May 2026 17:38:48 +0800 Subject: [PATCH] fix(escalation): dedup escalation card by fingerprint + 24h TTL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 接續 b3a0f0d7(decision card dedup)—— 統帥 17:35 鐵證:4 條 ESCALATION P0 連發(HostOutOfDiskSpace + 3×HostDiskUsageHigh,全 target=node-exporter-110, 全不同 INC ID C9CD6E/FB7944/559B54/C1BBF3)。 decision card 修了但 escalation card 走另一條路徑,根因相同: - emergency_escalation_service.py:31 dedup key 綁 incident_id (uuid4 隨機) - TTL 900s 比 sweeper 重觸週期 1h 短 修法: - escalate_auto_repair_unavailable() 改用 alertname+target fingerprint dedup - TTL 900s → 86400s,與 decision_manager.py:574 對齊 drift_auto_adopt 路徑暫不動(TTL 已 3600s + report_id 非隨機,非當前問題)。 Tests: 7 passed (escalation/emergency 相關用例) Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/api/src/services/emergency_escalation_service.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/emergency_escalation_service.py b/apps/api/src/services/emergency_escalation_service.py index dd38053d..24e26509 100644 --- a/apps/api/src/services/emergency_escalation_service.py +++ b/apps/api/src/services/emergency_escalation_service.py @@ -28,12 +28,19 @@ async def escalate_auto_repair_unavailable( ) -> None: """Open an emergency channel when auto repair cannot safely continue.""" - dedup_key = f"auto_repair:emergency_escalated:{incident_id}" - if not await _dedup_first_send(dedup_key, ttl=900, event="auto_repair"): + # 2026-05-02 Claude Opus 4.7 + 統帥 ogt:dedup key 從 incident_id → fingerprint(alertname+target) + # 鐵證:4 條 ESCALATION 卡 17:35-17:36 連發(HostOutOfDiskSpace + 3×HostDiskUsageHigh,全 target=node-exporter-110) + # 原本 incident_id 是 uuid4 隨機,TTL 900s 太短 → 同症狀換 INC ID 完全不去重 + # 改成 alertname+target fingerprint + TTL 86400s,與 decision_manager.py:218 對齊。 + _alertname_fp = (alert_type or "AutoRepairBlocked").strip().lower().replace(" ", "_")[:60] + _target_fp = (target_resource or "unknown").lower()[:40] + dedup_key = f"auto_repair:emergency_escalated:fp:{_alertname_fp}:{_target_fp}" + if not await _dedup_first_send(dedup_key, ttl=86400, event="auto_repair"): logger.info( "auto_repair_escalation_dedup_skipped", incident_id=incident_id, approval_id=approval_id, + fingerprint=f"{_alertname_fp}:{_target_fp}", ) return