From a84a5a0bc4a672ac6feb95a85ac590aa2dd4bb71 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 24 Jun 2026 02:00:25 +0800 Subject: [PATCH] fix(api): suppress healthy Telegram heartbeat noise --- apps/api/src/services/telegram_gateway.py | 34 ++++++++++++--------- apps/api/tests/test_heartbeat_dedup_p0_4.py | 32 +++++++++---------- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 33b9d692..897f89bd 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -10109,33 +10109,38 @@ class TelegramGateway: report = await HeartbeatReportService().collect() - # 2026-05-03 Claude Opus 4.7 + 統帥 ogt:P0 #4 heartbeat 噪音降頻 - # 鐵證:原本 30min/次 = 一天 48 條,統帥每天看相同內容 = 變相重複告警 + # 2026-06-24 Codex + ogt:心跳成功訊息不再即時推 Telegram。 + # 鐵證:每 30 分鐘「AWOOOI 心跳 / 告警鏈路正常」會洗版; + # 正常狀態應只留在 metrics/log/每日摘要,Telegram 只承載異常、 + # warnings 變化與恢復通知。 # 修法(不違反「監控工具必須被監控」鐵律): - # 健康(無 warnings)→ 6h 內最多 1 次「我活著」訊號 + # 健康(無 warnings、且沒有上一輪 warning)→ 不推 Telegram # 有 warnings 跟上次相同 → 跳過(hash 對比) # 有 warnings 跟上次不同 → 立即推送(新狀況不漏) + # warnings 消失 → 只推一次恢復通知,之後回到安靜 import hashlib - SILENT_REPORT_INTERVAL_HOURS = 6 WARNINGS_HASH_TTL = 24 * 3600 - silent_key = "heartbeat:silent_last_sent" + healthy_suppressed_key = "heartbeat:healthy_suppressed_last_seen" warnings_hash_key = "heartbeat:warnings_hash" warnings_str = "|".join(sorted(report.warnings)) warnings_hash = hashlib.md5(warnings_str.encode()).hexdigest()[:12] if not report.warnings: - # 健康狀態:6h 1 次「我活著」訊號 - if await redis_client.exists(silent_key): - logger.debug( - "telegram_heartbeat_skipped_silent_recent", + # 健康狀態:沒有上一輪 warning 時不送 Telegram,避免成功心跳洗版。 + last_hash_raw = await redis_client.get(warnings_hash_key) + if not last_hash_raw: + await redis_client.setex( + healthy_suppressed_key, + WARNINGS_HASH_TTL, + "1", + ) + logger.info( + "telegram_heartbeat_healthy_suppressed", slot_id=slot_id, ) return True - await redis_client.setex( - silent_key, SILENT_REPORT_INTERVAL_HOURS * 3600, "1", - ) - # 清掉舊的 warnings hash(從有事 → 健康,下次有事要立即推) + # 從有事 → 健康:清掉舊 hash,並允許下方送一次恢復通知。 await redis_client.delete(warnings_hash_key) else: # 有事:跟上次同 hash 跳過 @@ -10154,8 +10159,7 @@ class TelegramGateway: await redis_client.setex( warnings_hash_key, WARNINGS_HASH_TTL, warnings_hash, ) - # 清掉 silent marker(從健康 → 有事,下次健康要過 6h 才再推) - await redis_client.delete(silent_key) + await redis_client.delete(healthy_suppressed_key) text = report_to_telegram_html(report) diff --git a/apps/api/tests/test_heartbeat_dedup_p0_4.py b/apps/api/tests/test_heartbeat_dedup_p0_4.py index e2b223c9..1d8733e6 100644 --- a/apps/api/tests/test_heartbeat_dedup_p0_4.py +++ b/apps/api/tests/test_heartbeat_dedup_p0_4.py @@ -2,11 +2,11 @@ P0 #4 heartbeat 噪音降頻測試 2026-05-03 Claude Opus 4.7 + 統帥 ogt -驗證 send_heartbeat() 的 dedup 邏輯: - 健康(無 warnings)→ 6h 內最多 1 次 +驗證 send_heartbeat() 的低噪音邏輯: + 健康(無 warnings)→ 不即時推 Telegram,只留 metrics/log/每日摘要 有 warnings 且跟上次同 hash → 跳過 有 warnings 且跟上次不同 → 立即推送 - 健康 ↔ 有事 切換時清掉相反 marker + warnings 消失 → 只推一次恢復通知 直接測 telegram_gateway.send_heartbeat(),mock 掉 redis + report + send_to_group。 """ @@ -87,12 +87,12 @@ class TestHeartbeatDedup: """P0 #4 heartbeat 降頻邏輯""" @pytest.mark.asyncio - async def test_healthy_first_send_goes_through( + async def test_healthy_first_send_is_suppressed( self, gateway_with_fake_redis, sre_group_configured, ): - """健康狀態第一次推送(無 silent marker)→ 推送""" + """健康狀態第一次檢查 → 不推 Telegram,只記錄 suppressed marker""" gw, fake_redis = gateway_with_fake_redis with patch("src.core.redis_client.get_redis", return_value=fake_redis), \ @@ -104,19 +104,19 @@ class TestHeartbeatDedup: result = await gw.send_heartbeat() assert result is True - assert "heartbeat:silent_last_sent" in fake_redis._store - gw.send_to_group.assert_called_once() + assert "heartbeat:healthy_suppressed_last_seen" in fake_redis._store + gw.send_to_group.assert_not_called() gw.send_notification.assert_not_called() @pytest.mark.asyncio - async def test_healthy_second_send_within_6h_skipped( + async def test_healthy_second_send_stays_suppressed( self, gateway_with_fake_redis, sre_group_configured, ): - """健康狀態 6h 內第二次推送 → 跳過""" + """健康狀態第二次檢查 → 仍不推 Telegram""" gw, fake_redis = gateway_with_fake_redis - fake_redis.preset("heartbeat:silent_last_sent") # 模擬已有 silent marker + fake_redis.preset("heartbeat:healthy_suppressed_last_seen") with patch("src.core.redis_client.get_redis", return_value=fake_redis), \ patch("src.services.heartbeat_report_service.HeartbeatReportService") as MockSvc, \ @@ -127,7 +127,6 @@ class TestHeartbeatDedup: result = await gw.send_heartbeat() assert result is True - # 不該呼叫 send(被跳過) gw.send_to_group.assert_not_called() gw.send_notification.assert_not_called() @@ -202,17 +201,18 @@ class TestHeartbeatDedup: await gw.send_heartbeat() assert "heartbeat:warnings_hash" in fake_redis.delete_calls - assert "heartbeat:silent_last_sent" in fake_redis._store + gw.send_to_group.assert_called_once() + gw.send_notification.assert_not_called() @pytest.mark.asyncio - async def test_healthy_to_warnings_clears_silent_marker( + async def test_healthy_to_warnings_clears_suppressed_marker( self, gateway_with_fake_redis, sre_group_configured, ): - """從健康 → 有事:清掉 silent marker,下次靜默過 6h 才再推""" + """從健康 → 有事:清掉 healthy suppressed marker,並推送新 warning""" gw, fake_redis = gateway_with_fake_redis - fake_redis.preset("heartbeat:silent_last_sent") + fake_redis.preset("heartbeat:healthy_suppressed_last_seen") with patch("src.core.redis_client.get_redis", return_value=fake_redis), \ patch("src.services.heartbeat_report_service.HeartbeatReportService") as MockSvc, \ @@ -224,5 +224,5 @@ class TestHeartbeatDedup: await gw.send_heartbeat() - assert "heartbeat:silent_last_sent" in fake_redis.delete_calls + assert "heartbeat:healthy_suppressed_last_seen" in fake_redis.delete_calls assert "heartbeat:warnings_hash" in fake_redis._store