fix(api): suppress healthy Telegram heartbeat noise
This commit is contained in:
@@ -10109,33 +10109,38 @@ class TelegramGateway:
|
||||
|
||||
report = await HeartbeatReportService().collect()
|
||||
|
||||
# 2026-05-03 Claude Opus 4.7 + 統帥 ogt:P0 #4 heartbeat 噪音降頻
|
||||
# 鐵證:原本 30min/次 = 一天 48 條,統帥每天看相同內容 = 變相重複告警
|
||||
# 2026-06-24 Codex + ogt:心跳成功訊息不再即時推 Telegram。
|
||||
# 鐵證:每 30 分鐘「AWOOOI 心跳 / 告警鏈路正常」會洗版;
|
||||
# 正常狀態應只留在 metrics/log/每日摘要,Telegram 只承載異常、
|
||||
# warnings 變化與恢復通知。
|
||||
# 修法(不違反「監控工具必須被監控」鐵律):
|
||||
# 健康(無 warnings)→ 6h 內最多 1 次「我活著」訊號
|
||||
# 健康(無 warnings、且沒有上一輪 warning)→ 不推 Telegram
|
||||
# 有 warnings 跟上次相同 → 跳過(hash 對比)
|
||||
# 有 warnings 跟上次不同 → 立即推送(新狀況不漏)
|
||||
# warnings 消失 → 只推一次恢復通知,之後回到安靜
|
||||
import hashlib
|
||||
SILENT_REPORT_INTERVAL_HOURS = 6
|
||||
WARNINGS_HASH_TTL = 24 * 3600
|
||||
silent_key = "heartbeat:silent_last_sent"
|
||||
healthy_suppressed_key = "heartbeat:healthy_suppressed_last_seen"
|
||||
warnings_hash_key = "heartbeat:warnings_hash"
|
||||
|
||||
warnings_str = "|".join(sorted(report.warnings))
|
||||
warnings_hash = hashlib.md5(warnings_str.encode()).hexdigest()[:12]
|
||||
|
||||
if not report.warnings:
|
||||
# 健康狀態:6h 1 次「我活著」訊號
|
||||
if await redis_client.exists(silent_key):
|
||||
logger.debug(
|
||||
"telegram_heartbeat_skipped_silent_recent",
|
||||
# 健康狀態:沒有上一輪 warning 時不送 Telegram,避免成功心跳洗版。
|
||||
last_hash_raw = await redis_client.get(warnings_hash_key)
|
||||
if not last_hash_raw:
|
||||
await redis_client.setex(
|
||||
healthy_suppressed_key,
|
||||
WARNINGS_HASH_TTL,
|
||||
"1",
|
||||
)
|
||||
logger.info(
|
||||
"telegram_heartbeat_healthy_suppressed",
|
||||
slot_id=slot_id,
|
||||
)
|
||||
return True
|
||||
await redis_client.setex(
|
||||
silent_key, SILENT_REPORT_INTERVAL_HOURS * 3600, "1",
|
||||
)
|
||||
# 清掉舊的 warnings hash(從有事 → 健康,下次有事要立即推)
|
||||
# 從有事 → 健康:清掉舊 hash,並允許下方送一次恢復通知。
|
||||
await redis_client.delete(warnings_hash_key)
|
||||
else:
|
||||
# 有事:跟上次同 hash 跳過
|
||||
@@ -10154,8 +10159,7 @@ class TelegramGateway:
|
||||
await redis_client.setex(
|
||||
warnings_hash_key, WARNINGS_HASH_TTL, warnings_hash,
|
||||
)
|
||||
# 清掉 silent marker(從健康 → 有事,下次健康要過 6h 才再推)
|
||||
await redis_client.delete(silent_key)
|
||||
await redis_client.delete(healthy_suppressed_key)
|
||||
|
||||
text = report_to_telegram_html(report)
|
||||
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
P0 #4 heartbeat 噪音降頻測試
|
||||
2026-05-03 Claude Opus 4.7 + 統帥 ogt
|
||||
|
||||
驗證 send_heartbeat() 的 dedup 邏輯:
|
||||
健康(無 warnings)→ 6h 內最多 1 次
|
||||
驗證 send_heartbeat() 的低噪音邏輯:
|
||||
健康(無 warnings)→ 不即時推 Telegram,只留 metrics/log/每日摘要
|
||||
有 warnings 且跟上次同 hash → 跳過
|
||||
有 warnings 且跟上次不同 → 立即推送
|
||||
健康 ↔ 有事 切換時清掉相反 marker
|
||||
warnings 消失 → 只推一次恢復通知
|
||||
|
||||
直接測 telegram_gateway.send_heartbeat(),mock 掉 redis + report + send_to_group。
|
||||
"""
|
||||
@@ -87,12 +87,12 @@ class TestHeartbeatDedup:
|
||||
"""P0 #4 heartbeat 降頻邏輯"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_healthy_first_send_goes_through(
|
||||
async def test_healthy_first_send_is_suppressed(
|
||||
self,
|
||||
gateway_with_fake_redis,
|
||||
sre_group_configured,
|
||||
):
|
||||
"""健康狀態第一次推送(無 silent marker)→ 推送"""
|
||||
"""健康狀態第一次檢查 → 不推 Telegram,只記錄 suppressed marker"""
|
||||
gw, fake_redis = gateway_with_fake_redis
|
||||
|
||||
with patch("src.core.redis_client.get_redis", return_value=fake_redis), \
|
||||
@@ -104,19 +104,19 @@ class TestHeartbeatDedup:
|
||||
result = await gw.send_heartbeat()
|
||||
|
||||
assert result is True
|
||||
assert "heartbeat:silent_last_sent" in fake_redis._store
|
||||
gw.send_to_group.assert_called_once()
|
||||
assert "heartbeat:healthy_suppressed_last_seen" in fake_redis._store
|
||||
gw.send_to_group.assert_not_called()
|
||||
gw.send_notification.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_healthy_second_send_within_6h_skipped(
|
||||
async def test_healthy_second_send_stays_suppressed(
|
||||
self,
|
||||
gateway_with_fake_redis,
|
||||
sre_group_configured,
|
||||
):
|
||||
"""健康狀態 6h 內第二次推送 → 跳過"""
|
||||
"""健康狀態第二次檢查 → 仍不推 Telegram"""
|
||||
gw, fake_redis = gateway_with_fake_redis
|
||||
fake_redis.preset("heartbeat:silent_last_sent") # 模擬已有 silent marker
|
||||
fake_redis.preset("heartbeat:healthy_suppressed_last_seen")
|
||||
|
||||
with patch("src.core.redis_client.get_redis", return_value=fake_redis), \
|
||||
patch("src.services.heartbeat_report_service.HeartbeatReportService") as MockSvc, \
|
||||
@@ -127,7 +127,6 @@ class TestHeartbeatDedup:
|
||||
result = await gw.send_heartbeat()
|
||||
|
||||
assert result is True
|
||||
# 不該呼叫 send(被跳過)
|
||||
gw.send_to_group.assert_not_called()
|
||||
gw.send_notification.assert_not_called()
|
||||
|
||||
@@ -202,17 +201,18 @@ class TestHeartbeatDedup:
|
||||
await gw.send_heartbeat()
|
||||
|
||||
assert "heartbeat:warnings_hash" in fake_redis.delete_calls
|
||||
assert "heartbeat:silent_last_sent" in fake_redis._store
|
||||
gw.send_to_group.assert_called_once()
|
||||
gw.send_notification.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_healthy_to_warnings_clears_silent_marker(
|
||||
async def test_healthy_to_warnings_clears_suppressed_marker(
|
||||
self,
|
||||
gateway_with_fake_redis,
|
||||
sre_group_configured,
|
||||
):
|
||||
"""從健康 → 有事:清掉 silent marker,下次靜默過 6h 才再推"""
|
||||
"""從健康 → 有事:清掉 healthy suppressed marker,並推送新 warning"""
|
||||
gw, fake_redis = gateway_with_fake_redis
|
||||
fake_redis.preset("heartbeat:silent_last_sent")
|
||||
fake_redis.preset("heartbeat:healthy_suppressed_last_seen")
|
||||
|
||||
with patch("src.core.redis_client.get_redis", return_value=fake_redis), \
|
||||
patch("src.services.heartbeat_report_service.HeartbeatReportService") as MockSvc, \
|
||||
@@ -224,5 +224,5 @@ class TestHeartbeatDedup:
|
||||
|
||||
await gw.send_heartbeat()
|
||||
|
||||
assert "heartbeat:silent_last_sent" in fake_redis.delete_calls
|
||||
assert "heartbeat:healthy_suppressed_last_seen" in fake_redis.delete_calls
|
||||
assert "heartbeat:warnings_hash" in fake_redis._store
|
||||
|
||||
Reference in New Issue
Block a user