fix(telegram): deduplicate repeated failure updates
This commit is contained in:
@@ -53,6 +53,8 @@ SNOOZE_TTL_SECONDS = 30 * 60 # 30 分鐘
|
||||
SILENCE_TTL_SECONDS = 60 * 60 # 1 小時
|
||||
INCIDENT_UPDATE_DEDUP_PREFIX = "awoooi:tg_update_dedup:" # {incident_id}:{status_hash}
|
||||
INCIDENT_UPDATE_DEDUP_TTL_SECONDS = 5 * 60 # 5 分鐘內相同狀態不重複洗版
|
||||
INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_PREFIX = "awoooi:tg_update_global_failure_dedup:"
|
||||
INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_TTL_SECONDS = 10 * 60 # 相同失敗摘要跨 incident 10 分鐘只推一次
|
||||
|
||||
# 2026-04-01 Claude Code: Long Polling 分散式 Leader Election
|
||||
# 防止多 Pod 同時 getUpdates → 409 Conflict 互搶問題
|
||||
@@ -69,6 +71,14 @@ def _sanitize_telegram_error(text: str) -> str:
|
||||
"""遮蔽 Telegram Bot URL 中的 token,避免例外字串污染 log / trace。"""
|
||||
return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
|
||||
|
||||
|
||||
def _is_noisy_failure_update(status_line: str) -> bool:
|
||||
"""判斷是否屬於容易跨 incident 洗版的失敗摘要。"""
|
||||
return (
|
||||
"AI 自動修復失敗" in status_line
|
||||
or "AI 診斷工具失敗" in status_line
|
||||
)
|
||||
|
||||
# 2026-04-27 Claude Sonnet 4.6: B3 — LLM 動態 Telegram 按鈕 Feature Flag
|
||||
# true → 優先使用 ActionPlan.recommended_actions 動態生成按鈕
|
||||
# false → 維持現有 callback_action_spec.yaml 路徑(預設,向下相容)
|
||||
@@ -4539,6 +4549,33 @@ class TelegramGateway:
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
suppress_reply = False
|
||||
if _is_noisy_failure_update(status_line):
|
||||
# 不同 incident 若卡在同一個自動修復/診斷失敗摘要,Telegram 只推第一則;
|
||||
# 每個 incident 仍會繼續移除原卡危險按鈕,完整細節交給 timeline / AwoooP。
|
||||
global_hash = hashlib.sha1(status_line.encode("utf-8")).hexdigest()[:16]
|
||||
global_dedup_key = f"{INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_PREFIX}{global_hash}"
|
||||
try:
|
||||
was_global_set = await redis.set(
|
||||
global_dedup_key,
|
||||
incident_id,
|
||||
ex=INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_TTL_SECONDS,
|
||||
nx=True,
|
||||
)
|
||||
suppress_reply = not bool(was_global_set)
|
||||
if suppress_reply:
|
||||
logger.info(
|
||||
"append_incident_update_global_failure_dedup_suppressed",
|
||||
incident_id=incident_id,
|
||||
dedup_key=global_dedup_key,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"append_incident_update_global_failure_dedup_failed",
|
||||
incident_id=incident_id,
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
# Step 1: 取得原始訊息文字(Telegram Bot API 不提供讀取原文,只能在 editMessageText 裡重建)
|
||||
# 策略: 只追加 status_line,不讀取原文(Telegram edit 要傳完整新文字)
|
||||
# 所以先用 editMessageReplyMarkup 換按鈕,再 sendMessage 同 chat 以 reply 方式追加狀態
|
||||
@@ -4565,6 +4602,9 @@ class TelegramGateway:
|
||||
except TelegramGatewayError as e:
|
||||
logger.warning("append_incident_update_edit_buttons_failed", message_id=message_id, error=str(e))
|
||||
|
||||
if suppress_reply:
|
||||
return True
|
||||
|
||||
# Step 2: Reply 原訊息追加狀態(保留原文不動,以 reply 方式延續)
|
||||
try:
|
||||
await self._send_request("sendMessage", {
|
||||
|
||||
@@ -145,17 +145,19 @@ async def test_append_incident_update_deduplicates_same_status(monkeypatch):
|
||||
|
||||
class FakeRedis:
|
||||
def __init__(self):
|
||||
self.set_calls = 0
|
||||
self.values = {}
|
||||
|
||||
async def get(self, key):
|
||||
assert key == "tg_msg:INC-DEDUP"
|
||||
return "12345"
|
||||
|
||||
async def set(self, *args, **kwargs):
|
||||
self.set_calls += 1
|
||||
async def set(self, key, value, **kwargs):
|
||||
assert kwargs["nx"] is True
|
||||
assert kwargs["ex"] > 0
|
||||
return self.set_calls == 1
|
||||
if key in self.values:
|
||||
return False
|
||||
self.values[key] = value
|
||||
return True
|
||||
|
||||
fake_redis = FakeRedis()
|
||||
sent_requests = []
|
||||
@@ -179,6 +181,56 @@ async def test_append_incident_update_deduplicates_same_status(monkeypatch):
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_append_incident_update_suppresses_duplicate_failure_across_incidents(monkeypatch):
|
||||
"""不同 Incident 卡在相同失敗摘要時,只回覆第一則,避免 Telegram 洗版。"""
|
||||
|
||||
class FakeRedis:
|
||||
def __init__(self):
|
||||
self.values = {}
|
||||
|
||||
async def get(self, key):
|
||||
if key == "tg_msg:INC-A":
|
||||
return "111"
|
||||
if key == "tg_msg:INC-B":
|
||||
return "222"
|
||||
return None
|
||||
|
||||
async def set(self, key, value, **kwargs):
|
||||
assert kwargs["nx"] is True
|
||||
assert kwargs["ex"] > 0
|
||||
if key in self.values:
|
||||
return False
|
||||
self.values[key] = value
|
||||
return True
|
||||
|
||||
fake_redis = FakeRedis()
|
||||
sent_requests = []
|
||||
gateway = TelegramGateway()
|
||||
|
||||
async def fake_send_request(method, payload):
|
||||
sent_requests.append((method, payload))
|
||||
return {"ok": True}
|
||||
|
||||
monkeypatch.setattr(telegram_gateway_module, "get_redis", lambda: fake_redis)
|
||||
monkeypatch.setattr(gateway, "_send_request", fake_send_request)
|
||||
|
||||
status_line = (
|
||||
"🤖❌ <b>[AUTO] AI 自動修復失敗,已升級人工介入</b>\n"
|
||||
"├ 動作: <code>ssh 192.168.0.110 uptime</code>\n"
|
||||
"└ 錯誤: unsupported action"
|
||||
)
|
||||
|
||||
assert await gateway.append_incident_update("INC-A", status_line) is True
|
||||
assert await gateway.append_incident_update("INC-B", status_line) is True
|
||||
|
||||
assert [method for method, _ in sent_requests] == [
|
||||
"editMessageReplyMarkup",
|
||||
"sendMessage",
|
||||
"editMessageReplyMarkup",
|
||||
]
|
||||
|
||||
|
||||
class TestSentryErrorMessage:
|
||||
"""測試 Sentry 錯誤訊息"""
|
||||
|
||||
|
||||
@@ -4328,3 +4328,39 @@ Playwright live:
|
||||
```
|
||||
|
||||
判讀:AwoooP Operator Console 已具備第一版可用的處置語義面;下一步應把 Telegram 訊息彙整策略與 `outbound_message` / `conversation_event` 接到同一套視圖,避免群組持續被單筆訊息洗版。
|
||||
|
||||
## 2026-05-06(台北)— Telegram 自動化失敗摘要跨 Incident 去重
|
||||
|
||||
**觸發**:統帥指出 Telegram 群組中 `[AUTO] AI 自動修復失敗` 類訊息仍會連續洗版,且使用者難以區分 AI 已嘗試、AI 不能修、需人工接手。
|
||||
|
||||
### 改動
|
||||
|
||||
- `append_incident_update()` 保留既有「同一 incident 相同狀態 5 分鐘去重」。
|
||||
- 新增「相同失敗摘要跨 incident 10 分鐘去重」:
|
||||
- `AI 自動修復失敗`
|
||||
- `AI 診斷工具失敗`
|
||||
- 第二個以上相同失敗摘要會繼續 `editMessageReplyMarkup`,移除原卡危險按鈕,但不再 `sendMessage` reply 洗群組。
|
||||
- 更新 `TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md`,記錄目前落地行為。
|
||||
|
||||
### 驗證
|
||||
|
||||
```text
|
||||
/Users/ogt/awoooi/apps/api/.venv/bin/python -m py_compile \
|
||||
apps/api/src/services/telegram_gateway.py \
|
||||
apps/api/tests/test_telegram_message_templates.py
|
||||
# passed
|
||||
|
||||
DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \
|
||||
/Users/ogt/awoooi/apps/api/.venv/bin/python -m pytest \
|
||||
apps/api/tests/test_telegram_message_templates.py -q
|
||||
# 20 passed
|
||||
|
||||
/Users/ogt/awoooi/apps/api/.venv/bin/python -m ruff check \
|
||||
apps/api/tests/test_telegram_message_templates.py
|
||||
# All checks passed
|
||||
```
|
||||
|
||||
### 判讀
|
||||
|
||||
- 這不是靜默告警;第一個失敗摘要仍會送到戰情室。
|
||||
- 後續同樣失敗摘要收斂到 AwoooP Run / Timeline / Audit,Telegram 只保留人類注意力入口。
|
||||
|
||||
@@ -37,6 +37,7 @@ Telegram 不應是完整執行日誌,也不應承載所有 AI 推理細節。T
|
||||
|
||||
- `TelegramMessage` 主卡新增「處置狀態」。
|
||||
- `append_incident_update()` 對同一 incident 的相同狀態做 5 分鐘 Redis 去重。
|
||||
- `append_incident_update()` 對相同的「AI 自動修復失敗 / AI 診斷工具失敗」摘要增加 10 分鐘跨 incident 去重;每個 incident 仍會移除原卡危險按鈕,但 Telegram 不再重複 reply 同一個失敗摘要。
|
||||
- 既有 `詳情 / 重診 / 歷史` 按鈕保留,讓 Telegram 保持輕量,細節回到控制台。
|
||||
|
||||
## 後續建議
|
||||
|
||||
Reference in New Issue
Block a user