diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index a1101a5d..eed64baa 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -53,6 +53,8 @@ SNOOZE_TTL_SECONDS = 30 * 60 # 30 分鐘 SILENCE_TTL_SECONDS = 60 * 60 # 1 小時 INCIDENT_UPDATE_DEDUP_PREFIX = "awoooi:tg_update_dedup:" # {incident_id}:{status_hash} INCIDENT_UPDATE_DEDUP_TTL_SECONDS = 5 * 60 # 5 分鐘內相同狀態不重複洗版 +INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_PREFIX = "awoooi:tg_update_global_failure_dedup:" +INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_TTL_SECONDS = 10 * 60 # 相同失敗摘要跨 incident 10 分鐘只推一次 # 2026-04-01 Claude Code: Long Polling 分散式 Leader Election # 防止多 Pod 同時 getUpdates → 409 Conflict 互搶問題 @@ -69,6 +71,14 @@ def _sanitize_telegram_error(text: str) -> str: """遮蔽 Telegram Bot URL 中的 token,避免例外字串污染 log / trace。""" return _TELEGRAM_BOT_URL_RE.sub(r"\1", text) + +def _is_noisy_failure_update(status_line: str) -> bool: + """判斷是否屬於容易跨 incident 洗版的失敗摘要。""" + return ( + "AI 自動修復失敗" in status_line + or "AI 診斷工具失敗" in status_line + ) + # 2026-04-27 Claude Sonnet 4.6: B3 — LLM 動態 Telegram 按鈕 Feature Flag # true → 優先使用 ActionPlan.recommended_actions 動態生成按鈕 # false → 維持現有 callback_action_spec.yaml 路徑(預設,向下相容) @@ -4539,6 +4549,33 @@ class TelegramGateway: error=str(exc), ) + suppress_reply = False + if _is_noisy_failure_update(status_line): + # 不同 incident 若卡在同一個自動修復/診斷失敗摘要,Telegram 只推第一則; + # 每個 incident 仍會繼續移除原卡危險按鈕,完整細節交給 timeline / AwoooP。 + global_hash = hashlib.sha1(status_line.encode("utf-8")).hexdigest()[:16] + global_dedup_key = f"{INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_PREFIX}{global_hash}" + try: + was_global_set = await redis.set( + global_dedup_key, + incident_id, + ex=INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_TTL_SECONDS, + nx=True, + ) + suppress_reply = not bool(was_global_set) + if suppress_reply: + logger.info( + "append_incident_update_global_failure_dedup_suppressed", + incident_id=incident_id, + dedup_key=global_dedup_key, + ) + except Exception as exc: + logger.warning( + "append_incident_update_global_failure_dedup_failed", + incident_id=incident_id, + error=str(exc), + ) + # Step 1: 取得原始訊息文字(Telegram Bot API 不提供讀取原文,只能在 editMessageText 裡重建) # 策略: 只追加 status_line,不讀取原文(Telegram edit 要傳完整新文字) # 所以先用 editMessageReplyMarkup 換按鈕,再 sendMessage 同 chat 以 reply 方式追加狀態 @@ -4565,6 +4602,9 @@ class TelegramGateway: except TelegramGatewayError as e: logger.warning("append_incident_update_edit_buttons_failed", message_id=message_id, error=str(e)) + if suppress_reply: + return True + # Step 2: Reply 原訊息追加狀態(保留原文不動,以 reply 方式延續) try: await self._send_request("sendMessage", { diff --git a/apps/api/tests/test_telegram_message_templates.py b/apps/api/tests/test_telegram_message_templates.py index 9d3104d1..d0119031 100644 --- a/apps/api/tests/test_telegram_message_templates.py +++ b/apps/api/tests/test_telegram_message_templates.py @@ -145,17 +145,19 @@ async def test_append_incident_update_deduplicates_same_status(monkeypatch): class FakeRedis: def __init__(self): - self.set_calls = 0 + self.values = {} async def get(self, key): assert key == "tg_msg:INC-DEDUP" return "12345" - async def set(self, *args, **kwargs): - self.set_calls += 1 + async def set(self, key, value, **kwargs): assert kwargs["nx"] is True assert kwargs["ex"] > 0 - return self.set_calls == 1 + if key in self.values: + return False + self.values[key] = value + return True fake_redis = FakeRedis() sent_requests = [] @@ -179,6 +181,56 @@ async def test_append_incident_update_deduplicates_same_status(monkeypatch): ] +@pytest.mark.asyncio +async def test_append_incident_update_suppresses_duplicate_failure_across_incidents(monkeypatch): + """不同 Incident 卡在相同失敗摘要時,只回覆第一則,避免 Telegram 洗版。""" + + class FakeRedis: + def __init__(self): + self.values = {} + + async def get(self, key): + if key == "tg_msg:INC-A": + return "111" + if key == "tg_msg:INC-B": + return "222" + return None + + async def set(self, key, value, **kwargs): + assert kwargs["nx"] is True + assert kwargs["ex"] > 0 + if key in self.values: + return False + self.values[key] = value + return True + + fake_redis = FakeRedis() + sent_requests = [] + gateway = TelegramGateway() + + async def fake_send_request(method, payload): + sent_requests.append((method, payload)) + return {"ok": True} + + monkeypatch.setattr(telegram_gateway_module, "get_redis", lambda: fake_redis) + monkeypatch.setattr(gateway, "_send_request", fake_send_request) + + status_line = ( + "🤖❌ [AUTO] AI 自動修復失敗,已升級人工介入\n" + "├ 動作: ssh 192.168.0.110 uptime\n" + "└ 錯誤: unsupported action" + ) + + assert await gateway.append_incident_update("INC-A", status_line) is True + assert await gateway.append_incident_update("INC-B", status_line) is True + + assert [method for method, _ in sent_requests] == [ + "editMessageReplyMarkup", + "sendMessage", + "editMessageReplyMarkup", + ] + + class TestSentryErrorMessage: """測試 Sentry 錯誤訊息""" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 72e8c296..5cdda0d4 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -4328,3 +4328,39 @@ Playwright live: ``` 判讀:AwoooP Operator Console 已具備第一版可用的處置語義面;下一步應把 Telegram 訊息彙整策略與 `outbound_message` / `conversation_event` 接到同一套視圖,避免群組持續被單筆訊息洗版。 + +## 2026-05-06(台北)— Telegram 自動化失敗摘要跨 Incident 去重 + +**觸發**:統帥指出 Telegram 群組中 `[AUTO] AI 自動修復失敗` 類訊息仍會連續洗版,且使用者難以區分 AI 已嘗試、AI 不能修、需人工接手。 + +### 改動 + +- `append_incident_update()` 保留既有「同一 incident 相同狀態 5 分鐘去重」。 +- 新增「相同失敗摘要跨 incident 10 分鐘去重」: + - `AI 自動修復失敗` + - `AI 診斷工具失敗` +- 第二個以上相同失敗摘要會繼續 `editMessageReplyMarkup`,移除原卡危險按鈕,但不再 `sendMessage` reply 洗群組。 +- 更新 `TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md`,記錄目前落地行為。 + +### 驗證 + +```text +/Users/ogt/awoooi/apps/api/.venv/bin/python -m py_compile \ + apps/api/src/services/telegram_gateway.py \ + apps/api/tests/test_telegram_message_templates.py +# passed + +DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \ + /Users/ogt/awoooi/apps/api/.venv/bin/python -m pytest \ + apps/api/tests/test_telegram_message_templates.py -q +# 20 passed + +/Users/ogt/awoooi/apps/api/.venv/bin/python -m ruff check \ + apps/api/tests/test_telegram_message_templates.py +# All checks passed +``` + +### 判讀 + +- 這不是靜默告警;第一個失敗摘要仍會送到戰情室。 +- 後續同樣失敗摘要收斂到 AwoooP Run / Timeline / Audit,Telegram 只保留人類注意力入口。 diff --git a/docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md b/docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md index 52b4e16e..34d64ce5 100644 --- a/docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md +++ b/docs/awooop/TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md @@ -37,6 +37,7 @@ Telegram 不應是完整執行日誌,也不應承載所有 AI 推理細節。T - `TelegramMessage` 主卡新增「處置狀態」。 - `append_incident_update()` 對同一 incident 的相同狀態做 5 分鐘 Redis 去重。 +- `append_incident_update()` 對相同的「AI 自動修復失敗 / AI 診斷工具失敗」摘要增加 10 分鐘跨 incident 去重;每個 incident 仍會移除原卡危險按鈕,但 Telegram 不再重複 reply 同一個失敗摘要。 - 既有 `詳情 / 重診 / 歷史` 按鈕保留,讓 Telegram 保持輕量,細節回到控制台。 ## 後續建議