fix(telegram): deduplicate repeated failure updates
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m4s
CD Pipeline / build-and-deploy (push) Successful in 3m47s
CD Pipeline / post-deploy-checks (push) Successful in 1m20s

This commit is contained in:
Your Name
2026-05-06 22:36:44 +08:00
parent 886657473e
commit c5964fbcd3
4 changed files with 133 additions and 4 deletions

View File

@@ -53,6 +53,8 @@ SNOOZE_TTL_SECONDS = 30 * 60 # 30 分鐘
SILENCE_TTL_SECONDS = 60 * 60 # 1 小時
INCIDENT_UPDATE_DEDUP_PREFIX = "awoooi:tg_update_dedup:" # {incident_id}:{status_hash}
INCIDENT_UPDATE_DEDUP_TTL_SECONDS = 5 * 60 # 5 分鐘內相同狀態不重複洗版
INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_PREFIX = "awoooi:tg_update_global_failure_dedup:"
INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_TTL_SECONDS = 10 * 60 # 相同失敗摘要跨 incident 10 分鐘只推一次
# 2026-04-01 Claude Code: Long Polling 分散式 Leader Election
# 防止多 Pod 同時 getUpdates → 409 Conflict 互搶問題
@@ -69,6 +71,14 @@ def _sanitize_telegram_error(text: str) -> str:
"""遮蔽 Telegram Bot URL 中的 token避免例外字串污染 log / trace。"""
return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
def _is_noisy_failure_update(status_line: str) -> bool:
"""判斷是否屬於容易跨 incident 洗版的失敗摘要。"""
return (
"AI 自動修復失敗" in status_line
or "AI 診斷工具失敗" in status_line
)
# 2026-04-27 Claude Sonnet 4.6: B3 — LLM 動態 Telegram 按鈕 Feature Flag
# true → 優先使用 ActionPlan.recommended_actions 動態生成按鈕
# false → 維持現有 callback_action_spec.yaml 路徑(預設,向下相容)
@@ -4539,6 +4549,33 @@ class TelegramGateway:
error=str(exc),
)
suppress_reply = False
if _is_noisy_failure_update(status_line):
# 不同 incident 若卡在同一個自動修復/診斷失敗摘要Telegram 只推第一則;
# 每個 incident 仍會繼續移除原卡危險按鈕,完整細節交給 timeline / AwoooP。
global_hash = hashlib.sha1(status_line.encode("utf-8")).hexdigest()[:16]
global_dedup_key = f"{INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_PREFIX}{global_hash}"
try:
was_global_set = await redis.set(
global_dedup_key,
incident_id,
ex=INCIDENT_UPDATE_GLOBAL_FAILURE_DEDUP_TTL_SECONDS,
nx=True,
)
suppress_reply = not bool(was_global_set)
if suppress_reply:
logger.info(
"append_incident_update_global_failure_dedup_suppressed",
incident_id=incident_id,
dedup_key=global_dedup_key,
)
except Exception as exc:
logger.warning(
"append_incident_update_global_failure_dedup_failed",
incident_id=incident_id,
error=str(exc),
)
# Step 1: 取得原始訊息文字Telegram Bot API 不提供讀取原文,只能在 editMessageText 裡重建)
# 策略: 只追加 status_line不讀取原文Telegram edit 要傳完整新文字)
# 所以先用 editMessageReplyMarkup 換按鈕,再 sendMessage 同 chat 以 reply 方式追加狀態
@@ -4565,6 +4602,9 @@ class TelegramGateway:
except TelegramGatewayError as e:
logger.warning("append_incident_update_edit_buttons_failed", message_id=message_id, error=str(e))
if suppress_reply:
return True
# Step 2: Reply 原訊息追加狀態(保留原文不動,以 reply 方式延續)
try:
await self._send_request("sendMessage", {

View File

@@ -145,17 +145,19 @@ async def test_append_incident_update_deduplicates_same_status(monkeypatch):
class FakeRedis:
def __init__(self):
self.set_calls = 0
self.values = {}
async def get(self, key):
assert key == "tg_msg:INC-DEDUP"
return "12345"
async def set(self, *args, **kwargs):
self.set_calls += 1
async def set(self, key, value, **kwargs):
assert kwargs["nx"] is True
assert kwargs["ex"] > 0
return self.set_calls == 1
if key in self.values:
return False
self.values[key] = value
return True
fake_redis = FakeRedis()
sent_requests = []
@@ -179,6 +181,56 @@ async def test_append_incident_update_deduplicates_same_status(monkeypatch):
]
@pytest.mark.asyncio
async def test_append_incident_update_suppresses_duplicate_failure_across_incidents(monkeypatch):
"""不同 Incident 卡在相同失敗摘要時,只回覆第一則,避免 Telegram 洗版。"""
class FakeRedis:
def __init__(self):
self.values = {}
async def get(self, key):
if key == "tg_msg:INC-A":
return "111"
if key == "tg_msg:INC-B":
return "222"
return None
async def set(self, key, value, **kwargs):
assert kwargs["nx"] is True
assert kwargs["ex"] > 0
if key in self.values:
return False
self.values[key] = value
return True
fake_redis = FakeRedis()
sent_requests = []
gateway = TelegramGateway()
async def fake_send_request(method, payload):
sent_requests.append((method, payload))
return {"ok": True}
monkeypatch.setattr(telegram_gateway_module, "get_redis", lambda: fake_redis)
monkeypatch.setattr(gateway, "_send_request", fake_send_request)
status_line = (
"🤖❌ <b>[AUTO] AI 自動修復失敗,已升級人工介入</b>\n"
"├ 動作: <code>ssh 192.168.0.110 uptime</code>\n"
"└ 錯誤: unsupported action"
)
assert await gateway.append_incident_update("INC-A", status_line) is True
assert await gateway.append_incident_update("INC-B", status_line) is True
assert [method for method, _ in sent_requests] == [
"editMessageReplyMarkup",
"sendMessage",
"editMessageReplyMarkup",
]
class TestSentryErrorMessage:
"""測試 Sentry 錯誤訊息"""

View File

@@ -4328,3 +4328,39 @@ Playwright live:
```
判讀AwoooP Operator Console 已具備第一版可用的處置語義面;下一步應把 Telegram 訊息彙整策略與 `outbound_message` / `conversation_event` 接到同一套視圖,避免群組持續被單筆訊息洗版。
## 2026-05-06台北— Telegram 自動化失敗摘要跨 Incident 去重
**觸發**:統帥指出 Telegram 群組中 `[AUTO] AI 自動修復失敗` 類訊息仍會連續洗版,且使用者難以區分 AI 已嘗試、AI 不能修、需人工接手。
### 改動
- `append_incident_update()` 保留既有「同一 incident 相同狀態 5 分鐘去重」。
- 新增「相同失敗摘要跨 incident 10 分鐘去重」:
- `AI 自動修復失敗`
- `AI 診斷工具失敗`
- 第二個以上相同失敗摘要會繼續 `editMessageReplyMarkup`,移除原卡危險按鈕,但不再 `sendMessage` reply 洗群組。
- 更新 `TELEGRAM-INCIDENT-NOTIFICATION-MODEL.md`,記錄目前落地行為。
### 驗證
```text
/Users/ogt/awoooi/apps/api/.venv/bin/python -m py_compile \
apps/api/src/services/telegram_gateway.py \
apps/api/tests/test_telegram_message_templates.py
# passed
DATABASE_URL='postgresql+asyncpg://test:test@localhost:5432/test' \
/Users/ogt/awoooi/apps/api/.venv/bin/python -m pytest \
apps/api/tests/test_telegram_message_templates.py -q
# 20 passed
/Users/ogt/awoooi/apps/api/.venv/bin/python -m ruff check \
apps/api/tests/test_telegram_message_templates.py
# All checks passed
```
### 判讀
- 這不是靜默告警;第一個失敗摘要仍會送到戰情室。
- 後續同樣失敗摘要收斂到 AwoooP Run / Timeline / AuditTelegram 只保留人類注意力入口。

View File

@@ -37,6 +37,7 @@ Telegram 不應是完整執行日誌,也不應承載所有 AI 推理細節。T
- `TelegramMessage` 主卡新增「處置狀態」。
- `append_incident_update()` 對同一 incident 的相同狀態做 5 分鐘 Redis 去重。
- `append_incident_update()` 對相同的「AI 自動修復失敗 / AI 診斷工具失敗」摘要增加 10 分鐘跨 incident 去重;每個 incident 仍會移除原卡危險按鈕,但 Telegram 不再重複 reply 同一個失敗摘要。
- 既有 `詳情 / 重診 / 歷史` 按鈕保留,讓 Telegram 保持輕量,細節回到控制台。
## 後續建議