From 8f715fd3f2505d7fa0c0f74adce35e5f369fd010 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 6 May 2026 16:45:29 +0800 Subject: [PATCH] fix(telegram): sanitize failover alert errors --- apps/api/src/services/failover_alerter.py | 25 +++++++++++---- apps/api/src/services/telegram_gateway.py | 25 ++++++++++++--- apps/api/tests/test_failover_alerter.py | 18 +++++++++++ .../test_telegram_gateway_error_sanitizer.py | 12 +++++++ docs/LOGBOOK.md | 32 +++++++++++++++++++ 5 files changed, 101 insertions(+), 11 deletions(-) create mode 100644 apps/api/tests/test_telegram_gateway_error_sanitizer.py diff --git a/apps/api/src/services/failover_alerter.py b/apps/api/src/services/failover_alerter.py index 7e184fd7..8e033de6 100644 --- a/apps/api/src/services/failover_alerter.py +++ b/apps/api/src/services/failover_alerter.py @@ -12,7 +12,8 @@ from __future__ import annotations import hashlib import json -from datetime import datetime, timezone, timedelta +import re +from datetime import datetime, timedelta, timezone from typing import Any import structlog @@ -23,6 +24,7 @@ RECOVERY_DEDUP_TTL_SEC = 3600 # 1h — GCP 健康閃爍時 1 小時內不重複 QUOTA_DEDUP_TTL_SEC = 86400 # 24h(每日 quota 告警只發一次) logger = structlog.get_logger(__name__) +_TELEGRAM_BOT_URL_RE = re.compile(r"(api\.telegram\.org/bot)[^/\s]+") class FailoverAlerter: @@ -157,7 +159,7 @@ class FailoverAlerter: if impact_lines: sections.append(f"\n*影響*\n{impact_lines}") if remediation_lines or next_action_line: - sections.append(f"\n*修復方向*") + sections.append("\n*修復方向*") if remediation_lines: sections.append(remediation_lines) if next_action_line: @@ -285,8 +287,8 @@ class FailoverAlerter: 2026-04-25 P1.5 by Claude Engineer-D — 告警失敗不能阻斷主流程 """ try: - from src.services.telegram_gateway import get_telegram_gateway from src.core.config import get_settings + from src.services.telegram_gateway import get_telegram_gateway settings = get_settings() chat_id = getattr(settings, "SRE_GROUP_CHAT_ID", None) or getattr(settings, "OPENCLAW_TG_CHAT_ID", None) @@ -299,7 +301,13 @@ class FailoverAlerter: logger.info("telegram_failover_alert_sent", message_len=len(message)) except Exception as e: # 不 raise — 告警失敗不該阻斷主流程(鐵律) - logger.exception("telegram_failover_send_failed", error=str(e)) + # 2026-05-06 Codex: Telegram/httpx exception 字串可能包含 bot token URL, + # 禁止用 logger.exception 輸出 chained traceback。 + logger.warning( + "telegram_failover_send_failed", + error=_sanitize_telegram_error(str(e)), + error_type=type(e).__name__, + ) # ------------------------------------------------------------------------- @@ -319,6 +327,11 @@ def _escape_md(text: str) -> str: return text +def _sanitize_telegram_error(text: str) -> str: + """遮蔽 Telegram Bot URL 中的 token,避免例外訊息寫入 log。""" + return _TELEGRAM_BOT_URL_RE.sub(r"\1", text) + + def _as_dict(value: Any) -> dict[str, Any]: return value if isinstance(value, dict) else {} @@ -334,7 +347,7 @@ def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool = rows.append(f"{_escape_md(str(k))}:{_escape_md(str(data.get(k)))}") idx += 1 if compact and len(rows) >= max_items: - rows.append("...(更多欄位略)") + rows.append(_escape_md("...(更多欄位略)")) return "\n".join(f" {line}" for line in rows) @@ -342,7 +355,7 @@ def _lines_from_list(value: Any) -> str: if not isinstance(value, list): return "" return "\n".join( - f" {idx + 1}. {_escape_md(str(item))}" + f" {_escape_md(str(idx + 1))}\\. {_escape_md(str(item))}" for idx, item in enumerate(value) ) diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 2510edbb..cc5d0e25 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -25,6 +25,7 @@ SOUL.md 鐵律 (4.1 Telegram 訊息壓縮原則): import asyncio import html import os +import re from dataclasses import dataclass from datetime import UTC, datetime @@ -58,6 +59,12 @@ POLLING_LEADER_RENEW = 20 # seconds - 每 20s 續約 POLLING_LEADER_WATCH = 30 # seconds - 非 Leader Pod 每 30s 嘗試接管 logger = structlog.get_logger(__name__) +_TELEGRAM_BOT_URL_RE = re.compile(r"(api\.telegram\.org/bot)[^/\s]+") + + +def _sanitize_telegram_error(text: str) -> str: + """遮蔽 Telegram Bot URL 中的 token,避免例外字串污染 log / trace。""" + return _TELEGRAM_BOT_URL_RE.sub(r"\1", text) # 2026-04-27 Claude Sonnet 4.6: B3 — LLM 動態 Telegram 按鈕 Feature Flag # true → 優先使用 ActionPlan.recommended_actions 動態生成按鈕 @@ -1468,20 +1475,28 @@ class TelegramGateway: except httpx.HTTPStatusError as e: span.set_attribute("telegram.http_status", e.response.status_code) span.set_status(trace.Status(trace.StatusCode.ERROR)) - span.record_exception(e) + span.record_exception( + TelegramGatewayError(f"HTTP error: {e.response.status_code}") + ) logger.error("telegram_api_error", method=method, status=e.response.status_code, response_body=e.response.text[:500]) - raise TelegramGatewayError(f"HTTP error: {e.response.status_code}") from e + raise TelegramGatewayError(f"HTTP error: {e.response.status_code}") from None except TelegramGatewayError: # 已處理的錯誤,直接拋出 raise except Exception as e: + safe_error = _sanitize_telegram_error(str(e)) span.set_status(trace.Status(trace.StatusCode.ERROR)) - span.record_exception(e) - logger.error("telegram_request_failed", method=method, error=str(e)) - raise TelegramGatewayError(str(e)) from e + span.record_exception(TelegramGatewayError(safe_error)) + logger.error( + "telegram_request_failed", + method=method, + error=safe_error, + error_type=type(e).__name__, + ) + raise TelegramGatewayError(safe_error) from None async def _build_inline_keyboard( self, diff --git a/apps/api/tests/test_failover_alerter.py b/apps/api/tests/test_failover_alerter.py index db193d7b..51bbe407 100644 --- a/apps/api/tests/test_failover_alerter.py +++ b/apps/api/tests/test_failover_alerter.py @@ -19,6 +19,8 @@ from src.services.failover_alerter import ( DEDUP_TTL_SEC, QUOTA_DEDUP_TTL_SEC, FailoverAlerter, + _lines_from_list, + _sanitize_telegram_error, configure_alerter, get_failover_alerter, reset_failover_alerter, @@ -231,3 +233,19 @@ async def test_memory_dedup_max_size_gc(): # GC 後:999 個 stale entry 被清除,只剩 fresh:key + trigger:gc:key assert len(alerter._memory_dedup) <= 3 # fresh + trigger + 可能有邊界差1 + + +def test_lines_from_list_escapes_markdown_v2_numbered_periods() -> None: + rendered = _lines_from_list(["修復 node-exporter-110"]) + + assert "1\\." in rendered + assert "node\\-exporter\\-110" in rendered + + +def test_sanitize_telegram_error_redacts_bot_token_url() -> None: + raw = "HTTP error for https://api.telegram.org/bot123456:SECRET/sendMessage" + + sanitized = _sanitize_telegram_error(raw) + + assert "SECRET" not in sanitized + assert "bot" in sanitized diff --git a/apps/api/tests/test_telegram_gateway_error_sanitizer.py b/apps/api/tests/test_telegram_gateway_error_sanitizer.py new file mode 100644 index 00000000..984dac7d --- /dev/null +++ b/apps/api/tests/test_telegram_gateway_error_sanitizer.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from src.services.telegram_gateway import _sanitize_telegram_error + + +def test_telegram_gateway_sanitizes_bot_token_url() -> None: + raw = "Client error for https://api.telegram.org/bot123456:SECRET/sendMessage" + + sanitized = _sanitize_telegram_error(raw) + + assert "SECRET" not in sanitized + assert "bot" in sanitized diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 5d163f7e..537fa8c6 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -3825,3 +3825,35 @@ ruff check apps/api/tests/test_approval_execution_mcp_audit.py - 本次是「先補 durable audit + legacy 標記」,不是直接硬切 MCP Gateway enforcement;原因是 AwoooP project / agent / grant contract 尚未覆蓋所有 legacy 修復路徑,硬切會中斷現有 approved execution。 - 下一步應將 `decision_manager.py`、`pre_decision_investigator.py`、`post_execution_verifier.py`、`callback_dispatcher.py` 的 direct MCP caller 逐步套同一種可追蹤 wrapper,最後再切到 `McpGateway.call()` enforcement。 + +--- + +## 2026-05-06(台北)— Telegram failover 告警 400 與 token log 外洩修補 + +**觸發**:production API log 顯示 `telegram_failover_send_failed`,Telegram `sendMessage` 回 400;同時 chained traceback 內含 Telegram Bot URL,會把 token 形式的敏感資訊寫入 log / trace。 + +### 已修正 + +| 範圍 | 結果 | +|------|------| +| `failover_alerter.py` | 失敗時不再使用 `logger.exception()` 輸出 chained traceback,改記錄已遮蔽的錯誤文字與錯誤類型 | +| MarkdownV2 | `_lines_from_list()` 將編號句點改為 `1\\.`,並補上 compact 省略文字的 MarkdownV2 escape,避免治理告警清單觸發 Telegram parse 400 | +| `telegram_gateway.py` | HTTPStatusError 不再 `raise ... from e`,OTel span 也只記 sanitized gateway error,避免 httpx exception 字串帶出 Bot URL | +| 測試 | 新增 Telegram error sanitizer 與 MarkdownV2 編號 escape 回歸測試 | + +### 驗證 + +```text +pytest apps/api/tests/test_failover_alerter.py apps/api/tests/test_telegram_gateway_error_sanitizer.py apps/api/tests/test_heartbeat_dedup_p0_4.py -q +# 17 passed + +py_compile apps/api/src/services/failover_alerter.py apps/api/src/services/telegram_gateway.py apps/api/tests/test_failover_alerter.py apps/api/tests/test_telegram_gateway_error_sanitizer.py +# 通過 + +ruff check apps/api/src/services/failover_alerter.py apps/api/tests/test_failover_alerter.py apps/api/tests/test_telegram_gateway_error_sanitizer.py +# All checks passed +``` + +### 注意 + +- `telegram_gateway.py` 全檔仍有大量既有 ruff 債,本次只針對 token 外洩與 MarkdownV2 400 風險做最小安全修補,避免在 6000+ 行 gateway 巨檔混入無關機械改動。