fix(telegram): sanitize failover alert errors
All checks were successful
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / tests (push) Successful in 1m5s
CD Pipeline / build-and-deploy (push) Successful in 3m25s
CD Pipeline / post-deploy-checks (push) Successful in 1m16s

This commit is contained in:
Your Name
2026-05-06 16:45:29 +08:00
parent a94435f143
commit 8f715fd3f2
5 changed files with 101 additions and 11 deletions

View File

@@ -12,7 +12,8 @@ from __future__ import annotations
import hashlib
import json
from datetime import datetime, timezone, timedelta
import re
from datetime import datetime, timedelta, timezone
from typing import Any
import structlog
@@ -23,6 +24,7 @@ RECOVERY_DEDUP_TTL_SEC = 3600 # 1h — GCP 健康閃爍時 1 小時內不重複
QUOTA_DEDUP_TTL_SEC = 86400 # 24h每日 quota 告警只發一次)
logger = structlog.get_logger(__name__)
_TELEGRAM_BOT_URL_RE = re.compile(r"(api\.telegram\.org/bot)[^/\s]+")
class FailoverAlerter:
@@ -157,7 +159,7 @@ class FailoverAlerter:
if impact_lines:
sections.append(f"\n*影響*\n{impact_lines}")
if remediation_lines or next_action_line:
sections.append(f"\n*修復方向*")
sections.append("\n*修復方向*")
if remediation_lines:
sections.append(remediation_lines)
if next_action_line:
@@ -285,8 +287,8 @@ class FailoverAlerter:
2026-04-25 P1.5 by Claude Engineer-D — 告警失敗不能阻斷主流程
"""
try:
from src.services.telegram_gateway import get_telegram_gateway
from src.core.config import get_settings
from src.services.telegram_gateway import get_telegram_gateway
settings = get_settings()
chat_id = getattr(settings, "SRE_GROUP_CHAT_ID", None) or getattr(settings, "OPENCLAW_TG_CHAT_ID", None)
@@ -299,7 +301,13 @@ class FailoverAlerter:
logger.info("telegram_failover_alert_sent", message_len=len(message))
except Exception as e:
# 不 raise — 告警失敗不該阻斷主流程(鐵律)
logger.exception("telegram_failover_send_failed", error=str(e))
# 2026-05-06 Codex: Telegram/httpx exception 字串可能包含 bot token URL
# 禁止用 logger.exception 輸出 chained traceback。
logger.warning(
"telegram_failover_send_failed",
error=_sanitize_telegram_error(str(e)),
error_type=type(e).__name__,
)
# -------------------------------------------------------------------------
@@ -319,6 +327,11 @@ def _escape_md(text: str) -> str:
return text
def _sanitize_telegram_error(text: str) -> str:
"""遮蔽 Telegram Bot URL 中的 token避免例外訊息寫入 log。"""
return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
def _as_dict(value: Any) -> dict[str, Any]:
return value if isinstance(value, dict) else {}
@@ -334,7 +347,7 @@ def _lines_from_dict(data: dict[str, Any], max_items: int = 20, compact: bool =
rows.append(f"{_escape_md(str(k))}{_escape_md(str(data.get(k)))}")
idx += 1
if compact and len(rows) >= max_items:
rows.append("...(更多欄位略)")
rows.append(_escape_md("...(更多欄位略)"))
return "\n".join(f" {line}" for line in rows)
@@ -342,7 +355,7 @@ def _lines_from_list(value: Any) -> str:
if not isinstance(value, list):
return ""
return "\n".join(
f" {idx + 1}. {_escape_md(str(item))}"
f" {_escape_md(str(idx + 1))}\\. {_escape_md(str(item))}"
for idx, item in enumerate(value)
)

View File

@@ -25,6 +25,7 @@ SOUL.md 鐵律 (4.1 Telegram 訊息壓縮原則):
import asyncio
import html
import os
import re
from dataclasses import dataclass
from datetime import UTC, datetime
@@ -58,6 +59,12 @@ POLLING_LEADER_RENEW = 20 # seconds - 每 20s 續約
POLLING_LEADER_WATCH = 30 # seconds - 非 Leader Pod 每 30s 嘗試接管
logger = structlog.get_logger(__name__)
_TELEGRAM_BOT_URL_RE = re.compile(r"(api\.telegram\.org/bot)[^/\s]+")
def _sanitize_telegram_error(text: str) -> str:
"""遮蔽 Telegram Bot URL 中的 token避免例外字串污染 log / trace。"""
return _TELEGRAM_BOT_URL_RE.sub(r"\1<redacted>", text)
# 2026-04-27 Claude Sonnet 4.6: B3 — LLM 動態 Telegram 按鈕 Feature Flag
# true → 優先使用 ActionPlan.recommended_actions 動態生成按鈕
@@ -1468,20 +1475,28 @@ class TelegramGateway:
except httpx.HTTPStatusError as e:
span.set_attribute("telegram.http_status", e.response.status_code)
span.set_status(trace.Status(trace.StatusCode.ERROR))
span.record_exception(e)
span.record_exception(
TelegramGatewayError(f"HTTP error: {e.response.status_code}")
)
logger.error("telegram_api_error", method=method, status=e.response.status_code,
response_body=e.response.text[:500])
raise TelegramGatewayError(f"HTTP error: {e.response.status_code}") from e
raise TelegramGatewayError(f"HTTP error: {e.response.status_code}") from None
except TelegramGatewayError:
# 已處理的錯誤,直接拋出
raise
except Exception as e:
safe_error = _sanitize_telegram_error(str(e))
span.set_status(trace.Status(trace.StatusCode.ERROR))
span.record_exception(e)
logger.error("telegram_request_failed", method=method, error=str(e))
raise TelegramGatewayError(str(e)) from e
span.record_exception(TelegramGatewayError(safe_error))
logger.error(
"telegram_request_failed",
method=method,
error=safe_error,
error_type=type(e).__name__,
)
raise TelegramGatewayError(safe_error) from None
async def _build_inline_keyboard(
self,

View File

@@ -19,6 +19,8 @@ from src.services.failover_alerter import (
DEDUP_TTL_SEC,
QUOTA_DEDUP_TTL_SEC,
FailoverAlerter,
_lines_from_list,
_sanitize_telegram_error,
configure_alerter,
get_failover_alerter,
reset_failover_alerter,
@@ -231,3 +233,19 @@ async def test_memory_dedup_max_size_gc():
# GC 後999 個 stale entry 被清除,只剩 fresh:key + trigger:gc:key
assert len(alerter._memory_dedup) <= 3 # fresh + trigger + 可能有邊界差1
def test_lines_from_list_escapes_markdown_v2_numbered_periods() -> None:
rendered = _lines_from_list(["修復 node-exporter-110"])
assert "1\\." in rendered
assert "node\\-exporter\\-110" in rendered
def test_sanitize_telegram_error_redacts_bot_token_url() -> None:
raw = "HTTP error for https://api.telegram.org/bot123456:SECRET/sendMessage"
sanitized = _sanitize_telegram_error(raw)
assert "SECRET" not in sanitized
assert "bot<redacted>" in sanitized

View File

@@ -0,0 +1,12 @@
from __future__ import annotations
from src.services.telegram_gateway import _sanitize_telegram_error
def test_telegram_gateway_sanitizes_bot_token_url() -> None:
raw = "Client error for https://api.telegram.org/bot123456:SECRET/sendMessage"
sanitized = _sanitize_telegram_error(raw)
assert "SECRET" not in sanitized
assert "bot<redacted>" in sanitized

View File

@@ -3825,3 +3825,35 @@ ruff check apps/api/tests/test_approval_execution_mcp_audit.py
- 本次是「先補 durable audit + legacy 標記」,不是直接硬切 MCP Gateway enforcement原因是 AwoooP project / agent / grant contract 尚未覆蓋所有 legacy 修復路徑,硬切會中斷現有 approved execution。
- 下一步應將 `decision_manager.py``pre_decision_investigator.py``post_execution_verifier.py``callback_dispatcher.py` 的 direct MCP caller 逐步套同一種可追蹤 wrapper最後再切到 `McpGateway.call()` enforcement。
---
## 2026-05-06台北— Telegram failover 告警 400 與 token log 外洩修補
**觸發**production API log 顯示 `telegram_failover_send_failed`Telegram `sendMessage` 回 400同時 chained traceback 內含 Telegram Bot URL會把 token 形式的敏感資訊寫入 log / trace。
### 已修正
| 範圍 | 結果 |
|------|------|
| `failover_alerter.py` | 失敗時不再使用 `logger.exception()` 輸出 chained traceback改記錄已遮蔽的錯誤文字與錯誤類型 |
| MarkdownV2 | `_lines_from_list()` 將編號句點改為 `1\\.`,並補上 compact 省略文字的 MarkdownV2 escape避免治理告警清單觸發 Telegram parse 400 |
| `telegram_gateway.py` | HTTPStatusError 不再 `raise ... from e`OTel span 也只記 sanitized gateway error避免 httpx exception 字串帶出 Bot URL |
| 測試 | 新增 Telegram error sanitizer 與 MarkdownV2 編號 escape 回歸測試 |
### 驗證
```text
pytest apps/api/tests/test_failover_alerter.py apps/api/tests/test_telegram_gateway_error_sanitizer.py apps/api/tests/test_heartbeat_dedup_p0_4.py -q
# 17 passed
py_compile apps/api/src/services/failover_alerter.py apps/api/src/services/telegram_gateway.py apps/api/tests/test_failover_alerter.py apps/api/tests/test_telegram_gateway_error_sanitizer.py
# 通過
ruff check apps/api/src/services/failover_alerter.py apps/api/tests/test_failover_alerter.py apps/api/tests/test_telegram_gateway_error_sanitizer.py
# All checks passed
```
### 注意
- `telegram_gateway.py` 全檔仍有大量既有 ruff 債,本次只針對 token 外洩與 MarkdownV2 400 風險做最小安全修補,避免在 6000+ 行 gateway 巨檔混入無關機械改動。