diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py
index af73c28e..2da24794 100644
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -990,6 +990,38 @@ class WeeklyReportMessage:
return message[:900]
+@dataclass
+class InfraAlertMessage:
+ """
+ 基礎設施異常告警訊息 (INFRA_ALERT)
+
+ 2026-04-03 ogt: 新增 — 補足 Nemotron/NIM 等基礎設施異常的標準告警格式
+ 用途: 非 incident 型的系統元件異常通知 (AI provider, DB, 外部 API 等)
+ 按鈕: 無 (資訊型告警)
+ """
+ component: str # 元件名稱 (e.g., "Nemotron NIM")
+ status: str # 狀態描述 (e.g., "⚠️ 超時 (>10s)")
+ impact: str # 影響說明
+ auto_fixed: bool = False # 是否已自動修復
+ fix_action: str = "" # 執行的修復動作
+
+ def format(self) -> str:
+ """格式化為 Telegram HTML"""
+ fix_block = ""
+ if self.auto_fixed:
+ fix_block = f"━━━━━━━━━━━━━━━━━━━\n✅ 已自動修復\n└ {html.escape(self.fix_action[:100])}\n"
+ else:
+ fix_block = f"━━━━━━━━━━━━━━━━━━━\n❌ 自動修復失敗\n└ {html.escape(self.fix_action[:100] or '無可用修復方案')}\n"
+
+ return (
+ f"🚨 基礎設施異常\n"
+ f"━━━━━━━━━━━━━━━━━━━\n"
+ f"⚙️ {html.escape(self.component)}: {html.escape(self.status)}\n"
+ f"📛 影響: {html.escape(self.impact[:150])}\n"
+ f"{fix_block}"
+ )[:900]
+
+
# =============================================================================
# Risk Level Emoji Mapping
# =============================================================================
@@ -2992,15 +3024,37 @@ class TelegramGateway:
await self.send_notification(text)
self._last_message_time = datetime.now(UTC)
- # Nemotron 異常時額外發告警
+ # Nemotron 異常時:自動修復 + 標準格式告警
if not nemo_ok:
- await self.send_notification(
- f"🚨 Nemotron 異常告警\n\n"
- f"NVIDIA NIM API 不可用: {nemo_status}\n"
- f"影響: 所有 incident 的 Nemotron Tool Calling 將 100% 超時\n"
- f"緩解: kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod"
+ fix_action = "kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod"
+ auto_fixed = False
+
+ # 自動修復: 關閉 Nemotron 協作避免每個 incident 白等 30s
+ try:
+ import asyncio
+ proc = await asyncio.create_subprocess_exec(
+ "kubectl", "set", "env", "deployment/awoooi-api",
+ "ENABLE_NEMOTRON_COLLABORATION=false",
+ "-n", "awoooi-prod",
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE,
+ )
+ _, stderr = await asyncio.wait_for(proc.communicate(), timeout=15.0)
+ auto_fixed = proc.returncode == 0
+ if not auto_fixed:
+ logger.error("nemotron_auto_fix_failed", stderr=stderr.decode()[:100])
+ except Exception as fix_err:
+ logger.error("nemotron_auto_fix_error", error=str(fix_err))
+
+ alert = InfraAlertMessage(
+ component="Nemotron NIM (NVIDIA API)",
+ status=nemo_status,
+ impact="所有 incident Nemotron Tool Calling 將 100% 超時",
+ auto_fixed=auto_fixed,
+ fix_action=fix_action,
)
- logger.error("nemotron_health_alert_sent", status=nemo_status)
+ await self.send_notification(alert.format(), parse_mode="HTML")
+ logger.error("nemotron_health_alert_sent", status=nemo_status, auto_fixed=auto_fixed)
logger.info("telegram_heartbeat_sent", nemotron_ok=nemo_ok)
return True