diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index af73c28e..2da24794 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -990,6 +990,38 @@ class WeeklyReportMessage: return message[:900] +@dataclass +class InfraAlertMessage: + """ + 基礎設施異常告警訊息 (INFRA_ALERT) + + 2026-04-03 ogt: 新增 — 補足 Nemotron/NIM 等基礎設施異常的標準告警格式 + 用途: 非 incident 型的系統元件異常通知 (AI provider, DB, 外部 API 等) + 按鈕: 無 (資訊型告警) + """ + component: str # 元件名稱 (e.g., "Nemotron NIM") + status: str # 狀態描述 (e.g., "⚠️ 超時 (>10s)") + impact: str # 影響說明 + auto_fixed: bool = False # 是否已自動修復 + fix_action: str = "" # 執行的修復動作 + + def format(self) -> str: + """格式化為 Telegram HTML""" + fix_block = "" + if self.auto_fixed: + fix_block = f"━━━━━━━━━━━━━━━━━━━\n✅ 已自動修復\n└ {html.escape(self.fix_action[:100])}\n" + else: + fix_block = f"━━━━━━━━━━━━━━━━━━━\n❌ 自動修復失敗\n└ {html.escape(self.fix_action[:100] or '無可用修復方案')}\n" + + return ( + f"🚨 基礎設施異常\n" + f"━━━━━━━━━━━━━━━━━━━\n" + f"⚙️ {html.escape(self.component)}: {html.escape(self.status)}\n" + f"📛 影響: {html.escape(self.impact[:150])}\n" + f"{fix_block}" + )[:900] + + # ============================================================================= # Risk Level Emoji Mapping # ============================================================================= @@ -2992,15 +3024,37 @@ class TelegramGateway: await self.send_notification(text) self._last_message_time = datetime.now(UTC) - # Nemotron 異常時額外發告警 + # Nemotron 異常時:自動修復 + 標準格式告警 if not nemo_ok: - await self.send_notification( - f"🚨 Nemotron 異常告警\n\n" - f"NVIDIA NIM API 不可用: {nemo_status}\n" - f"影響: 所有 incident 的 Nemotron Tool Calling 將 100% 超時\n" - f"緩解: kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod" + fix_action = "kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod" + auto_fixed = False + + # 自動修復: 關閉 Nemotron 協作避免每個 incident 白等 30s + try: + import asyncio + proc = await asyncio.create_subprocess_exec( + "kubectl", "set", "env", "deployment/awoooi-api", + "ENABLE_NEMOTRON_COLLABORATION=false", + "-n", "awoooi-prod", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await asyncio.wait_for(proc.communicate(), timeout=15.0) + auto_fixed = proc.returncode == 0 + if not auto_fixed: + logger.error("nemotron_auto_fix_failed", stderr=stderr.decode()[:100]) + except Exception as fix_err: + logger.error("nemotron_auto_fix_error", error=str(fix_err)) + + alert = InfraAlertMessage( + component="Nemotron NIM (NVIDIA API)", + status=nemo_status, + impact="所有 incident Nemotron Tool Calling 將 100% 超時", + auto_fixed=auto_fixed, + fix_action=fix_action, ) - logger.error("nemotron_health_alert_sent", status=nemo_status) + await self.send_notification(alert.format(), parse_mode="HTML") + logger.error("nemotron_health_alert_sent", status=nemo_status, auto_fixed=auto_fixed) logger.info("telegram_heartbeat_sent", nemotron_ok=nemo_ok) return True