From d522c51deb68a83dafdf51124755a966ad1741b3 Mon Sep 17 00:00:00 2001 From: OG T Date: Fri, 3 Apr 2026 15:29:20 +0800 Subject: [PATCH] =?UTF-8?q?fix(infra-alert):=20Nemotron=20=E7=95=B0?= =?UTF-8?q?=E5=B8=B8=E5=91=8A=E8=AD=A6=E5=A5=97=E7=94=A8=E6=A8=99=E6=BA=96?= =?UTF-8?q?=E6=A8=A1=E6=9D=BF=20+=20=E7=9C=9F=E6=AD=A3=E8=87=AA=E5=8B=95?= =?UTF-8?q?=E4=BF=AE=E5=BE=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 新增 InfraAlertMessage dataclass — 基礎設施異常的標準告警格式 (之前 Nemotron 告警是硬編碼文字,不走任何模板) 2. 偵測 Nemotron 異常時自動執行修復: kubectl set env ENABLE_NEMOTRON_COLLABORATION=false (之前只是把指令印在訊息裡,從未執行) 3. 告警顯示自動修復結果 (✅ 已自動修復 / ❌ 失敗) Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/telegram_gateway.py | 68 ++++++++++++++++++++--- 1 file changed, 61 insertions(+), 7 deletions(-) diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index af73c28e..2da24794 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -990,6 +990,38 @@ class WeeklyReportMessage: return message[:900] +@dataclass +class InfraAlertMessage: + """ + 基礎設施異常告警訊息 (INFRA_ALERT) + + 2026-04-03 ogt: 新增 — 補足 Nemotron/NIM 等基礎設施異常的標準告警格式 + 用途: 非 incident 型的系統元件異常通知 (AI provider, DB, 外部 API 等) + 按鈕: 無 (資訊型告警) + """ + component: str # 元件名稱 (e.g., "Nemotron NIM") + status: str # 狀態描述 (e.g., "⚠️ 超時 (>10s)") + impact: str # 影響說明 + auto_fixed: bool = False # 是否已自動修復 + fix_action: str = "" # 執行的修復動作 + + def format(self) -> str: + """格式化為 Telegram HTML""" + fix_block = "" + if self.auto_fixed: + fix_block = f"━━━━━━━━━━━━━━━━━━━\n✅ 已自動修復\n└ {html.escape(self.fix_action[:100])}\n" + else: + fix_block = f"━━━━━━━━━━━━━━━━━━━\n❌ 自動修復失敗\n└ {html.escape(self.fix_action[:100] or '無可用修復方案')}\n" + + return ( + f"🚨 基礎設施異常\n" + f"━━━━━━━━━━━━━━━━━━━\n" + f"⚙️ {html.escape(self.component)}: {html.escape(self.status)}\n" + f"📛 影響: {html.escape(self.impact[:150])}\n" + f"{fix_block}" + )[:900] + + # ============================================================================= # Risk Level Emoji Mapping # ============================================================================= @@ -2992,15 +3024,37 @@ class TelegramGateway: await self.send_notification(text) self._last_message_time = datetime.now(UTC) - # Nemotron 異常時額外發告警 + # Nemotron 異常時:自動修復 + 標準格式告警 if not nemo_ok: - await self.send_notification( - f"🚨 Nemotron 異常告警\n\n" - f"NVIDIA NIM API 不可用: {nemo_status}\n" - f"影響: 所有 incident 的 Nemotron Tool Calling 將 100% 超時\n" - f"緩解: kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod" + fix_action = "kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod" + auto_fixed = False + + # 自動修復: 關閉 Nemotron 協作避免每個 incident 白等 30s + try: + import asyncio + proc = await asyncio.create_subprocess_exec( + "kubectl", "set", "env", "deployment/awoooi-api", + "ENABLE_NEMOTRON_COLLABORATION=false", + "-n", "awoooi-prod", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await asyncio.wait_for(proc.communicate(), timeout=15.0) + auto_fixed = proc.returncode == 0 + if not auto_fixed: + logger.error("nemotron_auto_fix_failed", stderr=stderr.decode()[:100]) + except Exception as fix_err: + logger.error("nemotron_auto_fix_error", error=str(fix_err)) + + alert = InfraAlertMessage( + component="Nemotron NIM (NVIDIA API)", + status=nemo_status, + impact="所有 incident Nemotron Tool Calling 將 100% 超時", + auto_fixed=auto_fixed, + fix_action=fix_action, ) - logger.error("nemotron_health_alert_sent", status=nemo_status) + await self.send_notification(alert.format(), parse_mode="HTML") + logger.error("nemotron_health_alert_sent", status=nemo_status, auto_fixed=auto_fixed) logger.info("telegram_heartbeat_sent", nemotron_ok=nemo_ok) return True