fix(infra-alert): Nemotron 異常告警套用標準模板 + 真正自動修復
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

1. 新增 InfraAlertMessage dataclass — 基礎設施異常的標準告警格式
   (之前 Nemotron 告警是硬編碼文字,不走任何模板)

2. 偵測 Nemotron 異常時自動執行修復:
   kubectl set env ENABLE_NEMOTRON_COLLABORATION=false
   (之前只是把指令印在訊息裡,從未執行)

3. 告警顯示自動修復結果 ( 已自動修復 /  失敗)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-03 15:29:20 +08:00
parent e93ada0452
commit d522c51deb

View File

@@ -990,6 +990,38 @@ class WeeklyReportMessage:
return message[:900]
@dataclass
class InfraAlertMessage:
"""
基礎設施異常告警訊息 (INFRA_ALERT)
2026-04-03 ogt: 新增 — 補足 Nemotron/NIM 等基礎設施異常的標準告警格式
用途: 非 incident 型的系統元件異常通知 (AI provider, DB, 外部 API 等)
按鈕: 無 (資訊型告警)
"""
component: str # 元件名稱 (e.g., "Nemotron NIM")
status: str # 狀態描述 (e.g., "⚠️ 超時 (>10s)")
impact: str # 影響說明
auto_fixed: bool = False # 是否已自動修復
fix_action: str = "" # 執行的修復動作
def format(self) -> str:
"""格式化為 Telegram HTML"""
fix_block = ""
if self.auto_fixed:
fix_block = f"━━━━━━━━━━━━━━━━━━━\n✅ <b>已自動修復</b>\n{html.escape(self.fix_action[:100])}\n"
else:
fix_block = f"━━━━━━━━━━━━━━━━━━━\n❌ <b>自動修復失敗</b>\n{html.escape(self.fix_action[:100] or '無可用修復方案')}\n"
return (
f"🚨 <b>基礎設施異常</b>\n"
f"━━━━━━━━━━━━━━━━━━━\n"
f"⚙️ <b>{html.escape(self.component)}</b>: {html.escape(self.status)}\n"
f"📛 影響: {html.escape(self.impact[:150])}\n"
f"{fix_block}"
)[:900]
# =============================================================================
# Risk Level Emoji Mapping
# =============================================================================
@@ -2992,15 +3024,37 @@ class TelegramGateway:
await self.send_notification(text)
self._last_message_time = datetime.now(UTC)
# Nemotron 異常時額外發告警
# Nemotron 異常時:自動修復 + 標準格式告警
if not nemo_ok:
await self.send_notification(
f"🚨 <b>Nemotron 異常告警</b>\n\n"
f"NVIDIA NIM API 不可用: <code>{nemo_status}</code>\n"
f"影響: 所有 incident 的 Nemotron Tool Calling 將 100% 超時\n"
f"緩解: <code>kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod</code>"
fix_action = "kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod"
auto_fixed = False
# 自動修復: 關閉 Nemotron 協作避免每個 incident 白等 30s
try:
import asyncio
proc = await asyncio.create_subprocess_exec(
"kubectl", "set", "env", "deployment/awoooi-api",
"ENABLE_NEMOTRON_COLLABORATION=false",
"-n", "awoooi-prod",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
_, stderr = await asyncio.wait_for(proc.communicate(), timeout=15.0)
auto_fixed = proc.returncode == 0
if not auto_fixed:
logger.error("nemotron_auto_fix_failed", stderr=stderr.decode()[:100])
except Exception as fix_err:
logger.error("nemotron_auto_fix_error", error=str(fix_err))
alert = InfraAlertMessage(
component="Nemotron NIM (NVIDIA API)",
status=nemo_status,
impact="所有 incident Nemotron Tool Calling 將 100% 超時",
auto_fixed=auto_fixed,
fix_action=fix_action,
)
logger.error("nemotron_health_alert_sent", status=nemo_status)
await self.send_notification(alert.format(), parse_mode="HTML")
logger.error("nemotron_health_alert_sent", status=nemo_status, auto_fixed=auto_fixed)
logger.info("telegram_heartbeat_sent", nemotron_ok=nemo_ok)
return True