fix(infra-alert): Nemotron 異常告警套用標準模板 + 真正自動修復
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
1. 新增 InfraAlertMessage dataclass — 基礎設施異常的標準告警格式 (之前 Nemotron 告警是硬編碼文字,不走任何模板) 2. 偵測 Nemotron 異常時自動執行修復: kubectl set env ENABLE_NEMOTRON_COLLABORATION=false (之前只是把指令印在訊息裡,從未執行) 3. 告警顯示自動修復結果 (✅ 已自動修復 / ❌ 失敗) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -990,6 +990,38 @@ class WeeklyReportMessage:
|
||||
return message[:900]
|
||||
|
||||
|
||||
@dataclass
|
||||
class InfraAlertMessage:
|
||||
"""
|
||||
基礎設施異常告警訊息 (INFRA_ALERT)
|
||||
|
||||
2026-04-03 ogt: 新增 — 補足 Nemotron/NIM 等基礎設施異常的標準告警格式
|
||||
用途: 非 incident 型的系統元件異常通知 (AI provider, DB, 外部 API 等)
|
||||
按鈕: 無 (資訊型告警)
|
||||
"""
|
||||
component: str # 元件名稱 (e.g., "Nemotron NIM")
|
||||
status: str # 狀態描述 (e.g., "⚠️ 超時 (>10s)")
|
||||
impact: str # 影響說明
|
||||
auto_fixed: bool = False # 是否已自動修復
|
||||
fix_action: str = "" # 執行的修復動作
|
||||
|
||||
def format(self) -> str:
|
||||
"""格式化為 Telegram HTML"""
|
||||
fix_block = ""
|
||||
if self.auto_fixed:
|
||||
fix_block = f"━━━━━━━━━━━━━━━━━━━\n✅ <b>已自動修復</b>\n└ {html.escape(self.fix_action[:100])}\n"
|
||||
else:
|
||||
fix_block = f"━━━━━━━━━━━━━━━━━━━\n❌ <b>自動修復失敗</b>\n└ {html.escape(self.fix_action[:100] or '無可用修復方案')}\n"
|
||||
|
||||
return (
|
||||
f"🚨 <b>基礎設施異常</b>\n"
|
||||
f"━━━━━━━━━━━━━━━━━━━\n"
|
||||
f"⚙️ <b>{html.escape(self.component)}</b>: {html.escape(self.status)}\n"
|
||||
f"📛 影響: {html.escape(self.impact[:150])}\n"
|
||||
f"{fix_block}"
|
||||
)[:900]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Risk Level Emoji Mapping
|
||||
# =============================================================================
|
||||
@@ -2992,15 +3024,37 @@ class TelegramGateway:
|
||||
await self.send_notification(text)
|
||||
self._last_message_time = datetime.now(UTC)
|
||||
|
||||
# Nemotron 異常時額外發告警
|
||||
# Nemotron 異常時:自動修復 + 標準格式告警
|
||||
if not nemo_ok:
|
||||
await self.send_notification(
|
||||
f"🚨 <b>Nemotron 異常告警</b>\n\n"
|
||||
f"NVIDIA NIM API 不可用: <code>{nemo_status}</code>\n"
|
||||
f"影響: 所有 incident 的 Nemotron Tool Calling 將 100% 超時\n"
|
||||
f"緩解: <code>kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod</code>"
|
||||
fix_action = "kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod"
|
||||
auto_fixed = False
|
||||
|
||||
# 自動修復: 關閉 Nemotron 協作避免每個 incident 白等 30s
|
||||
try:
|
||||
import asyncio
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"kubectl", "set", "env", "deployment/awoooi-api",
|
||||
"ENABLE_NEMOTRON_COLLABORATION=false",
|
||||
"-n", "awoooi-prod",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
_, stderr = await asyncio.wait_for(proc.communicate(), timeout=15.0)
|
||||
auto_fixed = proc.returncode == 0
|
||||
if not auto_fixed:
|
||||
logger.error("nemotron_auto_fix_failed", stderr=stderr.decode()[:100])
|
||||
except Exception as fix_err:
|
||||
logger.error("nemotron_auto_fix_error", error=str(fix_err))
|
||||
|
||||
alert = InfraAlertMessage(
|
||||
component="Nemotron NIM (NVIDIA API)",
|
||||
status=nemo_status,
|
||||
impact="所有 incident Nemotron Tool Calling 將 100% 超時",
|
||||
auto_fixed=auto_fixed,
|
||||
fix_action=fix_action,
|
||||
)
|
||||
logger.error("nemotron_health_alert_sent", status=nemo_status)
|
||||
await self.send_notification(alert.format(), parse_mode="HTML")
|
||||
logger.error("nemotron_health_alert_sent", status=nemo_status, auto_fixed=auto_fixed)
|
||||
|
||||
logger.info("telegram_heartbeat_sent", nemotron_ok=nemo_ok)
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user