From d9007e6855116a57c61ce191928e4581334be802 Mon Sep 17 00:00:00 2001 From: OG T Date: Fri, 3 Apr 2026 14:59:06 +0800 Subject: [PATCH] =?UTF-8?q?feat(chat+monitor):=20=E9=9B=99=20AI=20?= =?UTF-8?q?=E5=B0=8D=E8=A9=B1=E9=87=8D=E5=AF=AB=20+=20Nemotron=20=E5=81=A5?= =?UTF-8?q?=E5=BA=B7=E7=9B=A3=E6=8E=A7=E5=91=8A=E8=AD=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ChatManager 重寫 (Phase 22.6): - @openclaw → 只有 OpenClaw 回應 (Ollama qwen2.5:7b) - @nemo → 只有 NemoClaw 回應 (Gemini Flash) - 無前綴 → OpenClaw 先答,NemoClaw 評論/反駁 NemoClaw 改用 Gemini Flash (棄 NIM nemotron-mini-4b 因為 15s+ 回應時間) TelegramGateway 心跳新增 Nemotron 健康探測: - 每次心跳探測 NVIDIA NIM API (10s timeout) - 異常時立刻發 Telegram 告警 + 緩解指令 - 補足 Nemotron 100% 超時卻無告警的監控盲區 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/chat_manager.py | 262 +++++++++++++++------- apps/api/src/services/telegram_gateway.py | 57 ++++- 2 files changed, 232 insertions(+), 87 deletions(-) diff --git a/apps/api/src/services/chat_manager.py b/apps/api/src/services/chat_manager.py index 36bce6e8..e779f64d 100644 --- a/apps/api/src/services/chat_manager.py +++ b/apps/api/src/services/chat_manager.py @@ -1,115 +1,211 @@ """ -AWOOOI Chat Manager - 統帥對話核心 -=================================== -Phase 21.5: 實作 Telegram 互動對話功能 +AWOOOI Chat Manager - 雙 AI 對話核心 +====================================== +Phase 21.5 初版: 2026-03-31 ogt +Phase 22.6 重寫: 2026-04-03 ogt (統帥需求: 雙 AI 互動對話) -職責: -1. 整合系統上下文 (K3s 狀態, 最近告警, 目前時間) -2. 決定對話風格 (OpenClaw 專業風 vs Nemo 參謀風) -3. 調用 LLM (Nemo-4B / Gemini) 產出回應 -4. 遵守 SOUL.md Nothing.tech 純淨美學 +功能: +1. @openclaw / @nemo 路由 — 指定 AI 回應 +2. 無前綴 — 兩個 AI 輪流回應,並互相評論 +3. AI 互相對話 — NemoClaw 看到 OpenClaw 的回應後可補充/反駁 -2026-03-31 ogt: 初版建立 +架構: +- OpenClaw: 用 Ollama qwen2.5:7b-instruct (本地, 快) +- NemoClaw: 用 Gemini Flash (雲端, 快) — NIM nemotron-mini 太慢 (15s+) """ import structlog -from datetime import datetime from src.utils.timezone import now_taipei -from src.services.nvidia_provider import get_nvidia_provider from src.repositories.k8s_repository import get_k8s_repository from src.repositories.incident_repository import get_incident_repository logger = structlog.get_logger(__name__) +# 人格設定 +OPENCLAW_PERSONA = """你是 OpenClaw,AWOOOI 平台的 SRE AI 主帥。 +個性: 精準、果斷、專業,像老將一樣直接給出建議。 +語氣: 簡短有力,不廢話。繁體中文回應。 +當 NemoClaw 有不同意見時,你會直接反駁或接受,不拐彎抹角。 +""" + +NEMOCLAW_PERSONA = """你是 NemoClaw,AWOOOI 平台的 AI 戰術參謀。 +個性: 分析型、喜歡從不同角度思考,會質疑假設。 +語氣: 帶點挑釁但建設性,繁體中文回應。 +當 OpenClaw 給出意見時,你會評估是否同意,必要時提出替代方案。 +""" + + class ChatManager: - """ - AWOOOI 對話管理器 - 系統的大腦與聲帶 - """ + """AWOOOI 雙 AI 對話管理器""" def __init__(self): - self.nvidia = get_nvidia_provider() self.k8s = get_k8s_repository() self.incidents = get_incident_repository() async def get_system_context(self) -> str: - """ - 收集系統即時上下文,供 LLM 參考 - """ + """收集系統即時上下文""" now = now_taipei() - - # 1. K3s 狀態 - k8s_status = await self.k8s.get_pod_status_summary(namespace="awoooi-prod") - cluster_info = f"Cluster: {k8s_status['running']}/{k8s_status['total']} Pods Running" - if k8s_status['problem_pods']: - cluster_info += f", {len(k8s_status['problem_pods'])} anomalies detected." - - # 2. 最近告警 (取 3 筆) - active_incidents = await self.incidents.get_active() - incident_summary = "None" - if active_incidents: - lines = [] - for inc in active_incidents[:3]: - lines.append(f"- {inc.incident_id}: {inc.status.value} (Severity: {inc.severity.value})") - incident_summary = "\n".join(lines) - - context = f""" -## Current System Context (Taipei Time: {now.strftime('%Y-%m-%d %H:%M:%S')}) -- Environment: AWOOOI Production (K3s) -- {cluster_info} -- Active Incidents: -{incident_summary} -""" - return context - - async def generate_response( - self, - user_id: int, - username: str, - message_text: str - ) -> str: - """ - 根據統帥訊息產生回覆 - """ - system_context = await self.get_system_context() - - # 判定是否在跟 Nemo 對話 - is_asking_nemo = "nemo" in message_text.lower() - - role_description = "You are OpenClaw, the AI operations assistant for AWOOOI platform." - if is_asking_nemo: - role_description = "You are Nemo-4B, the elite AI tactical advisor for AWOOOI. Address the user as 'Supreme Commander' (統帥)." - - system_prompt = f"""{role_description} -{system_context} - -## Guidelines: -1. Keep responses extremely concise and professional (Nothing.tech aesthetic). -2. For status queries, provide precise data. -3. For general chat, be supportive but focused on operations. -4. Language: Preferred Traditional Chinese (繁體中文). -5. No emojis except for functional ones (🚨, ✅, 📊). -""" try: - # 優先使用 NVIDIA Nemo-4B - response, success, tokens, cost = await self.nvidia.chat( - prompt=f"{system_prompt}\n\nCommander's Message: {message_text}", - model="nvidia/nemotron-mini-4b-instruct", - max_tokens=1024 - ) - - if not success: - return "🛸 抱歉統帥,Nemo 參謀暫時離線。請稍後再試。" + k8s_status = await self.k8s.get_pod_status_summary(namespace="awoooi-prod") + cluster_info = f"Cluster: {k8s_status['running']}/{k8s_status['total']} Pods Running" + if k8s_status.get('problem_pods'): + cluster_info += f", {len(k8s_status['problem_pods'])} 異常" + except Exception: + cluster_info = "Cluster: 無法取得狀態" - return response.strip() + try: + active_incidents = await self.incidents.get_active() + if active_incidents: + lines = [f"- {inc.incident_id}: {inc.status.value} (SEV {inc.severity.value})" + for inc in active_incidents[:3]] + incident_summary = "\n".join(lines) + else: + incident_summary = "無活躍告警" + except Exception: + incident_summary = "無法取得告警" + return f"""## 系統狀態 ({now.strftime('%Y-%m-%d %H:%M')} 台北) +- {cluster_info} +- 活躍告警: {incident_summary} +""" + + async def _call_ollama(self, system_prompt: str, user_message: str) -> str: + """呼叫 Ollama (OpenClaw 用)""" + import httpx + try: + async with httpx.AsyncClient(timeout=30.0) as client: + resp = await client.post( + "http://192.168.0.188:11434/api/chat", + json={ + "model": "qwen2.5:7b-instruct", + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_message}, + ], + "stream": False, + "options": {"temperature": 0.7, "num_predict": 512}, + }, + ) + resp.raise_for_status() + data = resp.json() + return data.get("message", {}).get("content", "").strip() except Exception as e: - logger.exception("chat_generation_error", error=str(e)) - return "⚠️ 通訊鏈路異常,無法聯繫 AI 腦區。" + logger.warning("ollama_chat_failed", error=str(e)) + return None + + async def _call_gemini(self, system_prompt: str, user_message: str) -> str: + """呼叫 Gemini Flash (NemoClaw 用)""" + import httpx + from src.core.config import get_settings + settings = get_settings() + + api_key = settings.GEMINI_API_KEY if hasattr(settings, 'GEMINI_API_KEY') else None + if not api_key: + return None + + try: + full_prompt = f"{system_prompt}\n\n用戶訊息: {user_message}" + async with httpx.AsyncClient(timeout=20.0) as client: + resp = await client.post( + f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}", + json={ + "contents": [{"role": "user", "parts": [{"text": full_prompt}]}], + "generationConfig": {"temperature": 0.8, "maxOutputTokens": 512}, + }, + ) + resp.raise_for_status() + data = resp.json() + return data["candidates"][0]["content"]["parts"][0]["text"].strip() + except Exception as e: + logger.warning("gemini_chat_failed", error=str(e)) + return None + + async def _openclaw_respond(self, context: str, message: str) -> str: + """OpenClaw 回應""" + system = f"{OPENCLAW_PERSONA}\n{context}" + result = await self._call_ollama(system, message) + if not result: + result = "🔴 OpenClaw 暫時離線,Ollama 無響應。" + return f"🦞 OpenClaw:\n{result}" + + async def _nemoclaw_respond(self, context: str, message: str) -> str: + """NemoClaw 回應""" + system = f"{NEMOCLAW_PERSONA}\n{context}" + result = await self._call_gemini(system, message) + if not result: + # Gemini 失敗時 fallback 到 Ollama + result = await self._call_ollama(system, message) + if not result: + result = "🔴 NemoClaw 暫時離線。" + return f"🤖 NemoClaw:\n{result}" + + async def _nemoclaw_comment_on(self, context: str, openclaw_response: str, original_msg: str) -> str: + """NemoClaw 評論 OpenClaw 的回應""" + message = f"""統帥問了: {original_msg} + +OpenClaw 的回應是: +{openclaw_response} + +請你從 NemoClaw 的角度評論上面的回應。可以補充、反駁、或提出不同觀點。""" + + system = f"{NEMOCLAW_PERSONA}\n{context}" + result = await self._call_gemini(system, message) + if not result: + result = await self._call_ollama(system, message) + if not result: + return None + return f"🤖 NemoClaw 補充:\n{result}" + + async def generate_response( + self, + user_id: int, + username: str, + message_text: str, + ) -> str: + """ + 根據訊息內容決定回應模式: + + @openclaw → 只有 OpenClaw 回應 + @nemo → 只有 NemoClaw 回應 + 其他 → OpenClaw 先回,NemoClaw 評論 + """ + context = await self.get_system_context() + text = message_text.strip() + + # 模式 1: 指定 OpenClaw + if text.lower().startswith("@openclaw"): + msg = text[9:].strip() or text + return await self._openclaw_respond(context, msg) + + # 模式 2: 指定 NemoClaw + if text.lower().startswith("@nemo"): + msg = text[5:].strip() or text + return await self._nemoclaw_respond(context, msg) + + # 模式 3: 雙 AI 對話 + # Step 1: OpenClaw 先回 + openclaw_raw = await self._call_ollama( + f"{OPENCLAW_PERSONA}\n{context}", text + ) + if not openclaw_raw: + openclaw_raw = "Ollama 無響應,OpenClaw 暫時離線。" + + openclaw_block = f"🦞 OpenClaw:\n{openclaw_raw}" + + # Step 2: NemoClaw 評論 OpenClaw 的回應 + nemo_block = await self._nemoclaw_comment_on(context, openclaw_raw, text) + + if nemo_block: + return f"{openclaw_block}\n\n{nemo_block}" + else: + return openclaw_block + # Singleton _chat_manager = None + def get_chat_manager() -> ChatManager: global _chat_manager if _chat_manager is None: diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index d4aa86e2..af73c28e 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -2933,27 +2933,76 @@ class TelegramGateway: # Phase 6.5: 心跳監控方法 # ============================================================================= + async def _check_nemotron_health(self) -> tuple[bool, str]: + """ + 探測 Nemotron (NVIDIA NIM) 是否可用 + + 2026-04-03 ogt: 新增 — Nemotron 100% 超時但沒有告警,補足監控盲區 + Returns: (is_healthy, status_text) + """ + import httpx + from src.core.config import get_settings + settings = get_settings() + + api_key = settings.NVIDIA_API_KEY + if not api_key: + return False, "❌ NVIDIA_API_KEY 未設定" + + try: + async with httpx.AsyncClient(timeout=10.0) as client: + resp = await client.post( + "https://integrate.api.nvidia.com/v1/chat/completions", + headers={"Authorization": f"Bearer {api_key}"}, + json={ + "model": "nvidia/nemotron-mini-4b-instruct", + "messages": [{"role": "user", "content": "ping"}], + "max_tokens": 1, + }, + ) + if resp.status_code == 200: + return True, "✅ 正常" + return False, f"❌ HTTP {resp.status_code}" + except httpx.TimeoutException: + return False, "⚠️ 超時 (>10s)" + except Exception as e: + return False, f"❌ {str(e)[:40]}" + async def send_heartbeat(self) -> bool: """ - 發送心跳訊息 (系統狀態摘要) + 發送心跳訊息 (系統狀態摘要,含 Nemotron 健康探測) 每 30 分鐘執行一次,證明告警鏈路正常運作 + 2026-04-03 ogt: 加入 Nemotron 健康探測 — 補足監控盲區 """ try: if not self._initialized: await self.initialize() - # 心跳訊息 (2026-03-30 ogt: 改用台北時區,符合 feedback_timezone_taipei.md) from src.utils.timezone import now_taipei taipei_now = now_taipei() + + # Nemotron 健康探測 + nemo_ok, nemo_status = await self._check_nemotron_health() + text = f"""💓 AWOOOI 心跳 ⏰ {taipei_now.strftime('%Y-%m-%d %H:%M:%S')} (台北) -📡 告警鏈路: ✅ 正常""" +📡 告警鏈路: ✅ 正常 +🤖 Nemotron NIM: {nemo_status}""" await self.send_notification(text) self._last_message_time = datetime.now(UTC) - logger.info("telegram_heartbeat_sent") + # Nemotron 異常時額外發告警 + if not nemo_ok: + await self.send_notification( + f"🚨 Nemotron 異常告警\n\n" + f"NVIDIA NIM API 不可用: {nemo_status}\n" + f"影響: 所有 incident 的 Nemotron Tool Calling 將 100% 超時\n" + f"緩解: kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod" + ) + logger.error("nemotron_health_alert_sent", status=nemo_status) + + logger.info("telegram_heartbeat_sent", nemotron_ok=nemo_ok) return True except Exception as e: