feat(chat+monitor): 雙 AI 對話重寫 + Nemotron 健康監控告警
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 6m56s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 6m56s
ChatManager 重寫 (Phase 22.6): - @openclaw <msg> → 只有 OpenClaw 回應 (Ollama qwen2.5:7b) - @nemo <msg> → 只有 NemoClaw 回應 (Gemini Flash) - 無前綴 → OpenClaw 先答,NemoClaw 評論/反駁 NemoClaw 改用 Gemini Flash (棄 NIM nemotron-mini-4b 因為 15s+ 回應時間) TelegramGateway 心跳新增 Nemotron 健康探測: - 每次心跳探測 NVIDIA NIM API (10s timeout) - 異常時立刻發 Telegram 告警 + 緩解指令 - 補足 Nemotron 100% 超時卻無告警的監控盲區 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,115 +1,211 @@
|
||||
"""
|
||||
AWOOOI Chat Manager - 統帥對話核心
|
||||
===================================
|
||||
Phase 21.5: 實作 Telegram 互動對話功能
|
||||
AWOOOI Chat Manager - 雙 AI 對話核心
|
||||
======================================
|
||||
Phase 21.5 初版: 2026-03-31 ogt
|
||||
Phase 22.6 重寫: 2026-04-03 ogt (統帥需求: 雙 AI 互動對話)
|
||||
|
||||
職責:
|
||||
1. 整合系統上下文 (K3s 狀態, 最近告警, 目前時間)
|
||||
2. 決定對話風格 (OpenClaw 專業風 vs Nemo 參謀風)
|
||||
3. 調用 LLM (Nemo-4B / Gemini) 產出回應
|
||||
4. 遵守 SOUL.md Nothing.tech 純淨美學
|
||||
功能:
|
||||
1. @openclaw / @nemo 路由 — 指定 AI 回應
|
||||
2. 無前綴 — 兩個 AI 輪流回應,並互相評論
|
||||
3. AI 互相對話 — NemoClaw 看到 OpenClaw 的回應後可補充/反駁
|
||||
|
||||
2026-03-31 ogt: 初版建立
|
||||
架構:
|
||||
- OpenClaw: 用 Ollama qwen2.5:7b-instruct (本地, 快)
|
||||
- NemoClaw: 用 Gemini Flash (雲端, 快) — NIM nemotron-mini 太慢 (15s+)
|
||||
"""
|
||||
|
||||
import structlog
|
||||
from datetime import datetime
|
||||
from src.utils.timezone import now_taipei
|
||||
from src.services.nvidia_provider import get_nvidia_provider
|
||||
from src.repositories.k8s_repository import get_k8s_repository
|
||||
from src.repositories.incident_repository import get_incident_repository
|
||||
|
||||
logger = structlog.get_logger(__name__)
|
||||
|
||||
# 人格設定
|
||||
OPENCLAW_PERSONA = """你是 OpenClaw,AWOOOI 平台的 SRE AI 主帥。
|
||||
個性: 精準、果斷、專業,像老將一樣直接給出建議。
|
||||
語氣: 簡短有力,不廢話。繁體中文回應。
|
||||
當 NemoClaw 有不同意見時,你會直接反駁或接受,不拐彎抹角。
|
||||
"""
|
||||
|
||||
NEMOCLAW_PERSONA = """你是 NemoClaw,AWOOOI 平台的 AI 戰術參謀。
|
||||
個性: 分析型、喜歡從不同角度思考,會質疑假設。
|
||||
語氣: 帶點挑釁但建設性,繁體中文回應。
|
||||
當 OpenClaw 給出意見時,你會評估是否同意,必要時提出替代方案。
|
||||
"""
|
||||
|
||||
|
||||
class ChatManager:
|
||||
"""
|
||||
AWOOOI 對話管理器 - 系統的大腦與聲帶
|
||||
"""
|
||||
"""AWOOOI 雙 AI 對話管理器"""
|
||||
|
||||
def __init__(self):
|
||||
self.nvidia = get_nvidia_provider()
|
||||
self.k8s = get_k8s_repository()
|
||||
self.incidents = get_incident_repository()
|
||||
|
||||
async def get_system_context(self) -> str:
|
||||
"""
|
||||
收集系統即時上下文,供 LLM 參考
|
||||
"""
|
||||
"""收集系統即時上下文"""
|
||||
now = now_taipei()
|
||||
|
||||
# 1. K3s 狀態
|
||||
k8s_status = await self.k8s.get_pod_status_summary(namespace="awoooi-prod")
|
||||
cluster_info = f"Cluster: {k8s_status['running']}/{k8s_status['total']} Pods Running"
|
||||
if k8s_status['problem_pods']:
|
||||
cluster_info += f", {len(k8s_status['problem_pods'])} anomalies detected."
|
||||
|
||||
# 2. 最近告警 (取 3 筆)
|
||||
active_incidents = await self.incidents.get_active()
|
||||
incident_summary = "None"
|
||||
if active_incidents:
|
||||
lines = []
|
||||
for inc in active_incidents[:3]:
|
||||
lines.append(f"- {inc.incident_id}: {inc.status.value} (Severity: {inc.severity.value})")
|
||||
incident_summary = "\n".join(lines)
|
||||
|
||||
context = f"""
|
||||
## Current System Context (Taipei Time: {now.strftime('%Y-%m-%d %H:%M:%S')})
|
||||
- Environment: AWOOOI Production (K3s)
|
||||
- {cluster_info}
|
||||
- Active Incidents:
|
||||
{incident_summary}
|
||||
"""
|
||||
return context
|
||||
|
||||
async def generate_response(
|
||||
self,
|
||||
user_id: int,
|
||||
username: str,
|
||||
message_text: str
|
||||
) -> str:
|
||||
"""
|
||||
根據統帥訊息產生回覆
|
||||
"""
|
||||
system_context = await self.get_system_context()
|
||||
|
||||
# 判定是否在跟 Nemo 對話
|
||||
is_asking_nemo = "nemo" in message_text.lower()
|
||||
|
||||
role_description = "You are OpenClaw, the AI operations assistant for AWOOOI platform."
|
||||
if is_asking_nemo:
|
||||
role_description = "You are Nemo-4B, the elite AI tactical advisor for AWOOOI. Address the user as 'Supreme Commander' (統帥)."
|
||||
|
||||
system_prompt = f"""{role_description}
|
||||
{system_context}
|
||||
|
||||
## Guidelines:
|
||||
1. Keep responses extremely concise and professional (Nothing.tech aesthetic).
|
||||
2. For status queries, provide precise data.
|
||||
3. For general chat, be supportive but focused on operations.
|
||||
4. Language: Preferred Traditional Chinese (繁體中文).
|
||||
5. No emojis except for functional ones (🚨, ✅, 📊).
|
||||
"""
|
||||
|
||||
try:
|
||||
# 優先使用 NVIDIA Nemo-4B
|
||||
response, success, tokens, cost = await self.nvidia.chat(
|
||||
prompt=f"{system_prompt}\n\nCommander's Message: {message_text}",
|
||||
model="nvidia/nemotron-mini-4b-instruct",
|
||||
max_tokens=1024
|
||||
)
|
||||
|
||||
if not success:
|
||||
return "🛸 抱歉統帥,Nemo 參謀暫時離線。請稍後再試。"
|
||||
k8s_status = await self.k8s.get_pod_status_summary(namespace="awoooi-prod")
|
||||
cluster_info = f"Cluster: {k8s_status['running']}/{k8s_status['total']} Pods Running"
|
||||
if k8s_status.get('problem_pods'):
|
||||
cluster_info += f", {len(k8s_status['problem_pods'])} 異常"
|
||||
except Exception:
|
||||
cluster_info = "Cluster: 無法取得狀態"
|
||||
|
||||
return response.strip()
|
||||
try:
|
||||
active_incidents = await self.incidents.get_active()
|
||||
if active_incidents:
|
||||
lines = [f"- {inc.incident_id}: {inc.status.value} (SEV {inc.severity.value})"
|
||||
for inc in active_incidents[:3]]
|
||||
incident_summary = "\n".join(lines)
|
||||
else:
|
||||
incident_summary = "無活躍告警"
|
||||
except Exception:
|
||||
incident_summary = "無法取得告警"
|
||||
|
||||
return f"""## 系統狀態 ({now.strftime('%Y-%m-%d %H:%M')} 台北)
|
||||
- {cluster_info}
|
||||
- 活躍告警: {incident_summary}
|
||||
"""
|
||||
|
||||
async def _call_ollama(self, system_prompt: str, user_message: str) -> str:
|
||||
"""呼叫 Ollama (OpenClaw 用)"""
|
||||
import httpx
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
resp = await client.post(
|
||||
"http://192.168.0.188:11434/api/chat",
|
||||
json={
|
||||
"model": "qwen2.5:7b-instruct",
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_message},
|
||||
],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.7, "num_predict": 512},
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data.get("message", {}).get("content", "").strip()
|
||||
except Exception as e:
|
||||
logger.exception("chat_generation_error", error=str(e))
|
||||
return "⚠️ 通訊鏈路異常,無法聯繫 AI 腦區。"
|
||||
logger.warning("ollama_chat_failed", error=str(e))
|
||||
return None
|
||||
|
||||
async def _call_gemini(self, system_prompt: str, user_message: str) -> str:
|
||||
"""呼叫 Gemini Flash (NemoClaw 用)"""
|
||||
import httpx
|
||||
from src.core.config import get_settings
|
||||
settings = get_settings()
|
||||
|
||||
api_key = settings.GEMINI_API_KEY if hasattr(settings, 'GEMINI_API_KEY') else None
|
||||
if not api_key:
|
||||
return None
|
||||
|
||||
try:
|
||||
full_prompt = f"{system_prompt}\n\n用戶訊息: {user_message}"
|
||||
async with httpx.AsyncClient(timeout=20.0) as client:
|
||||
resp = await client.post(
|
||||
f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}",
|
||||
json={
|
||||
"contents": [{"role": "user", "parts": [{"text": full_prompt}]}],
|
||||
"generationConfig": {"temperature": 0.8, "maxOutputTokens": 512},
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data["candidates"][0]["content"]["parts"][0]["text"].strip()
|
||||
except Exception as e:
|
||||
logger.warning("gemini_chat_failed", error=str(e))
|
||||
return None
|
||||
|
||||
async def _openclaw_respond(self, context: str, message: str) -> str:
|
||||
"""OpenClaw 回應"""
|
||||
system = f"{OPENCLAW_PERSONA}\n{context}"
|
||||
result = await self._call_ollama(system, message)
|
||||
if not result:
|
||||
result = "🔴 OpenClaw 暫時離線,Ollama 無響應。"
|
||||
return f"🦞 <b>OpenClaw:</b>\n{result}"
|
||||
|
||||
async def _nemoclaw_respond(self, context: str, message: str) -> str:
|
||||
"""NemoClaw 回應"""
|
||||
system = f"{NEMOCLAW_PERSONA}\n{context}"
|
||||
result = await self._call_gemini(system, message)
|
||||
if not result:
|
||||
# Gemini 失敗時 fallback 到 Ollama
|
||||
result = await self._call_ollama(system, message)
|
||||
if not result:
|
||||
result = "🔴 NemoClaw 暫時離線。"
|
||||
return f"🤖 <b>NemoClaw:</b>\n{result}"
|
||||
|
||||
async def _nemoclaw_comment_on(self, context: str, openclaw_response: str, original_msg: str) -> str:
|
||||
"""NemoClaw 評論 OpenClaw 的回應"""
|
||||
message = f"""統帥問了: {original_msg}
|
||||
|
||||
OpenClaw 的回應是:
|
||||
{openclaw_response}
|
||||
|
||||
請你從 NemoClaw 的角度評論上面的回應。可以補充、反駁、或提出不同觀點。"""
|
||||
|
||||
system = f"{NEMOCLAW_PERSONA}\n{context}"
|
||||
result = await self._call_gemini(system, message)
|
||||
if not result:
|
||||
result = await self._call_ollama(system, message)
|
||||
if not result:
|
||||
return None
|
||||
return f"🤖 <b>NemoClaw 補充:</b>\n{result}"
|
||||
|
||||
async def generate_response(
|
||||
self,
|
||||
user_id: int,
|
||||
username: str,
|
||||
message_text: str,
|
||||
) -> str:
|
||||
"""
|
||||
根據訊息內容決定回應模式:
|
||||
|
||||
@openclaw <msg> → 只有 OpenClaw 回應
|
||||
@nemo <msg> → 只有 NemoClaw 回應
|
||||
其他 → OpenClaw 先回,NemoClaw 評論
|
||||
"""
|
||||
context = await self.get_system_context()
|
||||
text = message_text.strip()
|
||||
|
||||
# 模式 1: 指定 OpenClaw
|
||||
if text.lower().startswith("@openclaw"):
|
||||
msg = text[9:].strip() or text
|
||||
return await self._openclaw_respond(context, msg)
|
||||
|
||||
# 模式 2: 指定 NemoClaw
|
||||
if text.lower().startswith("@nemo"):
|
||||
msg = text[5:].strip() or text
|
||||
return await self._nemoclaw_respond(context, msg)
|
||||
|
||||
# 模式 3: 雙 AI 對話
|
||||
# Step 1: OpenClaw 先回
|
||||
openclaw_raw = await self._call_ollama(
|
||||
f"{OPENCLAW_PERSONA}\n{context}", text
|
||||
)
|
||||
if not openclaw_raw:
|
||||
openclaw_raw = "Ollama 無響應,OpenClaw 暫時離線。"
|
||||
|
||||
openclaw_block = f"🦞 <b>OpenClaw:</b>\n{openclaw_raw}"
|
||||
|
||||
# Step 2: NemoClaw 評論 OpenClaw 的回應
|
||||
nemo_block = await self._nemoclaw_comment_on(context, openclaw_raw, text)
|
||||
|
||||
if nemo_block:
|
||||
return f"{openclaw_block}\n\n{nemo_block}"
|
||||
else:
|
||||
return openclaw_block
|
||||
|
||||
|
||||
# Singleton
|
||||
_chat_manager = None
|
||||
|
||||
|
||||
def get_chat_manager() -> ChatManager:
|
||||
global _chat_manager
|
||||
if _chat_manager is None:
|
||||
|
||||
@@ -2933,27 +2933,76 @@ class TelegramGateway:
|
||||
# Phase 6.5: 心跳監控方法
|
||||
# =============================================================================
|
||||
|
||||
async def _check_nemotron_health(self) -> tuple[bool, str]:
|
||||
"""
|
||||
探測 Nemotron (NVIDIA NIM) 是否可用
|
||||
|
||||
2026-04-03 ogt: 新增 — Nemotron 100% 超時但沒有告警,補足監控盲區
|
||||
Returns: (is_healthy, status_text)
|
||||
"""
|
||||
import httpx
|
||||
from src.core.config import get_settings
|
||||
settings = get_settings()
|
||||
|
||||
api_key = settings.NVIDIA_API_KEY
|
||||
if not api_key:
|
||||
return False, "❌ NVIDIA_API_KEY 未設定"
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
resp = await client.post(
|
||||
"https://integrate.api.nvidia.com/v1/chat/completions",
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
json={
|
||||
"model": "nvidia/nemotron-mini-4b-instruct",
|
||||
"messages": [{"role": "user", "content": "ping"}],
|
||||
"max_tokens": 1,
|
||||
},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
return True, "✅ 正常"
|
||||
return False, f"❌ HTTP {resp.status_code}"
|
||||
except httpx.TimeoutException:
|
||||
return False, "⚠️ 超時 (>10s)"
|
||||
except Exception as e:
|
||||
return False, f"❌ {str(e)[:40]}"
|
||||
|
||||
async def send_heartbeat(self) -> bool:
|
||||
"""
|
||||
發送心跳訊息 (系統狀態摘要)
|
||||
發送心跳訊息 (系統狀態摘要,含 Nemotron 健康探測)
|
||||
|
||||
每 30 分鐘執行一次,證明告警鏈路正常運作
|
||||
2026-04-03 ogt: 加入 Nemotron 健康探測 — 補足監控盲區
|
||||
"""
|
||||
try:
|
||||
if not self._initialized:
|
||||
await self.initialize()
|
||||
|
||||
# 心跳訊息 (2026-03-30 ogt: 改用台北時區,符合 feedback_timezone_taipei.md)
|
||||
from src.utils.timezone import now_taipei
|
||||
taipei_now = now_taipei()
|
||||
|
||||
# Nemotron 健康探測
|
||||
nemo_ok, nemo_status = await self._check_nemotron_health()
|
||||
|
||||
text = f"""💓 <b>AWOOOI 心跳</b>
|
||||
⏰ {taipei_now.strftime('%Y-%m-%d %H:%M:%S')} (台北)
|
||||
📡 告警鏈路: ✅ 正常"""
|
||||
📡 告警鏈路: ✅ 正常
|
||||
🤖 Nemotron NIM: {nemo_status}"""
|
||||
|
||||
await self.send_notification(text)
|
||||
self._last_message_time = datetime.now(UTC)
|
||||
|
||||
logger.info("telegram_heartbeat_sent")
|
||||
# Nemotron 異常時額外發告警
|
||||
if not nemo_ok:
|
||||
await self.send_notification(
|
||||
f"🚨 <b>Nemotron 異常告警</b>\n\n"
|
||||
f"NVIDIA NIM API 不可用: <code>{nemo_status}</code>\n"
|
||||
f"影響: 所有 incident 的 Nemotron Tool Calling 將 100% 超時\n"
|
||||
f"緩解: <code>kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod</code>"
|
||||
)
|
||||
logger.error("nemotron_health_alert_sent", status=nemo_status)
|
||||
|
||||
logger.info("telegram_heartbeat_sent", nemotron_ok=nemo_ok)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user