feat(chat+monitor): 雙 AI 對話重寫 + Nemotron 健康監控告警
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 6m56s

ChatManager 重寫 (Phase 22.6):
- @openclaw <msg> → 只有 OpenClaw 回應 (Ollama qwen2.5:7b)
- @nemo <msg>     → 只有 NemoClaw 回應 (Gemini Flash)
- 無前綴           → OpenClaw 先答,NemoClaw 評論/反駁

NemoClaw 改用 Gemini Flash (棄 NIM nemotron-mini-4b 因為 15s+ 回應時間)

TelegramGateway 心跳新增 Nemotron 健康探測:
- 每次心跳探測 NVIDIA NIM API (10s timeout)
- 異常時立刻發 Telegram 告警 + 緩解指令
- 補足 Nemotron 100% 超時卻無告警的監控盲區

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-03 14:59:06 +08:00
parent c1834a7156
commit d9007e6855
2 changed files with 232 additions and 87 deletions

View File

@@ -1,115 +1,211 @@
"""
AWOOOI Chat Manager - 統帥對話核心
===================================
Phase 21.5: 實作 Telegram 互動對話功能
AWOOOI Chat Manager - 雙 AI 對話核心
======================================
Phase 21.5 初版: 2026-03-31 ogt
Phase 22.6 重寫: 2026-04-03 ogt (統帥需求: 雙 AI 互動對話)
職責:
1. 整合系統上下文 (K3s 狀態, 最近告警, 目前時間)
2. 決定對話風格 (OpenClaw 專業風 vs Nemo 參謀風)
3. 調用 LLM (Nemo-4B / Gemini) 產出回應
4. 遵守 SOUL.md Nothing.tech 純淨美學
功能:
1. @openclaw / @nemo 路由 — 指定 AI 回應
2. 無前綴 — 兩個 AI 輪流回應,並互相評論
3. AI 互相對話 — NemoClaw 看到 OpenClaw 的回應後可補充/反駁
2026-03-31 ogt: 初版建立
架構:
- OpenClaw: 用 Ollama qwen2.5:7b-instruct (本地, 快)
- NemoClaw: 用 Gemini Flash (雲端, 快) — NIM nemotron-mini 太慢 (15s+)
"""
import structlog
from datetime import datetime
from src.utils.timezone import now_taipei
from src.services.nvidia_provider import get_nvidia_provider
from src.repositories.k8s_repository import get_k8s_repository
from src.repositories.incident_repository import get_incident_repository
logger = structlog.get_logger(__name__)
# 人格設定
OPENCLAW_PERSONA = """你是 OpenClawAWOOOI 平台的 SRE AI 主帥。
個性: 精準、果斷、專業,像老將一樣直接給出建議。
語氣: 簡短有力,不廢話。繁體中文回應。
當 NemoClaw 有不同意見時,你會直接反駁或接受,不拐彎抹角。
"""
NEMOCLAW_PERSONA = """你是 NemoClawAWOOOI 平台的 AI 戰術參謀。
個性: 分析型、喜歡從不同角度思考,會質疑假設。
語氣: 帶點挑釁但建設性,繁體中文回應。
當 OpenClaw 給出意見時,你會評估是否同意,必要時提出替代方案。
"""
class ChatManager:
"""
AWOOOI 對話管理器 - 系統的大腦與聲帶
"""
"""AWOOOI 雙 AI 對話管理器"""
def __init__(self):
self.nvidia = get_nvidia_provider()
self.k8s = get_k8s_repository()
self.incidents = get_incident_repository()
async def get_system_context(self) -> str:
"""
收集系統即時上下文,供 LLM 參考
"""
"""收集系統即時上下文"""
now = now_taipei()
# 1. K3s 狀態
k8s_status = await self.k8s.get_pod_status_summary(namespace="awoooi-prod")
cluster_info = f"Cluster: {k8s_status['running']}/{k8s_status['total']} Pods Running"
if k8s_status['problem_pods']:
cluster_info += f", {len(k8s_status['problem_pods'])} anomalies detected."
# 2. 最近告警 (取 3 筆)
active_incidents = await self.incidents.get_active()
incident_summary = "None"
if active_incidents:
lines = []
for inc in active_incidents[:3]:
lines.append(f"- {inc.incident_id}: {inc.status.value} (Severity: {inc.severity.value})")
incident_summary = "\n".join(lines)
context = f"""
## Current System Context (Taipei Time: {now.strftime('%Y-%m-%d %H:%M:%S')})
- Environment: AWOOOI Production (K3s)
- {cluster_info}
- Active Incidents:
{incident_summary}
"""
return context
async def generate_response(
self,
user_id: int,
username: str,
message_text: str
) -> str:
"""
根據統帥訊息產生回覆
"""
system_context = await self.get_system_context()
# 判定是否在跟 Nemo 對話
is_asking_nemo = "nemo" in message_text.lower()
role_description = "You are OpenClaw, the AI operations assistant for AWOOOI platform."
if is_asking_nemo:
role_description = "You are Nemo-4B, the elite AI tactical advisor for AWOOOI. Address the user as 'Supreme Commander' (統帥)."
system_prompt = f"""{role_description}
{system_context}
## Guidelines:
1. Keep responses extremely concise and professional (Nothing.tech aesthetic).
2. For status queries, provide precise data.
3. For general chat, be supportive but focused on operations.
4. Language: Preferred Traditional Chinese (繁體中文).
5. No emojis except for functional ones (🚨, ✅, 📊).
"""
try:
# 優先使用 NVIDIA Nemo-4B
response, success, tokens, cost = await self.nvidia.chat(
prompt=f"{system_prompt}\n\nCommander's Message: {message_text}",
model="nvidia/nemotron-mini-4b-instruct",
max_tokens=1024
)
if not success:
return "🛸 抱歉統帥Nemo 參謀暫時離線。請稍後再試。"
k8s_status = await self.k8s.get_pod_status_summary(namespace="awoooi-prod")
cluster_info = f"Cluster: {k8s_status['running']}/{k8s_status['total']} Pods Running"
if k8s_status.get('problem_pods'):
cluster_info += f", {len(k8s_status['problem_pods'])} 異常"
except Exception:
cluster_info = "Cluster: 無法取得狀態"
return response.strip()
try:
active_incidents = await self.incidents.get_active()
if active_incidents:
lines = [f"- {inc.incident_id}: {inc.status.value} (SEV {inc.severity.value})"
for inc in active_incidents[:3]]
incident_summary = "\n".join(lines)
else:
incident_summary = "無活躍告警"
except Exception:
incident_summary = "無法取得告警"
return f"""## 系統狀態 ({now.strftime('%Y-%m-%d %H:%M')} 台北)
- {cluster_info}
- 活躍告警: {incident_summary}
"""
async def _call_ollama(self, system_prompt: str, user_message: str) -> str:
"""呼叫 Ollama (OpenClaw 用)"""
import httpx
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post(
"http://192.168.0.188:11434/api/chat",
json={
"model": "qwen2.5:7b-instruct",
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message},
],
"stream": False,
"options": {"temperature": 0.7, "num_predict": 512},
},
)
resp.raise_for_status()
data = resp.json()
return data.get("message", {}).get("content", "").strip()
except Exception as e:
logger.exception("chat_generation_error", error=str(e))
return "⚠️ 通訊鏈路異常,無法聯繫 AI 腦區。"
logger.warning("ollama_chat_failed", error=str(e))
return None
async def _call_gemini(self, system_prompt: str, user_message: str) -> str:
"""呼叫 Gemini Flash (NemoClaw 用)"""
import httpx
from src.core.config import get_settings
settings = get_settings()
api_key = settings.GEMINI_API_KEY if hasattr(settings, 'GEMINI_API_KEY') else None
if not api_key:
return None
try:
full_prompt = f"{system_prompt}\n\n用戶訊息: {user_message}"
async with httpx.AsyncClient(timeout=20.0) as client:
resp = await client.post(
f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}",
json={
"contents": [{"role": "user", "parts": [{"text": full_prompt}]}],
"generationConfig": {"temperature": 0.8, "maxOutputTokens": 512},
},
)
resp.raise_for_status()
data = resp.json()
return data["candidates"][0]["content"]["parts"][0]["text"].strip()
except Exception as e:
logger.warning("gemini_chat_failed", error=str(e))
return None
async def _openclaw_respond(self, context: str, message: str) -> str:
"""OpenClaw 回應"""
system = f"{OPENCLAW_PERSONA}\n{context}"
result = await self._call_ollama(system, message)
if not result:
result = "🔴 OpenClaw 暫時離線Ollama 無響應。"
return f"🦞 <b>OpenClaw:</b>\n{result}"
async def _nemoclaw_respond(self, context: str, message: str) -> str:
"""NemoClaw 回應"""
system = f"{NEMOCLAW_PERSONA}\n{context}"
result = await self._call_gemini(system, message)
if not result:
# Gemini 失敗時 fallback 到 Ollama
result = await self._call_ollama(system, message)
if not result:
result = "🔴 NemoClaw 暫時離線。"
return f"🤖 <b>NemoClaw:</b>\n{result}"
async def _nemoclaw_comment_on(self, context: str, openclaw_response: str, original_msg: str) -> str:
"""NemoClaw 評論 OpenClaw 的回應"""
message = f"""統帥問了: {original_msg}
OpenClaw 的回應是:
{openclaw_response}
請你從 NemoClaw 的角度評論上面的回應。可以補充、反駁、或提出不同觀點。"""
system = f"{NEMOCLAW_PERSONA}\n{context}"
result = await self._call_gemini(system, message)
if not result:
result = await self._call_ollama(system, message)
if not result:
return None
return f"🤖 <b>NemoClaw 補充:</b>\n{result}"
async def generate_response(
self,
user_id: int,
username: str,
message_text: str,
) -> str:
"""
根據訊息內容決定回應模式:
@openclaw <msg> → 只有 OpenClaw 回應
@nemo <msg> → 只有 NemoClaw 回應
其他 → OpenClaw 先回NemoClaw 評論
"""
context = await self.get_system_context()
text = message_text.strip()
# 模式 1: 指定 OpenClaw
if text.lower().startswith("@openclaw"):
msg = text[9:].strip() or text
return await self._openclaw_respond(context, msg)
# 模式 2: 指定 NemoClaw
if text.lower().startswith("@nemo"):
msg = text[5:].strip() or text
return await self._nemoclaw_respond(context, msg)
# 模式 3: 雙 AI 對話
# Step 1: OpenClaw 先回
openclaw_raw = await self._call_ollama(
f"{OPENCLAW_PERSONA}\n{context}", text
)
if not openclaw_raw:
openclaw_raw = "Ollama 無響應OpenClaw 暫時離線。"
openclaw_block = f"🦞 <b>OpenClaw:</b>\n{openclaw_raw}"
# Step 2: NemoClaw 評論 OpenClaw 的回應
nemo_block = await self._nemoclaw_comment_on(context, openclaw_raw, text)
if nemo_block:
return f"{openclaw_block}\n\n{nemo_block}"
else:
return openclaw_block
# Singleton
_chat_manager = None
def get_chat_manager() -> ChatManager:
global _chat_manager
if _chat_manager is None:

View File

@@ -2933,27 +2933,76 @@ class TelegramGateway:
# Phase 6.5: 心跳監控方法
# =============================================================================
async def _check_nemotron_health(self) -> tuple[bool, str]:
"""
探測 Nemotron (NVIDIA NIM) 是否可用
2026-04-03 ogt: 新增 — Nemotron 100% 超時但沒有告警,補足監控盲區
Returns: (is_healthy, status_text)
"""
import httpx
from src.core.config import get_settings
settings = get_settings()
api_key = settings.NVIDIA_API_KEY
if not api_key:
return False, "❌ NVIDIA_API_KEY 未設定"
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
"https://integrate.api.nvidia.com/v1/chat/completions",
headers={"Authorization": f"Bearer {api_key}"},
json={
"model": "nvidia/nemotron-mini-4b-instruct",
"messages": [{"role": "user", "content": "ping"}],
"max_tokens": 1,
},
)
if resp.status_code == 200:
return True, "✅ 正常"
return False, f"❌ HTTP {resp.status_code}"
except httpx.TimeoutException:
return False, "⚠️ 超時 (>10s)"
except Exception as e:
return False, f"{str(e)[:40]}"
async def send_heartbeat(self) -> bool:
"""
發送心跳訊息 (系統狀態摘要)
發送心跳訊息 (系統狀態摘要,含 Nemotron 健康探測)
每 30 分鐘執行一次,證明告警鏈路正常運作
2026-04-03 ogt: 加入 Nemotron 健康探測 — 補足監控盲區
"""
try:
if not self._initialized:
await self.initialize()
# 心跳訊息 (2026-03-30 ogt: 改用台北時區,符合 feedback_timezone_taipei.md)
from src.utils.timezone import now_taipei
taipei_now = now_taipei()
# Nemotron 健康探測
nemo_ok, nemo_status = await self._check_nemotron_health()
text = f"""💓 <b>AWOOOI 心跳</b>
{taipei_now.strftime('%Y-%m-%d %H:%M:%S')} (台北)
📡 告警鏈路: ✅ 正常"""
📡 告警鏈路: ✅ 正常
🤖 Nemotron NIM: {nemo_status}"""
await self.send_notification(text)
self._last_message_time = datetime.now(UTC)
logger.info("telegram_heartbeat_sent")
# Nemotron 異常時額外發告警
if not nemo_ok:
await self.send_notification(
f"🚨 <b>Nemotron 異常告警</b>\n\n"
f"NVIDIA NIM API 不可用: <code>{nemo_status}</code>\n"
f"影響: 所有 incident 的 Nemotron Tool Calling 將 100% 超時\n"
f"緩解: <code>kubectl set env deployment/awoooi-api ENABLE_NEMOTRON_COLLABORATION=false -n awoooi-prod</code>"
)
logger.error("nemotron_health_alert_sent", status=nemo_status)
logger.info("telegram_heartbeat_sent", nemotron_ok=nemo_ok)
return True
except Exception as e: