From 15aabd6ac5805bf4fd2eb4409b0cf5f09a1cffc8 Mon Sep 17 00:00:00 2001 From: OG T Date: Fri, 3 Apr 2026 16:36:16 +0800 Subject: [PATCH] =?UTF-8?q?fix(chat+nim):=20=E4=BF=AE=E5=BE=A9=E9=A6=96?= =?UTF-8?q?=E5=B8=AD=E6=9E=B6=E6=A7=8B=E5=B8=AB=20Review=20I1-I4=20+=20S3?= =?UTF-8?q?=20=E5=9B=9B=E9=A0=85=E9=87=8D=E8=A6=81=E5=95=8F=E9=A1=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I1: chat_manager._call_openclaw timeout=30.0 → 讀 settings.OPENCLAW_TIMEOUT I2: nvidia_provider.py stale comment "45" → "55" 對齊 ConfigMap I3: asyncio.shield 移除 — shield 超時後 task 繼續跑但無人等待 (silent leak) I4: ChatManager.__init__ 移除 repo 實例 (leWOOOgo 禁 Service 持有 repository) S3: _check_nemotron_health probe 10s → 25s + /v1/models 輕量端點 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/chat_manager.py | 15 ++++++---- apps/api/src/services/nvidia_provider.py | 2 +- apps/api/src/services/telegram_gateway.py | 15 ++++------ docs/LOGBOOK.md | 35 +++++++++++++++++++++++ 4 files changed, 51 insertions(+), 16 deletions(-) diff --git a/apps/api/src/services/chat_manager.py b/apps/api/src/services/chat_manager.py index e12c1e1f..e3a521b0 100644 --- a/apps/api/src/services/chat_manager.py +++ b/apps/api/src/services/chat_manager.py @@ -41,15 +41,16 @@ class ChatManager: """AWOOOI 雙 AI 對話管理器""" def __init__(self): - self.k8s = get_k8s_repository() - self.incidents = get_incident_repository() + pass # 2026-04-03 ogt: 移除 repo 實例化,leWOOOgo 規範禁止 Service 持有 repository async def get_system_context(self) -> str: """收集系統即時上下文""" now = now_taipei() + k8s = get_k8s_repository() + incidents = get_incident_repository() try: - k8s_status = await self.k8s.get_pod_status_summary(namespace="awoooi-prod") + k8s_status = await k8s.get_pod_status_summary(namespace="awoooi-prod") cluster_info = f"Cluster: {k8s_status['running']}/{k8s_status['total']} Pods Running" if k8s_status.get('problem_pods'): cluster_info += f", {len(k8s_status['problem_pods'])} 異常" @@ -57,7 +58,7 @@ class ChatManager: cluster_info = "Cluster: 無法取得狀態" try: - active_incidents = await self.incidents.get_active() + active_incidents = await incidents.get_active() if active_incidents: lines = [f"- {inc.incident_id}: {inc.status.value} (SEV {inc.severity.value})" for inc in active_incidents[:3]] @@ -84,9 +85,10 @@ class ChatManager: settings = get_settings() openclaw_url = getattr(settings, 'OPENCLAW_URL', 'http://192.168.0.188:8088') + openclaw_timeout = float(getattr(settings, 'OPENCLAW_TIMEOUT', 30.0)) try: # OpenClaw 沒有通用 chat endpoint,用 analyze/incident 傳入對話內容 - async with httpx.AsyncClient(timeout=30.0) as client: + async with httpx.AsyncClient(timeout=openclaw_timeout) as client: resp = await client.post( f"{openclaw_url}/api/v1/analyze/incident", json={ @@ -167,8 +169,9 @@ class ChatManager: ) # OpenClaw 最多等 40s(含 context 取得時間),NemoClaw 最多等 60s + # 2026-04-03 ogt: 移除 asyncio.shield — shield 會在超時後讓 task 繼續跑但無人等待,造成 silent leak try: - openclaw_raw = await asyncio.wait_for(asyncio.shield(openclaw_task), timeout=40.0) + openclaw_raw = await asyncio.wait_for(openclaw_task, timeout=40.0) except asyncio.TimeoutError: openclaw_raw = None diff --git a/apps/api/src/services/nvidia_provider.py b/apps/api/src/services/nvidia_provider.py index f65aabac..d76d7e4a 100644 --- a/apps/api/src/services/nvidia_provider.py +++ b/apps/api/src/services/nvidia_provider.py @@ -119,7 +119,7 @@ NVIDIA_DEFAULT_MODEL = "nvidia/nemotron-mini-4b-instruct" # 請求超時 (秒) # 2026-04-01 ogt: 設為 30s (平衡點) -# 2026-04-03 ogt: 改從 config 讀取,與 NEMOTRON_TIMEOUT_SECONDS=45 對齊 +# 2026-04-03 ogt: 改從 config 讀取,與 NEMOTRON_TIMEOUT_SECONDS=55 對齊 # Memory 記載 NIM 免費 tier 延遲 11-45s,30s 硬編碼導致慢請求全超時 def _get_nvidia_timeout() -> float: try: diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py index 335a3767..288b5a18 100644 --- a/apps/api/src/services/telegram_gateway.py +++ b/apps/api/src/services/telegram_gateway.py @@ -2980,22 +2980,19 @@ class TelegramGateway: if not api_key: return False, "❌ NVIDIA_API_KEY 未設定" + # 2026-04-03 ogt: 用 /v1/models 輕量端點探測,避免觸發推理計費 + # timeout 改為 25s — NIM 免費 tier 冷啟動可能需要 15-20s try: - async with httpx.AsyncClient(timeout=10.0) as client: - resp = await client.post( - "https://integrate.api.nvidia.com/v1/chat/completions", + async with httpx.AsyncClient(timeout=25.0) as client: + resp = await client.get( + "https://integrate.api.nvidia.com/v1/models", headers={"Authorization": f"Bearer {api_key}"}, - json={ - "model": "nvidia/nemotron-mini-4b-instruct", - "messages": [{"role": "user", "content": "ping"}], - "max_tokens": 1, - }, ) if resp.status_code == 200: return True, "✅ 正常" return False, f"❌ HTTP {resp.status_code}" except httpx.TimeoutException: - return False, "⚠️ 超時 (>10s)" + return False, "⚠️ 超時 (>25s)" except Exception as e: return False, f"❌ {str(e)[:40]}" diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 0468b0f8..d816eff9 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -5,6 +5,41 @@ --- +## 📍 當前狀態 (2026-04-03 Phase 22.6 雙 AI 對話 + 首席架構師 Code Review) + +| 項目 | 狀態 | Commit/備註 | +|------|------|-------------| +| **Phase 22.6 chat_manager 重寫** | ✅ 雙 AI (@openclaw/@nemo/混合模式) | be247d6 | +| **NEMOTRON_TIMEOUT 30→55s** | ✅ ConfigMap + kubectl set env | k8s configmap | +| **nvidia_provider.py 讀 config** | ✅ 不再硬編碼 30s | — | +| **費用變更審批憲法第五章** | ✅ HARD_RULES + Memory + CLAUDE.md | — | +| **I1: openclaw timeout 硬編碼** | ✅ 改讀 OPENCLAW_TIMEOUT config | — | +| **I2: stale 註解 45→55** | ✅ nvidia_provider.py comment 修正 | — | +| **I3: asyncio.shield task leak** | ✅ 移除 shield,改直接 wait_for | — | +| **I4: ChatManager 持有 repo** | ✅ 移至 get_system_context() 本地變數 | — | +| **S3: NIM 探測 10s timeout** | ✅ 改 25s + 用 /v1/models 輕量端點 | — | +| **首席架構師 Review 評分** | 85/100 — 4 Important 已全修 | — | + +**下一步**: 等待 CI 部署驗證,Ollama on 188 仍需手動重啟 + +--- + +## 📍 當前狀態 (2026-04-03 首席架構師 Code Review — Layout 對齊 + Phase 24 命名收尾) + +| 項目 | 狀態 | 備註 | +|------|------|------| +| **sidebar top 修正** | ✅ top:0→top:68px,sidebar 不再蓋住 header | | +| **app-layout 對齊** | ✅ pt-[68px] + ml-[224px],消除 32px 水平空隙 | | +| **page.tsx calc** | ✅ calc(100vh-64px)→calc(100vh-68px) | | +| **Metrics Strip 7指標** | ✅ 完整對齊 figma-v2 設計 | | +| **test_nvidia_provider.py** | ✅ "nvidia" key → "openclaw_nemo" 對齊 Phase 24 | | +| **ai_rate_limiter.py** | ✅ RATE_LIMITS/COST_LIMITS "nvidia"→"openclaw_nemo" | | +| **Review 評分** | 88/100 — 通過,3項警告,0項違規 | | + +**下一步**: 無緊急待做 + +--- + ## 📍 當前狀態 (2026-04-03 Phase 24 收尾 + KB + Monitoring 修復) | 項目 | 狀態 | Commit |