From 15aabd6ac5805bf4fd2eb4409b0cf5f09a1cffc8 Mon Sep 17 00:00:00 2001
From: OG T <ogt@WOOOMacMiniM4.local>
Date: Fri, 3 Apr 2026 16:36:16 +0800
Subject: [PATCH] =?UTF-8?q?fix(chat+nim):=20=E4=BF=AE=E5=BE=A9=E9=A6=96?=
 =?UTF-8?q?=E5=B8=AD=E6=9E=B6=E6=A7=8B=E5=B8=AB=20Review=20I1-I4=20+=20S3?=
 =?UTF-8?q?=20=E5=9B=9B=E9=A0=85=E9=87=8D=E8=A6=81=E5=95=8F=E9=A1=8C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I1: chat_manager._call_openclaw timeout=30.0 → 讀 settings.OPENCLAW_TIMEOUT
I2: nvidia_provider.py stale comment "45" → "55" 對齊 ConfigMap
I3: asyncio.shield 移除 — shield 超時後 task 繼續跑但無人等待 (silent leak)
I4: ChatManager.__init__ 移除 repo 實例 (leWOOOgo 禁 Service 持有 repository)
S3: _check_nemotron_health probe 10s → 25s + /v1/models 輕量端點

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/api/src/services/chat_manager.py     | 15 ++++++----
 apps/api/src/services/nvidia_provider.py  |  2 +-
 apps/api/src/services/telegram_gateway.py | 15 ++++------
 docs/LOGBOOK.md                           | 35 +++++++++++++++++++++++
 4 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/apps/api/src/services/chat_manager.py b/apps/api/src/services/chat_manager.py
index e12c1e1f..e3a521b0 100644
--- a/apps/api/src/services/chat_manager.py
+++ b/apps/api/src/services/chat_manager.py
@@ -41,15 +41,16 @@ class ChatManager:
     """AWOOOI 雙 AI 對話管理器"""
 
     def __init__(self):
-        self.k8s = get_k8s_repository()
-        self.incidents = get_incident_repository()
+        pass  # 2026-04-03 ogt: 移除 repo 實例化，leWOOOgo 規範禁止 Service 持有 repository
 
     async def get_system_context(self) -> str:
         """收集系統即時上下文"""
         now = now_taipei()
+        k8s = get_k8s_repository()
+        incidents = get_incident_repository()
 
         try:
-            k8s_status = await self.k8s.get_pod_status_summary(namespace="awoooi-prod")
+            k8s_status = await k8s.get_pod_status_summary(namespace="awoooi-prod")
             cluster_info = f"Cluster: {k8s_status['running']}/{k8s_status['total']} Pods Running"
             if k8s_status.get('problem_pods'):
                 cluster_info += f", {len(k8s_status['problem_pods'])} 異常"
@@ -57,7 +58,7 @@ class ChatManager:
             cluster_info = "Cluster: 無法取得狀態"
 
         try:
-            active_incidents = await self.incidents.get_active()
+            active_incidents = await incidents.get_active()
             if active_incidents:
                 lines = [f"- {inc.incident_id}: {inc.status.value} (SEV {inc.severity.value})"
                          for inc in active_incidents[:3]]
@@ -84,9 +85,10 @@ class ChatManager:
         settings = get_settings()
 
         openclaw_url = getattr(settings, 'OPENCLAW_URL', 'http://192.168.0.188:8088')
+        openclaw_timeout = float(getattr(settings, 'OPENCLAW_TIMEOUT', 30.0))
         try:
             # OpenClaw 沒有通用 chat endpoint，用 analyze/incident 傳入對話內容
-            async with httpx.AsyncClient(timeout=30.0) as client:
+            async with httpx.AsyncClient(timeout=openclaw_timeout) as client:
                 resp = await client.post(
                     f"{openclaw_url}/api/v1/analyze/incident",
                     json={
@@ -167,8 +169,9 @@ class ChatManager:
         )
 
         # OpenClaw 最多等 40s（含 context 取得時間），NemoClaw 最多等 60s
+        # 2026-04-03 ogt: 移除 asyncio.shield — shield 會在超時後讓 task 繼續跑但無人等待，造成 silent leak
         try:
-            openclaw_raw = await asyncio.wait_for(asyncio.shield(openclaw_task), timeout=40.0)
+            openclaw_raw = await asyncio.wait_for(openclaw_task, timeout=40.0)
         except asyncio.TimeoutError:
             openclaw_raw = None
 
diff --git a/apps/api/src/services/nvidia_provider.py b/apps/api/src/services/nvidia_provider.py
index f65aabac..d76d7e4a 100644
--- a/apps/api/src/services/nvidia_provider.py
+++ b/apps/api/src/services/nvidia_provider.py
@@ -119,7 +119,7 @@ NVIDIA_DEFAULT_MODEL = "nvidia/nemotron-mini-4b-instruct"
 
 # 請求超時 (秒)
 # 2026-04-01 ogt: 設為 30s (平衡點)
-# 2026-04-03 ogt: 改從 config 讀取，與 NEMOTRON_TIMEOUT_SECONDS=45 對齊
+# 2026-04-03 ogt: 改從 config 讀取，與 NEMOTRON_TIMEOUT_SECONDS=55 對齊
 # Memory 記載 NIM 免費 tier 延遲 11-45s，30s 硬編碼導致慢請求全超時
 def _get_nvidia_timeout() -> float:
     try:
diff --git a/apps/api/src/services/telegram_gateway.py b/apps/api/src/services/telegram_gateway.py
index 335a3767..288b5a18 100644
--- a/apps/api/src/services/telegram_gateway.py
+++ b/apps/api/src/services/telegram_gateway.py
@@ -2980,22 +2980,19 @@ class TelegramGateway:
         if not api_key:
             return False, "❌ NVIDIA_API_KEY 未設定"
 
+        # 2026-04-03 ogt: 用 /v1/models 輕量端點探測，避免觸發推理計費
+        # timeout 改為 25s — NIM 免費 tier 冷啟動可能需要 15-20s
         try:
-            async with httpx.AsyncClient(timeout=10.0) as client:
-                resp = await client.post(
-                    "https://integrate.api.nvidia.com/v1/chat/completions",
+            async with httpx.AsyncClient(timeout=25.0) as client:
+                resp = await client.get(
+                    "https://integrate.api.nvidia.com/v1/models",
                     headers={"Authorization": f"Bearer {api_key}"},
-                    json={
-                        "model": "nvidia/nemotron-mini-4b-instruct",
-                        "messages": [{"role": "user", "content": "ping"}],
-                        "max_tokens": 1,
-                    },
                 )
                 if resp.status_code == 200:
                     return True, "✅ 正常"
                 return False, f"❌ HTTP {resp.status_code}"
         except httpx.TimeoutException:
-            return False, "⚠️ 超時 (>10s)"
+            return False, "⚠️ 超時 (>25s)"
         except Exception as e:
             return False, f"❌ {str(e)[:40]}"
 
diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md
index 0468b0f8..d816eff9 100644
--- a/docs/LOGBOOK.md
+++ b/docs/LOGBOOK.md
@@ -5,6 +5,41 @@
 
 ---
 
+## 📍 當前狀態 (2026-04-03 Phase 22.6 雙 AI 對話 + 首席架構師 Code Review)
+
+| 項目 | 狀態 | Commit/備註 |
+|------|------|-------------|
+| **Phase 22.6 chat_manager 重寫** | ✅ 雙 AI (@openclaw/@nemo/混合模式) | be247d6 |
+| **NEMOTRON_TIMEOUT 30→55s** | ✅ ConfigMap + kubectl set env | k8s configmap |
+| **nvidia_provider.py 讀 config** | ✅ 不再硬編碼 30s | — |
+| **費用變更審批憲法第五章** | ✅ HARD_RULES + Memory + CLAUDE.md | — |
+| **I1: openclaw timeout 硬編碼** | ✅ 改讀 OPENCLAW_TIMEOUT config | — |
+| **I2: stale 註解 45→55** | ✅ nvidia_provider.py comment 修正 | — |
+| **I3: asyncio.shield task leak** | ✅ 移除 shield，改直接 wait_for | — |
+| **I4: ChatManager 持有 repo** | ✅ 移至 get_system_context() 本地變數 | — |
+| **S3: NIM 探測 10s timeout** | ✅ 改 25s + 用 /v1/models 輕量端點 | — |
+| **首席架構師 Review 評分** | 85/100 — 4 Important 已全修 | — |
+
+**下一步**: 等待 CI 部署驗證，Ollama on 188 仍需手動重啟
+
+---
+
+## 📍 當前狀態 (2026-04-03 首席架構師 Code Review — Layout 對齊 + Phase 24 命名收尾)
+
+| 項目 | 狀態 | 備註 |
+|------|------|------|
+| **sidebar top 修正** | ✅ top:0→top:68px，sidebar 不再蓋住 header | |
+| **app-layout 對齊** | ✅ pt-[68px] + ml-[224px]，消除 32px 水平空隙 | |
+| **page.tsx calc** | ✅ calc(100vh-64px)→calc(100vh-68px) | |
+| **Metrics Strip 7指標** | ✅ 完整對齊 figma-v2 設計 | |
+| **test_nvidia_provider.py** | ✅ "nvidia" key → "openclaw_nemo" 對齊 Phase 24 | |
+| **ai_rate_limiter.py** | ✅ RATE_LIMITS/COST_LIMITS "nvidia"→"openclaw_nemo" | |
+| **Review 評分** | 88/100 — 通過，3項警告，0項違規 | |
+
+**下一步**: 無緊急待做
+
+---
+
 ## 📍 當前狀態 (2026-04-03 Phase 24 收尾 + KB + Monitoring 修復)
 
 | 項目 | 狀態 | Commit |