From dd0a778e1fbe0a2ad52a91abd5113313bdce26ad Mon Sep 17 00:00:00 2001
From: OG T <ogt@WOOOMacMiniM4.local>
Date: Tue, 14 Apr 2026 15:51:23 +0800
Subject: [PATCH] =?UTF-8?q?feat(GAP-B4):=20LLM=20=E8=B6=85=E6=99=82?=
 =?UTF-8?q?=E9=99=8D=E7=B4=9A=E6=89=B6=E6=A2=AF=20=E2=80=94=20=E7=B2=BE?=
 =?UTF-8?q?=E7=A2=BA=E5=8C=96=E5=85=A7=E5=B1=A4=20timeout?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_dual_engine_analyze 強化（2026-04-14 Claude Sonnet 4.6）:
- OpenClaw LLM 呼叫獨立 25s hard timeout（留 5s 給後續處理）
- 超時時明確 llm_timeout_fallback 日誌，立即降級 Expert System
- NemoClaw second opinion 加 3s timeout（advisory 不拖累主流程）
- 保留外層 decide() 30s wait_for 作為 defence-in-depth

為何要做：
- 外層 30s 會把 LLM 卡死整段吃光，thread pool 可能飢餓
- 內層 25s 更早降級 → Expert System 仍能在 SLA 內回應
- LLM timeout 與其他異常用不同日誌標記，便於 SLO-2 監控

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
---
 apps/api/src/services/decision_manager.py | 38 ++++++++++++++++++-----
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py
index fe7ac1af..7ca7f642 100644
--- a/apps/api/src/services/decision_manager.py
+++ b/apps/api/src/services/decision_manager.py
@@ -1643,12 +1643,18 @@ class DecisionManager:
             if context_parts:
                 llm_expert_context["diagnosis_context"] = "\n\n".join(context_parts)
 
-            llm_result, provider, success = await self._openclaw.generate_incident_proposal_with_tools(
-                incident_id=incident.incident_id,
-                severity=incident.severity.value,
-                signals=signals_dict,
-                affected_services=incident.affected_services,
-                expert_context=llm_expert_context if llm_expert_context else None,
+            # GAP-B4 (2026-04-14 Claude Sonnet 4.6): LLM 25s hard timeout，
+            # 比外層 decide() 30s wait_for 更嚴格，留 5s 給 YAML risk override + NemoClaw second opinion
+            # Timeout → 明確 llm_timeout_fallback 日誌，返回 expert_result 而非等外層觸發
+            llm_result, provider, success = await asyncio.wait_for(
+                self._openclaw.generate_incident_proposal_with_tools(
+                    incident_id=incident.incident_id,
+                    severity=incident.severity.value,
+                    signals=signals_dict,
+                    affected_services=incident.affected_services,
+                    expert_context=llm_expert_context if llm_expert_context else None,
+                ),
+                timeout=25.0,
             )
 
             if success and llm_result:
@@ -1685,7 +1691,12 @@ class DecisionManager:
                 _conf = float(result.get("confidence", 1.0))
                 if _conf < 0.7:
                     try:
-                        _advisory = await _nemoclaw_second_opinion(incident, result)
+                        # GAP-B4 (2026-04-14 Claude Sonnet 4.6): NemoClaw 是 advisory，
+                        # 3s timeout 保護主決策流程不被拖累
+                        _advisory = await asyncio.wait_for(
+                            _nemoclaw_second_opinion(incident, result),
+                            timeout=3.0,
+                        )
                         if _advisory:
                             result["advisory_note"] = _advisory
                             logger.info(
@@ -1693,12 +1704,23 @@ class DecisionManager:
                                 incident_id=incident.incident_id,
                                 confidence=_conf,
                             )
+                    except asyncio.TimeoutError:
+                        logger.warning("nemoclaw_second_opinion_timeout",
+                                       incident_id=incident.incident_id)
                     except Exception as _soe:
                         logger.warning("nemoclaw_second_opinion_failed",
                                        incident_id=incident.incident_id, error=str(_soe))
 
                 return result
 
+        except asyncio.TimeoutError:
+            # GAP-B4: LLM 超時 → 明確標記，降級 Expert System
+            logger.warning(
+                "llm_timeout_fallback",
+                incident_id=incident.incident_id,
+                timeout_sec=25.0,
+                action="降級 Expert System",
+            )
         except Exception as e:
             logger.warning(
                 "dual_engine_llm_failed",
@@ -1706,7 +1728,7 @@ class DecisionManager:
                 error=str(e),
             )
 
-        # LLM 失敗，使用 Expert System
+        # LLM 失敗/超時，使用 Expert System
         logger.info(
             "dual_engine_expert_fallback",
             incident_id=incident.incident_id,