feat(GAP-B4): LLM 超時降級扶梯 — 精確化內層 timeout

_dual_engine_analyze 強化（2026-04-14 Claude Sonnet 4.6）: - OpenClaw LLM 呼叫獨立 25s hard timeout（留 5s 給後續處理） - 超時時明確 llm_timeout_fallback 日誌，立即降級 Expert System - NemoClaw second opinion 加 3s timeout（advisory 不拖累主流程） - 保留外層 decide() 30s wait_for 作為 defence-in-depth 為何要做： - 外層 30s 會把 LLM 卡死整段吃光，thread pool 可能飢餓 - 內層 25s 更早降級 → Expert System 仍能在 SLA 內回應 - LLM timeout 與其他異常用不同日誌標記，便於 SLO-2 監控 Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2026-04-14 15:51:23 +08:00
parent dedd7c2c17
commit dd0a778e1f
1 changed files with 30 additions and 8 deletions
--- a/apps/api/src/services/decision_manager.py
+++ b/apps/api/src/services/decision_manager.py
@@ -1643,12 +1643,18 @@ class DecisionManager:
            if context_parts:
                llm_expert_context["diagnosis_context"] = "\n\n".join(context_parts)

-            llm_result, provider, success = await self._openclaw.generate_incident_proposal_with_tools(
-                incident_id=incident.incident_id,
-                severity=incident.severity.value,
-                signals=signals_dict,
-                affected_services=incident.affected_services,
-                expert_context=llm_expert_context if llm_expert_context else None,
+            # GAP-B4 (2026-04-14 Claude Sonnet 4.6): LLM 25s hard timeout，
+            # 比外層 decide() 30s wait_for 更嚴格，留 5s 給 YAML risk override + NemoClaw second opinion
+            # Timeout → 明確 llm_timeout_fallback 日誌，返回 expert_result 而非等外層觸發
+            llm_result, provider, success = await asyncio.wait_for(
+                self._openclaw.generate_incident_proposal_with_tools(
+                    incident_id=incident.incident_id,
+                    severity=incident.severity.value,
+                    signals=signals_dict,
+                    affected_services=incident.affected_services,
+                    expert_context=llm_expert_context if llm_expert_context else None,
+                ),
+                timeout=25.0,
            )

            if success and llm_result:
@@ -1685,7 +1691,12 @@ class DecisionManager:
                _conf = float(result.get("confidence", 1.0))
                if _conf < 0.7:
                    try:
-                        _advisory = await _nemoclaw_second_opinion(incident, result)
+                        # GAP-B4 (2026-04-14 Claude Sonnet 4.6): NemoClaw 是 advisory，
+                        # 3s timeout 保護主決策流程不被拖累
+                        _advisory = await asyncio.wait_for(
+                            _nemoclaw_second_opinion(incident, result),
+                            timeout=3.0,
+                        )
                        if _advisory:
                            result["advisory_note"] = _advisory
                            logger.info(
@@ -1693,12 +1704,23 @@ class DecisionManager:
                                incident_id=incident.incident_id,
                                confidence=_conf,
                            )
+                    except asyncio.TimeoutError:
+                        logger.warning("nemoclaw_second_opinion_timeout",
+                                       incident_id=incident.incident_id)
                    except Exception as _soe:
                        logger.warning("nemoclaw_second_opinion_failed",
                                       incident_id=incident.incident_id, error=str(_soe))

                return result

+        except asyncio.TimeoutError:
+            # GAP-B4: LLM 超時 → 明確標記，降級 Expert System
+            logger.warning(
+                "llm_timeout_fallback",
+                incident_id=incident.incident_id,
+                timeout_sec=25.0,
+                action="降級 Expert System",
+            )
        except Exception as e:
            logger.warning(
                "dual_engine_llm_failed",
@@ -1706,7 +1728,7 @@ class DecisionManager:
                error=str(e),
            )

-        # LLM 失敗，使用 Expert System
+        # LLM 失敗/超時，使用 Expert System
        logger.info(
            "dual_engine_expert_fallback",
            incident_id=incident.incident_id,