From dd0a778e1fbe0a2ad52a91abd5113313bdce26ad Mon Sep 17 00:00:00 2001 From: OG T Date: Tue, 14 Apr 2026 15:51:23 +0800 Subject: [PATCH] =?UTF-8?q?feat(GAP-B4):=20LLM=20=E8=B6=85=E6=99=82?= =?UTF-8?q?=E9=99=8D=E7=B4=9A=E6=89=B6=E6=A2=AF=20=E2=80=94=20=E7=B2=BE?= =?UTF-8?q?=E7=A2=BA=E5=8C=96=E5=85=A7=E5=B1=A4=20timeout?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _dual_engine_analyze 強化(2026-04-14 Claude Sonnet 4.6): - OpenClaw LLM 呼叫獨立 25s hard timeout(留 5s 給後續處理) - 超時時明確 llm_timeout_fallback 日誌,立即降級 Expert System - NemoClaw second opinion 加 3s timeout(advisory 不拖累主流程) - 保留外層 decide() 30s wait_for 作為 defence-in-depth 為何要做: - 外層 30s 會把 LLM 卡死整段吃光,thread pool 可能飢餓 - 內層 25s 更早降級 → Expert System 仍能在 SLA 內回應 - LLM timeout 與其他異常用不同日誌標記,便於 SLO-2 監控 Co-Authored-By: Claude Haiku 4.5 --- apps/api/src/services/decision_manager.py | 38 ++++++++++++++++++----- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index fe7ac1af..7ca7f642 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1643,12 +1643,18 @@ class DecisionManager: if context_parts: llm_expert_context["diagnosis_context"] = "\n\n".join(context_parts) - llm_result, provider, success = await self._openclaw.generate_incident_proposal_with_tools( - incident_id=incident.incident_id, - severity=incident.severity.value, - signals=signals_dict, - affected_services=incident.affected_services, - expert_context=llm_expert_context if llm_expert_context else None, + # GAP-B4 (2026-04-14 Claude Sonnet 4.6): LLM 25s hard timeout, + # 比外層 decide() 30s wait_for 更嚴格,留 5s 給 YAML risk override + NemoClaw second opinion + # Timeout → 明確 llm_timeout_fallback 日誌,返回 expert_result 而非等外層觸發 + llm_result, provider, success = await asyncio.wait_for( + self._openclaw.generate_incident_proposal_with_tools( + incident_id=incident.incident_id, + severity=incident.severity.value, + signals=signals_dict, + affected_services=incident.affected_services, + expert_context=llm_expert_context if llm_expert_context else None, + ), + timeout=25.0, ) if success and llm_result: @@ -1685,7 +1691,12 @@ class DecisionManager: _conf = float(result.get("confidence", 1.0)) if _conf < 0.7: try: - _advisory = await _nemoclaw_second_opinion(incident, result) + # GAP-B4 (2026-04-14 Claude Sonnet 4.6): NemoClaw 是 advisory, + # 3s timeout 保護主決策流程不被拖累 + _advisory = await asyncio.wait_for( + _nemoclaw_second_opinion(incident, result), + timeout=3.0, + ) if _advisory: result["advisory_note"] = _advisory logger.info( @@ -1693,12 +1704,23 @@ class DecisionManager: incident_id=incident.incident_id, confidence=_conf, ) + except asyncio.TimeoutError: + logger.warning("nemoclaw_second_opinion_timeout", + incident_id=incident.incident_id) except Exception as _soe: logger.warning("nemoclaw_second_opinion_failed", incident_id=incident.incident_id, error=str(_soe)) return result + except asyncio.TimeoutError: + # GAP-B4: LLM 超時 → 明確標記,降級 Expert System + logger.warning( + "llm_timeout_fallback", + incident_id=incident.incident_id, + timeout_sec=25.0, + action="降級 Expert System", + ) except Exception as e: logger.warning( "dual_engine_llm_failed", @@ -1706,7 +1728,7 @@ class DecisionManager: error=str(e), ) - # LLM 失敗,使用 Expert System + # LLM 失敗/超時,使用 Expert System logger.info( "dual_engine_expert_fallback", incident_id=incident.incident_id,