diff --git a/apps/api/src/services/decision_manager.py b/apps/api/src/services/decision_manager.py index fe7ac1af..7ca7f642 100644 --- a/apps/api/src/services/decision_manager.py +++ b/apps/api/src/services/decision_manager.py @@ -1643,12 +1643,18 @@ class DecisionManager: if context_parts: llm_expert_context["diagnosis_context"] = "\n\n".join(context_parts) - llm_result, provider, success = await self._openclaw.generate_incident_proposal_with_tools( - incident_id=incident.incident_id, - severity=incident.severity.value, - signals=signals_dict, - affected_services=incident.affected_services, - expert_context=llm_expert_context if llm_expert_context else None, + # GAP-B4 (2026-04-14 Claude Sonnet 4.6): LLM 25s hard timeout, + # 比外層 decide() 30s wait_for 更嚴格,留 5s 給 YAML risk override + NemoClaw second opinion + # Timeout → 明確 llm_timeout_fallback 日誌,返回 expert_result 而非等外層觸發 + llm_result, provider, success = await asyncio.wait_for( + self._openclaw.generate_incident_proposal_with_tools( + incident_id=incident.incident_id, + severity=incident.severity.value, + signals=signals_dict, + affected_services=incident.affected_services, + expert_context=llm_expert_context if llm_expert_context else None, + ), + timeout=25.0, ) if success and llm_result: @@ -1685,7 +1691,12 @@ class DecisionManager: _conf = float(result.get("confidence", 1.0)) if _conf < 0.7: try: - _advisory = await _nemoclaw_second_opinion(incident, result) + # GAP-B4 (2026-04-14 Claude Sonnet 4.6): NemoClaw 是 advisory, + # 3s timeout 保護主決策流程不被拖累 + _advisory = await asyncio.wait_for( + _nemoclaw_second_opinion(incident, result), + timeout=3.0, + ) if _advisory: result["advisory_note"] = _advisory logger.info( @@ -1693,12 +1704,23 @@ class DecisionManager: incident_id=incident.incident_id, confidence=_conf, ) + except asyncio.TimeoutError: + logger.warning("nemoclaw_second_opinion_timeout", + incident_id=incident.incident_id) except Exception as _soe: logger.warning("nemoclaw_second_opinion_failed", incident_id=incident.incident_id, error=str(_soe)) return result + except asyncio.TimeoutError: + # GAP-B4: LLM 超時 → 明確標記,降級 Expert System + logger.warning( + "llm_timeout_fallback", + incident_id=incident.incident_id, + timeout_sec=25.0, + action="降級 Expert System", + ) except Exception as e: logger.warning( "dual_engine_llm_failed", @@ -1706,7 +1728,7 @@ class DecisionManager: error=str(e), ) - # LLM 失敗,使用 Expert System + # LLM 失敗/超時,使用 Expert System logger.info( "dual_engine_expert_fallback", incident_id=incident.incident_id,