feat(GAP-B4): LLM 超時降級扶梯 — 精確化內層 timeout
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 14m19s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 14m19s
_dual_engine_analyze 強化(2026-04-14 Claude Sonnet 4.6): - OpenClaw LLM 呼叫獨立 25s hard timeout(留 5s 給後續處理) - 超時時明確 llm_timeout_fallback 日誌,立即降級 Expert System - NemoClaw second opinion 加 3s timeout(advisory 不拖累主流程) - 保留外層 decide() 30s wait_for 作為 defence-in-depth 為何要做: - 外層 30s 會把 LLM 卡死整段吃光,thread pool 可能飢餓 - 內層 25s 更早降級 → Expert System 仍能在 SLA 內回應 - LLM timeout 與其他異常用不同日誌標記,便於 SLO-2 監控 Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1643,12 +1643,18 @@ class DecisionManager:
|
||||
if context_parts:
|
||||
llm_expert_context["diagnosis_context"] = "\n\n".join(context_parts)
|
||||
|
||||
llm_result, provider, success = await self._openclaw.generate_incident_proposal_with_tools(
|
||||
incident_id=incident.incident_id,
|
||||
severity=incident.severity.value,
|
||||
signals=signals_dict,
|
||||
affected_services=incident.affected_services,
|
||||
expert_context=llm_expert_context if llm_expert_context else None,
|
||||
# GAP-B4 (2026-04-14 Claude Sonnet 4.6): LLM 25s hard timeout,
|
||||
# 比外層 decide() 30s wait_for 更嚴格,留 5s 給 YAML risk override + NemoClaw second opinion
|
||||
# Timeout → 明確 llm_timeout_fallback 日誌,返回 expert_result 而非等外層觸發
|
||||
llm_result, provider, success = await asyncio.wait_for(
|
||||
self._openclaw.generate_incident_proposal_with_tools(
|
||||
incident_id=incident.incident_id,
|
||||
severity=incident.severity.value,
|
||||
signals=signals_dict,
|
||||
affected_services=incident.affected_services,
|
||||
expert_context=llm_expert_context if llm_expert_context else None,
|
||||
),
|
||||
timeout=25.0,
|
||||
)
|
||||
|
||||
if success and llm_result:
|
||||
@@ -1685,7 +1691,12 @@ class DecisionManager:
|
||||
_conf = float(result.get("confidence", 1.0))
|
||||
if _conf < 0.7:
|
||||
try:
|
||||
_advisory = await _nemoclaw_second_opinion(incident, result)
|
||||
# GAP-B4 (2026-04-14 Claude Sonnet 4.6): NemoClaw 是 advisory,
|
||||
# 3s timeout 保護主決策流程不被拖累
|
||||
_advisory = await asyncio.wait_for(
|
||||
_nemoclaw_second_opinion(incident, result),
|
||||
timeout=3.0,
|
||||
)
|
||||
if _advisory:
|
||||
result["advisory_note"] = _advisory
|
||||
logger.info(
|
||||
@@ -1693,12 +1704,23 @@ class DecisionManager:
|
||||
incident_id=incident.incident_id,
|
||||
confidence=_conf,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning("nemoclaw_second_opinion_timeout",
|
||||
incident_id=incident.incident_id)
|
||||
except Exception as _soe:
|
||||
logger.warning("nemoclaw_second_opinion_failed",
|
||||
incident_id=incident.incident_id, error=str(_soe))
|
||||
|
||||
return result
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
# GAP-B4: LLM 超時 → 明確標記,降級 Expert System
|
||||
logger.warning(
|
||||
"llm_timeout_fallback",
|
||||
incident_id=incident.incident_id,
|
||||
timeout_sec=25.0,
|
||||
action="降級 Expert System",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"dual_engine_llm_failed",
|
||||
@@ -1706,7 +1728,7 @@ class DecisionManager:
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
# LLM 失敗,使用 Expert System
|
||||
# LLM 失敗/超時,使用 Expert System
|
||||
logger.info(
|
||||
"dual_engine_expert_fallback",
|
||||
incident_id=incident.incident_id,
|
||||
|
||||
Reference in New Issue
Block a user