feat(GAP-B4): LLM 超時降級扶梯 — 精確化內層 timeout
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 14m19s

_dual_engine_analyze 強化(2026-04-14 Claude Sonnet 4.6):
- OpenClaw LLM 呼叫獨立 25s hard timeout(留 5s 給後續處理)
- 超時時明確 llm_timeout_fallback 日誌,立即降級 Expert System
- NemoClaw second opinion 加 3s timeout(advisory 不拖累主流程)
- 保留外層 decide() 30s wait_for 作為 defence-in-depth

為何要做:
- 外層 30s 會把 LLM 卡死整段吃光,thread pool 可能飢餓
- 內層 25s 更早降級 → Expert System 仍能在 SLA 內回應
- LLM timeout 與其他異常用不同日誌標記,便於 SLO-2 監控

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-14 15:51:23 +08:00
parent dedd7c2c17
commit dd0a778e1f

View File

@@ -1643,12 +1643,18 @@ class DecisionManager:
if context_parts:
llm_expert_context["diagnosis_context"] = "\n\n".join(context_parts)
llm_result, provider, success = await self._openclaw.generate_incident_proposal_with_tools(
incident_id=incident.incident_id,
severity=incident.severity.value,
signals=signals_dict,
affected_services=incident.affected_services,
expert_context=llm_expert_context if llm_expert_context else None,
# GAP-B4 (2026-04-14 Claude Sonnet 4.6): LLM 25s hard timeout
# 比外層 decide() 30s wait_for 更嚴格,留 5s 給 YAML risk override + NemoClaw second opinion
# Timeout → 明確 llm_timeout_fallback 日誌,返回 expert_result 而非等外層觸發
llm_result, provider, success = await asyncio.wait_for(
self._openclaw.generate_incident_proposal_with_tools(
incident_id=incident.incident_id,
severity=incident.severity.value,
signals=signals_dict,
affected_services=incident.affected_services,
expert_context=llm_expert_context if llm_expert_context else None,
),
timeout=25.0,
)
if success and llm_result:
@@ -1685,7 +1691,12 @@ class DecisionManager:
_conf = float(result.get("confidence", 1.0))
if _conf < 0.7:
try:
_advisory = await _nemoclaw_second_opinion(incident, result)
# GAP-B4 (2026-04-14 Claude Sonnet 4.6): NemoClaw 是 advisory
# 3s timeout 保護主決策流程不被拖累
_advisory = await asyncio.wait_for(
_nemoclaw_second_opinion(incident, result),
timeout=3.0,
)
if _advisory:
result["advisory_note"] = _advisory
logger.info(
@@ -1693,12 +1704,23 @@ class DecisionManager:
incident_id=incident.incident_id,
confidence=_conf,
)
except asyncio.TimeoutError:
logger.warning("nemoclaw_second_opinion_timeout",
incident_id=incident.incident_id)
except Exception as _soe:
logger.warning("nemoclaw_second_opinion_failed",
incident_id=incident.incident_id, error=str(_soe))
return result
except asyncio.TimeoutError:
# GAP-B4: LLM 超時 → 明確標記,降級 Expert System
logger.warning(
"llm_timeout_fallback",
incident_id=incident.incident_id,
timeout_sec=25.0,
action="降級 Expert System",
)
except Exception as e:
logger.warning(
"dual_engine_llm_failed",
@@ -1706,7 +1728,7 @@ class DecisionManager:
error=str(e),
)
# LLM 失敗,使用 Expert System
# LLM 失敗/超時,使用 Expert System
logger.info(
"dual_engine_expert_fallback",
incident_id=incident.incident_id,