diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index 54ed2792..9705beda 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -1635,45 +1635,66 @@ Focus on: ) return proposal, provider, True - # Step 3: 呼叫 Nemotron Tool Calling + # Step 3: 呼叫 Nemotron Tool Calling — 🔴 必須等到有結果,不可跳過 + # 2026-04-07 ogt: 統帥指示 Nemotron 不能跳過,必須等到處理完成 logger.info( "nemotron_collaboration_start", incident_id=incident_id, risk_level=risk_level, ) - try: - nemotron_result = await self._call_nemotron_tools( - incident_id=incident_id, - reasoning=proposal.get("reasoning", ""), - target_resource=proposal.get("target_resource", ""), - suggested_action=proposal.get("action", ""), - namespace=proposal.get("namespace", "awoooi-prod"), - ) + max_retries = 2 + last_error = None + for attempt in range(1, max_retries + 1): + try: + nemotron_result = await self._call_nemotron_tools( + incident_id=incident_id, + reasoning=proposal.get("reasoning", ""), + target_resource=proposal.get("target_resource", ""), + suggested_action=proposal.get("action", ""), + namespace=proposal.get("namespace", "awoooi-prod"), + ) - proposal["nemotron_enabled"] = True - proposal["nemotron_tools"] = nemotron_result.get("tools", []) - proposal["nemotron_validation"] = nemotron_result.get("validation", "⏳ 驗證中") - proposal["nemotron_latency_ms"] = nemotron_result.get("latency_ms", 0.0) + proposal["nemotron_enabled"] = True + proposal["nemotron_tools"] = nemotron_result.get("tools", []) + proposal["nemotron_validation"] = nemotron_result.get("validation", "⏳ 驗證中") + proposal["nemotron_latency_ms"] = nemotron_result.get("latency_ms", 0.0) - logger.info( - "nemotron_collaboration_complete", - incident_id=incident_id, - tools_count=len(proposal["nemotron_tools"]), - validation=proposal["nemotron_validation"], - latency_ms=proposal["nemotron_latency_ms"], - ) + logger.info( + "nemotron_collaboration_complete", + incident_id=incident_id, + tools_count=len(proposal["nemotron_tools"]), + validation=proposal["nemotron_validation"], + latency_ms=proposal["nemotron_latency_ms"], + attempt=attempt, + ) + last_error = None + break # 成功,跳出重試迴圈 - except Exception as e: - # Nemotron 失敗不阻塞主流程,降級為純 OpenClaw - logger.warning( - "nemotron_collaboration_failed", + except Exception as e: + last_error = e + logger.warning( + "nemotron_collaboration_retry", + incident_id=incident_id, + error=str(e), + attempt=attempt, + max_retries=max_retries, + ) + if attempt < max_retries: + import asyncio + await asyncio.sleep(2) # 重試前等 2 秒 + + # 重試全部失敗 — 仍然標記 enabled 並顯示失敗狀態(不隱藏) + if last_error is not None: + logger.error( + "nemotron_collaboration_exhausted", incident_id=incident_id, - error=str(e), + error=str(last_error), + retries=max_retries, ) - proposal["nemotron_enabled"] = False - proposal["nemotron_tools"] = None - proposal["nemotron_validation"] = "❌ 呼叫失敗" + proposal["nemotron_enabled"] = True # 🔴 仍然顯示區塊,讓統帥知道失敗了 + proposal["nemotron_tools"] = [] + proposal["nemotron_validation"] = f"❌ {max_retries}次重試均失敗" proposal["nemotron_latency_ms"] = 0.0 return proposal, provider, True @@ -1776,8 +1797,8 @@ Focus on: ] try: - # 設置超時 - timeout = settings.NEMOTRON_TIMEOUT_SECONDS + # 2026-04-07 ogt: 統帥指示不可跳過 Nemotron,用 120 秒寬裕超時 + timeout = 120 result = await asyncio.wait_for( nvidia.tool_call( @@ -1822,16 +1843,13 @@ Focus on: except asyncio.TimeoutError: latency_ms = (time.time() - start_time) * 1000 - logger.warning( + logger.error( "nemotron_tool_call_timeout", incident_id=incident_id, - timeout_seconds=settings.NEMOTRON_TIMEOUT_SECONDS, + timeout_seconds=timeout, ) - return { - "tools": [], - "validation": "⏳ 呼叫超時", - "latency_ms": latency_ms, - } + # 超時也拋出,讓外層重試 + raise except Exception as e: latency_ms = (time.time() - start_time) * 1000