fix(openclaw): Nemotron 重試邏輯 + exhausted log key (未提交的修改)
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

- generate_incident_proposal_with_tools: 單次 try/except → 2次重試迴圈
- 失敗 log key: nemotron_collaboration_failed → nemotron_collaboration_exhausted
- 失敗時 nemotron_enabled=True (讓統帥看到失敗狀態)
- _call_nemotron_tools: timeout 超時改為拋出異常(讓外層重試)
- 這是之前 Session 的本地修改,修正測試與實際實作不一致問題

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-08 21:16:34 +08:00
parent d276b39bd5
commit 14cb015826

View File

@@ -1635,45 +1635,66 @@ Focus on:
)
return proposal, provider, True
# Step 3: 呼叫 Nemotron Tool Calling
# Step 3: 呼叫 Nemotron Tool Calling — 🔴 必須等到有結果,不可跳過
# 2026-04-07 ogt: 統帥指示 Nemotron 不能跳過,必須等到處理完成
logger.info(
"nemotron_collaboration_start",
incident_id=incident_id,
risk_level=risk_level,
)
try:
nemotron_result = await self._call_nemotron_tools(
incident_id=incident_id,
reasoning=proposal.get("reasoning", ""),
target_resource=proposal.get("target_resource", ""),
suggested_action=proposal.get("action", ""),
namespace=proposal.get("namespace", "awoooi-prod"),
)
max_retries = 2
last_error = None
for attempt in range(1, max_retries + 1):
try:
nemotron_result = await self._call_nemotron_tools(
incident_id=incident_id,
reasoning=proposal.get("reasoning", ""),
target_resource=proposal.get("target_resource", ""),
suggested_action=proposal.get("action", ""),
namespace=proposal.get("namespace", "awoooi-prod"),
)
proposal["nemotron_enabled"] = True
proposal["nemotron_tools"] = nemotron_result.get("tools", [])
proposal["nemotron_validation"] = nemotron_result.get("validation", "⏳ 驗證中")
proposal["nemotron_latency_ms"] = nemotron_result.get("latency_ms", 0.0)
proposal["nemotron_enabled"] = True
proposal["nemotron_tools"] = nemotron_result.get("tools", [])
proposal["nemotron_validation"] = nemotron_result.get("validation", "⏳ 驗證中")
proposal["nemotron_latency_ms"] = nemotron_result.get("latency_ms", 0.0)
logger.info(
"nemotron_collaboration_complete",
incident_id=incident_id,
tools_count=len(proposal["nemotron_tools"]),
validation=proposal["nemotron_validation"],
latency_ms=proposal["nemotron_latency_ms"],
)
logger.info(
"nemotron_collaboration_complete",
incident_id=incident_id,
tools_count=len(proposal["nemotron_tools"]),
validation=proposal["nemotron_validation"],
latency_ms=proposal["nemotron_latency_ms"],
attempt=attempt,
)
last_error = None
break # 成功,跳出重試迴圈
except Exception as e:
# Nemotron 失敗不阻塞主流程,降級為純 OpenClaw
logger.warning(
"nemotron_collaboration_failed",
except Exception as e:
last_error = e
logger.warning(
"nemotron_collaboration_retry",
incident_id=incident_id,
error=str(e),
attempt=attempt,
max_retries=max_retries,
)
if attempt < max_retries:
import asyncio
await asyncio.sleep(2) # 重試前等 2 秒
# 重試全部失敗 — 仍然標記 enabled 並顯示失敗狀態(不隱藏)
if last_error is not None:
logger.error(
"nemotron_collaboration_exhausted",
incident_id=incident_id,
error=str(e),
error=str(last_error),
retries=max_retries,
)
proposal["nemotron_enabled"] = False
proposal["nemotron_tools"] = None
proposal["nemotron_validation"] = "呼叫失敗"
proposal["nemotron_enabled"] = True # 🔴 仍然顯示區塊,讓統帥知道失敗了
proposal["nemotron_tools"] = []
proposal["nemotron_validation"] = f"{max_retries}次重試均失敗"
proposal["nemotron_latency_ms"] = 0.0
return proposal, provider, True
@@ -1776,8 +1797,8 @@ Focus on:
]
try:
# 設置超時
timeout = settings.NEMOTRON_TIMEOUT_SECONDS
# 2026-04-07 ogt: 統帥指示不可跳過 Nemotron用 120 秒寬裕超時
timeout = 120
result = await asyncio.wait_for(
nvidia.tool_call(
@@ -1822,16 +1843,13 @@ Focus on:
except asyncio.TimeoutError:
latency_ms = (time.time() - start_time) * 1000
logger.warning(
logger.error(
"nemotron_tool_call_timeout",
incident_id=incident_id,
timeout_seconds=settings.NEMOTRON_TIMEOUT_SECONDS,
timeout_seconds=timeout,
)
return {
"tools": [],
"validation": "⏳ 呼叫超時",
"latency_ms": latency_ms,
}
# 超時也拋出,讓外層重試
raise
except Exception as e:
latency_ms = (time.time() - start_time) * 1000