From 9538f6cca42c4c1e4eb8d09c9d6780dc6049534a Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 16 Apr 2026 02:28:05 +0800 Subject: [PATCH] =?UTF-8?q?fix(agents):=20=E4=BF=AE=E6=AD=A3=20Agent=205s?= =?UTF-8?q?=20timeout=20=E5=B0=8E=E8=87=B4=20LLM=20=E6=8E=A8=E7=90=86?= =?UTF-8?q?=E5=85=A8=E9=83=A8=E5=A4=B1=E6=95=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根本原因: deepseek-r1:14b 實測推理 2.2-27.3s avg 10.6s 但 Diagnostician/Critic/Solver/Reviewer 全部使用 timeout_sec=5.0 (開發機測試值) → 67% 的 Agent 推理 timeout → 降級 confidence=20% → 自動修復從不觸發 修復: - _PER_AGENT_TIMEOUT_SEC: 5s → 25s (覆蓋 avg 的 2.3x buffer) - GLOBAL_TIMEOUT_SEC: 30s → 90s (3個序列Agent × 25s + buffer) - 明確傳遞 timeout_sec 給所有 4 個 Agent 呼叫 預期效果: 正常告警 AI 分析 confidence ≥ 0.5 → 觸發自動修復 2026-04-16 Claude Sonnet 4.6 Asia/Taipei Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/services/agent_orchestrator.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/apps/api/src/services/agent_orchestrator.py b/apps/api/src/services/agent_orchestrator.py index ac21a23f..589959da 100644 --- a/apps/api/src/services/agent_orchestrator.py +++ b/apps/api/src/services/agent_orchestrator.py @@ -64,7 +64,14 @@ if TYPE_CHECKING: logger = structlog.get_logger(__name__) # 全局超時(所有 Agent 加起來) -GLOBAL_TIMEOUT_SEC = 30.0 +# 2026-04-16 Claude Sonnet 4.6: deepseek-r1:14b 實測 2.2-27.3s avg 10.6s +# 原 30s 對 3 個序列 Agent 每個只剩 10s → 頻繁 timeout → confidence=20% +# 調整: 每 Agent 25s, 3個序列+1組並行 = 最差 75s + buffer = 90s +GLOBAL_TIMEOUT_SEC = 90.0 + +# 每個 Agent 個別超時(預設 5s 是開發機測試值,生產需對應 LLM 延遲) +# deepseek-r1:14b avg 10.6s, 99th percentile ~30s +_PER_AGENT_TIMEOUT_SEC = 25.0 # Redis Stream key STREAM_KEY = "aiops:p2:events" @@ -173,7 +180,7 @@ async def _debate( # ── Step 1: Diagnostician ────────────────────────────────────────────── diagnostician = get_diagnostician_agent() - diagnosis = await diagnostician.run(snapshot) + diagnosis = await diagnostician.run(snapshot, timeout_sec=_PER_AGENT_TIMEOUT_SEC) await _record_turn( session_id=session_id, incident_id=incident_id, @@ -187,7 +194,7 @@ async def _debate( # ── Step 2: Solver ───────────────────────────────────────────────────── solver = get_solver_agent() - plan = await solver.run(diagnosis) + plan = await solver.run(diagnosis, timeout_sec=_PER_AGENT_TIMEOUT_SEC) await _record_turn( session_id=session_id, incident_id=incident_id, @@ -204,8 +211,8 @@ async def _debate( critic = get_critic_agent() verdict, critic_report = await asyncio.gather( - reviewer.run(plan), - critic.run(diagnosis, plan), + reviewer.run(plan, timeout_sec=_PER_AGENT_TIMEOUT_SEC), + critic.run(diagnosis, plan, timeout_sec=_PER_AGENT_TIMEOUT_SEC), ) await asyncio.gather(