fix(agents): 修正 Agent 5s timeout 導致 LLM 推理全部失敗

根本原因: deepseek-r1:14b 實測推理 2.2-27.3s avg 10.6s 但 Diagnostician/Critic/Solver/Reviewer 全部使用 timeout_sec=5.0 (開發機測試值) → 67% 的 Agent 推理 timeout → 降級 confidence=20% → 自動修復從不觸發修復: - _PER_AGENT_TIMEOUT_SEC: 5s → 25s (覆蓋 avg 的 2.3x buffer) - GLOBAL_TIMEOUT_SEC: 30s → 90s (3個序列Agent × 25s + buffer) - 明確傳遞 timeout_sec 給所有 4 個 Agent 呼叫預期效果: 正常告警 AI 分析 confidence ≥ 0.5 → 觸發自動修復 2026-04-16 Claude Sonnet 4.6 Asia/Taipei Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-16 02:28:05 +08:00
parent a07daf7e3f
commit 9538f6cca4
1 changed files with 12 additions and 5 deletions
--- a/apps/api/src/services/agent_orchestrator.py
+++ b/apps/api/src/services/agent_orchestrator.py
@@ -64,7 +64,14 @@ if TYPE_CHECKING:
 logger = structlog.get_logger(__name__)

 # 全局超時（所有 Agent 加起來）
-GLOBAL_TIMEOUT_SEC = 30.0
+# 2026-04-16 Claude Sonnet 4.6: deepseek-r1:14b 實測 2.2-27.3s avg 10.6s
+# 原 30s 對 3 個序列 Agent 每個只剩 10s → 頻繁 timeout → confidence=20%
+# 調整: 每 Agent 25s, 3個序列+1組並行 = 最差 75s + buffer = 90s
+GLOBAL_TIMEOUT_SEC = 90.0
+
+# 每個 Agent 個別超時（預設 5s 是開發機測試值，生產需對應 LLM 延遲）
+# deepseek-r1:14b avg 10.6s, 99th percentile ~30s
+_PER_AGENT_TIMEOUT_SEC = 25.0

 # Redis Stream key
 STREAM_KEY = "aiops:p2:events"
@@ -173,7 +180,7 @@ async def _debate(

    # ── Step 1: Diagnostician ──────────────────────────────────────────────
    diagnostician = get_diagnostician_agent()
-    diagnosis = await diagnostician.run(snapshot)
+    diagnosis = await diagnostician.run(snapshot, timeout_sec=_PER_AGENT_TIMEOUT_SEC)
    await _record_turn(
        session_id=session_id,
        incident_id=incident_id,
@@ -187,7 +194,7 @@ async def _debate(

    # ── Step 2: Solver ─────────────────────────────────────────────────────
    solver = get_solver_agent()
-    plan = await solver.run(diagnosis)
+    plan = await solver.run(diagnosis, timeout_sec=_PER_AGENT_TIMEOUT_SEC)
    await _record_turn(
        session_id=session_id,
        incident_id=incident_id,
@@ -204,8 +211,8 @@ async def _debate(
    critic = get_critic_agent()

    verdict, critic_report = await asyncio.gather(
-        reviewer.run(plan),
-        critic.run(diagnosis, plan),
+        reviewer.run(plan, timeout_sec=_PER_AGENT_TIMEOUT_SEC),
+        critic.run(diagnosis, plan, timeout_sec=_PER_AGENT_TIMEOUT_SEC),
    )

    await asyncio.gather(