From 9538f6cca42c4c1e4eb8d09c9d6780dc6049534a Mon Sep 17 00:00:00 2001
From: OG T <ogt@WOOOMacMiniM4.local>
Date: Thu, 16 Apr 2026 02:28:05 +0800
Subject: [PATCH] =?UTF-8?q?fix(agents):=20=E4=BF=AE=E6=AD=A3=20Agent=205s?=
 =?UTF-8?q?=20timeout=20=E5=B0=8E=E8=87=B4=20LLM=20=E6=8E=A8=E7=90=86?=
 =?UTF-8?q?=E5=85=A8=E9=83=A8=E5=A4=B1=E6=95=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

根本原因: deepseek-r1:14b 實測推理 2.2-27.3s avg 10.6s
但 Diagnostician/Critic/Solver/Reviewer 全部使用 timeout_sec=5.0 (開發機測試值)
→ 67% 的 Agent 推理 timeout → 降級 confidence=20% → 自動修復從不觸發

修復:
- _PER_AGENT_TIMEOUT_SEC: 5s → 25s (覆蓋 avg 的 2.3x buffer)
- GLOBAL_TIMEOUT_SEC: 30s → 90s (3個序列Agent × 25s + buffer)
- 明確傳遞 timeout_sec 給所有 4 個 Agent 呼叫

預期效果: 正常告警 AI 分析 confidence ≥ 0.5 → 觸發自動修復

2026-04-16 Claude Sonnet 4.6 Asia/Taipei

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/api/src/services/agent_orchestrator.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/apps/api/src/services/agent_orchestrator.py b/apps/api/src/services/agent_orchestrator.py
index ac21a23f..589959da 100644
--- a/apps/api/src/services/agent_orchestrator.py
+++ b/apps/api/src/services/agent_orchestrator.py
@@ -64,7 +64,14 @@ if TYPE_CHECKING:
 logger = structlog.get_logger(__name__)
 
 # 全局超時（所有 Agent 加起來）
-GLOBAL_TIMEOUT_SEC = 30.0
+# 2026-04-16 Claude Sonnet 4.6: deepseek-r1:14b 實測 2.2-27.3s avg 10.6s
+# 原 30s 對 3 個序列 Agent 每個只剩 10s → 頻繁 timeout → confidence=20%
+# 調整: 每 Agent 25s, 3個序列+1組並行 = 最差 75s + buffer = 90s
+GLOBAL_TIMEOUT_SEC = 90.0
+
+# 每個 Agent 個別超時（預設 5s 是開發機測試值，生產需對應 LLM 延遲）
+# deepseek-r1:14b avg 10.6s, 99th percentile ~30s
+_PER_AGENT_TIMEOUT_SEC = 25.0
 
 # Redis Stream key
 STREAM_KEY = "aiops:p2:events"
@@ -173,7 +180,7 @@ async def _debate(
 
     # ── Step 1: Diagnostician ──────────────────────────────────────────────
     diagnostician = get_diagnostician_agent()
-    diagnosis = await diagnostician.run(snapshot)
+    diagnosis = await diagnostician.run(snapshot, timeout_sec=_PER_AGENT_TIMEOUT_SEC)
     await _record_turn(
         session_id=session_id,
         incident_id=incident_id,
@@ -187,7 +194,7 @@ async def _debate(
 
     # ── Step 2: Solver ─────────────────────────────────────────────────────
     solver = get_solver_agent()
-    plan = await solver.run(diagnosis)
+    plan = await solver.run(diagnosis, timeout_sec=_PER_AGENT_TIMEOUT_SEC)
     await _record_turn(
         session_id=session_id,
         incident_id=incident_id,
@@ -204,8 +211,8 @@ async def _debate(
     critic = get_critic_agent()
 
     verdict, critic_report = await asyncio.gather(
-        reviewer.run(plan),
-        critic.run(diagnosis, plan),
+        reviewer.run(plan, timeout_sec=_PER_AGENT_TIMEOUT_SEC),
+        critic.run(diagnosis, plan, timeout_sec=_PER_AGENT_TIMEOUT_SEC),
     )
 
     await asyncio.gather(