From 7e3cc8b3b0dac5618e3fea2798efc1785c8da3da Mon Sep 17 00:00:00 2001 From: OG T Date: Thu, 16 Apr 2026 02:54:34 +0800 Subject: [PATCH] =?UTF-8?q?fix(agents):=20=E7=A7=BB=E9=99=A4=E4=BA=BA?= =?UTF-8?q?=E5=B7=A5=20per-agent=20timeout=EF=BC=8CLLM=20=E5=BF=85?= =?UTF-8?q?=E9=A0=88=E7=AD=89=E5=AE=8C=E6=95=B4=E5=9B=9E=E6=87=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 原設計 asyncio.wait_for(timeout_sec=25s) 是任意截斷, 只要 LLM 超過時限就降級為 confidence=20%,根本沒有分析。 正確做法: - 移除所有 4 個 agent 的 asyncio.wait_for() 包裝 - 只留 except Exception 捕真實異常(連線失敗、模型崩潰) - 全流程由 Orchestrator GLOBAL_TIMEOUT_SEC=90s 防掛死 - _PER_AGENT_TIMEOUT_SEC 常數廢棄移除 影響:LLM 推理多久就等多久,不再人工截斷, deepseek-r1:14b 等模型得以完整輸出分析結果。 2026-04-16 ogt + Claude Sonnet 4.6 Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/agents/critic_agent.py | 17 ++++------------- apps/api/src/agents/diagnostician_agent.py | 19 ++++++------------- apps/api/src/agents/reviewer_agent.py | 19 +++++-------------- apps/api/src/agents/solver_agent.py | 17 ++++------------- apps/api/src/services/agent_orchestrator.py | 14 +++++++------- 5 files changed, 26 insertions(+), 60 deletions(-) diff --git a/apps/api/src/agents/critic_agent.py b/apps/api/src/agents/critic_agent.py index 1a202805..7529e08e 100644 --- a/apps/api/src/agents/critic_agent.py +++ b/apps/api/src/agents/critic_agent.py @@ -20,7 +20,6 @@ ADR-082: Phase 2 多 Agent 協作 from __future__ import annotations -import asyncio import hashlib import time from typing import Any @@ -63,7 +62,7 @@ class CriticAgent(BaseAgent): self, diagnosis: DiagnosisReport, plan: ActionPlan, - timeout_sec: float = 5.0, + timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性 ) -> CriticReport: """ 批判性審查診斷和方案。 @@ -71,18 +70,15 @@ class CriticAgent(BaseAgent): Args: diagnosis: Diagnostician 輸出 plan: Solver 輸出 - timeout_sec: 熔斷超時 + timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級 Returns: - CriticReport(熔斷時 degraded=True,challenges 為空) + CriticReport(真實異常時 degraded=True) """ start_ms = int(time.monotonic() * 1000) try: - report = await asyncio.wait_for( - self._critique(diagnosis, plan), - timeout=timeout_sec, - ) + report = await self._critique(diagnosis, plan) report.latency_ms = int(time.monotonic() * 1000) - start_ms logger.info( "critic_done", @@ -93,11 +89,6 @@ class CriticAgent(BaseAgent): ) return report - except asyncio.TimeoutError: - latency = int(time.monotonic() * 1000) - start_ms - logger.warning("critic_timeout", timeout_sec=timeout_sec) - return self._degraded_report(latency, "timeout") - except Exception: latency = int(time.monotonic() * 1000) - start_ms logger.exception("critic_error") diff --git a/apps/api/src/agents/diagnostician_agent.py b/apps/api/src/agents/diagnostician_agent.py index 7974c079..15367e13 100644 --- a/apps/api/src/agents/diagnostician_agent.py +++ b/apps/api/src/agents/diagnostician_agent.py @@ -18,7 +18,6 @@ ADR-082: Phase 2 多 Agent 協作 from __future__ import annotations -import asyncio import hashlib import json import time @@ -62,25 +61,24 @@ class DiagnosticianAgent(BaseAgent): async def run( self, snapshot: "EvidenceSnapshot", - timeout_sec: float = 5.0, + timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性 ) -> DiagnosisReport: """ 執行根因分析。 Args: snapshot: Phase 1 感官快照 - timeout_sec: 熔斷超時(預設 5s) + timeout_sec: 已廢棄(2026-04-16 ogt + Claude Sonnet 4.6 — LLM 必須等完整回應) + 降級只在真正異常(連線失敗、模型崩潰)時觸發, + 全流程由 Orchestrator GLOBAL_TIMEOUT_SEC 防掛死 Returns: - DiagnosisReport(熔斷時 degraded=True,vote=ABSTAIN) + DiagnosisReport(真實異常時 degraded=True,vote=ABSTAIN) """ start_ms = int(time.monotonic() * 1000) try: - report = await asyncio.wait_for( - self._analyze(snapshot), - timeout=timeout_sec, - ) + report = await self._analyze(snapshot) report.latency_ms = int(time.monotonic() * 1000) - start_ms logger.info( "diagnostician_done", @@ -92,11 +90,6 @@ class DiagnosticianAgent(BaseAgent): ) return report - except asyncio.TimeoutError: - latency = int(time.monotonic() * 1000) - start_ms - logger.warning("diagnostician_timeout", timeout_sec=timeout_sec) - return self._degraded_report(snapshot, latency, reason="timeout") - except Exception: latency = int(time.monotonic() * 1000) - start_ms logger.exception("diagnostician_error") diff --git a/apps/api/src/agents/reviewer_agent.py b/apps/api/src/agents/reviewer_agent.py index 6437c243..4de51ef7 100644 --- a/apps/api/src/agents/reviewer_agent.py +++ b/apps/api/src/agents/reviewer_agent.py @@ -26,7 +26,6 @@ ADR-082: Phase 2 多 Agent 協作 from __future__ import annotations -import asyncio import hashlib import re import time @@ -77,21 +76,21 @@ class ReviewerAgent(BaseAgent): async def run( self, plan: ActionPlan, - timeout_sec: float = 5.0, + timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性 ) -> ReviewVerdict: """ 審查方案安全性。 Args: plan: Solver 輸出的方案 - timeout_sec: 熔斷超時 + timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級 Returns: - ReviewVerdict(熔斷時 degraded=True,使用 static rule 降級) + ReviewVerdict(真實異常時 degraded=True) """ start_ms = int(time.monotonic() * 1000) - # 1. 硬核靜態檢查(不依賴 LLM)— 先於超時保護 + # 1. 硬核靜態檢查(不依賴 LLM)— HARD_RULES 優先 hard_blocked = [ c.action for c in plan.candidates if _is_hard_blocked(c.action) @@ -108,10 +107,7 @@ class ReviewerAgent(BaseAgent): ) try: - verdict = await asyncio.wait_for( - self._review(plan), - timeout=timeout_sec, - ) + verdict = await self._review(plan) verdict.latency_ms = int(time.monotonic() * 1000) - start_ms logger.info( "reviewer_done", @@ -122,11 +118,6 @@ class ReviewerAgent(BaseAgent): ) return verdict - except asyncio.TimeoutError: - latency = int(time.monotonic() * 1000) - start_ms - logger.warning("reviewer_timeout", timeout_sec=timeout_sec) - return self._degraded_verdict(plan, latency, "timeout") - except Exception: latency = int(time.monotonic() * 1000) - start_ms logger.exception("reviewer_error") diff --git a/apps/api/src/agents/solver_agent.py b/apps/api/src/agents/solver_agent.py index 627eb918..d62f6dcb 100644 --- a/apps/api/src/agents/solver_agent.py +++ b/apps/api/src/agents/solver_agent.py @@ -19,7 +19,6 @@ ADR-082: Phase 2 多 Agent 協作 from __future__ import annotations -import asyncio import hashlib import time from typing import Any @@ -54,17 +53,17 @@ class SolverAgent(BaseAgent): async def run( self, diagnosis: DiagnosisReport, - timeout_sec: float = 5.0, + timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性 ) -> ActionPlan: """ 根據診斷報告產出修復計畫。 Args: diagnosis: Diagnostician 輸出 - timeout_sec: 熔斷超時 + timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級 Returns: - ActionPlan(熔斷時 degraded=True) + ActionPlan(真實異常時 degraded=True) """ start_ms = int(time.monotonic() * 1000) @@ -80,10 +79,7 @@ class SolverAgent(BaseAgent): ) try: - plan = await asyncio.wait_for( - self._solve(diagnosis), - timeout=timeout_sec, - ) + plan = await self._solve(diagnosis) plan.latency_ms = int(time.monotonic() * 1000) - start_ms logger.info( "solver_done", @@ -93,11 +89,6 @@ class SolverAgent(BaseAgent): ) return plan - except asyncio.TimeoutError: - latency = int(time.monotonic() * 1000) - start_ms - logger.warning("solver_timeout", timeout_sec=timeout_sec) - return self._degraded_plan(diagnosis, latency, "timeout") - except Exception: latency = int(time.monotonic() * 1000) - start_ms logger.exception("solver_error") diff --git a/apps/api/src/services/agent_orchestrator.py b/apps/api/src/services/agent_orchestrator.py index 589959da..780a4d29 100644 --- a/apps/api/src/services/agent_orchestrator.py +++ b/apps/api/src/services/agent_orchestrator.py @@ -69,9 +69,9 @@ logger = structlog.get_logger(__name__) # 調整: 每 Agent 25s, 3個序列+1組並行 = 最差 75s + buffer = 90s GLOBAL_TIMEOUT_SEC = 90.0 -# 每個 Agent 個別超時(預設 5s 是開發機測試值,生產需對應 LLM 延遲) -# deepseek-r1:14b avg 10.6s, 99th percentile ~30s -_PER_AGENT_TIMEOUT_SEC = 25.0 +# 2026-04-16 ogt + Claude Sonnet 4.6: 移除 _PER_AGENT_TIMEOUT_SEC +# LLM 必須等到完整回應,不得人工截斷。降級只在真正異常(連線失敗、模型崩潰)觸發。 +# 全流程由 GLOBAL_TIMEOUT_SEC 防掛死即可。 # Redis Stream key STREAM_KEY = "aiops:p2:events" @@ -180,7 +180,7 @@ async def _debate( # ── Step 1: Diagnostician ────────────────────────────────────────────── diagnostician = get_diagnostician_agent() - diagnosis = await diagnostician.run(snapshot, timeout_sec=_PER_AGENT_TIMEOUT_SEC) + diagnosis = await diagnostician.run(snapshot) await _record_turn( session_id=session_id, incident_id=incident_id, @@ -194,7 +194,7 @@ async def _debate( # ── Step 2: Solver ───────────────────────────────────────────────────── solver = get_solver_agent() - plan = await solver.run(diagnosis, timeout_sec=_PER_AGENT_TIMEOUT_SEC) + plan = await solver.run(diagnosis) await _record_turn( session_id=session_id, incident_id=incident_id, @@ -211,8 +211,8 @@ async def _debate( critic = get_critic_agent() verdict, critic_report = await asyncio.gather( - reviewer.run(plan, timeout_sec=_PER_AGENT_TIMEOUT_SEC), - critic.run(diagnosis, plan, timeout_sec=_PER_AGENT_TIMEOUT_SEC), + reviewer.run(plan), + critic.run(diagnosis, plan), ) await asyncio.gather(