fix(agents): 移除人工 per-agent timeout,LLM 必須等完整回應
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
原設計 asyncio.wait_for(timeout_sec=25s) 是任意截斷,
只要 LLM 超過時限就降級為 confidence=20%,根本沒有分析。
正確做法:
- 移除所有 4 個 agent 的 asyncio.wait_for() 包裝
- 只留 except Exception 捕真實異常(連線失敗、模型崩潰)
- 全流程由 Orchestrator GLOBAL_TIMEOUT_SEC=90s 防掛死
- _PER_AGENT_TIMEOUT_SEC 常數廢棄移除
影響:LLM 推理多久就等多久,不再人工截斷,
deepseek-r1:14b 等模型得以完整輸出分析結果。
2026-04-16 ogt + Claude Sonnet 4.6
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,7 +20,6 @@ ADR-082: Phase 2 多 Agent 協作
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import time
|
||||
from typing import Any
|
||||
@@ -63,7 +62,7 @@ class CriticAgent(BaseAgent):
|
||||
self,
|
||||
diagnosis: DiagnosisReport,
|
||||
plan: ActionPlan,
|
||||
timeout_sec: float = 5.0,
|
||||
timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性
|
||||
) -> CriticReport:
|
||||
"""
|
||||
批判性審查診斷和方案。
|
||||
@@ -71,18 +70,15 @@ class CriticAgent(BaseAgent):
|
||||
Args:
|
||||
diagnosis: Diagnostician 輸出
|
||||
plan: Solver 輸出
|
||||
timeout_sec: 熔斷超時
|
||||
timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級
|
||||
|
||||
Returns:
|
||||
CriticReport(熔斷時 degraded=True,challenges 為空)
|
||||
CriticReport(真實異常時 degraded=True)
|
||||
"""
|
||||
start_ms = int(time.monotonic() * 1000)
|
||||
|
||||
try:
|
||||
report = await asyncio.wait_for(
|
||||
self._critique(diagnosis, plan),
|
||||
timeout=timeout_sec,
|
||||
)
|
||||
report = await self._critique(diagnosis, plan)
|
||||
report.latency_ms = int(time.monotonic() * 1000) - start_ms
|
||||
logger.info(
|
||||
"critic_done",
|
||||
@@ -93,11 +89,6 @@ class CriticAgent(BaseAgent):
|
||||
)
|
||||
return report
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
latency = int(time.monotonic() * 1000) - start_ms
|
||||
logger.warning("critic_timeout", timeout_sec=timeout_sec)
|
||||
return self._degraded_report(latency, "timeout")
|
||||
|
||||
except Exception:
|
||||
latency = int(time.monotonic() * 1000) - start_ms
|
||||
logger.exception("critic_error")
|
||||
|
||||
@@ -18,7 +18,6 @@ ADR-082: Phase 2 多 Agent 協作
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import time
|
||||
@@ -62,25 +61,24 @@ class DiagnosticianAgent(BaseAgent):
|
||||
async def run(
|
||||
self,
|
||||
snapshot: "EvidenceSnapshot",
|
||||
timeout_sec: float = 5.0,
|
||||
timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性
|
||||
) -> DiagnosisReport:
|
||||
"""
|
||||
執行根因分析。
|
||||
|
||||
Args:
|
||||
snapshot: Phase 1 感官快照
|
||||
timeout_sec: 熔斷超時(預設 5s)
|
||||
timeout_sec: 已廢棄(2026-04-16 ogt + Claude Sonnet 4.6 — LLM 必須等完整回應)
|
||||
降級只在真正異常(連線失敗、模型崩潰)時觸發,
|
||||
全流程由 Orchestrator GLOBAL_TIMEOUT_SEC 防掛死
|
||||
|
||||
Returns:
|
||||
DiagnosisReport(熔斷時 degraded=True,vote=ABSTAIN)
|
||||
DiagnosisReport(真實異常時 degraded=True,vote=ABSTAIN)
|
||||
"""
|
||||
start_ms = int(time.monotonic() * 1000)
|
||||
|
||||
try:
|
||||
report = await asyncio.wait_for(
|
||||
self._analyze(snapshot),
|
||||
timeout=timeout_sec,
|
||||
)
|
||||
report = await self._analyze(snapshot)
|
||||
report.latency_ms = int(time.monotonic() * 1000) - start_ms
|
||||
logger.info(
|
||||
"diagnostician_done",
|
||||
@@ -92,11 +90,6 @@ class DiagnosticianAgent(BaseAgent):
|
||||
)
|
||||
return report
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
latency = int(time.monotonic() * 1000) - start_ms
|
||||
logger.warning("diagnostician_timeout", timeout_sec=timeout_sec)
|
||||
return self._degraded_report(snapshot, latency, reason="timeout")
|
||||
|
||||
except Exception:
|
||||
latency = int(time.monotonic() * 1000) - start_ms
|
||||
logger.exception("diagnostician_error")
|
||||
|
||||
@@ -26,7 +26,6 @@ ADR-082: Phase 2 多 Agent 協作
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import re
|
||||
import time
|
||||
@@ -77,21 +76,21 @@ class ReviewerAgent(BaseAgent):
|
||||
async def run(
|
||||
self,
|
||||
plan: ActionPlan,
|
||||
timeout_sec: float = 5.0,
|
||||
timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性
|
||||
) -> ReviewVerdict:
|
||||
"""
|
||||
審查方案安全性。
|
||||
|
||||
Args:
|
||||
plan: Solver 輸出的方案
|
||||
timeout_sec: 熔斷超時
|
||||
timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級
|
||||
|
||||
Returns:
|
||||
ReviewVerdict(熔斷時 degraded=True,使用 static rule 降級)
|
||||
ReviewVerdict(真實異常時 degraded=True)
|
||||
"""
|
||||
start_ms = int(time.monotonic() * 1000)
|
||||
|
||||
# 1. 硬核靜態檢查(不依賴 LLM)— 先於超時保護
|
||||
# 1. 硬核靜態檢查(不依賴 LLM)— HARD_RULES 優先
|
||||
hard_blocked = [
|
||||
c.action for c in plan.candidates
|
||||
if _is_hard_blocked(c.action)
|
||||
@@ -108,10 +107,7 @@ class ReviewerAgent(BaseAgent):
|
||||
)
|
||||
|
||||
try:
|
||||
verdict = await asyncio.wait_for(
|
||||
self._review(plan),
|
||||
timeout=timeout_sec,
|
||||
)
|
||||
verdict = await self._review(plan)
|
||||
verdict.latency_ms = int(time.monotonic() * 1000) - start_ms
|
||||
logger.info(
|
||||
"reviewer_done",
|
||||
@@ -122,11 +118,6 @@ class ReviewerAgent(BaseAgent):
|
||||
)
|
||||
return verdict
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
latency = int(time.monotonic() * 1000) - start_ms
|
||||
logger.warning("reviewer_timeout", timeout_sec=timeout_sec)
|
||||
return self._degraded_verdict(plan, latency, "timeout")
|
||||
|
||||
except Exception:
|
||||
latency = int(time.monotonic() * 1000) - start_ms
|
||||
logger.exception("reviewer_error")
|
||||
|
||||
@@ -19,7 +19,6 @@ ADR-082: Phase 2 多 Agent 協作
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import time
|
||||
from typing import Any
|
||||
@@ -54,17 +53,17 @@ class SolverAgent(BaseAgent):
|
||||
async def run(
|
||||
self,
|
||||
diagnosis: DiagnosisReport,
|
||||
timeout_sec: float = 5.0,
|
||||
timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性
|
||||
) -> ActionPlan:
|
||||
"""
|
||||
根據診斷報告產出修復計畫。
|
||||
|
||||
Args:
|
||||
diagnosis: Diagnostician 輸出
|
||||
timeout_sec: 熔斷超時
|
||||
timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級
|
||||
|
||||
Returns:
|
||||
ActionPlan(熔斷時 degraded=True)
|
||||
ActionPlan(真實異常時 degraded=True)
|
||||
"""
|
||||
start_ms = int(time.monotonic() * 1000)
|
||||
|
||||
@@ -80,10 +79,7 @@ class SolverAgent(BaseAgent):
|
||||
)
|
||||
|
||||
try:
|
||||
plan = await asyncio.wait_for(
|
||||
self._solve(diagnosis),
|
||||
timeout=timeout_sec,
|
||||
)
|
||||
plan = await self._solve(diagnosis)
|
||||
plan.latency_ms = int(time.monotonic() * 1000) - start_ms
|
||||
logger.info(
|
||||
"solver_done",
|
||||
@@ -93,11 +89,6 @@ class SolverAgent(BaseAgent):
|
||||
)
|
||||
return plan
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
latency = int(time.monotonic() * 1000) - start_ms
|
||||
logger.warning("solver_timeout", timeout_sec=timeout_sec)
|
||||
return self._degraded_plan(diagnosis, latency, "timeout")
|
||||
|
||||
except Exception:
|
||||
latency = int(time.monotonic() * 1000) - start_ms
|
||||
logger.exception("solver_error")
|
||||
|
||||
@@ -69,9 +69,9 @@ logger = structlog.get_logger(__name__)
|
||||
# 調整: 每 Agent 25s, 3個序列+1組並行 = 最差 75s + buffer = 90s
|
||||
GLOBAL_TIMEOUT_SEC = 90.0
|
||||
|
||||
# 每個 Agent 個別超時(預設 5s 是開發機測試值,生產需對應 LLM 延遲)
|
||||
# deepseek-r1:14b avg 10.6s, 99th percentile ~30s
|
||||
_PER_AGENT_TIMEOUT_SEC = 25.0
|
||||
# 2026-04-16 ogt + Claude Sonnet 4.6: 移除 _PER_AGENT_TIMEOUT_SEC
|
||||
# LLM 必須等到完整回應,不得人工截斷。降級只在真正異常(連線失敗、模型崩潰)觸發。
|
||||
# 全流程由 GLOBAL_TIMEOUT_SEC 防掛死即可。
|
||||
|
||||
# Redis Stream key
|
||||
STREAM_KEY = "aiops:p2:events"
|
||||
@@ -180,7 +180,7 @@ async def _debate(
|
||||
|
||||
# ── Step 1: Diagnostician ──────────────────────────────────────────────
|
||||
diagnostician = get_diagnostician_agent()
|
||||
diagnosis = await diagnostician.run(snapshot, timeout_sec=_PER_AGENT_TIMEOUT_SEC)
|
||||
diagnosis = await diagnostician.run(snapshot)
|
||||
await _record_turn(
|
||||
session_id=session_id,
|
||||
incident_id=incident_id,
|
||||
@@ -194,7 +194,7 @@ async def _debate(
|
||||
|
||||
# ── Step 2: Solver ─────────────────────────────────────────────────────
|
||||
solver = get_solver_agent()
|
||||
plan = await solver.run(diagnosis, timeout_sec=_PER_AGENT_TIMEOUT_SEC)
|
||||
plan = await solver.run(diagnosis)
|
||||
await _record_turn(
|
||||
session_id=session_id,
|
||||
incident_id=incident_id,
|
||||
@@ -211,8 +211,8 @@ async def _debate(
|
||||
critic = get_critic_agent()
|
||||
|
||||
verdict, critic_report = await asyncio.gather(
|
||||
reviewer.run(plan, timeout_sec=_PER_AGENT_TIMEOUT_SEC),
|
||||
critic.run(diagnosis, plan, timeout_sec=_PER_AGENT_TIMEOUT_SEC),
|
||||
reviewer.run(plan),
|
||||
critic.run(diagnosis, plan),
|
||||
)
|
||||
|
||||
await asyncio.gather(
|
||||
|
||||
Reference in New Issue
Block a user