fix(agents): 移除人工 per-agent timeout,LLM 必須等完整回應
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled

原設計 asyncio.wait_for(timeout_sec=25s) 是任意截斷,
只要 LLM 超過時限就降級為 confidence=20%,根本沒有分析。

正確做法:
- 移除所有 4 個 agent 的 asyncio.wait_for() 包裝
- 只留 except Exception 捕真實異常(連線失敗、模型崩潰)
- 全流程由 Orchestrator GLOBAL_TIMEOUT_SEC=90s 防掛死
- _PER_AGENT_TIMEOUT_SEC 常數廢棄移除

影響:LLM 推理多久就等多久,不再人工截斷,
      deepseek-r1:14b 等模型得以完整輸出分析結果。

2026-04-16 ogt + Claude Sonnet 4.6

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-16 02:54:34 +08:00
parent 5a3a649f8a
commit 7e3cc8b3b0
5 changed files with 26 additions and 60 deletions

View File

@@ -20,7 +20,6 @@ ADR-082: Phase 2 多 Agent 協作
from __future__ import annotations
import asyncio
import hashlib
import time
from typing import Any
@@ -63,7 +62,7 @@ class CriticAgent(BaseAgent):
self,
diagnosis: DiagnosisReport,
plan: ActionPlan,
timeout_sec: float = 5.0,
timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性
) -> CriticReport:
"""
批判性審查診斷和方案。
@@ -71,18 +70,15 @@ class CriticAgent(BaseAgent):
Args:
diagnosis: Diagnostician 輸出
plan: Solver 輸出
timeout_sec: 熔斷超時
timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級
Returns:
CriticReport熔斷時 degraded=Truechallenges 為空
CriticReport真實異常時 degraded=True
"""
start_ms = int(time.monotonic() * 1000)
try:
report = await asyncio.wait_for(
self._critique(diagnosis, plan),
timeout=timeout_sec,
)
report = await self._critique(diagnosis, plan)
report.latency_ms = int(time.monotonic() * 1000) - start_ms
logger.info(
"critic_done",
@@ -93,11 +89,6 @@ class CriticAgent(BaseAgent):
)
return report
except asyncio.TimeoutError:
latency = int(time.monotonic() * 1000) - start_ms
logger.warning("critic_timeout", timeout_sec=timeout_sec)
return self._degraded_report(latency, "timeout")
except Exception:
latency = int(time.monotonic() * 1000) - start_ms
logger.exception("critic_error")

View File

@@ -18,7 +18,6 @@ ADR-082: Phase 2 多 Agent 協作
from __future__ import annotations
import asyncio
import hashlib
import json
import time
@@ -62,25 +61,24 @@ class DiagnosticianAgent(BaseAgent):
async def run(
self,
snapshot: "EvidenceSnapshot",
timeout_sec: float = 5.0,
timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性
) -> DiagnosisReport:
"""
執行根因分析。
Args:
snapshot: Phase 1 感官快照
timeout_sec: 熔斷超時(預設 5s
timeout_sec: 已廢棄2026-04-16 ogt + Claude Sonnet 4.6 — LLM 必須等完整回應
降級只在真正異常(連線失敗、模型崩潰)時觸發,
全流程由 Orchestrator GLOBAL_TIMEOUT_SEC 防掛死
Returns:
DiagnosisReport熔斷時 degraded=Truevote=ABSTAIN
DiagnosisReport真實異常時 degraded=Truevote=ABSTAIN
"""
start_ms = int(time.monotonic() * 1000)
try:
report = await asyncio.wait_for(
self._analyze(snapshot),
timeout=timeout_sec,
)
report = await self._analyze(snapshot)
report.latency_ms = int(time.monotonic() * 1000) - start_ms
logger.info(
"diagnostician_done",
@@ -92,11 +90,6 @@ class DiagnosticianAgent(BaseAgent):
)
return report
except asyncio.TimeoutError:
latency = int(time.monotonic() * 1000) - start_ms
logger.warning("diagnostician_timeout", timeout_sec=timeout_sec)
return self._degraded_report(snapshot, latency, reason="timeout")
except Exception:
latency = int(time.monotonic() * 1000) - start_ms
logger.exception("diagnostician_error")

View File

@@ -26,7 +26,6 @@ ADR-082: Phase 2 多 Agent 協作
from __future__ import annotations
import asyncio
import hashlib
import re
import time
@@ -77,21 +76,21 @@ class ReviewerAgent(BaseAgent):
async def run(
self,
plan: ActionPlan,
timeout_sec: float = 5.0,
timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性
) -> ReviewVerdict:
"""
審查方案安全性。
Args:
plan: Solver 輸出的方案
timeout_sec: 熔斷超時
timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級
Returns:
ReviewVerdict熔斷時 degraded=True,使用 static rule 降級
ReviewVerdict真實異常時 degraded=True
"""
start_ms = int(time.monotonic() * 1000)
# 1. 硬核靜態檢查(不依賴 LLM— 先於超時保護
# 1. 硬核靜態檢查(不依賴 LLMHARD_RULES 優
hard_blocked = [
c.action for c in plan.candidates
if _is_hard_blocked(c.action)
@@ -108,10 +107,7 @@ class ReviewerAgent(BaseAgent):
)
try:
verdict = await asyncio.wait_for(
self._review(plan),
timeout=timeout_sec,
)
verdict = await self._review(plan)
verdict.latency_ms = int(time.monotonic() * 1000) - start_ms
logger.info(
"reviewer_done",
@@ -122,11 +118,6 @@ class ReviewerAgent(BaseAgent):
)
return verdict
except asyncio.TimeoutError:
latency = int(time.monotonic() * 1000) - start_ms
logger.warning("reviewer_timeout", timeout_sec=timeout_sec)
return self._degraded_verdict(plan, latency, "timeout")
except Exception:
latency = int(time.monotonic() * 1000) - start_ms
logger.exception("reviewer_error")

View File

@@ -19,7 +19,6 @@ ADR-082: Phase 2 多 Agent 協作
from __future__ import annotations
import asyncio
import hashlib
import time
from typing import Any
@@ -54,17 +53,17 @@ class SolverAgent(BaseAgent):
async def run(
self,
diagnosis: DiagnosisReport,
timeout_sec: float = 5.0,
timeout_sec: float = 0.0, # noqa: ARG002 — 已廢棄,保留簽名相容性
) -> ActionPlan:
"""
根據診斷報告產出修復計畫。
Args:
diagnosis: Diagnostician 輸出
timeout_sec: 熔斷超時
timeout_sec: 已廢棄 (2026-04-16 ogt) — LLM 等完整回應,真實異常才降級
Returns:
ActionPlan熔斷時 degraded=True
ActionPlan真實異常時 degraded=True
"""
start_ms = int(time.monotonic() * 1000)
@@ -80,10 +79,7 @@ class SolverAgent(BaseAgent):
)
try:
plan = await asyncio.wait_for(
self._solve(diagnosis),
timeout=timeout_sec,
)
plan = await self._solve(diagnosis)
plan.latency_ms = int(time.monotonic() * 1000) - start_ms
logger.info(
"solver_done",
@@ -93,11 +89,6 @@ class SolverAgent(BaseAgent):
)
return plan
except asyncio.TimeoutError:
latency = int(time.monotonic() * 1000) - start_ms
logger.warning("solver_timeout", timeout_sec=timeout_sec)
return self._degraded_plan(diagnosis, latency, "timeout")
except Exception:
latency = int(time.monotonic() * 1000) - start_ms
logger.exception("solver_error")

View File

@@ -69,9 +69,9 @@ logger = structlog.get_logger(__name__)
# 調整: 每 Agent 25s, 3個序列+1組並行 = 最差 75s + buffer = 90s
GLOBAL_TIMEOUT_SEC = 90.0
# 每個 Agent 個別超時(預設 5s 是開發機測試值,生產需對應 LLM 延遲)
# deepseek-r1:14b avg 10.6s, 99th percentile ~30s
_PER_AGENT_TIMEOUT_SEC = 25.0
# 2026-04-16 ogt + Claude Sonnet 4.6: 移除 _PER_AGENT_TIMEOUT_SEC
# LLM 必須等到完整回應,不得人工截斷。降級只在真正異常(連線失敗、模型崩潰)觸發。
# 全流程由 GLOBAL_TIMEOUT_SEC 防掛死即可。
# Redis Stream key
STREAM_KEY = "aiops:p2:events"
@@ -180,7 +180,7 @@ async def _debate(
# ── Step 1: Diagnostician ──────────────────────────────────────────────
diagnostician = get_diagnostician_agent()
diagnosis = await diagnostician.run(snapshot, timeout_sec=_PER_AGENT_TIMEOUT_SEC)
diagnosis = await diagnostician.run(snapshot)
await _record_turn(
session_id=session_id,
incident_id=incident_id,
@@ -194,7 +194,7 @@ async def _debate(
# ── Step 2: Solver ─────────────────────────────────────────────────────
solver = get_solver_agent()
plan = await solver.run(diagnosis, timeout_sec=_PER_AGENT_TIMEOUT_SEC)
plan = await solver.run(diagnosis)
await _record_turn(
session_id=session_id,
incident_id=incident_id,
@@ -211,8 +211,8 @@ async def _debate(
critic = get_critic_agent()
verdict, critic_report = await asyncio.gather(
reviewer.run(plan, timeout_sec=_PER_AGENT_TIMEOUT_SEC),
critic.run(diagnosis, plan, timeout_sec=_PER_AGENT_TIMEOUT_SEC),
reviewer.run(plan),
critic.run(diagnosis, plan),
)
await asyncio.gather(