fix(api): incidents list 不再同步等待 AI 決策 (效能修復)
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
問題: GET /api/v1/incidents 對每個 incident await AI 分析 (120-180s)
多個活躍 incident 時 timeout 乘積爆炸 → 前端完全無法載入
修復:
- list endpoint 只查 Redis 已快取的決策 token (立即返回)
- 無快取時回 decision=null,背景 fire-and-forget 觸發 AI
- 前端對有興趣的 incident 再 GET 單筆端點取得決策結果
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -143,37 +143,46 @@ async def list_incidents() -> IncidentListResponse:
|
||||
|
||||
incidents.sort(key=safe_created_at, reverse=True)
|
||||
|
||||
# Phase 6.5: 為每個事件生成決策令牌 (非同步並行)
|
||||
# 2026-04-09 Claude Sonnet 4.6: 效能修復 — list endpoint 不同步等待 AI
|
||||
# 原設計: 每個 incident await AI 決策 (120-180s timeout),多 incident 時乘積爆炸
|
||||
# 修復: 只取已存在的決策 token,若無則背景觸發生成,前端 poll 單筆 GET 取得結果
|
||||
import asyncio
|
||||
|
||||
responses = []
|
||||
background_tasks = []
|
||||
|
||||
for incident in incidents:
|
||||
try:
|
||||
# P0/P1 給更短的 timeout (緊急)
|
||||
# 2026-03-27 ogt: 增加超時 (Ollama CPU 模式 llama3.2:3b 約 2-3 分鐘)
|
||||
timeout = 120.0 if incident.severity in (Severity.P0, Severity.P1) else 180.0
|
||||
|
||||
decision_token = await decision_manager.get_or_create_decision(
|
||||
incident=incident,
|
||||
timeout_sec=timeout,
|
||||
)
|
||||
|
||||
decision_info = DecisionInfo(
|
||||
token=decision_token.token,
|
||||
state=decision_token.state.value,
|
||||
proposal_data=decision_token.proposal_data,
|
||||
proposal_id=decision_token.proposal_id,
|
||||
)
|
||||
|
||||
responses.append(IncidentResponse.from_incident(incident, decision_info))
|
||||
|
||||
# 只查已快取的決策 (不等待 AI,立即返回)
|
||||
existing = await decision_manager._find_existing_token(incident.incident_id)
|
||||
if existing:
|
||||
decision_info = DecisionInfo(
|
||||
token=existing.token,
|
||||
state=existing.state.value,
|
||||
proposal_data=existing.proposal_data,
|
||||
proposal_id=existing.proposal_id,
|
||||
)
|
||||
responses.append(IncidentResponse.from_incident(incident, decision_info))
|
||||
else:
|
||||
# 無快取 → 背景觸發,本次返回 None(前端看到 decision=null 會 poll)
|
||||
responses.append(IncidentResponse.from_incident(incident, None))
|
||||
timeout = 120.0 if incident.severity in (Severity.P0, Severity.P1) else 180.0
|
||||
background_tasks.append(
|
||||
decision_manager.get_or_create_decision(incident=incident, timeout_sec=timeout)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"decision_generation_failed",
|
||||
incident_id=incident.incident_id,
|
||||
error=str(e),
|
||||
)
|
||||
# 即使決策生成失敗,也返回事件 (不含 decision)
|
||||
responses.append(IncidentResponse.from_incident(incident, None))
|
||||
|
||||
# 背景觸發 AI 決策(fire-and-forget,不阻塞 response)
|
||||
if background_tasks:
|
||||
for task in background_tasks:
|
||||
asyncio.create_task(task)
|
||||
|
||||
logger.info(
|
||||
"incidents_listed",
|
||||
count=len(incidents),
|
||||
|
||||
Reference in New Issue
Block a user