fix(api): bound ai route status checks
All checks were successful
CD Pipeline / tests (push) Successful in 5m39s
Code Review / ai-code-review (push) Successful in 11s
CD Pipeline / build-and-deploy (push) Successful in 4m10s
CD Pipeline / post-deploy-checks (push) Successful in 1m34s

This commit is contained in:
Your Name
2026-05-24 12:38:46 +08:00
parent b17acbb043
commit bdccb80ed7
2 changed files with 61 additions and 1 deletions

View File

@@ -8,6 +8,7 @@ ADR-106AwoooP Agent Platform
from __future__ import annotations
import asyncio
import re
import uuid
from collections import defaultdict
@@ -60,6 +61,7 @@ _MAX_EVENTS = 100
_MAX_TIMELINE_ITEMS = 100
_MAX_LIST_CONTEXT_ROWS = 500
_MAX_STEP_SUMMARY_CHARS = 128
_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS = 12.0
_REMEDIATION_HISTORY_LIMIT = 20
_INCIDENT_ID_RE = re.compile(r"\bINC-\d{8}-[A-Z0-9]{4,}\b")
_REMEDIATION_STATUS_FILTERS = {
@@ -503,7 +505,33 @@ async def get_ai_route_status(
checked_at = _utc_now_naive()
try:
route = await get_ollama_failover_manager().select_provider(task_type=workload)
route = await asyncio.wait_for(
get_ollama_failover_manager().select_provider(task_type=workload),
timeout=_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS,
)
except asyncio.TimeoutError:
logger.warning(
"ai_route_status_check_timeout",
workload_type=workload,
timeout_seconds=_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS,
)
return {
"schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION,
"workload_type": workload,
"policy_order": policy_order,
"selected_provider": None,
"selected_url": None,
"selected_model": None,
"fallback_chain": [],
"route_reason": "route_check_timeout",
"route_source": "ollama_failover_manager",
"route_error": (
f"route status timed out after "
f"{_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS:g}s"
),
"health": {},
"checked_at": checked_at,
}
except Exception as exc:
return {
"schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION,

View File

@@ -1,3 +1,4 @@
import asyncio
from datetime import datetime
from decimal import Decimal
from types import SimpleNamespace
@@ -6,6 +7,7 @@ from uuid import UUID
import pytest
from fastapi import HTTPException
import src.services.platform_operator_service as platform_operator_service
from src.api.v1.platform.operator_runs import (
AiRouteStatusResponse,
ListCicdEventsResponse,
@@ -1182,6 +1184,36 @@ def test_ai_route_status_response_preserves_route_fields() -> None:
assert dumped["selected_provider"] == "ollama_gcp_a"
@pytest.mark.asyncio
async def test_ai_route_status_times_out_before_slow_provider_checks(monkeypatch) -> None:
class SlowFailoverManager:
async def select_provider(self, task_type: str = "general") -> None:
await asyncio.sleep(0.05)
monkeypatch.setattr(
platform_operator_service,
"_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS",
0.001,
)
monkeypatch.setattr(
platform_operator_service,
"get_ollama_failover_manager",
lambda: SlowFailoverManager(),
)
response = await platform_operator_service.get_ai_route_status("deep_rca")
assert response["route_reason"] == "route_check_timeout"
assert response["route_error"] == "route status timed out after 0.001s"
assert response["selected_provider"] is None
assert [item["provider_name"] for item in response["policy_order"]] == [
"ollama_gcp_a",
"ollama_gcp_b",
"ollama_local",
"gemini",
]
def test_ai_route_workload_validation_rejects_unknown_value() -> None:
assert _validate_ai_route_workload(" hermes ") == "hermes"
with pytest.raises(HTTPException) as exc_info: