fix(api): bound ai route status checks
This commit is contained in:
@@ -8,6 +8,7 @@ ADR-106(AwoooP Agent Platform)
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
@@ -60,6 +61,7 @@ _MAX_EVENTS = 100
|
||||
_MAX_TIMELINE_ITEMS = 100
|
||||
_MAX_LIST_CONTEXT_ROWS = 500
|
||||
_MAX_STEP_SUMMARY_CHARS = 128
|
||||
_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS = 12.0
|
||||
_REMEDIATION_HISTORY_LIMIT = 20
|
||||
_INCIDENT_ID_RE = re.compile(r"\bINC-\d{8}-[A-Z0-9]{4,}\b")
|
||||
_REMEDIATION_STATUS_FILTERS = {
|
||||
@@ -503,7 +505,33 @@ async def get_ai_route_status(
|
||||
checked_at = _utc_now_naive()
|
||||
|
||||
try:
|
||||
route = await get_ollama_failover_manager().select_provider(task_type=workload)
|
||||
route = await asyncio.wait_for(
|
||||
get_ollama_failover_manager().select_provider(task_type=workload),
|
||||
timeout=_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(
|
||||
"ai_route_status_check_timeout",
|
||||
workload_type=workload,
|
||||
timeout_seconds=_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS,
|
||||
)
|
||||
return {
|
||||
"schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION,
|
||||
"workload_type": workload,
|
||||
"policy_order": policy_order,
|
||||
"selected_provider": None,
|
||||
"selected_url": None,
|
||||
"selected_model": None,
|
||||
"fallback_chain": [],
|
||||
"route_reason": "route_check_timeout",
|
||||
"route_source": "ollama_failover_manager",
|
||||
"route_error": (
|
||||
f"route status timed out after "
|
||||
f"{_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS:g}s"
|
||||
),
|
||||
"health": {},
|
||||
"checked_at": checked_at,
|
||||
}
|
||||
except Exception as exc:
|
||||
return {
|
||||
"schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION,
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from types import SimpleNamespace
|
||||
@@ -6,6 +7,7 @@ from uuid import UUID
|
||||
import pytest
|
||||
from fastapi import HTTPException
|
||||
|
||||
import src.services.platform_operator_service as platform_operator_service
|
||||
from src.api.v1.platform.operator_runs import (
|
||||
AiRouteStatusResponse,
|
||||
ListCicdEventsResponse,
|
||||
@@ -1182,6 +1184,36 @@ def test_ai_route_status_response_preserves_route_fields() -> None:
|
||||
assert dumped["selected_provider"] == "ollama_gcp_a"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ai_route_status_times_out_before_slow_provider_checks(monkeypatch) -> None:
|
||||
class SlowFailoverManager:
|
||||
async def select_provider(self, task_type: str = "general") -> None:
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
monkeypatch.setattr(
|
||||
platform_operator_service,
|
||||
"_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS",
|
||||
0.001,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
platform_operator_service,
|
||||
"get_ollama_failover_manager",
|
||||
lambda: SlowFailoverManager(),
|
||||
)
|
||||
|
||||
response = await platform_operator_service.get_ai_route_status("deep_rca")
|
||||
|
||||
assert response["route_reason"] == "route_check_timeout"
|
||||
assert response["route_error"] == "route status timed out after 0.001s"
|
||||
assert response["selected_provider"] is None
|
||||
assert [item["provider_name"] for item in response["policy_order"]] == [
|
||||
"ollama_gcp_a",
|
||||
"ollama_gcp_b",
|
||||
"ollama_local",
|
||||
"gemini",
|
||||
]
|
||||
|
||||
|
||||
def test_ai_route_workload_validation_rejects_unknown_value() -> None:
|
||||
assert _validate_ai_route_workload(" hermes ") == "hermes"
|
||||
with pytest.raises(HTTPException) as exc_info:
|
||||
|
||||
Reference in New Issue
Block a user