fix(api): bound ai route status checks
This commit is contained in:
@@ -8,6 +8,7 @@ ADR-106(AwoooP Agent Platform)
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import re
|
import re
|
||||||
import uuid
|
import uuid
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
@@ -60,6 +61,7 @@ _MAX_EVENTS = 100
|
|||||||
_MAX_TIMELINE_ITEMS = 100
|
_MAX_TIMELINE_ITEMS = 100
|
||||||
_MAX_LIST_CONTEXT_ROWS = 500
|
_MAX_LIST_CONTEXT_ROWS = 500
|
||||||
_MAX_STEP_SUMMARY_CHARS = 128
|
_MAX_STEP_SUMMARY_CHARS = 128
|
||||||
|
_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS = 12.0
|
||||||
_REMEDIATION_HISTORY_LIMIT = 20
|
_REMEDIATION_HISTORY_LIMIT = 20
|
||||||
_INCIDENT_ID_RE = re.compile(r"\bINC-\d{8}-[A-Z0-9]{4,}\b")
|
_INCIDENT_ID_RE = re.compile(r"\bINC-\d{8}-[A-Z0-9]{4,}\b")
|
||||||
_REMEDIATION_STATUS_FILTERS = {
|
_REMEDIATION_STATUS_FILTERS = {
|
||||||
@@ -503,7 +505,33 @@ async def get_ai_route_status(
|
|||||||
checked_at = _utc_now_naive()
|
checked_at = _utc_now_naive()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
route = await get_ollama_failover_manager().select_provider(task_type=workload)
|
route = await asyncio.wait_for(
|
||||||
|
get_ollama_failover_manager().select_provider(task_type=workload),
|
||||||
|
timeout=_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning(
|
||||||
|
"ai_route_status_check_timeout",
|
||||||
|
workload_type=workload,
|
||||||
|
timeout_seconds=_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION,
|
||||||
|
"workload_type": workload,
|
||||||
|
"policy_order": policy_order,
|
||||||
|
"selected_provider": None,
|
||||||
|
"selected_url": None,
|
||||||
|
"selected_model": None,
|
||||||
|
"fallback_chain": [],
|
||||||
|
"route_reason": "route_check_timeout",
|
||||||
|
"route_source": "ollama_failover_manager",
|
||||||
|
"route_error": (
|
||||||
|
f"route status timed out after "
|
||||||
|
f"{_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS:g}s"
|
||||||
|
),
|
||||||
|
"health": {},
|
||||||
|
"checked_at": checked_at,
|
||||||
|
}
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
return {
|
return {
|
||||||
"schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION,
|
"schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION,
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import asyncio
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
@@ -6,6 +7,7 @@ from uuid import UUID
|
|||||||
import pytest
|
import pytest
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
|
|
||||||
|
import src.services.platform_operator_service as platform_operator_service
|
||||||
from src.api.v1.platform.operator_runs import (
|
from src.api.v1.platform.operator_runs import (
|
||||||
AiRouteStatusResponse,
|
AiRouteStatusResponse,
|
||||||
ListCicdEventsResponse,
|
ListCicdEventsResponse,
|
||||||
@@ -1182,6 +1184,36 @@ def test_ai_route_status_response_preserves_route_fields() -> None:
|
|||||||
assert dumped["selected_provider"] == "ollama_gcp_a"
|
assert dumped["selected_provider"] == "ollama_gcp_a"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_ai_route_status_times_out_before_slow_provider_checks(monkeypatch) -> None:
|
||||||
|
class SlowFailoverManager:
|
||||||
|
async def select_provider(self, task_type: str = "general") -> None:
|
||||||
|
await asyncio.sleep(0.05)
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
platform_operator_service,
|
||||||
|
"_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS",
|
||||||
|
0.001,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
platform_operator_service,
|
||||||
|
"get_ollama_failover_manager",
|
||||||
|
lambda: SlowFailoverManager(),
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await platform_operator_service.get_ai_route_status("deep_rca")
|
||||||
|
|
||||||
|
assert response["route_reason"] == "route_check_timeout"
|
||||||
|
assert response["route_error"] == "route status timed out after 0.001s"
|
||||||
|
assert response["selected_provider"] is None
|
||||||
|
assert [item["provider_name"] for item in response["policy_order"]] == [
|
||||||
|
"ollama_gcp_a",
|
||||||
|
"ollama_gcp_b",
|
||||||
|
"ollama_local",
|
||||||
|
"gemini",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_ai_route_workload_validation_rejects_unknown_value() -> None:
|
def test_ai_route_workload_validation_rejects_unknown_value() -> None:
|
||||||
assert _validate_ai_route_workload(" hermes ") == "hermes"
|
assert _validate_ai_route_workload(" hermes ") == "hermes"
|
||||||
with pytest.raises(HTTPException) as exc_info:
|
with pytest.raises(HTTPException) as exc_info:
|
||||||
|
|||||||
Reference in New Issue
Block a user