diff --git a/apps/api/src/services/platform_operator_service.py b/apps/api/src/services/platform_operator_service.py index cdf4490e..ada65e3e 100644 --- a/apps/api/src/services/platform_operator_service.py +++ b/apps/api/src/services/platform_operator_service.py @@ -8,6 +8,7 @@ ADR-106(AwoooP Agent Platform) from __future__ import annotations +import asyncio import re import uuid from collections import defaultdict @@ -60,6 +61,7 @@ _MAX_EVENTS = 100 _MAX_TIMELINE_ITEMS = 100 _MAX_LIST_CONTEXT_ROWS = 500 _MAX_STEP_SUMMARY_CHARS = 128 +_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS = 12.0 _REMEDIATION_HISTORY_LIMIT = 20 _INCIDENT_ID_RE = re.compile(r"\bINC-\d{8}-[A-Z0-9]{4,}\b") _REMEDIATION_STATUS_FILTERS = { @@ -503,7 +505,33 @@ async def get_ai_route_status( checked_at = _utc_now_naive() try: - route = await get_ollama_failover_manager().select_provider(task_type=workload) + route = await asyncio.wait_for( + get_ollama_failover_manager().select_provider(task_type=workload), + timeout=_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS, + ) + except asyncio.TimeoutError: + logger.warning( + "ai_route_status_check_timeout", + workload_type=workload, + timeout_seconds=_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS, + ) + return { + "schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION, + "workload_type": workload, + "policy_order": policy_order, + "selected_provider": None, + "selected_url": None, + "selected_model": None, + "fallback_chain": [], + "route_reason": "route_check_timeout", + "route_source": "ollama_failover_manager", + "route_error": ( + f"route status timed out after " + f"{_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS:g}s" + ), + "health": {}, + "checked_at": checked_at, + } except Exception as exc: return { "schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION, diff --git a/apps/api/tests/test_awooop_operator_timeline_labels.py b/apps/api/tests/test_awooop_operator_timeline_labels.py index 477022c4..6a1cca69 100644 --- a/apps/api/tests/test_awooop_operator_timeline_labels.py +++ b/apps/api/tests/test_awooop_operator_timeline_labels.py @@ -1,3 +1,4 @@ +import asyncio from datetime import datetime from decimal import Decimal from types import SimpleNamespace @@ -6,6 +7,7 @@ from uuid import UUID import pytest from fastapi import HTTPException +import src.services.platform_operator_service as platform_operator_service from src.api.v1.platform.operator_runs import ( AiRouteStatusResponse, ListCicdEventsResponse, @@ -1182,6 +1184,36 @@ def test_ai_route_status_response_preserves_route_fields() -> None: assert dumped["selected_provider"] == "ollama_gcp_a" +@pytest.mark.asyncio +async def test_ai_route_status_times_out_before_slow_provider_checks(monkeypatch) -> None: + class SlowFailoverManager: + async def select_provider(self, task_type: str = "general") -> None: + await asyncio.sleep(0.05) + + monkeypatch.setattr( + platform_operator_service, + "_AI_ROUTE_STATUS_SELECT_TIMEOUT_SECONDS", + 0.001, + ) + monkeypatch.setattr( + platform_operator_service, + "get_ollama_failover_manager", + lambda: SlowFailoverManager(), + ) + + response = await platform_operator_service.get_ai_route_status("deep_rca") + + assert response["route_reason"] == "route_check_timeout" + assert response["route_error"] == "route status timed out after 0.001s" + assert response["selected_provider"] is None + assert [item["provider_name"] for item in response["policy_order"]] == [ + "ollama_gcp_a", + "ollama_gcp_b", + "ollama_local", + "gemini", + ] + + def test_ai_route_workload_validation_rejects_unknown_value() -> None: assert _validate_ai_route_workload(" hermes ") == "hermes" with pytest.raises(HTTPException) as exc_info: