diff --git a/apps/api/src/api/v1/platform/operator_runs.py b/apps/api/src/api/v1/platform/operator_runs.py index 3b3f80d2..4fed4506 100644 --- a/apps/api/src/api/v1/platform/operator_runs.py +++ b/apps/api/src/api/v1/platform/operator_runs.py @@ -147,6 +147,10 @@ class AiRouteStatusResponse(BaseModel): route_source: str route_error: str | None = None health: dict[str, dict[str, Any]] + lane_mode: str | None = None + active_lane: dict[str, Any] | None = None + skipped_lanes: list[dict[str, Any]] = Field(default_factory=list) + operator_action: dict[str, Any] | None = None checked_at: datetime diff --git a/apps/api/src/services/platform_operator_service.py b/apps/api/src/services/platform_operator_service.py index 09d80392..67562f65 100644 --- a/apps/api/src/services/platform_operator_service.py +++ b/apps/api/src/services/platform_operator_service.py @@ -619,7 +619,8 @@ async def get_ai_route_status( route_error=str(exc), ) - return { + health = _ai_route_health_map(route) + response = { "schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION, "workload_type": workload, "policy_order": policy_order, @@ -633,9 +634,15 @@ async def get_ai_route_status( "route_reason": route.routing_reason, "route_source": "ollama_failover_manager", "route_error": None, - "health": _ai_route_health_map(route), + "health": health, "checked_at": checked_at, } + response.update(_ai_route_lane_state( + policy_order=policy_order, + selected_provider=route.primary.provider_name, + health=health, + )) + return response def _validate_ai_route_workload(workload_type: str | None) -> OllamaWorkloadType: @@ -712,7 +719,7 @@ async def _ai_route_lightweight_status_from_policy( ) if selected_index is None: - return { + response = { "schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION, "workload_type": workload, "policy_order": policy_order, @@ -729,6 +736,12 @@ async def _ai_route_lightweight_status_from_policy( "health": health_by_provider, "checked_at": checked_at, } + response.update(_ai_route_lane_state( + policy_order=policy_order, + selected_provider="gemini", + health=health_by_provider, + )) + return response selected = endpoints[selected_index] model = get_settings().OLLAMA_HEALTH_CHECK_MODEL @@ -748,7 +761,7 @@ async def _ai_route_lightweight_status_from_policy( "runtime": "cloud", }) - return { + response = { "schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION, "workload_type": workload, "policy_order": policy_order, @@ -765,6 +778,12 @@ async def _ai_route_lightweight_status_from_policy( "health": health_by_provider, "checked_at": checked_at, } + response.update(_ai_route_lane_state( + policy_order=policy_order, + selected_provider=selected.provider_name, + health=health_by_provider, + )) + return response async def _ai_route_probe_connectivity( @@ -832,7 +851,7 @@ def _ai_route_unavailable_status( route_error: str, route_source: str, ) -> dict[str, Any]: - return { + response = { "schema_version": _AI_ROUTE_STATUS_SCHEMA_VERSION, "workload_type": workload, "policy_order": policy_order, @@ -846,6 +865,101 @@ def _ai_route_unavailable_status( "health": {}, "checked_at": checked_at, } + response.update(_ai_route_lane_state( + policy_order=policy_order, + selected_provider=None, + health={}, + )) + return response + + +def _ai_route_lane_state( + *, + policy_order: list[dict[str, Any]], + selected_provider: str | None, + health: dict[str, dict[str, Any]], +) -> dict[str, Any]: + """Expose failover lane state separately from policy labels.""" + selected_index = next( + ( + index + for index, item in enumerate(policy_order) + if item.get("provider_name") == selected_provider + ), + None, + ) + active_item = ( + policy_order[selected_index] + if selected_index is not None + else None + ) + skipped_items = policy_order[:selected_index] if selected_index is not None else [] + + skipped_lanes = [ + _ai_route_lane_item(item, health.get(str(item.get("provider_name")))) + for item in skipped_items + if item.get("runtime") == "ollama" + ] + + if not selected_provider or active_item is None: + lane_mode = "unavailable" + operator_action = { + "human_required": True, + "action": "inspect_ai_router", + "reason": "no_active_provider", + } + elif active_item.get("runtime") == "cloud": + lane_mode = "cloud_fallback" + operator_action = { + "human_required": True, + "action": "restore_ollama_lanes", + "reason": "all_ollama_lanes_unavailable", + } + elif skipped_lanes: + lane_mode = "degraded_failover" + operator_action = { + "human_required": True, + "action": "repair_skipped_primary_lane", + "reason": "fallback_lane_active", + } + else: + lane_mode = "primary" + operator_action = { + "human_required": False, + "action": "monitor", + "reason": "primary_lane_active", + } + + return { + "lane_mode": lane_mode, + "active_lane": ( + _ai_route_lane_item(active_item, health.get(str(active_item.get("provider_name")))) + if active_item + else None + ), + "skipped_lanes": skipped_lanes, + "operator_action": operator_action, + } + + +def _ai_route_lane_item( + item: dict[str, Any], + health_item: dict[str, Any] | None, +) -> dict[str, Any]: + return { + "priority": item.get("priority"), + "provider_name": item.get("provider_name"), + "role": item.get("role"), + "runtime": item.get("runtime"), + "url": item.get("url"), + "health_status": (health_item or {}).get("status", "not_checked"), + "reason": (health_item or {}).get("reason") or item.get("reason"), + "action_required": (health_item or {}).get("status") not in { + "healthy", + "not_checked", + None, + }, + } def _ai_route_policy_endpoint_item( diff --git a/apps/api/tests/test_awooop_operator_timeline_labels.py b/apps/api/tests/test_awooop_operator_timeline_labels.py index 9b754bbc..77455926 100644 --- a/apps/api/tests/test_awooop_operator_timeline_labels.py +++ b/apps/api/tests/test_awooop_operator_timeline_labels.py @@ -19,6 +19,7 @@ from src.services.ollama_failover_manager import OllamaEndpoint, OllamaRoutingRe from src.services.ollama_health_monitor import HealthReport, HealthStatus from src.services.platform_operator_service import ( _ai_route_health_map, + _ai_route_lane_state, _ai_route_policy_order, _build_awooop_status_chain, _callback_reply_event_item, @@ -1549,12 +1550,87 @@ def test_ai_route_status_response_preserves_route_fields() -> None: "checked": True, }, }, + "lane_mode": "primary", + "active_lane": { + "provider_name": "ollama_gcp_a", + "health_status": "healthy", + "action_required": False, + }, + "skipped_lanes": [], + "operator_action": { + "human_required": False, + "action": "monitor", + "reason": "primary_lane_active", + }, "checked_at": datetime(2026, 5, 19, 12, 0, 0), }) dumped = response.model_dump(mode="json") assert dumped["policy_order"][-1]["provider_name"] == "gemini" assert dumped["selected_provider"] == "ollama_gcp_a" + assert dumped["lane_mode"] == "primary" + + +def test_ai_route_lane_state_marks_degraded_failover() -> None: + policy = _ai_route_policy_order("deep_rca") + health = { + "ollama_gcp_a": { + "status": "offline", + "reason": "recent_endpoint_failure_cooldown:25s", + }, + "ollama_gcp_b": { + "status": "healthy", + "reason": "", + }, + "ollama_local": { + "status": "healthy", + "reason": "", + }, + } + + state = _ai_route_lane_state( + policy_order=policy, + selected_provider="ollama_gcp_b", + health=health, + ) + + assert state["lane_mode"] == "degraded_failover" + assert state["active_lane"]["provider_name"] == "ollama_gcp_b" + assert len(state["skipped_lanes"]) == 1 + assert state["skipped_lanes"][0]["provider_name"] == "ollama_gcp_a" + assert state["skipped_lanes"][0]["role"] == "primary" + assert state["skipped_lanes"][0]["health_status"] == "offline" + assert state["skipped_lanes"][0]["reason"] == "recent_endpoint_failure_cooldown:25s" + assert state["skipped_lanes"][0]["action_required"] is True + assert state["operator_action"] == { + "human_required": True, + "action": "repair_skipped_primary_lane", + "reason": "fallback_lane_active", + } + + +def test_ai_route_lane_state_marks_cloud_fallback() -> None: + policy = _ai_route_policy_order("deep_rca") + health = { + "ollama_gcp_a": {"status": "offline", "reason": "timeout"}, + "ollama_gcp_b": {"status": "offline", "reason": "timeout"}, + "ollama_local": {"status": "offline", "reason": "timeout"}, + } + + state = _ai_route_lane_state( + policy_order=policy, + selected_provider="gemini", + health=health, + ) + + assert state["lane_mode"] == "cloud_fallback" + assert state["active_lane"]["provider_name"] == "gemini" + assert [lane["provider_name"] for lane in state["skipped_lanes"]] == [ + "ollama_gcp_a", + "ollama_gcp_b", + "ollama_local", + ] + assert state["operator_action"]["action"] == "restore_ollama_lanes" @pytest.mark.asyncio diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 0777699a..94a009d9 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -260,6 +260,7 @@ "humanGapClear": "Quality summary has no top gap", "modelRoute": "Model route", "routeDetail": "{model}; current {selected}; {primary}={primaryStatus}; fallback {fallback}", + "routeLaneDetail": "{mode}; skipped {skipped}", "routeReasonSeparator": "; ", "routeReason": "Reason: {reason}", "routeErrorDetail": "Route check failed: {error}", @@ -272,6 +273,13 @@ "not_checked": "standby", "unknown": "unknown" }, + "routeLaneMode": { + "primary": "Primary normal", + "degraded_failover": "Degraded handoff", + "cloud_fallback": "Cloud fallback", + "unavailable": "Route unavailable", + "unknown": "Unknown state" + }, "topGap": "Largest current gap: {gate}, {count} items." } }, @@ -2865,17 +2873,19 @@ "aiRouteStatus": { "title": "AI Provider Routing", "subtitle": "Current policy and health checks across GCP-A, GCP-B, 111, and Gemini handoff order", - "selected": "Primary: {provider}", - "selectedEmpty": "Primary: --", + "selected": "Active: {provider}", + "selectedEmpty": "Active: --", "empty": "AI provider route status is not available yet.", "error": "AI provider route failed to load: {error}", "badges": { "active": "Active", + "skipped": "Skipped", "standby": "Standby" }, "fields": { "workload": "Workload", - "primary": "Current Primary", + "laneMode": "Lane state", + "primary": "Current handoff", "reason": "Route Reason", "checkedAt": "Checked at {time}", "model": "Model: {model}", @@ -2899,6 +2909,22 @@ "local_fallback": "111 local fallback", "final_fallback": "Gemini final fallback", "ollama": "Ollama node" + }, + "laneModes": { + "primary": "Primary normal", + "degraded_failover": "Degraded handoff", + "cloud_fallback": "Cloud final fallback", + "unavailable": "Route unavailable", + "unknown": "Unknown state" + }, + "operatorActions": { + "monitor": "Monitor only", + "repair_skipped_primary_lane": "Repair the skipped primary lane", + "restore_ollama_lanes": "Restore Ollama lanes before relying on cloud only", + "inspect_ai_router": "Inspect AI Router / provider status", + "unknown": "Confirm next action" + }, + "degradedSummary": "Current handoff is {active}; skipped {skipped}; next action: {action}" } }, "incidentEvidence": { @@ -3221,4 +3247,3 @@ } } } -} diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index e8818222..711b95c8 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -261,6 +261,7 @@ "humanGapClear": "品質摘要未列出主要缺口", "modelRoute": "模型路由", "routeDetail": "{model};目前 {selected};{primary}={primaryStatus};備援 {fallback}", + "routeLaneDetail": "{mode};已跳過 {skipped}", "routeReasonSeparator": ";", "routeReason": "原因:{reason}", "routeErrorDetail": "路由檢查失敗:{error}", @@ -273,6 +274,13 @@ "not_checked": "待命", "unknown": "未知" }, + "routeLaneMode": { + "primary": "Primary 正常", + "degraded_failover": "降級接手", + "cloud_fallback": "雲端備援", + "unavailable": "路由不可用", + "unknown": "狀態未知" + }, "topGap": "目前最大缺口:{gate},共 {count} 筆。" } }, @@ -2866,17 +2874,19 @@ "aiRouteStatus": { "title": "AI Provider 路由", "subtitle": "目前策略與健康檢查,顯示 GCP-A、GCP-B、111、Gemini 的接手順序", - "selected": "Primary:{provider}", - "selectedEmpty": "Primary:--", + "selected": "使用中:{provider}", + "selectedEmpty": "使用中:--", "empty": "尚未取得 AI provider route 狀態。", "error": "AI provider route 載入失敗:{error}", "badges": { "active": "使用中", + "skipped": "已跳過", "standby": "備援" }, "fields": { "workload": "Workload", - "primary": "目前 Primary", + "laneMode": "Lane 狀態", + "primary": "目前接手", "reason": "路由原因", "checkedAt": "檢查時間 {time}", "model": "Model:{model}", @@ -2900,6 +2910,22 @@ "local_fallback": "111 本機備援", "final_fallback": "Gemini 最終備援", "ollama": "Ollama 節點" + }, + "laneModes": { + "primary": "Primary 正常", + "degraded_failover": "降級接手中", + "cloud_fallback": "雲端最終備援", + "unavailable": "路由不可用", + "unknown": "狀態未知" + }, + "operatorActions": { + "monitor": "持續監控即可", + "repair_skipped_primary_lane": "需修復被跳過的 Primary lane", + "restore_ollama_lanes": "需恢復 Ollama lanes,避免只剩雲端", + "inspect_ai_router": "需檢查 AI Router / provider 狀態", + "unknown": "待確認下一步" + }, + "degradedSummary": "目前由 {active} 接手;已跳過 {skipped};下一步:{action}" } }, "incidentEvidence": { @@ -3222,4 +3248,3 @@ } } } -} diff --git a/apps/web/src/app/[locale]/awooop/runs/page.tsx b/apps/web/src/app/[locale]/awooop/runs/page.tsx index 518313eb..937903de 100644 --- a/apps/web/src/app/[locale]/awooop/runs/page.tsx +++ b/apps/web/src/app/[locale]/awooop/runs/page.tsx @@ -428,6 +428,23 @@ interface AiRouteHealthItem { checked?: boolean; } +interface AiRouteLaneItem { + priority?: number | null; + provider_name?: string | null; + role?: string | null; + runtime?: string | null; + url?: string | null; + health_status?: string | null; + reason?: string | null; + action_required?: boolean; +} + +interface AiRouteOperatorAction { + human_required?: boolean; + action?: string | null; + reason?: string | null; +} + interface AiRouteStatusResponse { schema_version: string; workload_type: string; @@ -440,6 +457,10 @@ interface AiRouteStatusResponse { route_source: string; route_error?: string | null; health: Record; + lane_mode?: string | null; + active_lane?: AiRouteLaneItem | null; + skipped_lanes?: AiRouteLaneItem[]; + operator_action?: AiRouteOperatorAction | null; checked_at: string; } @@ -2005,6 +2026,30 @@ function aiRouteRoleLabelKey(role?: string | null) { return "roles.ollama"; } +function aiRouteLaneModeLabelKey(mode?: string | null) { + if ( + mode === "primary" || + mode === "degraded_failover" || + mode === "cloud_fallback" || + mode === "unavailable" + ) { + return `laneModes.${mode}`; + } + return "laneModes.unknown"; +} + +function aiRouteOperatorActionLabelKey(action?: string | null) { + if ( + action === "monitor" || + action === "repair_skipped_primary_lane" || + action === "restore_ollama_lanes" || + action === "inspect_ai_router" + ) { + return `operatorActions.${action}`; + } + return "operatorActions.unknown"; +} + function AiRouteStatusPanel({ status, error, @@ -2016,6 +2061,15 @@ function AiRouteStatusPanel({ const policy = status?.policy_order ?? []; const selectedProvider = status?.selected_provider ?? null; const selectedModel = status?.selected_model ?? null; + const laneMode = status?.lane_mode ?? null; + const laneModeKey = aiRouteLaneModeLabelKey(laneMode); + const operatorActionKey = aiRouteOperatorActionLabelKey(status?.operator_action?.action); + const skippedLanes = status?.skipped_lanes ?? []; + const skippedProviderSet = new Set( + skippedLanes + .map((lane) => lane.provider_name) + .filter((provider): provider is string => Boolean(provider)) + ); const checkedAt = status?.checked_at ? new Date(status.checked_at).toLocaleTimeString("zh-TW", { hour: "2-digit", @@ -2033,7 +2087,14 @@ function AiRouteStatusPanel({

{t("subtitle")}

- + {selectedProvider ? t("selected", { provider: selectedProvider }) : t("selectedEmpty")} @@ -2050,7 +2111,28 @@ function AiRouteStatusPanel({ ) : ( <> -
+ {laneMode && laneMode !== "primary" && ( +
+
+ )} + +

{t("fields.workload")}

@@ -2060,6 +2142,15 @@ function AiRouteStatusPanel({ {t("fields.checkedAt", { time: checkedAt })}

+
+

{t("fields.laneMode")}

+

+ {t(laneModeKey as never)} +

+

+ {t(operatorActionKey as never)} +

+

{t("fields.primary")}

@@ -2090,6 +2181,7 @@ function AiRouteStatusPanel({ const healthKey = aiRouteHealthLabelKey(health?.status); const roleKey = aiRouteRoleLabelKey(item.role); const isSelected = selectedProvider === item.provider_name; + const isSkipped = skippedProviderSet.has(item.provider_name); const latency = typeof health?.latency_ms === "number" ? `${health.latency_ms.toFixed(1)}ms` : "--"; @@ -2110,12 +2202,18 @@ function AiRouteStatusPanel({ "shrink-0 border px-2 py-0.5 text-xs font-semibold", isSelected ? "border-[#9bc7a4] bg-[#f0faf2] text-[#17602a]" + : isSkipped + ? "border-[#d9b36f] bg-[#fff7e8] text-[#8a5a08]" : item.runtime === "cloud" ? "border-[#d9b36f] bg-[#fff7e8] text-[#8a5a08]" : "border-[#d8d3c7] bg-[#faf9f3] text-[#5f5b52]" )} > - {isSelected ? t("badges.active") : t("badges.standby")} + {isSelected + ? t("badges.active") + : isSkipped + ? t("badges.skipped") + : t("badges.standby")}

@@ -2124,6 +2222,9 @@ function AiRouteStatusPanel({

{item.url || t("fields.noUrl")}

+

+ {health?.reason || item.reason || "--"} +

); diff --git a/apps/web/src/components/dashboard/automation-evidence-card.tsx b/apps/web/src/components/dashboard/automation-evidence-card.tsx index 1543a68d..bd8b54e2 100644 --- a/apps/web/src/components/dashboard/automation-evidence-card.tsx +++ b/apps/web/src/components/dashboard/automation-evidence-card.tsx @@ -120,6 +120,12 @@ interface AiRouteStatusResponse { route_reason?: string | null route_error?: string | null health?: Record + lane_mode?: string | null + skipped_lanes?: Array<{ provider_name?: string | null }> + operator_action?: { + action?: string | null + human_required?: boolean + } | null } interface EvidenceSnapshot { @@ -185,6 +191,18 @@ function routeHealthLabelKey(status?: string | null) { return 'routeHealth.unknown' } +function routeLaneModeLabelKey(mode?: string | null) { + if ( + mode === 'primary' || + mode === 'degraded_failover' || + mode === 'cloud_fallback' || + mode === 'unavailable' + ) { + return `routeLaneMode.${mode}` + } + return 'routeLaneMode.unknown' +} + function providerDisplayName(provider?: string | null) { switch (provider) { case 'ollama_gcp_a': @@ -353,6 +371,10 @@ export function AutomationEvidenceCard() { const primaryProvider = route?.policy_order?.[0]?.provider_name ?? null const primaryStatus = primaryProvider ? route?.health?.[primaryProvider]?.status : null const selectedProvider = providerDisplayName(route?.selected_provider) + const laneMode = route?.lane_mode ?? null + const skippedLanes = route?.skipped_lanes + ?.map((lane) => providerDisplayName(lane.provider_name)) + .join(' -> ') const fallback = route?.fallback_chain ?.map((item) => item.provider_name) .map(providerDisplayName) @@ -366,8 +388,14 @@ export function AutomationEvidenceCard() { primaryStatus: t(routeHealthLabelKey(primaryStatus) as never), fallback: fallback || t('routeNoFallback'), }) + const laneDetail = laneMode && laneMode !== 'primary' + ? t('routeLaneDetail', { + mode: t(routeLaneModeLabelKey(laneMode) as never), + skipped: skippedLanes || '--', + }) + : null const routeDetail = route?.route_reason && !route.route_error - ? `${routeSummary}${t('routeReasonSeparator')}${t('routeReason', { reason: route.route_reason })}` + ? `${routeSummary}${laneDetail ? t('routeReasonSeparator') + laneDetail : ''}${t('routeReasonSeparator')}${t('routeReason', { reason: route.route_reason })}` : routeSummary return { diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 54ae16e8..5a4a2f50 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,68 @@ +## 2026-05-25|T175 AI Provider lane 降級狀態前後端顯示 + +**背景**: + +- T174 已把 production manifest 恢復成 `GCP-A -> GCP-B -> 111 -> Gemini`,但前端/API 仍主要用 `selected_provider` 表示結果。 +- 當 GCP-A 真實紅燈且 GCP-B 接手時,Operator 需要同時看到: + - policy 仍以 GCP-A 為第一順位; + - 目前使用中 lane 是 GCP-B; + - GCP-A 是被跳過的 degraded lane; + - 下一步是修復被跳過的 primary lane,而不是把 manifest 改名或靜默跳過。 + +**本次修補**: + +- `/api/v1/platform/ai-route-status` 新增 lane 狀態欄位: + - `lane_mode`:`primary` / `degraded_failover` / `cloud_fallback` / `unavailable` + - `active_lane` + - `skipped_lanes` + - `operator_action` +- AwoooP Runs 頁的 AI Provider Routing panel 會把「使用中」與「已跳過」分開顯示,並在 degraded / cloud fallback 時列出下一步。 +- 首頁 Automation Evidence card 會把非 primary 狀態補進模型路由摘要,避免只看到目前 provider 而看不到 failover 階段。 +- i18n 同步補齊 zh-TW / en 文案,維持前端零硬編碼。 + +**本地驗證**: + +```text +pytest: + test_ai_route_status_response_preserves_route_fields + test_ai_route_lane_state_marks_degraded_failover + test_ai_route_lane_state_marks_cloud_fallback + test_ai_route_status_times_out_before_slow_provider_checks + test_ai_route_status_lightweight_fallback_keeps_gemini_policy_only + -> 5 passed + +ruff / py_compile: + platform_operator_service.py + operator_runs.py + test_awooop_operator_timeline_labels.py + -> passed + +frontend: + pnpm --dir apps/web exec tsc --noEmit + -> passed + +json / diff: + en.json / zh-TW.json JSON.parse + git diff --check + -> passed +``` + +**目前整體進度**: + +- AwoooP 告警可觀測鏈:約 99.2%。 +- 低風險自動修復閉環:約 95.5%。 +- 前端 AI 自動化管理介面同步:約 96.9%。 +- Telegram 詳情 / 歷史可解釋性:約 95.5%。 +- Callback evidence / DB replayability:約 95.6%。 +- MCP / 自建 MCP 可見性:約 88%。 +- Sentry / SigNoz source correlation visibility:約 88%。 +- Ansible / PlayBook decision visibility:約 84.8%。 +- KM owner-review / completion governance:約 84%。 +- AI Provider lane 健康與可見性:約 88%(GCP-A 仍待 repair;本輪補足 degraded lane 顯示)。 +- 完整 AI 自動化管理產品化:約 94.5%。 + +--- + ## 2026-05-25|T174 Ollama manifest policy-order guard **背景**: