feat(health): expose ollama provider chain
This commit is contained in:
@@ -11,7 +11,7 @@ Endpoints:
|
||||
Components Checked:
|
||||
- PostgreSQL (192.168.0.188:5432)
|
||||
- Redis (192.168.0.188:6380)
|
||||
- Ollama (settings.OLLAMA_URL / ADR-110 provider pool)
|
||||
- Ollama ADR-110 provider pool (GCP-A -> GCP-B -> 111)
|
||||
- OpenClaw (192.168.0.188:8089)
|
||||
- SigNoz (192.168.0.188:3301)
|
||||
"""
|
||||
@@ -26,9 +26,11 @@ from pydantic import BaseModel
|
||||
from src.core.config import settings
|
||||
from src.core.logging import get_logger
|
||||
from src.services.health_check_service import get_health_check_service
|
||||
from src.services.ollama_endpoint_resolver import resolve_ollama_order
|
||||
|
||||
router = APIRouter()
|
||||
logger = get_logger("awoooi.health")
|
||||
CORE_COMPONENTS = ("api", "postgresql", "redis", "ollama", "openclaw", "signoz")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -50,6 +52,7 @@ class HealthResponse(BaseModel):
|
||||
mock_mode: bool
|
||||
timestamp: datetime
|
||||
components: dict[str, ComponentHealth]
|
||||
ollama_route_order: list[str] = []
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -106,8 +109,82 @@ async def check_redis() -> ComponentHealth:
|
||||
|
||||
|
||||
async def check_ollama() -> ComponentHealth:
|
||||
"""Async Ollama health check via /api/tags"""
|
||||
return await _http_health_check("ollama", settings.OLLAMA_URL, "/api/tags")
|
||||
"""Async aggregate Ollama health check via ADR-110 provider chain."""
|
||||
aggregate, _details = await check_ollama_provider_chain()
|
||||
return aggregate
|
||||
|
||||
|
||||
async def check_ollama_provider_chain() -> tuple[ComponentHealth, dict[str, ComponentHealth]]:
|
||||
"""
|
||||
Check the full Ollama provider chain.
|
||||
|
||||
The aggregate ``ollama`` component represents route availability:
|
||||
- up: GCP-A is reachable
|
||||
- degraded: GCP-A is unavailable but GCP-B or 111 is reachable
|
||||
- down: no configured Ollama endpoint is reachable
|
||||
"""
|
||||
selections = tuple(
|
||||
selection
|
||||
for selection in resolve_ollama_order("healthcheck")
|
||||
if selection.url and selection.provider_name != "ollama_unconfigured"
|
||||
)
|
||||
if not selections:
|
||||
aggregate = ComponentHealth(
|
||||
status="down",
|
||||
error="no Ollama endpoints configured",
|
||||
)
|
||||
return aggregate, {}
|
||||
|
||||
checked = await asyncio.gather(
|
||||
*(
|
||||
_http_health_check(
|
||||
selection.provider_name,
|
||||
selection.url,
|
||||
"/api/tags",
|
||||
)
|
||||
for selection in selections
|
||||
)
|
||||
)
|
||||
details = {
|
||||
selection.provider_name: result
|
||||
for selection, result in zip(selections, checked, strict=False)
|
||||
}
|
||||
|
||||
primary = selections[0]
|
||||
primary_status = details[primary.provider_name].status
|
||||
if primary.provider_name == "ollama_gcp_a" and primary_status == "up":
|
||||
return details[primary.provider_name], details
|
||||
|
||||
first_available = next(
|
||||
(
|
||||
selection
|
||||
for selection in selections
|
||||
if details[selection.provider_name].status == "up"
|
||||
),
|
||||
None,
|
||||
)
|
||||
if first_available:
|
||||
fallback = details[first_available.provider_name]
|
||||
return (
|
||||
ComponentHealth(
|
||||
status="degraded",
|
||||
latency_ms=fallback.latency_ms,
|
||||
error=f"primary unavailable; fallback active: {first_available.provider_name}",
|
||||
),
|
||||
details,
|
||||
)
|
||||
|
||||
errors = ", ".join(
|
||||
f"{provider}={health.error or health.status}"
|
||||
for provider, health in details.items()
|
||||
)
|
||||
return (
|
||||
ComponentHealth(
|
||||
status="down",
|
||||
error=f"all Ollama endpoints unavailable: {errors}",
|
||||
),
|
||||
details,
|
||||
)
|
||||
|
||||
|
||||
async def check_openclaw() -> ComponentHealth:
|
||||
@@ -120,6 +197,30 @@ async def check_signoz() -> ComponentHealth:
|
||||
return await _http_health_check("signoz", settings.SIGNOZ_URL, "/api/v1/health")
|
||||
|
||||
|
||||
def _determine_overall_status(
|
||||
components: dict[str, ComponentHealth],
|
||||
) -> Literal["healthy", "degraded", "unhealthy"]:
|
||||
"""Determine overall health from core aggregate components only."""
|
||||
statuses = [
|
||||
components[name].status
|
||||
for name in CORE_COMPONENTS
|
||||
if name in components
|
||||
]
|
||||
down_count = statuses.count("down")
|
||||
degraded_count = statuses.count("degraded")
|
||||
|
||||
critical_down = (
|
||||
components.get("postgresql", ComponentHealth(status="down")).status == "down"
|
||||
or components.get("redis", ComponentHealth(status="down")).status == "down"
|
||||
)
|
||||
|
||||
if critical_down or down_count >= 3:
|
||||
return "unhealthy"
|
||||
if down_count >= 1 or degraded_count > 0:
|
||||
return "degraded"
|
||||
return "healthy"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Endpoints
|
||||
# =============================================================================
|
||||
@@ -142,34 +243,28 @@ async def get_health() -> HealthResponse:
|
||||
results = await asyncio.gather(
|
||||
check_postgresql(),
|
||||
check_redis(),
|
||||
check_ollama(),
|
||||
check_ollama_provider_chain(),
|
||||
check_openclaw(),
|
||||
check_signoz(),
|
||||
)
|
||||
|
||||
ollama_aggregate, ollama_details = results[2]
|
||||
components = {
|
||||
"api": ComponentHealth(status="up", latency_ms=0.0),
|
||||
"postgresql": results[0],
|
||||
"redis": results[1],
|
||||
"ollama": results[2],
|
||||
"ollama": ollama_aggregate,
|
||||
"openclaw": results[3],
|
||||
"signoz": results[4],
|
||||
}
|
||||
components.update(ollama_details)
|
||||
|
||||
# Determine overall status
|
||||
statuses = [c.status for c in components.values()]
|
||||
down_count = statuses.count("down")
|
||||
degraded_count = statuses.count("degraded")
|
||||
|
||||
# Critical services: postgresql, redis
|
||||
critical_down = components["postgresql"].status == "down" or components["redis"].status == "down"
|
||||
|
||||
if critical_down or down_count >= 3:
|
||||
overall_status: Literal["healthy", "degraded", "unhealthy"] = "unhealthy"
|
||||
elif down_count >= 1 or degraded_count > 0:
|
||||
overall_status = "degraded"
|
||||
else:
|
||||
overall_status = "healthy"
|
||||
overall_status = _determine_overall_status(components)
|
||||
ollama_route_order = [
|
||||
selection.provider_name
|
||||
for selection in resolve_ollama_order("healthcheck")
|
||||
if selection.url and selection.provider_name != "ollama_unconfigured"
|
||||
]
|
||||
|
||||
logger.info(
|
||||
"health_check_complete",
|
||||
@@ -185,6 +280,7 @@ async def get_health() -> HealthResponse:
|
||||
mock_mode=settings.MOCK_MODE,
|
||||
timestamp=datetime.now(UTC),
|
||||
components=components,
|
||||
ollama_route_order=ollama_route_order,
|
||||
)
|
||||
|
||||
|
||||
|
||||
68
apps/api/tests/test_health_ollama_provider_chain.py
Normal file
68
apps/api/tests/test_health_ollama_provider_chain.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from src.api.v1 import health
|
||||
|
||||
|
||||
def _set_ollama_settings(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
monkeypatch.setattr(health.settings, "OLLAMA_URL", "http://gcp-a:11434")
|
||||
monkeypatch.setattr(health.settings, "OLLAMA_SECONDARY_URL", "http://gcp-b:11434")
|
||||
monkeypatch.setattr(health.settings, "OLLAMA_FALLBACK_URL", "http://local-111:11434")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ollama_provider_chain_reports_fallback_when_primary_down(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
_set_ollama_settings(monkeypatch)
|
||||
|
||||
async def fake_http_check(name: str, _url: str, _path: str) -> health.ComponentHealth:
|
||||
if name == "ollama_gcp_a":
|
||||
return health.ComponentHealth(status="down", error="timeout")
|
||||
if name == "ollama_gcp_b":
|
||||
return health.ComponentHealth(status="up", latency_ms=42.0)
|
||||
return health.ComponentHealth(status="up", latency_ms=9.0)
|
||||
|
||||
monkeypatch.setattr(health, "_http_health_check", fake_http_check)
|
||||
|
||||
aggregate, details = await health.check_ollama_provider_chain()
|
||||
|
||||
assert aggregate.status == "degraded"
|
||||
assert aggregate.latency_ms == 42.0
|
||||
assert aggregate.error == "primary unavailable; fallback active: ollama_gcp_b"
|
||||
assert details["ollama_gcp_a"].status == "down"
|
||||
assert details["ollama_gcp_b"].status == "up"
|
||||
assert details["ollama_local"].status == "up"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ollama_provider_chain_reports_all_endpoints_when_down(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
_set_ollama_settings(monkeypatch)
|
||||
|
||||
async def fake_http_check(name: str, _url: str, _path: str) -> health.ComponentHealth:
|
||||
return health.ComponentHealth(status="down", error=f"{name} timeout")
|
||||
|
||||
monkeypatch.setattr(health, "_http_health_check", fake_http_check)
|
||||
|
||||
aggregate, details = await health.check_ollama_provider_chain()
|
||||
|
||||
assert aggregate.status == "down"
|
||||
assert "ollama_gcp_a=ollama_gcp_a timeout" in (aggregate.error or "")
|
||||
assert "ollama_gcp_b=ollama_gcp_b timeout" in (aggregate.error or "")
|
||||
assert "ollama_local=ollama_local timeout" in (aggregate.error or "")
|
||||
assert set(details) == {"ollama_gcp_a", "ollama_gcp_b", "ollama_local"}
|
||||
|
||||
|
||||
def test_overall_status_uses_aggregate_ollama_not_endpoint_details() -> None:
|
||||
components = {
|
||||
"api": health.ComponentHealth(status="up"),
|
||||
"postgresql": health.ComponentHealth(status="up"),
|
||||
"redis": health.ComponentHealth(status="up"),
|
||||
"ollama": health.ComponentHealth(status="degraded"),
|
||||
"openclaw": health.ComponentHealth(status="up"),
|
||||
"signoz": health.ComponentHealth(status="up"),
|
||||
"ollama_gcp_a": health.ComponentHealth(status="down"),
|
||||
"ollama_gcp_b": health.ComponentHealth(status="up"),
|
||||
"ollama_local": health.ComponentHealth(status="up"),
|
||||
}
|
||||
|
||||
assert health._determine_overall_status(components) == "degraded"
|
||||
@@ -210,6 +210,13 @@
|
||||
"viewAllAuth": "View All Authorizations",
|
||||
"viewAllReport": "View Full Report",
|
||||
"aiModelStatus": "AI Model Status",
|
||||
"aiModelRoles": {
|
||||
"primary": "Primary",
|
||||
"backup": "Backup",
|
||||
"local": "Local",
|
||||
"agent": "Agent",
|
||||
"provider": "Provider"
|
||||
},
|
||||
"loading": "Loading...",
|
||||
"trendUp": "↑{pct}%",
|
||||
"searchPlaceholderShort": "Search...",
|
||||
|
||||
@@ -211,6 +211,13 @@
|
||||
"viewAllAuth": "查看全部授權",
|
||||
"viewAllReport": "查看完整報表",
|
||||
"aiModelStatus": "AI 模型狀態",
|
||||
"aiModelRoles": {
|
||||
"primary": "主用",
|
||||
"backup": "備援",
|
||||
"local": "本機",
|
||||
"agent": "Agent",
|
||||
"provider": "Provider"
|
||||
},
|
||||
"loading": "載入中...",
|
||||
"trendUp": "↑{pct}%",
|
||||
"searchPlaceholderShort": "搜尋...",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'use client'
|
||||
|
||||
/**
|
||||
* AIModelStatus — AI 模型狀態 2×2 網格
|
||||
* AIModelStatus — AI provider route health grid
|
||||
* Sprint 5R S9: 設計稿 L531-545
|
||||
* @created 2026-04-09 Claude Opus 4.6 Asia/Taipei
|
||||
*/
|
||||
@@ -13,33 +13,66 @@ const API_BASE = process.env.NEXT_PUBLIC_API_URL ?? ''
|
||||
|
||||
interface ModelInfo {
|
||||
name: string
|
||||
tag: string
|
||||
healthy: boolean
|
||||
role: 'primary' | 'backup' | 'local' | 'agent' | 'provider'
|
||||
status: 'up' | 'down' | 'degraded' | 'unknown'
|
||||
latencyMs?: number | null
|
||||
}
|
||||
|
||||
interface HealthComponent {
|
||||
status?: 'up' | 'down' | 'degraded'
|
||||
latency_ms?: number | null
|
||||
}
|
||||
|
||||
interface HealthResponse {
|
||||
components?: Record<string, HealthComponent>
|
||||
ollama_route_order?: string[]
|
||||
}
|
||||
|
||||
const PROVIDER_LABELS: Record<string, string> = {
|
||||
ollama_gcp_a: 'GCP-A',
|
||||
ollama_gcp_b: 'GCP-B',
|
||||
ollama_local: '111',
|
||||
openclaw: 'OpenClaw',
|
||||
}
|
||||
|
||||
const PROVIDER_ROLES: Record<string, ModelInfo['role']> = {
|
||||
ollama_gcp_a: 'primary',
|
||||
ollama_gcp_b: 'backup',
|
||||
ollama_local: 'local',
|
||||
openclaw: 'agent',
|
||||
}
|
||||
|
||||
function statusColor(status: ModelInfo['status']) {
|
||||
if (status === 'up') return '#22C55E'
|
||||
if (status === 'degraded') return '#F59E0B'
|
||||
if (status === 'down') return '#cc2200'
|
||||
return '#87867f'
|
||||
}
|
||||
|
||||
export function AIModelStatus() {
|
||||
const t = useTranslations('dashboard')
|
||||
const [models, setModels] = useState<ModelInfo[]>([
|
||||
{ name: 'OpenClaw Nemo', tag: 'local', healthy: false },
|
||||
{ name: 'Ollama gemma3', tag: 'local', healthy: false },
|
||||
{ name: 'Gemini Pro', tag: 'cloud', healthy: false },
|
||||
{ name: 'NVIDIA NIM', tag: 'cloud', healthy: false },
|
||||
{ name: 'GCP-A', role: 'primary', status: 'unknown' },
|
||||
{ name: 'GCP-B', role: 'backup', status: 'unknown' },
|
||||
{ name: '111', role: 'local', status: 'unknown' },
|
||||
{ name: 'OpenClaw', role: 'agent', status: 'unknown' },
|
||||
])
|
||||
|
||||
useEffect(() => {
|
||||
fetch(`${API_BASE}/api/v1/health`)
|
||||
.then(r => r.ok ? r.json() : null)
|
||||
.then(d => {
|
||||
.then((d: HealthResponse | null) => {
|
||||
if (!d?.components) return
|
||||
setModels(prev => prev.map(m => {
|
||||
if (m.name.includes('OpenClaw') && d.components.openclaw) return { ...m, healthy: d.components.openclaw.status === 'up' }
|
||||
if (m.name.includes('Ollama') && d.components.ollama) return { ...m, healthy: d.components.ollama.status === 'up' }
|
||||
// 2026-04-09 Claude Sonnet 4.6: 移除假數據 — /api/v1/health 無 gemini/nvidia component
|
||||
// cloud 模型狀態未知,保持 false,不顯示假綠燈
|
||||
if (m.name.includes('Gemini') && d.components.gemini) return { ...m, healthy: d.components.gemini.status === 'up' }
|
||||
if (m.name.includes('NVIDIA') && d.components.nvidia) return { ...m, healthy: d.components.nvidia.status === 'up' }
|
||||
return m
|
||||
}))
|
||||
const routeOrder = d.ollama_route_order?.length
|
||||
? d.ollama_route_order
|
||||
: ['ollama_gcp_a', 'ollama_gcp_b', 'ollama_local']
|
||||
const providerKeys = [...routeOrder, 'openclaw']
|
||||
setModels(providerKeys.map(key => ({
|
||||
name: PROVIDER_LABELS[key] ?? key,
|
||||
role: PROVIDER_ROLES[key] ?? 'provider',
|
||||
status: d.components?.[key]?.status ?? 'unknown',
|
||||
latencyMs: d.components?.[key]?.latency_ms,
|
||||
})))
|
||||
})
|
||||
.catch(() => {})
|
||||
}, [])
|
||||
@@ -62,9 +95,13 @@ export function AIModelStatus() {
|
||||
border: '0.5px solid #e0ddd4', borderRadius: 6, padding: '6px 8px',
|
||||
display: 'flex', alignItems: 'center', gap: 6,
|
||||
}}>
|
||||
<span style={{ width: 5, height: 5, borderRadius: '50%', background: m.healthy ? '#22C55E' : '#cc2200', flexShrink: 0 }} />
|
||||
<span style={{ width: 5, height: 5, borderRadius: '50%', background: statusColor(m.status), flexShrink: 0 }} />
|
||||
<span style={{ fontSize: 12, fontWeight: 500, color: '#141413' }}>{m.name}</span>
|
||||
<span style={{ fontSize: 10, color: '#87867f', marginLeft: 'auto' }}>{m.tag}</span>
|
||||
<span style={{ fontSize: 10, color: '#87867f', marginLeft: 'auto' }}>
|
||||
{typeof m.latencyMs === 'number'
|
||||
? `${Math.round(m.latencyMs)}ms`
|
||||
: t(`aiModelRoles.${m.role}` as never)}
|
||||
</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
|
||||
@@ -54,7 +54,12 @@ export const apiClient = {
|
||||
status: 'healthy' | 'degraded' | 'unhealthy'
|
||||
version: string
|
||||
timestamp: string
|
||||
components: Record<string, 'up' | 'down'>
|
||||
components: Record<string, {
|
||||
status: 'up' | 'down' | 'degraded'
|
||||
latency_ms?: number | null
|
||||
error?: string | null
|
||||
}>
|
||||
ollama_route_order?: string[]
|
||||
}>(res)
|
||||
},
|
||||
|
||||
|
||||
Reference in New Issue
Block a user