feat(health): expose ollama provider chain
All checks were successful
CD Pipeline / tests (push) Successful in 6m8s
Code Review / ai-code-review (push) Successful in 10s
CD Pipeline / build-and-deploy (push) Successful in 4m38s
CD Pipeline / post-deploy-checks (push) Successful in 1m42s

This commit is contained in:
Your Name
2026-05-24 11:44:37 +08:00
parent 06dfdf7ead
commit 9bac5718da
6 changed files with 259 additions and 39 deletions

View File

@@ -11,7 +11,7 @@ Endpoints:
Components Checked:
- PostgreSQL (192.168.0.188:5432)
- Redis (192.168.0.188:6380)
- Ollama (settings.OLLAMA_URL / ADR-110 provider pool)
- Ollama ADR-110 provider pool (GCP-A -> GCP-B -> 111)
- OpenClaw (192.168.0.188:8089)
- SigNoz (192.168.0.188:3301)
"""
@@ -26,9 +26,11 @@ from pydantic import BaseModel
from src.core.config import settings
from src.core.logging import get_logger
from src.services.health_check_service import get_health_check_service
from src.services.ollama_endpoint_resolver import resolve_ollama_order
router = APIRouter()
logger = get_logger("awoooi.health")
CORE_COMPONENTS = ("api", "postgresql", "redis", "ollama", "openclaw", "signoz")
# =============================================================================
@@ -50,6 +52,7 @@ class HealthResponse(BaseModel):
mock_mode: bool
timestamp: datetime
components: dict[str, ComponentHealth]
ollama_route_order: list[str] = []
# =============================================================================
@@ -106,8 +109,82 @@ async def check_redis() -> ComponentHealth:
async def check_ollama() -> ComponentHealth:
"""Async Ollama health check via /api/tags"""
return await _http_health_check("ollama", settings.OLLAMA_URL, "/api/tags")
"""Async aggregate Ollama health check via ADR-110 provider chain."""
aggregate, _details = await check_ollama_provider_chain()
return aggregate
async def check_ollama_provider_chain() -> tuple[ComponentHealth, dict[str, ComponentHealth]]:
"""
Check the full Ollama provider chain.
The aggregate ``ollama`` component represents route availability:
- up: GCP-A is reachable
- degraded: GCP-A is unavailable but GCP-B or 111 is reachable
- down: no configured Ollama endpoint is reachable
"""
selections = tuple(
selection
for selection in resolve_ollama_order("healthcheck")
if selection.url and selection.provider_name != "ollama_unconfigured"
)
if not selections:
aggregate = ComponentHealth(
status="down",
error="no Ollama endpoints configured",
)
return aggregate, {}
checked = await asyncio.gather(
*(
_http_health_check(
selection.provider_name,
selection.url,
"/api/tags",
)
for selection in selections
)
)
details = {
selection.provider_name: result
for selection, result in zip(selections, checked, strict=False)
}
primary = selections[0]
primary_status = details[primary.provider_name].status
if primary.provider_name == "ollama_gcp_a" and primary_status == "up":
return details[primary.provider_name], details
first_available = next(
(
selection
for selection in selections
if details[selection.provider_name].status == "up"
),
None,
)
if first_available:
fallback = details[first_available.provider_name]
return (
ComponentHealth(
status="degraded",
latency_ms=fallback.latency_ms,
error=f"primary unavailable; fallback active: {first_available.provider_name}",
),
details,
)
errors = ", ".join(
f"{provider}={health.error or health.status}"
for provider, health in details.items()
)
return (
ComponentHealth(
status="down",
error=f"all Ollama endpoints unavailable: {errors}",
),
details,
)
async def check_openclaw() -> ComponentHealth:
@@ -120,6 +197,30 @@ async def check_signoz() -> ComponentHealth:
return await _http_health_check("signoz", settings.SIGNOZ_URL, "/api/v1/health")
def _determine_overall_status(
components: dict[str, ComponentHealth],
) -> Literal["healthy", "degraded", "unhealthy"]:
"""Determine overall health from core aggregate components only."""
statuses = [
components[name].status
for name in CORE_COMPONENTS
if name in components
]
down_count = statuses.count("down")
degraded_count = statuses.count("degraded")
critical_down = (
components.get("postgresql", ComponentHealth(status="down")).status == "down"
or components.get("redis", ComponentHealth(status="down")).status == "down"
)
if critical_down or down_count >= 3:
return "unhealthy"
if down_count >= 1 or degraded_count > 0:
return "degraded"
return "healthy"
# =============================================================================
# Endpoints
# =============================================================================
@@ -142,34 +243,28 @@ async def get_health() -> HealthResponse:
results = await asyncio.gather(
check_postgresql(),
check_redis(),
check_ollama(),
check_ollama_provider_chain(),
check_openclaw(),
check_signoz(),
)
ollama_aggregate, ollama_details = results[2]
components = {
"api": ComponentHealth(status="up", latency_ms=0.0),
"postgresql": results[0],
"redis": results[1],
"ollama": results[2],
"ollama": ollama_aggregate,
"openclaw": results[3],
"signoz": results[4],
}
components.update(ollama_details)
# Determine overall status
statuses = [c.status for c in components.values()]
down_count = statuses.count("down")
degraded_count = statuses.count("degraded")
# Critical services: postgresql, redis
critical_down = components["postgresql"].status == "down" or components["redis"].status == "down"
if critical_down or down_count >= 3:
overall_status: Literal["healthy", "degraded", "unhealthy"] = "unhealthy"
elif down_count >= 1 or degraded_count > 0:
overall_status = "degraded"
else:
overall_status = "healthy"
overall_status = _determine_overall_status(components)
ollama_route_order = [
selection.provider_name
for selection in resolve_ollama_order("healthcheck")
if selection.url and selection.provider_name != "ollama_unconfigured"
]
logger.info(
"health_check_complete",
@@ -185,6 +280,7 @@ async def get_health() -> HealthResponse:
mock_mode=settings.MOCK_MODE,
timestamp=datetime.now(UTC),
components=components,
ollama_route_order=ollama_route_order,
)

View File

@@ -0,0 +1,68 @@
from __future__ import annotations
import pytest
from src.api.v1 import health
def _set_ollama_settings(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(health.settings, "OLLAMA_URL", "http://gcp-a:11434")
monkeypatch.setattr(health.settings, "OLLAMA_SECONDARY_URL", "http://gcp-b:11434")
monkeypatch.setattr(health.settings, "OLLAMA_FALLBACK_URL", "http://local-111:11434")
@pytest.mark.asyncio
async def test_ollama_provider_chain_reports_fallback_when_primary_down(monkeypatch: pytest.MonkeyPatch) -> None:
_set_ollama_settings(monkeypatch)
async def fake_http_check(name: str, _url: str, _path: str) -> health.ComponentHealth:
if name == "ollama_gcp_a":
return health.ComponentHealth(status="down", error="timeout")
if name == "ollama_gcp_b":
return health.ComponentHealth(status="up", latency_ms=42.0)
return health.ComponentHealth(status="up", latency_ms=9.0)
monkeypatch.setattr(health, "_http_health_check", fake_http_check)
aggregate, details = await health.check_ollama_provider_chain()
assert aggregate.status == "degraded"
assert aggregate.latency_ms == 42.0
assert aggregate.error == "primary unavailable; fallback active: ollama_gcp_b"
assert details["ollama_gcp_a"].status == "down"
assert details["ollama_gcp_b"].status == "up"
assert details["ollama_local"].status == "up"
@pytest.mark.asyncio
async def test_ollama_provider_chain_reports_all_endpoints_when_down(monkeypatch: pytest.MonkeyPatch) -> None:
_set_ollama_settings(monkeypatch)
async def fake_http_check(name: str, _url: str, _path: str) -> health.ComponentHealth:
return health.ComponentHealth(status="down", error=f"{name} timeout")
monkeypatch.setattr(health, "_http_health_check", fake_http_check)
aggregate, details = await health.check_ollama_provider_chain()
assert aggregate.status == "down"
assert "ollama_gcp_a=ollama_gcp_a timeout" in (aggregate.error or "")
assert "ollama_gcp_b=ollama_gcp_b timeout" in (aggregate.error or "")
assert "ollama_local=ollama_local timeout" in (aggregate.error or "")
assert set(details) == {"ollama_gcp_a", "ollama_gcp_b", "ollama_local"}
def test_overall_status_uses_aggregate_ollama_not_endpoint_details() -> None:
components = {
"api": health.ComponentHealth(status="up"),
"postgresql": health.ComponentHealth(status="up"),
"redis": health.ComponentHealth(status="up"),
"ollama": health.ComponentHealth(status="degraded"),
"openclaw": health.ComponentHealth(status="up"),
"signoz": health.ComponentHealth(status="up"),
"ollama_gcp_a": health.ComponentHealth(status="down"),
"ollama_gcp_b": health.ComponentHealth(status="up"),
"ollama_local": health.ComponentHealth(status="up"),
}
assert health._determine_overall_status(components) == "degraded"

View File

@@ -210,6 +210,13 @@
"viewAllAuth": "View All Authorizations",
"viewAllReport": "View Full Report",
"aiModelStatus": "AI Model Status",
"aiModelRoles": {
"primary": "Primary",
"backup": "Backup",
"local": "Local",
"agent": "Agent",
"provider": "Provider"
},
"loading": "Loading...",
"trendUp": "↑{pct}%",
"searchPlaceholderShort": "Search...",

View File

@@ -211,6 +211,13 @@
"viewAllAuth": "查看全部授權",
"viewAllReport": "查看完整報表",
"aiModelStatus": "AI 模型狀態",
"aiModelRoles": {
"primary": "主用",
"backup": "備援",
"local": "本機",
"agent": "Agent",
"provider": "Provider"
},
"loading": "載入中...",
"trendUp": "↑{pct}%",
"searchPlaceholderShort": "搜尋...",

View File

@@ -1,7 +1,7 @@
'use client'
/**
* AIModelStatus — AI 模型狀態 2×2 網格
* AIModelStatus — AI provider route health grid
* Sprint 5R S9: 設計稿 L531-545
* @created 2026-04-09 Claude Opus 4.6 Asia/Taipei
*/
@@ -13,33 +13,66 @@ const API_BASE = process.env.NEXT_PUBLIC_API_URL ?? ''
interface ModelInfo {
name: string
tag: string
healthy: boolean
role: 'primary' | 'backup' | 'local' | 'agent' | 'provider'
status: 'up' | 'down' | 'degraded' | 'unknown'
latencyMs?: number | null
}
interface HealthComponent {
status?: 'up' | 'down' | 'degraded'
latency_ms?: number | null
}
interface HealthResponse {
components?: Record<string, HealthComponent>
ollama_route_order?: string[]
}
const PROVIDER_LABELS: Record<string, string> = {
ollama_gcp_a: 'GCP-A',
ollama_gcp_b: 'GCP-B',
ollama_local: '111',
openclaw: 'OpenClaw',
}
const PROVIDER_ROLES: Record<string, ModelInfo['role']> = {
ollama_gcp_a: 'primary',
ollama_gcp_b: 'backup',
ollama_local: 'local',
openclaw: 'agent',
}
function statusColor(status: ModelInfo['status']) {
if (status === 'up') return '#22C55E'
if (status === 'degraded') return '#F59E0B'
if (status === 'down') return '#cc2200'
return '#87867f'
}
export function AIModelStatus() {
const t = useTranslations('dashboard')
const [models, setModels] = useState<ModelInfo[]>([
{ name: 'OpenClaw Nemo', tag: 'local', healthy: false },
{ name: 'Ollama gemma3', tag: 'local', healthy: false },
{ name: 'Gemini Pro', tag: 'cloud', healthy: false },
{ name: 'NVIDIA NIM', tag: 'cloud', healthy: false },
{ name: 'GCP-A', role: 'primary', status: 'unknown' },
{ name: 'GCP-B', role: 'backup', status: 'unknown' },
{ name: '111', role: 'local', status: 'unknown' },
{ name: 'OpenClaw', role: 'agent', status: 'unknown' },
])
useEffect(() => {
fetch(`${API_BASE}/api/v1/health`)
.then(r => r.ok ? r.json() : null)
.then(d => {
.then((d: HealthResponse | null) => {
if (!d?.components) return
setModels(prev => prev.map(m => {
if (m.name.includes('OpenClaw') && d.components.openclaw) return { ...m, healthy: d.components.openclaw.status === 'up' }
if (m.name.includes('Ollama') && d.components.ollama) return { ...m, healthy: d.components.ollama.status === 'up' }
// 2026-04-09 Claude Sonnet 4.6: 移除假數據 — /api/v1/health 無 gemini/nvidia component
// cloud 模型狀態未知,保持 false不顯示假綠燈
if (m.name.includes('Gemini') && d.components.gemini) return { ...m, healthy: d.components.gemini.status === 'up' }
if (m.name.includes('NVIDIA') && d.components.nvidia) return { ...m, healthy: d.components.nvidia.status === 'up' }
return m
}))
const routeOrder = d.ollama_route_order?.length
? d.ollama_route_order
: ['ollama_gcp_a', 'ollama_gcp_b', 'ollama_local']
const providerKeys = [...routeOrder, 'openclaw']
setModels(providerKeys.map(key => ({
name: PROVIDER_LABELS[key] ?? key,
role: PROVIDER_ROLES[key] ?? 'provider',
status: d.components?.[key]?.status ?? 'unknown',
latencyMs: d.components?.[key]?.latency_ms,
})))
})
.catch(() => {})
}, [])
@@ -62,9 +95,13 @@ export function AIModelStatus() {
border: '0.5px solid #e0ddd4', borderRadius: 6, padding: '6px 8px',
display: 'flex', alignItems: 'center', gap: 6,
}}>
<span style={{ width: 5, height: 5, borderRadius: '50%', background: m.healthy ? '#22C55E' : '#cc2200', flexShrink: 0 }} />
<span style={{ width: 5, height: 5, borderRadius: '50%', background: statusColor(m.status), flexShrink: 0 }} />
<span style={{ fontSize: 12, fontWeight: 500, color: '#141413' }}>{m.name}</span>
<span style={{ fontSize: 10, color: '#87867f', marginLeft: 'auto' }}>{m.tag}</span>
<span style={{ fontSize: 10, color: '#87867f', marginLeft: 'auto' }}>
{typeof m.latencyMs === 'number'
? `${Math.round(m.latencyMs)}ms`
: t(`aiModelRoles.${m.role}` as never)}
</span>
</div>
))}
</div>

View File

@@ -54,7 +54,12 @@ export const apiClient = {
status: 'healthy' | 'degraded' | 'unhealthy'
version: string
timestamp: string
components: Record<string, 'up' | 'down'>
components: Record<string, {
status: 'up' | 'down' | 'degraded'
latency_ms?: number | null
error?: string | null
}>
ollama_route_order?: string[]
}>(res)
},