diff --git a/apps/api/src/api/v1/monitoring.py b/apps/api/src/api/v1/monitoring.py new file mode 100644 index 00000000..3ccfb75b --- /dev/null +++ b/apps/api/src/api/v1/monitoring.py @@ -0,0 +1,174 @@ +""" +Monitoring Status API +===================== +探測各監控工具狀態:Grafana / Prometheus / SigNoz / Gitea + +所有探測從後端發出,不暴露內網 IP 給前端。 + +建立時間: 2026-04-03 (台北時區) +建立者: Claude Code +""" + +import asyncio +from datetime import UTC, datetime + +import httpx +from fastapi import APIRouter + +from src.core.logging import get_logger + +logger = get_logger(__name__) + +router = APIRouter(prefix="/monitoring", tags=["Monitoring"]) + +# ============================================================================= +# Internal service endpoints (backend-only) +# ============================================================================= + +SERVICES = { + "grafana": { + "base": "http://192.168.0.188:3000", + "health": "/api/health", + "build": "/api/frontend/settings", + }, + "prometheus": { + "base": "http://192.168.0.110:9090", + "health": "/-/healthy", + "build": "/api/v1/status/buildinfo", + "rules": "/api/v1/rules", + }, + "signoz": { + "base": "http://192.168.0.188:3301", + "health": "/api/v1/health", + }, + "gitea": { + "base": "http://192.168.0.110:3001", + "health": "/-/readiness", + }, +} + +TIMEOUT = 3.0 + + +async def _probe_grafana(client: httpx.AsyncClient) -> dict: + base = SERVICES["grafana"]["base"] + try: + r = await client.get(f"{base}/api/health", timeout=TIMEOUT) + if r.status_code == 200: + data = r.json() + version = data.get("version", "—") + # Try to get dashboard count + dash_r = await client.get(f"{base}/api/search?type=dash-db", timeout=TIMEOUT) + dash_count = len(dash_r.json()) if dash_r.status_code == 200 else None + return { + "name": "Grafana", + "status": "up", + "version": version, + "stats": f"面板 {dash_count} 個" if dash_count is not None else None, + "description": "監控面板 · 指標視覺化", + } + except Exception as e: + logger.warning("grafana_probe_failed", error=str(e)) + return {"name": "Grafana", "status": "down", "version": None, "stats": None, "description": "監控面板 · 指標視覺化"} + + +async def _probe_prometheus(client: httpx.AsyncClient) -> dict: + base = SERVICES["prometheus"]["base"] + try: + health_r = await client.get(f"{base}/-/healthy", timeout=TIMEOUT) + if health_r.status_code == 200: + # Get build info + build_r = await client.get(f"{base}/api/v1/status/buildinfo", timeout=TIMEOUT) + version = None + if build_r.status_code == 200: + version = build_r.json().get("data", {}).get("version") + # Get rules count + rules_r = await client.get(f"{base}/api/v1/rules", timeout=TIMEOUT) + rules_count = None + firing_count = 0 + if rules_r.status_code == 200: + groups = rules_r.json().get("data", {}).get("groups", []) + rules_count = sum(len(g.get("rules", [])) for g in groups) + firing_count = sum( + 1 for g in groups for r in g.get("rules", []) + if r.get("state") == "firing" + ) + stats_parts = [] + if rules_count is not None: + stats_parts.append(f"規則 {rules_count} 條") + if firing_count > 0: + stats_parts.append(f"{firing_count} 觸發") + return { + "name": "Prometheus", + "status": "up", + "version": version, + "stats": " · ".join(stats_parts) if stats_parts else None, + "description": "時序資料庫 · 告警規則", + "firing_count": firing_count, + } + except Exception as e: + logger.warning("prometheus_probe_failed", error=str(e)) + return {"name": "Prometheus", "status": "down", "version": None, "stats": None, "description": "時序資料庫 · 告警規則", "firing_count": 0} + + +async def _probe_signoz(client: httpx.AsyncClient) -> dict: + base = SERVICES["signoz"]["base"] + try: + r = await client.get(f"{base}/api/v1/health", timeout=TIMEOUT) + if r.status_code == 200: + return {"name": "SigNoz", "status": "up", "version": None, "stats": "APM · 追蹤 · 日誌", "description": "可觀測性平台"} + except Exception as e: + logger.warning("signoz_probe_failed", error=str(e)) + # Fallback: try root + try: + r2 = await client.get(f"{base}/", timeout=TIMEOUT) + if r2.status_code in (200, 301, 302): + return {"name": "SigNoz", "status": "up", "version": None, "stats": "APM · 追蹤 · 日誌", "description": "可觀測性平台"} + except Exception: + pass + return {"name": "SigNoz", "status": "down", "version": None, "stats": None, "description": "可觀測性平台"} + + +async def _probe_gitea(client: httpx.AsyncClient) -> dict: + base = SERVICES["gitea"]["base"] + try: + r = await client.get(f"{base}/-/readiness", timeout=TIMEOUT) + if r.status_code == 200: + # Get version from API + ver_r = await client.get(f"{base}/api/v1/version", timeout=TIMEOUT) + version = None + if ver_r.status_code == 200: + version = ver_r.json().get("version") + return {"name": "Gitea", "status": "up", "version": version, "stats": "CI/CD · Git 倉庫", "description": "代碼倉庫 · Pipeline"} + except Exception as e: + logger.warning("gitea_probe_failed", error=str(e)) + return {"name": "Gitea", "status": "down", "version": None, "stats": None, "description": "代碼倉庫 · Pipeline"} + + +@router.get("/status") +async def get_monitoring_status() -> dict: + """ + 並行探測所有監控工具狀態 + + Returns: + dict with tools list, each containing name/status/version/stats/description + """ + async with httpx.AsyncClient(follow_redirects=True) as client: + results = await asyncio.gather( + _probe_grafana(client), + _probe_prometheus(client), + _probe_signoz(client), + _probe_gitea(client), + return_exceptions=True, + ) + + tools = [] + for r in results: + if isinstance(r, Exception): + continue + tools.append({**r, "checked_at": datetime.now(UTC).isoformat()}) + + return { + "tools": tools, + "checked_at": datetime.now(UTC).isoformat(), + } diff --git a/apps/api/src/main.py b/apps/api/src/main.py index 383dbd07..e4225961 100644 --- a/apps/api/src/main.py +++ b/apps/api/src/main.py @@ -57,6 +57,7 @@ from src.api.v1 import ( from src.api.v1 import ( signoz_webhook as signoz_webhook_v1, # Phase 21: SignOz → Telegram (ADR-037) ) +from src.api.v1 import monitoring as monitoring_v1 # 2026-04-03: 監控工具狀態 from src.api.v1 import stats as stats_v1 # Phase 6.5: Statistics Analytics from src.api.v1 import telegram as telegram_v1 # Phase 5.4: Telegram Gateway from src.api.v1 import terminal as terminal_v1 # Phase 19.1: Omni-Terminal SSE @@ -402,6 +403,9 @@ app.include_router( app.include_router( stats_v1.router, prefix="/api/v1", tags=["Statistics"] ) # Phase 6.5: Statistics Analytics +app.include_router( + monitoring_v1.router, prefix="/api/v1", tags=["Monitoring"] +) # 2026-04-03: 監控工具狀態 app.include_router( github_webhook_v1.router, prefix="/api/v1", tags=["GitHub Webhook"] ) # Phase 13.1: GitHub → OpenClaw diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 92967a27..f10012c3 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -143,7 +143,13 @@ "infrastructure": "INFRASTRUCTURE", "podHealth": "POD Health", "allRunning": "All Running", - "servicesUp": "Services Up" + "servicesUp": "Services Up", + "monitoringTools": "Monitoring Tools", + "monitoringStatus": { + "up": "OK", + "down": "Down", + "unknown": "Unknown" + } }, "openclaw": { "name": "OpenClaw", diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index 43df1577..b043f7fc 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -144,7 +144,13 @@ "infrastructure": "基礎架構", "podHealth": "POD 健康", "allRunning": "全部運行中", - "servicesUp": "服務上線" + "servicesUp": "服務上線", + "monitoringTools": "監控工具", + "monitoringStatus": { + "up": "正常", + "down": "離線", + "unknown": "未知" + } }, "openclaw": { "name": "OpenClaw", diff --git a/apps/web/src/app/[locale]/page.tsx b/apps/web/src/app/[locale]/page.tsx index e19c119c..c441aad1 100644 --- a/apps/web/src/app/[locale]/page.tsx +++ b/apps/web/src/app/[locale]/page.tsx @@ -8,10 +8,12 @@ * 統帥鐵律: 使用真實數據 Hook,禁止假數據! * * @updated 2026-04-02 Claude Code — Metrics Strip 7指標視覺強化 + * @updated 2026-04-03 Claude Code — 監控工具區塊 (Grafana/Prometheus/SigNoz/Gitea) * 串接: incidents(count/P0/MTTR/autoRemediation) + dashboard(serviceHealth/pendingApprovals/podHealth) */ import { useTranslations } from 'next-intl' +import { useState, useEffect } from 'react' import { useGlobalPulseMetrics } from '@/hooks/useGlobalPulseMetrics' import { useIncidents } from '@/hooks/useIncidents' import { useHosts, useDashboardStore } from '@/stores/dashboard.store' @@ -20,6 +22,8 @@ import { OpenClawPanel } from '@/components/ai/openclaw-panel' import { HostGrid, type HostInfo } from '@/components/infra/host-grid' import { AppLayout } from '@/components/layout' +const API_BASE = process.env.NEXT_PUBLIC_API_URL ?? '' + // ============================================================================= // Types // ============================================================================= @@ -55,6 +59,96 @@ function MiniSparkline({ values, color }: { values: number[]; color: string }) { ) } +// ============================================================================= +// Monitoring Tools Component +// ============================================================================= + +interface MonitoringTool { + name: string + status: string + version: string | null + stats: string | null + description: string + firing_count?: number + checked_at: string +} + +function MonitoringTools() { + const [tools, setTools] = useState([]) + const [loading, setLoading] = useState(true) + + useEffect(() => { + const load = () => { + fetch(`${API_BASE}/api/v1/monitoring/status`) + .then(r => r.json()) + .then(d => { setTools(d.tools ?? []); setLoading(false) }) + .catch(() => setLoading(false)) + } + load() + const t = setInterval(load, 60000) + return () => clearInterval(t) + }, []) + + const TOOL_ICONS: Record = { + Grafana: '📊', + Prometheus: '🔥', + SigNoz: '🔭', + Gitea: '🐙', + } + + if (loading) return ( +
+ 載入中... +
+ ) + + return ( +
+ {tools.map((tool, i) => { + const isUp = tool.status === 'up' + const hasFiring = (tool.firing_count ?? 0) > 0 + return ( +
+
{TOOL_ICONS[tool.name] ?? '⚙️'}
+
+
+ {tool.name} + + + {isUp ? (hasFiring ? `${tool.firing_count} 觸發` : '正常') : '離線'} + +
+
+ {tool.description} + {tool.version && · v{tool.version}} +
+ {tool.stats && ( +
{tool.stats}
+ )} +
+
+ {new Date(tool.checked_at).toLocaleTimeString('zh-TW', { timeZone: 'Asia/Taipei', hour: '2-digit', minute: '2-digit' })} +
+
+ ) + })} +
+ ) +} + // ============================================================================= // Main Page // ============================================================================= @@ -414,6 +508,29 @@ export default function Home({ params }: { params: { locale: string } }) { }))} /> + {/* 監控工具 */} +
+
+
+ {tDashboard('monitoringTools')} +
+ +
+