feat(monitoring): 監控工具區塊 — Grafana/Prometheus/SigNoz/Gitea 狀態

- 新增 GET /api/v1/monitoring/status,asyncio.gather 並行探測四工具
- 前端 MonitoringTools 元件,60s 輪詢顯示狀態/版本/統計
- 新增 monitoringTools i18n key

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-03 00:27:47 +08:00
parent 30b7b10f01
commit ce11fcdc3a
5 changed files with 309 additions and 2 deletions

View File

@@ -0,0 +1,174 @@
"""
Monitoring Status API
=====================
探測各監控工具狀態Grafana / Prometheus / SigNoz / Gitea
所有探測從後端發出,不暴露內網 IP 給前端。
建立時間: 2026-04-03 (台北時區)
建立者: Claude Code
"""
import asyncio
from datetime import UTC, datetime
import httpx
from fastapi import APIRouter
from src.core.logging import get_logger
logger = get_logger(__name__)
router = APIRouter(prefix="/monitoring", tags=["Monitoring"])
# =============================================================================
# Internal service endpoints (backend-only)
# =============================================================================
SERVICES = {
"grafana": {
"base": "http://192.168.0.188:3000",
"health": "/api/health",
"build": "/api/frontend/settings",
},
"prometheus": {
"base": "http://192.168.0.110:9090",
"health": "/-/healthy",
"build": "/api/v1/status/buildinfo",
"rules": "/api/v1/rules",
},
"signoz": {
"base": "http://192.168.0.188:3301",
"health": "/api/v1/health",
},
"gitea": {
"base": "http://192.168.0.110:3001",
"health": "/-/readiness",
},
}
TIMEOUT = 3.0
async def _probe_grafana(client: httpx.AsyncClient) -> dict:
base = SERVICES["grafana"]["base"]
try:
r = await client.get(f"{base}/api/health", timeout=TIMEOUT)
if r.status_code == 200:
data = r.json()
version = data.get("version", "")
# Try to get dashboard count
dash_r = await client.get(f"{base}/api/search?type=dash-db", timeout=TIMEOUT)
dash_count = len(dash_r.json()) if dash_r.status_code == 200 else None
return {
"name": "Grafana",
"status": "up",
"version": version,
"stats": f"面板 {dash_count}" if dash_count is not None else None,
"description": "監控面板 · 指標視覺化",
}
except Exception as e:
logger.warning("grafana_probe_failed", error=str(e))
return {"name": "Grafana", "status": "down", "version": None, "stats": None, "description": "監控面板 · 指標視覺化"}
async def _probe_prometheus(client: httpx.AsyncClient) -> dict:
base = SERVICES["prometheus"]["base"]
try:
health_r = await client.get(f"{base}/-/healthy", timeout=TIMEOUT)
if health_r.status_code == 200:
# Get build info
build_r = await client.get(f"{base}/api/v1/status/buildinfo", timeout=TIMEOUT)
version = None
if build_r.status_code == 200:
version = build_r.json().get("data", {}).get("version")
# Get rules count
rules_r = await client.get(f"{base}/api/v1/rules", timeout=TIMEOUT)
rules_count = None
firing_count = 0
if rules_r.status_code == 200:
groups = rules_r.json().get("data", {}).get("groups", [])
rules_count = sum(len(g.get("rules", [])) for g in groups)
firing_count = sum(
1 for g in groups for r in g.get("rules", [])
if r.get("state") == "firing"
)
stats_parts = []
if rules_count is not None:
stats_parts.append(f"規則 {rules_count}")
if firing_count > 0:
stats_parts.append(f"{firing_count} 觸發")
return {
"name": "Prometheus",
"status": "up",
"version": version,
"stats": " · ".join(stats_parts) if stats_parts else None,
"description": "時序資料庫 · 告警規則",
"firing_count": firing_count,
}
except Exception as e:
logger.warning("prometheus_probe_failed", error=str(e))
return {"name": "Prometheus", "status": "down", "version": None, "stats": None, "description": "時序資料庫 · 告警規則", "firing_count": 0}
async def _probe_signoz(client: httpx.AsyncClient) -> dict:
base = SERVICES["signoz"]["base"]
try:
r = await client.get(f"{base}/api/v1/health", timeout=TIMEOUT)
if r.status_code == 200:
return {"name": "SigNoz", "status": "up", "version": None, "stats": "APM · 追蹤 · 日誌", "description": "可觀測性平台"}
except Exception as e:
logger.warning("signoz_probe_failed", error=str(e))
# Fallback: try root
try:
r2 = await client.get(f"{base}/", timeout=TIMEOUT)
if r2.status_code in (200, 301, 302):
return {"name": "SigNoz", "status": "up", "version": None, "stats": "APM · 追蹤 · 日誌", "description": "可觀測性平台"}
except Exception:
pass
return {"name": "SigNoz", "status": "down", "version": None, "stats": None, "description": "可觀測性平台"}
async def _probe_gitea(client: httpx.AsyncClient) -> dict:
base = SERVICES["gitea"]["base"]
try:
r = await client.get(f"{base}/-/readiness", timeout=TIMEOUT)
if r.status_code == 200:
# Get version from API
ver_r = await client.get(f"{base}/api/v1/version", timeout=TIMEOUT)
version = None
if ver_r.status_code == 200:
version = ver_r.json().get("version")
return {"name": "Gitea", "status": "up", "version": version, "stats": "CI/CD · Git 倉庫", "description": "代碼倉庫 · Pipeline"}
except Exception as e:
logger.warning("gitea_probe_failed", error=str(e))
return {"name": "Gitea", "status": "down", "version": None, "stats": None, "description": "代碼倉庫 · Pipeline"}
@router.get("/status")
async def get_monitoring_status() -> dict:
"""
並行探測所有監控工具狀態
Returns:
dict with tools list, each containing name/status/version/stats/description
"""
async with httpx.AsyncClient(follow_redirects=True) as client:
results = await asyncio.gather(
_probe_grafana(client),
_probe_prometheus(client),
_probe_signoz(client),
_probe_gitea(client),
return_exceptions=True,
)
tools = []
for r in results:
if isinstance(r, Exception):
continue
tools.append({**r, "checked_at": datetime.now(UTC).isoformat()})
return {
"tools": tools,
"checked_at": datetime.now(UTC).isoformat(),
}

View File

@@ -57,6 +57,7 @@ from src.api.v1 import (
from src.api.v1 import (
signoz_webhook as signoz_webhook_v1, # Phase 21: SignOz → Telegram (ADR-037)
)
from src.api.v1 import monitoring as monitoring_v1 # 2026-04-03: 監控工具狀態
from src.api.v1 import stats as stats_v1 # Phase 6.5: Statistics Analytics
from src.api.v1 import telegram as telegram_v1 # Phase 5.4: Telegram Gateway
from src.api.v1 import terminal as terminal_v1 # Phase 19.1: Omni-Terminal SSE
@@ -402,6 +403,9 @@ app.include_router(
app.include_router(
stats_v1.router, prefix="/api/v1", tags=["Statistics"]
) # Phase 6.5: Statistics Analytics
app.include_router(
monitoring_v1.router, prefix="/api/v1", tags=["Monitoring"]
) # 2026-04-03: 監控工具狀態
app.include_router(
github_webhook_v1.router, prefix="/api/v1", tags=["GitHub Webhook"]
) # Phase 13.1: GitHub → OpenClaw

View File

@@ -143,7 +143,13 @@
"infrastructure": "INFRASTRUCTURE",
"podHealth": "POD Health",
"allRunning": "All Running",
"servicesUp": "Services Up"
"servicesUp": "Services Up",
"monitoringTools": "Monitoring Tools",
"monitoringStatus": {
"up": "OK",
"down": "Down",
"unknown": "Unknown"
}
},
"openclaw": {
"name": "OpenClaw",

View File

@@ -144,7 +144,13 @@
"infrastructure": "基礎架構",
"podHealth": "POD 健康",
"allRunning": "全部運行中",
"servicesUp": "服務上線"
"servicesUp": "服務上線",
"monitoringTools": "監控工具",
"monitoringStatus": {
"up": "正常",
"down": "離線",
"unknown": "未知"
}
},
"openclaw": {
"name": "OpenClaw",

View File

@@ -8,10 +8,12 @@
* 統帥鐵律: 使用真實數據 Hook禁止假數據
*
* @updated 2026-04-02 Claude Code — Metrics Strip 7指標視覺強化
* @updated 2026-04-03 Claude Code — 監控工具區塊 (Grafana/Prometheus/SigNoz/Gitea)
* 串接: incidents(count/P0/MTTR/autoRemediation) + dashboard(serviceHealth/pendingApprovals/podHealth)
*/
import { useTranslations } from 'next-intl'
import { useState, useEffect } from 'react'
import { useGlobalPulseMetrics } from '@/hooks/useGlobalPulseMetrics'
import { useIncidents } from '@/hooks/useIncidents'
import { useHosts, useDashboardStore } from '@/stores/dashboard.store'
@@ -20,6 +22,8 @@ import { OpenClawPanel } from '@/components/ai/openclaw-panel'
import { HostGrid, type HostInfo } from '@/components/infra/host-grid'
import { AppLayout } from '@/components/layout'
const API_BASE = process.env.NEXT_PUBLIC_API_URL ?? ''
// =============================================================================
// Types
// =============================================================================
@@ -55,6 +59,96 @@ function MiniSparkline({ values, color }: { values: number[]; color: string }) {
)
}
// =============================================================================
// Monitoring Tools Component
// =============================================================================
interface MonitoringTool {
name: string
status: string
version: string | null
stats: string | null
description: string
firing_count?: number
checked_at: string
}
function MonitoringTools() {
const [tools, setTools] = useState<MonitoringTool[]>([])
const [loading, setLoading] = useState(true)
useEffect(() => {
const load = () => {
fetch(`${API_BASE}/api/v1/monitoring/status`)
.then(r => r.json())
.then(d => { setTools(d.tools ?? []); setLoading(false) })
.catch(() => setLoading(false))
}
load()
const t = setInterval(load, 60000)
return () => clearInterval(t)
}, [])
const TOOL_ICONS: Record<string, string> = {
Grafana: '📊',
Prometheus: '🔥',
SigNoz: '🔭',
Gitea: '🐙',
}
if (loading) return (
<div style={{ padding: '12px 14px', fontSize: 12, color: '#87867f', fontFamily: 'var(--font-body), monospace' }}>
...
</div>
)
return (
<div>
{tools.map((tool, i) => {
const isUp = tool.status === 'up'
const hasFiring = (tool.firing_count ?? 0) > 0
return (
<div key={tool.name} style={{
padding: '10px 14px',
borderBottom: i < tools.length - 1 ? '0.5px solid #f0ede4' : 'none',
display: 'flex',
alignItems: 'center',
gap: 10,
}}>
<div style={{ fontSize: 18, flexShrink: 0, width: 24, textAlign: 'center' }}>{TOOL_ICONS[tool.name] ?? '⚙️'}</div>
<div style={{ flex: 1, minWidth: 0 }}>
<div style={{ display: 'flex', alignItems: 'center', gap: 6, marginBottom: 2 }}>
<span style={{ fontSize: 13, fontWeight: 700, color: '#141413', fontFamily: 'var(--font-body), monospace' }}>{tool.name}</span>
<span style={{
display: 'inline-flex', alignItems: 'center', gap: 3,
fontSize: 10, fontWeight: 600,
color: isUp ? (hasFiring ? '#F59E0B' : '#22C55E') : '#cc2200',
background: isUp ? (hasFiring ? 'rgba(245,158,11,0.08)' : 'rgba(34,197,94,0.08)') : 'rgba(204,34,0,0.08)',
border: `0.5px solid ${isUp ? (hasFiring ? 'rgba(245,158,11,0.25)' : 'rgba(34,197,94,0.25)') : 'rgba(204,34,0,0.25)'}`,
borderRadius: 4, padding: '1px 5px',
}}>
<span style={{ width: 4, height: 4, borderRadius: '50%', background: 'currentColor', display: 'inline-block' }} />
{isUp ? (hasFiring ? `${tool.firing_count} 觸發` : '正常') : '離線'}
</span>
</div>
<div style={{ fontSize: 11, color: '#87867f', fontFamily: 'var(--font-body), monospace' }}>
{tool.description}
{tool.version && <span style={{ color: '#c0bdb4' }}> · v{tool.version}</span>}
</div>
{tool.stats && (
<div style={{ fontSize: 11, color: '#a0a09a', fontFamily: 'var(--font-body), monospace', marginTop: 1 }}>{tool.stats}</div>
)}
</div>
<div style={{ fontSize: 10, color: '#c0bdb4', fontFamily: 'var(--font-body), monospace', flexShrink: 0, textAlign: 'right' }}>
{new Date(tool.checked_at).toLocaleTimeString('zh-TW', { timeZone: 'Asia/Taipei', hour: '2-digit', minute: '2-digit' })}
</div>
</div>
)
})}
</div>
)
}
// =============================================================================
// Main Page
// =============================================================================
@@ -414,6 +508,29 @@ export default function Home({ params }: { params: { locale: string } }) {
}))} />
</div>
{/* 監控工具 */}
<div style={{
background: '#fff',
border: '0.5px solid #e0ddd4',
borderRadius: 12,
overflow: 'hidden',
boxShadow: '0 1px 4px rgba(0,0,0,0.05)',
flexShrink: 0,
}}>
<div style={{
padding: '10px 14px',
borderBottom: '0.5px solid #e0ddd4',
fontSize: 14, fontWeight: 700, color: '#141413',
letterSpacing: '0.5px',
fontFamily: 'var(--font-body), monospace', background: '#faf9f3',
display: 'flex', alignItems: 'center', gap: 8,
}}>
<div style={{ width: 6, height: 6, borderRadius: '50%', background: '#d97757', flexShrink: 0 }} />
{tDashboard('monitoringTools')}
</div>
<MonitoringTools />
</div>
</div>
</div>
</div>