移除 Router 層直接 DB 存取,遵循 leWOOOgo 積木化原則: - 新增 IMetricsRepository Protocol (interfaces.py) - 新增 MetricsDBRepository 封裝 DB 查詢 - 新增 MetricsService 封裝業務邏輯 - Router 層只做 HTTP 轉發 架構: Router → Service → Repository → PostgreSQL Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
382 lines
11 KiB
Python
382 lines
11 KiB
Python
"""
|
||
Metrics Service - 黃金指標服務層
|
||
=================================
|
||
Phase 17 技術債修復: Router 層違規抽取
|
||
|
||
職責:
|
||
- 聚合 SignOz + DB 數據
|
||
- 計算健康狀態 (healthy/warning/critical)
|
||
- 生成趨勢數據
|
||
|
||
設計原則:
|
||
- Service 透過 Protocol 依賴 Repository (DI)
|
||
- 業務邏輯集中於此層
|
||
- Router 只做 HTTP 轉發
|
||
|
||
版本: v1.0
|
||
建立: 2026-03-26 (台北時區)
|
||
建立者: Claude Code (Phase 17 技術債修復)
|
||
"""
|
||
|
||
from datetime import UTC, datetime
|
||
from typing import Any
|
||
|
||
import structlog
|
||
|
||
from src.repositories.interfaces import IMetricsRepository
|
||
from src.repositories.metrics_repository import get_metrics_repository
|
||
from src.services.signoz_client import get_signoz_client
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
# =============================================================================
|
||
# Data Classes
|
||
# =============================================================================
|
||
|
||
|
||
class GoldMetricResult:
|
||
"""單一黃金指標結果"""
|
||
|
||
def __init__(
|
||
self,
|
||
label: str,
|
||
value: float | str,
|
||
unit: str | None,
|
||
trend: list[float],
|
||
status: str,
|
||
) -> None:
|
||
self.label = label
|
||
self.value = value
|
||
self.unit = unit
|
||
self.trend = trend
|
||
self.status = status
|
||
|
||
|
||
class GoldMetricsResult:
|
||
"""黃金指標聚合結果"""
|
||
|
||
def __init__(
|
||
self,
|
||
timestamp: datetime,
|
||
service_name: str,
|
||
metrics: list[GoldMetricResult],
|
||
raw_data: dict[str, Any] | None = None,
|
||
) -> None:
|
||
self.timestamp = timestamp
|
||
self.service_name = service_name
|
||
self.metrics = metrics
|
||
self.raw_data = raw_data
|
||
|
||
|
||
# =============================================================================
|
||
# MetricsService
|
||
# =============================================================================
|
||
|
||
|
||
class MetricsService:
|
||
"""
|
||
黃金指標服務
|
||
|
||
職責:
|
||
1. 從 SignOz 取得 RPS, Error Rate, P99
|
||
2. 從 Repository 取得 AI Success Rate
|
||
3. 計算健康狀態
|
||
4. 組合成統一的 GoldMetrics 回應
|
||
|
||
使用方式:
|
||
service = MetricsService()
|
||
result = await service.get_gold_metrics("awoooi-api", 10)
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
metrics_repo: IMetricsRepository | None = None,
|
||
) -> None:
|
||
"""
|
||
依賴注入建構函數
|
||
|
||
Args:
|
||
metrics_repo: Metrics Repository (預設使用 Singleton)
|
||
"""
|
||
self._metrics_repo = metrics_repo or get_metrics_repository()
|
||
|
||
# =========================================================================
|
||
# Gold Metrics
|
||
# =========================================================================
|
||
|
||
async def get_gold_metrics(
|
||
self,
|
||
service_name: str = "awoooi-api",
|
||
time_window_minutes: int = 10,
|
||
) -> GoldMetricsResult:
|
||
"""
|
||
獲取黃金指標 (Gold Metrics)
|
||
|
||
統帥鐵律:
|
||
- 所有數據必須來自 SignOz 真實血脈
|
||
- AI Success 來自 AuditLog 真實統計
|
||
- 無數據時顯示 0,嚴禁造假
|
||
|
||
Returns:
|
||
GoldMetricsResult with RPS, Error Rate, P99, AI Success
|
||
"""
|
||
logger.info(
|
||
"gold_metrics_fetch",
|
||
service=service_name,
|
||
window_minutes=time_window_minutes,
|
||
)
|
||
|
||
metrics_list: list[GoldMetricResult] = []
|
||
raw_data: dict[str, Any] = {}
|
||
|
||
# =====================================================================
|
||
# 1. SignOz Gold Metrics (RPS, Error Rate, P99)
|
||
# =====================================================================
|
||
signoz_metrics = await self._fetch_signoz_metrics(
|
||
service_name,
|
||
time_window_minutes,
|
||
)
|
||
metrics_list.extend(signoz_metrics["metrics"])
|
||
raw_data.update(signoz_metrics["raw_data"])
|
||
|
||
# =====================================================================
|
||
# 2. AI Success Rate (from Repository)
|
||
# =====================================================================
|
||
ai_metrics = await self._fetch_ai_success_metrics(hours=24)
|
||
metrics_list.append(ai_metrics["metric"])
|
||
raw_data.update(ai_metrics["raw_data"])
|
||
|
||
return GoldMetricsResult(
|
||
timestamp=datetime.now(UTC),
|
||
service_name=service_name,
|
||
metrics=metrics_list,
|
||
raw_data=raw_data,
|
||
)
|
||
|
||
async def _fetch_signoz_metrics(
|
||
self,
|
||
service_name: str,
|
||
time_window_minutes: int,
|
||
) -> dict[str, Any]:
|
||
"""
|
||
從 SignOz 取得 RPS, Error Rate, P99
|
||
|
||
統帥鐵律: SignOz 斷線時顯示 0,非假數據
|
||
"""
|
||
metrics: list[GoldMetricResult] = []
|
||
raw_data: dict[str, Any] = {}
|
||
|
||
try:
|
||
signoz = get_signoz_client()
|
||
gold = await signoz.get_gold_metrics(
|
||
service_name=service_name,
|
||
time_window_minutes=time_window_minutes,
|
||
)
|
||
|
||
# RPS
|
||
rps_status = self._calculate_rps_status(gold.rps)
|
||
rps_trend = self._generate_simulated_trend(gold.rps)
|
||
metrics.append(GoldMetricResult(
|
||
label="RPS",
|
||
value=round(gold.rps, 1),
|
||
unit="req/s",
|
||
trend=rps_trend,
|
||
status=rps_status,
|
||
))
|
||
|
||
# Error Rate
|
||
error_status = self._calculate_error_status(gold.error_rate)
|
||
error_trend = self._generate_simulated_trend(gold.error_rate)
|
||
metrics.append(GoldMetricResult(
|
||
label="Error Rate",
|
||
value=round(gold.error_rate, 2),
|
||
unit="%",
|
||
trend=error_trend,
|
||
status=error_status,
|
||
))
|
||
|
||
# P99 Latency
|
||
p99_status = self._calculate_latency_status(gold.p99_latency_ms)
|
||
p99_trend = self._generate_simulated_trend(gold.p99_latency_ms)
|
||
metrics.append(GoldMetricResult(
|
||
label="P99 Latency",
|
||
value=round(gold.p99_latency_ms, 0),
|
||
unit="ms",
|
||
trend=p99_trend,
|
||
status=p99_status,
|
||
))
|
||
|
||
raw_data["signoz"] = {
|
||
"rps": gold.rps,
|
||
"error_rate": gold.error_rate,
|
||
"p99_latency_ms": gold.p99_latency_ms,
|
||
"total_requests": gold.total_requests,
|
||
"error_count": gold.error_count,
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.warning("signoz_metrics_error", error=str(e))
|
||
# 統帥鐵律: SignOz 斷線時顯示 0,非假數據
|
||
metrics.extend([
|
||
GoldMetricResult(
|
||
label="RPS",
|
||
value=0,
|
||
unit="req/s",
|
||
trend=[0] * 10,
|
||
status="critical",
|
||
),
|
||
GoldMetricResult(
|
||
label="Error Rate",
|
||
value=0,
|
||
unit="%",
|
||
trend=[0] * 10,
|
||
status="critical",
|
||
),
|
||
GoldMetricResult(
|
||
label="P99 Latency",
|
||
value=0,
|
||
unit="ms",
|
||
trend=[0] * 10,
|
||
status="critical",
|
||
),
|
||
])
|
||
raw_data["signoz_error"] = str(e)
|
||
|
||
return {"metrics": metrics, "raw_data": raw_data}
|
||
|
||
async def _fetch_ai_success_metrics(
|
||
self,
|
||
hours: int = 24,
|
||
) -> dict[str, Any]:
|
||
"""
|
||
從 Repository 取得 AI Success Rate
|
||
|
||
統帥鐵律: 若無數據,回傳真實的 0,嚴禁造假
|
||
"""
|
||
# 從 Repository 取得數據
|
||
success_rate, executed, total = await self._metrics_repo.get_ai_success_rate(
|
||
hours=hours,
|
||
)
|
||
trend = await self._metrics_repo.get_ai_success_trend(
|
||
hours=hours,
|
||
points=10,
|
||
)
|
||
|
||
ai_status = self._calculate_ai_success_status(success_rate)
|
||
|
||
metric = GoldMetricResult(
|
||
label="AI Success",
|
||
value=round(success_rate, 1),
|
||
unit="%",
|
||
trend=trend,
|
||
status=ai_status,
|
||
)
|
||
|
||
raw_data = {
|
||
"ai_success": {
|
||
"rate": success_rate,
|
||
"executed": executed,
|
||
"total": total,
|
||
"hours": hours,
|
||
}
|
||
}
|
||
|
||
logger.info(
|
||
"ai_success_rate_calculated",
|
||
success_rate=success_rate,
|
||
executed=executed,
|
||
total=total,
|
||
hours=hours,
|
||
)
|
||
|
||
return {"metric": metric, "raw_data": raw_data}
|
||
|
||
# =========================================================================
|
||
# Health Check
|
||
# =========================================================================
|
||
|
||
async def check_health(self) -> dict[str, Any]:
|
||
"""
|
||
Metrics 子系統健康檢查
|
||
|
||
快速檢查 SignOz 連線狀態
|
||
"""
|
||
try:
|
||
signoz = get_signoz_client()
|
||
results = await signoz._query_clickhouse("SELECT 1")
|
||
clickhouse_ok = len(results) > 0
|
||
except Exception as e:
|
||
clickhouse_ok = False
|
||
logger.warning("clickhouse_health_check_failed", error=str(e))
|
||
|
||
return {
|
||
"status": "healthy" if clickhouse_ok else "degraded",
|
||
"clickhouse": "connected" if clickhouse_ok else "disconnected",
|
||
"timestamp": datetime.now(UTC).isoformat(),
|
||
}
|
||
|
||
# =========================================================================
|
||
# Status Calculation Helpers
|
||
# =========================================================================
|
||
|
||
@staticmethod
|
||
def _calculate_rps_status(rps: float) -> str:
|
||
"""計算 RPS 健康狀態"""
|
||
if rps < 1000:
|
||
return "healthy"
|
||
if rps < 5000:
|
||
return "warning"
|
||
return "critical"
|
||
|
||
@staticmethod
|
||
def _calculate_error_status(error_rate: float) -> str:
|
||
"""計算 Error Rate 健康狀態"""
|
||
if error_rate < 1:
|
||
return "healthy"
|
||
if error_rate < 5:
|
||
return "warning"
|
||
return "critical"
|
||
|
||
@staticmethod
|
||
def _calculate_latency_status(p99_ms: float) -> str:
|
||
"""計算 P99 Latency 健康狀態"""
|
||
if p99_ms < 200:
|
||
return "healthy"
|
||
if p99_ms < 500:
|
||
return "warning"
|
||
return "critical"
|
||
|
||
@staticmethod
|
||
def _calculate_ai_success_status(success_rate: float) -> str:
|
||
"""計算 AI Success Rate 健康狀態"""
|
||
if success_rate >= 90:
|
||
return "healthy"
|
||
if success_rate >= 70:
|
||
return "warning"
|
||
return "critical"
|
||
|
||
@staticmethod
|
||
def _generate_simulated_trend(base_value: float, points: int = 10) -> list[float]:
|
||
"""
|
||
生成模擬趨勢數據 (SignOz 不提供歷史數據時使用)
|
||
|
||
注意: 這是暫時方案,未來應從 SignOz 取得真實歷史數據
|
||
"""
|
||
return [base_value * (0.9 + i * 0.02) for i in range(points)]
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_metrics_service: MetricsService | None = None
|
||
|
||
|
||
def get_metrics_service() -> MetricsService:
|
||
"""取得 MetricsService 實例 (Singleton)"""
|
||
global _metrics_service
|
||
if _metrics_service is None:
|
||
_metrics_service = MetricsService()
|
||
return _metrics_service
|