Files
awoooi/apps/api/src/services/metrics_service.py
OG T e7f361db50 refactor(api): Phase 17 metrics.py Router 層違規修復
移除 Router 層直接 DB 存取,遵循 leWOOOgo 積木化原則:
- 新增 IMetricsRepository Protocol (interfaces.py)
- 新增 MetricsDBRepository 封裝 DB 查詢
- 新增 MetricsService 封裝業務邏輯
- Router 層只做 HTTP 轉發

架構: Router → Service → Repository → PostgreSQL

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-26 10:01:57 +08:00

382 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Metrics Service - 黃金指標服務層
=================================
Phase 17 技術債修復: Router 層違規抽取
職責:
- 聚合 SignOz + DB 數據
- 計算健康狀態 (healthy/warning/critical)
- 生成趨勢數據
設計原則:
- Service 透過 Protocol 依賴 Repository (DI)
- 業務邏輯集中於此層
- Router 只做 HTTP 轉發
版本: v1.0
建立: 2026-03-26 (台北時區)
建立者: Claude Code (Phase 17 技術債修復)
"""
from datetime import UTC, datetime
from typing import Any
import structlog
from src.repositories.interfaces import IMetricsRepository
from src.repositories.metrics_repository import get_metrics_repository
from src.services.signoz_client import get_signoz_client
logger = structlog.get_logger(__name__)
# =============================================================================
# Data Classes
# =============================================================================
class GoldMetricResult:
"""單一黃金指標結果"""
def __init__(
self,
label: str,
value: float | str,
unit: str | None,
trend: list[float],
status: str,
) -> None:
self.label = label
self.value = value
self.unit = unit
self.trend = trend
self.status = status
class GoldMetricsResult:
"""黃金指標聚合結果"""
def __init__(
self,
timestamp: datetime,
service_name: str,
metrics: list[GoldMetricResult],
raw_data: dict[str, Any] | None = None,
) -> None:
self.timestamp = timestamp
self.service_name = service_name
self.metrics = metrics
self.raw_data = raw_data
# =============================================================================
# MetricsService
# =============================================================================
class MetricsService:
"""
黃金指標服務
職責:
1. 從 SignOz 取得 RPS, Error Rate, P99
2. 從 Repository 取得 AI Success Rate
3. 計算健康狀態
4. 組合成統一的 GoldMetrics 回應
使用方式:
service = MetricsService()
result = await service.get_gold_metrics("awoooi-api", 10)
"""
def __init__(
self,
metrics_repo: IMetricsRepository | None = None,
) -> None:
"""
依賴注入建構函數
Args:
metrics_repo: Metrics Repository (預設使用 Singleton)
"""
self._metrics_repo = metrics_repo or get_metrics_repository()
# =========================================================================
# Gold Metrics
# =========================================================================
async def get_gold_metrics(
self,
service_name: str = "awoooi-api",
time_window_minutes: int = 10,
) -> GoldMetricsResult:
"""
獲取黃金指標 (Gold Metrics)
統帥鐵律:
- 所有數據必須來自 SignOz 真實血脈
- AI Success 來自 AuditLog 真實統計
- 無數據時顯示 0嚴禁造假
Returns:
GoldMetricsResult with RPS, Error Rate, P99, AI Success
"""
logger.info(
"gold_metrics_fetch",
service=service_name,
window_minutes=time_window_minutes,
)
metrics_list: list[GoldMetricResult] = []
raw_data: dict[str, Any] = {}
# =====================================================================
# 1. SignOz Gold Metrics (RPS, Error Rate, P99)
# =====================================================================
signoz_metrics = await self._fetch_signoz_metrics(
service_name,
time_window_minutes,
)
metrics_list.extend(signoz_metrics["metrics"])
raw_data.update(signoz_metrics["raw_data"])
# =====================================================================
# 2. AI Success Rate (from Repository)
# =====================================================================
ai_metrics = await self._fetch_ai_success_metrics(hours=24)
metrics_list.append(ai_metrics["metric"])
raw_data.update(ai_metrics["raw_data"])
return GoldMetricsResult(
timestamp=datetime.now(UTC),
service_name=service_name,
metrics=metrics_list,
raw_data=raw_data,
)
async def _fetch_signoz_metrics(
self,
service_name: str,
time_window_minutes: int,
) -> dict[str, Any]:
"""
從 SignOz 取得 RPS, Error Rate, P99
統帥鐵律: SignOz 斷線時顯示 0非假數據
"""
metrics: list[GoldMetricResult] = []
raw_data: dict[str, Any] = {}
try:
signoz = get_signoz_client()
gold = await signoz.get_gold_metrics(
service_name=service_name,
time_window_minutes=time_window_minutes,
)
# RPS
rps_status = self._calculate_rps_status(gold.rps)
rps_trend = self._generate_simulated_trend(gold.rps)
metrics.append(GoldMetricResult(
label="RPS",
value=round(gold.rps, 1),
unit="req/s",
trend=rps_trend,
status=rps_status,
))
# Error Rate
error_status = self._calculate_error_status(gold.error_rate)
error_trend = self._generate_simulated_trend(gold.error_rate)
metrics.append(GoldMetricResult(
label="Error Rate",
value=round(gold.error_rate, 2),
unit="%",
trend=error_trend,
status=error_status,
))
# P99 Latency
p99_status = self._calculate_latency_status(gold.p99_latency_ms)
p99_trend = self._generate_simulated_trend(gold.p99_latency_ms)
metrics.append(GoldMetricResult(
label="P99 Latency",
value=round(gold.p99_latency_ms, 0),
unit="ms",
trend=p99_trend,
status=p99_status,
))
raw_data["signoz"] = {
"rps": gold.rps,
"error_rate": gold.error_rate,
"p99_latency_ms": gold.p99_latency_ms,
"total_requests": gold.total_requests,
"error_count": gold.error_count,
}
except Exception as e:
logger.warning("signoz_metrics_error", error=str(e))
# 統帥鐵律: SignOz 斷線時顯示 0非假數據
metrics.extend([
GoldMetricResult(
label="RPS",
value=0,
unit="req/s",
trend=[0] * 10,
status="critical",
),
GoldMetricResult(
label="Error Rate",
value=0,
unit="%",
trend=[0] * 10,
status="critical",
),
GoldMetricResult(
label="P99 Latency",
value=0,
unit="ms",
trend=[0] * 10,
status="critical",
),
])
raw_data["signoz_error"] = str(e)
return {"metrics": metrics, "raw_data": raw_data}
async def _fetch_ai_success_metrics(
self,
hours: int = 24,
) -> dict[str, Any]:
"""
從 Repository 取得 AI Success Rate
統帥鐵律: 若無數據,回傳真實的 0嚴禁造假
"""
# 從 Repository 取得數據
success_rate, executed, total = await self._metrics_repo.get_ai_success_rate(
hours=hours,
)
trend = await self._metrics_repo.get_ai_success_trend(
hours=hours,
points=10,
)
ai_status = self._calculate_ai_success_status(success_rate)
metric = GoldMetricResult(
label="AI Success",
value=round(success_rate, 1),
unit="%",
trend=trend,
status=ai_status,
)
raw_data = {
"ai_success": {
"rate": success_rate,
"executed": executed,
"total": total,
"hours": hours,
}
}
logger.info(
"ai_success_rate_calculated",
success_rate=success_rate,
executed=executed,
total=total,
hours=hours,
)
return {"metric": metric, "raw_data": raw_data}
# =========================================================================
# Health Check
# =========================================================================
async def check_health(self) -> dict[str, Any]:
"""
Metrics 子系統健康檢查
快速檢查 SignOz 連線狀態
"""
try:
signoz = get_signoz_client()
results = await signoz._query_clickhouse("SELECT 1")
clickhouse_ok = len(results) > 0
except Exception as e:
clickhouse_ok = False
logger.warning("clickhouse_health_check_failed", error=str(e))
return {
"status": "healthy" if clickhouse_ok else "degraded",
"clickhouse": "connected" if clickhouse_ok else "disconnected",
"timestamp": datetime.now(UTC).isoformat(),
}
# =========================================================================
# Status Calculation Helpers
# =========================================================================
@staticmethod
def _calculate_rps_status(rps: float) -> str:
"""計算 RPS 健康狀態"""
if rps < 1000:
return "healthy"
if rps < 5000:
return "warning"
return "critical"
@staticmethod
def _calculate_error_status(error_rate: float) -> str:
"""計算 Error Rate 健康狀態"""
if error_rate < 1:
return "healthy"
if error_rate < 5:
return "warning"
return "critical"
@staticmethod
def _calculate_latency_status(p99_ms: float) -> str:
"""計算 P99 Latency 健康狀態"""
if p99_ms < 200:
return "healthy"
if p99_ms < 500:
return "warning"
return "critical"
@staticmethod
def _calculate_ai_success_status(success_rate: float) -> str:
"""計算 AI Success Rate 健康狀態"""
if success_rate >= 90:
return "healthy"
if success_rate >= 70:
return "warning"
return "critical"
@staticmethod
def _generate_simulated_trend(base_value: float, points: int = 10) -> list[float]:
"""
生成模擬趨勢數據 (SignOz 不提供歷史數據時使用)
注意: 這是暫時方案,未來應從 SignOz 取得真實歷史數據
"""
return [base_value * (0.9 + i * 0.02) for i in range(points)]
# =============================================================================
# Singleton
# =============================================================================
_metrics_service: MetricsService | None = None
def get_metrics_service() -> MetricsService:
"""取得 MetricsService 實例 (Singleton)"""
global _metrics_service
if _metrics_service is None:
_metrics_service = MetricsService()
return _metrics_service