Files
awoooi/apps/api/src/services/token_counter.py
Your Name dccdcdbaf5
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 9m45s
fix(flywheel): unblock action safety and Claude fallback
2026-04-29 21:51:18 +08:00

676 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Token Counter Service - Phase 13.3 #88 AI Token Dashboard
=========================================================
Token 用量監控,整合 SignOz OTEL Metrics + Langfuse
功能:
- 記錄每次 LLM 呼叫的 input/output tokens
- 按 provider 分類統計
- 成本估算 (Gemini/Claude 有成本Ollama 免費)
- 每日/每月 Token 預算監控
- 超標時通知切換到本地模型
SignOz 指標:
- llm.tokens.input (Counter) - 輸入 Token 數
- llm.tokens.output (Counter) - 輸出 Token 數
- llm.cost.usd (Counter) - 累計成本
- llm.latency.ms (Histogram) - 延遲分佈
- llm.requests.total (Counter) - 總請求數
- llm.requests.failed (Counter) - 失敗請求數
版本: v1.0
建立: 2026-03-26 14:30 (台北時區)
建立者: Claude Code
最後修改: 2026-03-26 14:30 (台北時區)
修改者: Claude Code
變更紀錄:
| 版本 | 日期 | 執行者 | 變更內容 |
|------|------|--------|----------|
| v1.0 | 2026-03-26 | Claude Code | Phase 13.3 #88 初始實作 |
"""
import time
from dataclasses import dataclass, field
from datetime import UTC, datetime
from typing import Protocol
import structlog
from opentelemetry import metrics
from opentelemetry.metrics import Counter, Histogram, Meter
from src.core.config import settings
logger = structlog.get_logger(__name__)
# =============================================================================
# Constants - Cost Per 1K Tokens (USD)
# =============================================================================
# 成本定義 (from models.json)
COST_PER_1K_TOKENS = {
"ollama": 0.0, # 本地免費
"gemini": 0.001, # Gemini 1.5 Flash
"claude": 0.005, # Claude Haiku 4.5 conservative output-side estimate
}
# 預算閾值 (from models.json monitoring.alerts)
DAILY_COST_THRESHOLD_USD = 5.0
MONTHLY_COST_THRESHOLD_USD = 10.0
DAILY_TOKEN_BUDGET = {
"gemini": 100_000, # 每日 100K tokens
"claude": 50_000, # 每日 50K tokens
}
MONTHLY_TOKEN_BUDGET = {
"gemini": 2_000_000, # 每月 2M tokens
"claude": 500_000, # 每月 500K tokens
}
ALERT_THRESHOLD_PERCENT = 70 # 70% 預警
# =============================================================================
# Data Classes
# =============================================================================
@dataclass
class TokenUsage:
"""單次 LLM 呼叫的 Token 使用量"""
input_tokens: int
output_tokens: int
total_tokens: int = field(init=False)
provider: str
model: str
latency_ms: float = 0.0
success: bool = True
error_message: str | None = None
timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
def __post_init__(self):
self.total_tokens = self.input_tokens + self.output_tokens
@property
def estimated_cost_usd(self) -> float:
"""估算成本 (USD)"""
cost_per_1k = COST_PER_1K_TOKENS.get(self.provider.lower(), 0.0)
return (self.total_tokens / 1000) * cost_per_1k
@dataclass
class ProviderStats:
"""Provider 統計"""
provider: str
total_input_tokens: int = 0
total_output_tokens: int = 0
total_requests: int = 0
failed_requests: int = 0
total_latency_ms: float = 0.0
total_cost_usd: float = 0.0
period_start: datetime = field(default_factory=lambda: datetime.now(UTC))
@property
def total_tokens(self) -> int:
return self.total_input_tokens + self.total_output_tokens
@property
def success_rate(self) -> float:
if self.total_requests == 0:
return 100.0
return ((self.total_requests - self.failed_requests) / self.total_requests) * 100
@property
def avg_latency_ms(self) -> float:
if self.total_requests == 0:
return 0.0
return self.total_latency_ms / self.total_requests
@dataclass
class BudgetStatus:
"""預算狀態"""
provider: str
daily_tokens_used: int
daily_tokens_budget: int
daily_cost_usd: float
monthly_tokens_used: int
monthly_tokens_budget: int
monthly_cost_usd: float
is_over_budget: bool = False
alert_triggered: bool = False
recommendation: str = ""
# =============================================================================
# Interface (Protocol for DI)
# =============================================================================
class ITokenCounter(Protocol):
"""Token Counter Interface"""
def record_usage(self, usage: TokenUsage) -> None:
"""記錄 Token 使用"""
...
def get_provider_stats(self, provider: str) -> ProviderStats:
"""取得 Provider 統計"""
...
def get_budget_status(self, provider: str) -> BudgetStatus:
"""取得預算狀態"""
...
def should_fallback_to_local(self, provider: str) -> tuple[bool, str]:
"""檢查是否應該 fallback 到本地模型"""
...
# =============================================================================
# Token Counter Implementation
# =============================================================================
class TokenCounter:
"""
Token 計數器 - OTEL Metrics + Langfuse 整合
使用 OpenTelemetry Metrics API 將指標送到 SignOz
同時整合 Langfuse 記錄詳細的 LLM trace。
Usage:
counter = get_token_counter()
counter.record_usage(TokenUsage(
input_tokens=500,
output_tokens=200,
provider="ollama",
model="qwen2.5:7b-instruct",
latency_ms=1500,
))
"""
def __init__(self):
self._provider_stats: dict[str, ProviderStats] = {}
self._daily_stats: dict[str, ProviderStats] = {}
self._monthly_stats: dict[str, ProviderStats] = {}
self._last_daily_reset: datetime = datetime.now(UTC).replace(
hour=0, minute=0, second=0, microsecond=0
)
self._last_monthly_reset: datetime = datetime.now(UTC).replace(
day=1, hour=0, minute=0, second=0, microsecond=0
)
# OTEL Metrics 初始化
self._meter: Meter | None = None
self._input_tokens_counter: Counter | None = None
self._output_tokens_counter: Counter | None = None
self._cost_counter: Counter | None = None
self._latency_histogram: Histogram | None = None
self._request_counter: Counter | None = None
self._failed_counter: Counter | None = None
self._init_metrics()
def _init_metrics(self) -> None:
"""初始化 OTEL Metrics"""
if not settings.OTEL_ENABLED or settings.MOCK_MODE:
logger.info("otel_metrics_disabled", reason="OTEL_ENABLED=false or MOCK_MODE=true")
return
try:
# 取得 MeterProvider
self._meter = metrics.get_meter(
name="awoooi.llm",
version=settings.VERSION,
)
# 建立 Counters
self._input_tokens_counter = self._meter.create_counter(
name="llm.tokens.input",
description="LLM input tokens count",
unit="tokens",
)
self._output_tokens_counter = self._meter.create_counter(
name="llm.tokens.output",
description="LLM output tokens count",
unit="tokens",
)
self._cost_counter = self._meter.create_counter(
name="llm.cost.usd",
description="Estimated LLM cost in USD",
unit="USD",
)
self._request_counter = self._meter.create_counter(
name="llm.requests.total",
description="Total LLM requests",
unit="requests",
)
self._failed_counter = self._meter.create_counter(
name="llm.requests.failed",
description="Failed LLM requests",
unit="requests",
)
# 建立 Histogram (延遲分佈)
self._latency_histogram = self._meter.create_histogram(
name="llm.latency.ms",
description="LLM request latency in milliseconds",
unit="ms",
)
logger.info(
"otel_llm_metrics_initialized",
meter_name="awoooi.llm",
)
except Exception as e:
logger.warning(
"otel_metrics_init_failed",
error=str(e),
)
def _reset_if_needed(self) -> None:
"""檢查並重置每日/每月統計"""
now = datetime.now(UTC)
# 每日重置
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
if today_start > self._last_daily_reset:
logger.info(
"daily_stats_reset",
previous_date=self._last_daily_reset.isoformat(),
)
self._daily_stats = {}
self._last_daily_reset = today_start
# 每月重置
month_start = now.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
if month_start > self._last_monthly_reset:
logger.info(
"monthly_stats_reset",
previous_month=self._last_monthly_reset.isoformat(),
)
self._monthly_stats = {}
self._last_monthly_reset = month_start
def _get_or_create_stats(
self, provider: str, stats_dict: dict[str, ProviderStats]
) -> ProviderStats:
"""取得或建立 Provider 統計"""
if provider not in stats_dict:
stats_dict[provider] = ProviderStats(provider=provider)
return stats_dict[provider]
def record_usage(self, usage: TokenUsage) -> None:
"""
記錄 Token 使用量
同時更新:
1. 內存統計 (總計/每日/每月)
2. OTEL Metrics (SignOz)
3. Langfuse (如果有 trace context)
"""
self._reset_if_needed()
provider = usage.provider.lower()
attributes = {
"provider": provider,
"model": usage.model,
"environment": settings.ENVIRONMENT,
}
# 更新內存統計
for stats_dict in [self._provider_stats, self._daily_stats, self._monthly_stats]:
stats = self._get_or_create_stats(provider, stats_dict)
stats.total_input_tokens += usage.input_tokens
stats.total_output_tokens += usage.output_tokens
stats.total_requests += 1
stats.total_latency_ms += usage.latency_ms
stats.total_cost_usd += usage.estimated_cost_usd
if not usage.success:
stats.failed_requests += 1
# 發送 OTEL Metrics
if self._input_tokens_counter:
self._input_tokens_counter.add(usage.input_tokens, attributes)
if self._output_tokens_counter:
self._output_tokens_counter.add(usage.output_tokens, attributes)
if self._cost_counter and usage.estimated_cost_usd > 0:
# Counter 只接受整數或 float成本用 micro-USD (乘以 1,000,000)
# 或直接用 float
self._cost_counter.add(usage.estimated_cost_usd, attributes)
if self._request_counter:
self._request_counter.add(1, attributes)
if not usage.success and self._failed_counter:
self._failed_counter.add(1, attributes)
if self._latency_histogram and usage.latency_ms > 0:
self._latency_histogram.record(usage.latency_ms, attributes)
# 記錄日誌
logger.info(
"token_usage_recorded",
provider=provider,
model=usage.model,
input_tokens=usage.input_tokens,
output_tokens=usage.output_tokens,
total_tokens=usage.total_tokens,
latency_ms=round(usage.latency_ms, 2),
cost_usd=round(usage.estimated_cost_usd, 6),
success=usage.success,
)
# 檢查預算告警
self._check_budget_alert(provider)
def _check_budget_alert(self, provider: str) -> None:
"""檢查預算告警"""
status = self.get_budget_status(provider)
if status.alert_triggered:
logger.warning(
"llm_budget_alert",
provider=provider,
daily_usage_percent=round(
(status.daily_tokens_used / status.daily_tokens_budget * 100)
if status.daily_tokens_budget > 0
else 0,
1,
),
monthly_usage_percent=round(
(status.monthly_tokens_used / status.monthly_tokens_budget * 100)
if status.monthly_tokens_budget > 0
else 0,
1,
),
recommendation=status.recommendation,
)
if status.is_over_budget:
logger.error(
"llm_budget_exceeded",
provider=provider,
daily_tokens_used=status.daily_tokens_used,
monthly_tokens_used=status.monthly_tokens_used,
recommendation=status.recommendation,
)
def get_provider_stats(self, provider: str) -> ProviderStats:
"""取得 Provider 總計統計"""
return self._get_or_create_stats(provider.lower(), self._provider_stats)
def get_daily_stats(self, provider: str) -> ProviderStats:
"""取得 Provider 每日統計"""
self._reset_if_needed()
return self._get_or_create_stats(provider.lower(), self._daily_stats)
def get_monthly_stats(self, provider: str) -> ProviderStats:
"""取得 Provider 每月統計"""
self._reset_if_needed()
return self._get_or_create_stats(provider.lower(), self._monthly_stats)
def get_budget_status(self, provider: str) -> BudgetStatus:
"""取得預算狀態"""
self._reset_if_needed()
provider = provider.lower()
daily_stats = self.get_daily_stats(provider)
monthly_stats = self.get_monthly_stats(provider)
daily_budget = DAILY_TOKEN_BUDGET.get(provider, 0)
monthly_budget = MONTHLY_TOKEN_BUDGET.get(provider, 0)
# 計算使用率
daily_usage_percent = (
(daily_stats.total_tokens / daily_budget * 100) if daily_budget > 0 else 0
)
monthly_usage_percent = (
(monthly_stats.total_tokens / monthly_budget * 100) if monthly_budget > 0 else 0
)
# 判斷告警狀態
alert_triggered = (
daily_usage_percent >= ALERT_THRESHOLD_PERCENT
or monthly_usage_percent >= ALERT_THRESHOLD_PERCENT
)
is_over_budget = daily_usage_percent >= 100 or monthly_usage_percent >= 100
# 建議
recommendation = ""
if is_over_budget:
recommendation = "建議切換到本地模型 (Ollama) 以節省成本"
elif alert_triggered:
recommendation = f"接近預算上限 ({max(daily_usage_percent, monthly_usage_percent):.1f}%),考慮減少 {provider} 呼叫"
return BudgetStatus(
provider=provider,
daily_tokens_used=daily_stats.total_tokens,
daily_tokens_budget=daily_budget,
daily_cost_usd=daily_stats.total_cost_usd,
monthly_tokens_used=monthly_stats.total_tokens,
monthly_tokens_budget=monthly_budget,
monthly_cost_usd=monthly_stats.total_cost_usd,
is_over_budget=is_over_budget,
alert_triggered=alert_triggered,
recommendation=recommendation,
)
def should_fallback_to_local(self, provider: str) -> tuple[bool, str]:
"""
檢查是否應該 fallback 到本地模型
Returns:
(should_fallback, reason)
"""
if provider.lower() == "ollama":
return False, "Already using local model"
status = self.get_budget_status(provider)
if status.is_over_budget:
return True, f"Budget exceeded for {provider}: {status.recommendation}"
if status.alert_triggered:
# 70% 以上時,可選擇 fallback
return False, f"Near budget threshold for {provider}: {status.recommendation}"
return False, "Budget OK"
def get_all_stats_summary(self) -> dict:
"""取得所有 Provider 統計摘要"""
self._reset_if_needed()
summary = {
"timestamp": datetime.now(UTC).isoformat(),
"providers": {},
"total": {
"input_tokens": 0,
"output_tokens": 0,
"cost_usd": 0.0,
"requests": 0,
},
}
for provider in ["ollama", "gemini", "claude"]:
daily = self.get_daily_stats(provider)
monthly = self.get_monthly_stats(provider)
budget = self.get_budget_status(provider)
summary["providers"][provider] = {
"daily": {
"input_tokens": daily.total_input_tokens,
"output_tokens": daily.total_output_tokens,
"total_tokens": daily.total_tokens,
"cost_usd": round(daily.total_cost_usd, 4),
"requests": daily.total_requests,
"success_rate": round(daily.success_rate, 1),
"avg_latency_ms": round(daily.avg_latency_ms, 1),
},
"monthly": {
"input_tokens": monthly.total_input_tokens,
"output_tokens": monthly.total_output_tokens,
"total_tokens": monthly.total_tokens,
"cost_usd": round(monthly.total_cost_usd, 4),
"requests": monthly.total_requests,
},
"budget": {
"daily_budget": budget.daily_tokens_budget,
"daily_usage_percent": round(
(budget.daily_tokens_used / budget.daily_tokens_budget * 100)
if budget.daily_tokens_budget > 0
else 0,
1,
),
"monthly_budget": budget.monthly_tokens_budget,
"monthly_usage_percent": round(
(budget.monthly_tokens_used / budget.monthly_tokens_budget * 100)
if budget.monthly_tokens_budget > 0
else 0,
1,
),
"is_over_budget": budget.is_over_budget,
"alert_triggered": budget.alert_triggered,
},
}
# 累計總計
summary["total"]["input_tokens"] += daily.total_input_tokens
summary["total"]["output_tokens"] += daily.total_output_tokens
summary["total"]["cost_usd"] += daily.total_cost_usd
summary["total"]["requests"] += daily.total_requests
summary["total"]["cost_usd"] = round(summary["total"]["cost_usd"], 4)
return summary
# =============================================================================
# Helper: Usage Tracker Context Manager
# =============================================================================
class UsageTracker:
"""
Token 使用追蹤器 - Context Manager
自動計時並記錄 Token 使用
Usage:
async with UsageTracker("ollama", "qwen2.5:7b-instruct") as tracker:
result = await call_llm(prompt)
tracker.set_tokens(input_tokens=500, output_tokens=200)
"""
def __init__(self, provider: str, model: str):
self.provider = provider
self.model = model
self.start_time: float = 0
self.input_tokens: int = 0
self.output_tokens: int = 0
self.success: bool = True
self.error_message: str | None = None
self._counter = get_token_counter()
def __enter__(self):
self.start_time = time.perf_counter()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
latency_ms = (time.perf_counter() - self.start_time) * 1000
if exc_type is not None:
self.success = False
self.error_message = str(exc_val)
usage = TokenUsage(
input_tokens=self.input_tokens,
output_tokens=self.output_tokens,
provider=self.provider,
model=self.model,
latency_ms=latency_ms,
success=self.success,
error_message=self.error_message,
)
self._counter.record_usage(usage)
async def __aenter__(self):
return self.__enter__()
async def __aexit__(self, exc_type, exc_val, exc_tb):
return self.__exit__(exc_type, exc_val, exc_tb)
def set_tokens(self, input_tokens: int, output_tokens: int) -> None:
"""設定 Token 數量"""
self.input_tokens = input_tokens
self.output_tokens = output_tokens
def mark_failed(self, error_message: str) -> None:
"""標記失敗"""
self.success = False
self.error_message = error_message
# =============================================================================
# Singleton
# =============================================================================
_token_counter: TokenCounter | None = None
def get_token_counter() -> TokenCounter:
"""取得 TokenCounter 單例"""
global _token_counter
if _token_counter is None:
_token_counter = TokenCounter()
return _token_counter
def reset_token_counter() -> None:
"""重置單例 (用於測試)"""
global _token_counter
_token_counter = None
# =============================================================================
# Convenience Functions
# =============================================================================
def record_token_usage(
provider: str,
model: str,
input_tokens: int,
output_tokens: int,
latency_ms: float = 0.0,
success: bool = True,
error_message: str | None = None,
) -> None:
"""便捷函數: 記錄 Token 使用"""
usage = TokenUsage(
input_tokens=input_tokens,
output_tokens=output_tokens,
provider=provider,
model=model,
latency_ms=latency_ms,
success=success,
error_message=error_message,
)
get_token_counter().record_usage(usage)
def should_use_local_model(provider: str) -> tuple[bool, str]:
"""便捷函數: 檢查是否應該使用本地模型"""
return get_token_counter().should_fallback_to_local(provider)