""" Token Counter Service - Phase 13.3 #88 AI Token Dashboard ========================================================= Token 用量監控,整合 SignOz OTEL Metrics + Langfuse 功能: - 記錄每次 LLM 呼叫的 input/output tokens - 按 provider 分類統計 - 成本估算 (Gemini/Claude 有成本,Ollama 免費) - 每日/每月 Token 預算監控 - 超標時通知切換到本地模型 SignOz 指標: - llm.tokens.input (Counter) - 輸入 Token 數 - llm.tokens.output (Counter) - 輸出 Token 數 - llm.cost.usd (Counter) - 累計成本 - llm.latency.ms (Histogram) - 延遲分佈 - llm.requests.total (Counter) - 總請求數 - llm.requests.failed (Counter) - 失敗請求數 版本: v1.0 建立: 2026-03-26 14:30 (台北時區) 建立者: Claude Code 最後修改: 2026-03-26 14:30 (台北時區) 修改者: Claude Code 變更紀錄: | 版本 | 日期 | 執行者 | 變更內容 | |------|------|--------|----------| | v1.0 | 2026-03-26 | Claude Code | Phase 13.3 #88 初始實作 | """ import time from dataclasses import dataclass, field from datetime import UTC, datetime from typing import Protocol import structlog from opentelemetry import metrics from opentelemetry.metrics import Counter, Histogram, Meter from src.core.config import settings logger = structlog.get_logger(__name__) # ============================================================================= # Constants - Cost Per 1K Tokens (USD) # ============================================================================= # 成本定義 (from models.json) COST_PER_1K_TOKENS = { "ollama": 0.0, # 本地免費 "gemini": 0.001, # Gemini 1.5 Flash "claude": 0.005, # Claude Haiku 4.5 conservative output-side estimate } # 預算閾值 (from models.json monitoring.alerts) DAILY_COST_THRESHOLD_USD = 5.0 MONTHLY_COST_THRESHOLD_USD = 10.0 DAILY_TOKEN_BUDGET = { "gemini": 100_000, # 每日 100K tokens "claude": 50_000, # 每日 50K tokens } MONTHLY_TOKEN_BUDGET = { "gemini": 2_000_000, # 每月 2M tokens "claude": 500_000, # 每月 500K tokens } ALERT_THRESHOLD_PERCENT = 70 # 70% 預警 # ============================================================================= # Data Classes # ============================================================================= @dataclass class TokenUsage: """單次 LLM 呼叫的 Token 使用量""" input_tokens: int output_tokens: int total_tokens: int = field(init=False) provider: str model: str latency_ms: float = 0.0 success: bool = True error_message: str | None = None timestamp: datetime = field(default_factory=lambda: datetime.now(UTC)) def __post_init__(self): self.total_tokens = self.input_tokens + self.output_tokens @property def estimated_cost_usd(self) -> float: """估算成本 (USD)""" cost_per_1k = COST_PER_1K_TOKENS.get(self.provider.lower(), 0.0) return (self.total_tokens / 1000) * cost_per_1k @dataclass class ProviderStats: """Provider 統計""" provider: str total_input_tokens: int = 0 total_output_tokens: int = 0 total_requests: int = 0 failed_requests: int = 0 total_latency_ms: float = 0.0 total_cost_usd: float = 0.0 period_start: datetime = field(default_factory=lambda: datetime.now(UTC)) @property def total_tokens(self) -> int: return self.total_input_tokens + self.total_output_tokens @property def success_rate(self) -> float: if self.total_requests == 0: return 100.0 return ((self.total_requests - self.failed_requests) / self.total_requests) * 100 @property def avg_latency_ms(self) -> float: if self.total_requests == 0: return 0.0 return self.total_latency_ms / self.total_requests @dataclass class BudgetStatus: """預算狀態""" provider: str daily_tokens_used: int daily_tokens_budget: int daily_cost_usd: float monthly_tokens_used: int monthly_tokens_budget: int monthly_cost_usd: float is_over_budget: bool = False alert_triggered: bool = False recommendation: str = "" # ============================================================================= # Interface (Protocol for DI) # ============================================================================= class ITokenCounter(Protocol): """Token Counter Interface""" def record_usage(self, usage: TokenUsage) -> None: """記錄 Token 使用""" ... def get_provider_stats(self, provider: str) -> ProviderStats: """取得 Provider 統計""" ... def get_budget_status(self, provider: str) -> BudgetStatus: """取得預算狀態""" ... def should_fallback_to_local(self, provider: str) -> tuple[bool, str]: """檢查是否應該 fallback 到本地模型""" ... # ============================================================================= # Token Counter Implementation # ============================================================================= class TokenCounter: """ Token 計數器 - OTEL Metrics + Langfuse 整合 使用 OpenTelemetry Metrics API 將指標送到 SignOz, 同時整合 Langfuse 記錄詳細的 LLM trace。 Usage: counter = get_token_counter() counter.record_usage(TokenUsage( input_tokens=500, output_tokens=200, provider="ollama", model="qwen2.5:7b-instruct", latency_ms=1500, )) """ def __init__(self): self._provider_stats: dict[str, ProviderStats] = {} self._daily_stats: dict[str, ProviderStats] = {} self._monthly_stats: dict[str, ProviderStats] = {} self._last_daily_reset: datetime = datetime.now(UTC).replace( hour=0, minute=0, second=0, microsecond=0 ) self._last_monthly_reset: datetime = datetime.now(UTC).replace( day=1, hour=0, minute=0, second=0, microsecond=0 ) # OTEL Metrics 初始化 self._meter: Meter | None = None self._input_tokens_counter: Counter | None = None self._output_tokens_counter: Counter | None = None self._cost_counter: Counter | None = None self._latency_histogram: Histogram | None = None self._request_counter: Counter | None = None self._failed_counter: Counter | None = None self._init_metrics() def _init_metrics(self) -> None: """初始化 OTEL Metrics""" if not settings.OTEL_ENABLED or settings.MOCK_MODE: logger.info("otel_metrics_disabled", reason="OTEL_ENABLED=false or MOCK_MODE=true") return try: # 取得 MeterProvider self._meter = metrics.get_meter( name="awoooi.llm", version=settings.VERSION, ) # 建立 Counters self._input_tokens_counter = self._meter.create_counter( name="llm.tokens.input", description="LLM input tokens count", unit="tokens", ) self._output_tokens_counter = self._meter.create_counter( name="llm.tokens.output", description="LLM output tokens count", unit="tokens", ) self._cost_counter = self._meter.create_counter( name="llm.cost.usd", description="Estimated LLM cost in USD", unit="USD", ) self._request_counter = self._meter.create_counter( name="llm.requests.total", description="Total LLM requests", unit="requests", ) self._failed_counter = self._meter.create_counter( name="llm.requests.failed", description="Failed LLM requests", unit="requests", ) # 建立 Histogram (延遲分佈) self._latency_histogram = self._meter.create_histogram( name="llm.latency.ms", description="LLM request latency in milliseconds", unit="ms", ) logger.info( "otel_llm_metrics_initialized", meter_name="awoooi.llm", ) except Exception as e: logger.warning( "otel_metrics_init_failed", error=str(e), ) def _reset_if_needed(self) -> None: """檢查並重置每日/每月統計""" now = datetime.now(UTC) # 每日重置 today_start = now.replace(hour=0, minute=0, second=0, microsecond=0) if today_start > self._last_daily_reset: logger.info( "daily_stats_reset", previous_date=self._last_daily_reset.isoformat(), ) self._daily_stats = {} self._last_daily_reset = today_start # 每月重置 month_start = now.replace(day=1, hour=0, minute=0, second=0, microsecond=0) if month_start > self._last_monthly_reset: logger.info( "monthly_stats_reset", previous_month=self._last_monthly_reset.isoformat(), ) self._monthly_stats = {} self._last_monthly_reset = month_start def _get_or_create_stats( self, provider: str, stats_dict: dict[str, ProviderStats] ) -> ProviderStats: """取得或建立 Provider 統計""" if provider not in stats_dict: stats_dict[provider] = ProviderStats(provider=provider) return stats_dict[provider] def record_usage(self, usage: TokenUsage) -> None: """ 記錄 Token 使用量 同時更新: 1. 內存統計 (總計/每日/每月) 2. OTEL Metrics (SignOz) 3. Langfuse (如果有 trace context) """ self._reset_if_needed() provider = usage.provider.lower() attributes = { "provider": provider, "model": usage.model, "environment": settings.ENVIRONMENT, } # 更新內存統計 for stats_dict in [self._provider_stats, self._daily_stats, self._monthly_stats]: stats = self._get_or_create_stats(provider, stats_dict) stats.total_input_tokens += usage.input_tokens stats.total_output_tokens += usage.output_tokens stats.total_requests += 1 stats.total_latency_ms += usage.latency_ms stats.total_cost_usd += usage.estimated_cost_usd if not usage.success: stats.failed_requests += 1 # 發送 OTEL Metrics if self._input_tokens_counter: self._input_tokens_counter.add(usage.input_tokens, attributes) if self._output_tokens_counter: self._output_tokens_counter.add(usage.output_tokens, attributes) if self._cost_counter and usage.estimated_cost_usd > 0: # Counter 只接受整數或 float,成本用 micro-USD (乘以 1,000,000) # 或直接用 float self._cost_counter.add(usage.estimated_cost_usd, attributes) if self._request_counter: self._request_counter.add(1, attributes) if not usage.success and self._failed_counter: self._failed_counter.add(1, attributes) if self._latency_histogram and usage.latency_ms > 0: self._latency_histogram.record(usage.latency_ms, attributes) # 記錄日誌 logger.info( "token_usage_recorded", provider=provider, model=usage.model, input_tokens=usage.input_tokens, output_tokens=usage.output_tokens, total_tokens=usage.total_tokens, latency_ms=round(usage.latency_ms, 2), cost_usd=round(usage.estimated_cost_usd, 6), success=usage.success, ) # 檢查預算告警 self._check_budget_alert(provider) def _check_budget_alert(self, provider: str) -> None: """檢查預算告警""" status = self.get_budget_status(provider) if status.alert_triggered: logger.warning( "llm_budget_alert", provider=provider, daily_usage_percent=round( (status.daily_tokens_used / status.daily_tokens_budget * 100) if status.daily_tokens_budget > 0 else 0, 1, ), monthly_usage_percent=round( (status.monthly_tokens_used / status.monthly_tokens_budget * 100) if status.monthly_tokens_budget > 0 else 0, 1, ), recommendation=status.recommendation, ) if status.is_over_budget: logger.error( "llm_budget_exceeded", provider=provider, daily_tokens_used=status.daily_tokens_used, monthly_tokens_used=status.monthly_tokens_used, recommendation=status.recommendation, ) def get_provider_stats(self, provider: str) -> ProviderStats: """取得 Provider 總計統計""" return self._get_or_create_stats(provider.lower(), self._provider_stats) def get_daily_stats(self, provider: str) -> ProviderStats: """取得 Provider 每日統計""" self._reset_if_needed() return self._get_or_create_stats(provider.lower(), self._daily_stats) def get_monthly_stats(self, provider: str) -> ProviderStats: """取得 Provider 每月統計""" self._reset_if_needed() return self._get_or_create_stats(provider.lower(), self._monthly_stats) def get_budget_status(self, provider: str) -> BudgetStatus: """取得預算狀態""" self._reset_if_needed() provider = provider.lower() daily_stats = self.get_daily_stats(provider) monthly_stats = self.get_monthly_stats(provider) daily_budget = DAILY_TOKEN_BUDGET.get(provider, 0) monthly_budget = MONTHLY_TOKEN_BUDGET.get(provider, 0) # 計算使用率 daily_usage_percent = ( (daily_stats.total_tokens / daily_budget * 100) if daily_budget > 0 else 0 ) monthly_usage_percent = ( (monthly_stats.total_tokens / monthly_budget * 100) if monthly_budget > 0 else 0 ) # 判斷告警狀態 alert_triggered = ( daily_usage_percent >= ALERT_THRESHOLD_PERCENT or monthly_usage_percent >= ALERT_THRESHOLD_PERCENT ) is_over_budget = daily_usage_percent >= 100 or monthly_usage_percent >= 100 # 建議 recommendation = "" if is_over_budget: recommendation = "建議切換到本地模型 (Ollama) 以節省成本" elif alert_triggered: recommendation = f"接近預算上限 ({max(daily_usage_percent, monthly_usage_percent):.1f}%),考慮減少 {provider} 呼叫" return BudgetStatus( provider=provider, daily_tokens_used=daily_stats.total_tokens, daily_tokens_budget=daily_budget, daily_cost_usd=daily_stats.total_cost_usd, monthly_tokens_used=monthly_stats.total_tokens, monthly_tokens_budget=monthly_budget, monthly_cost_usd=monthly_stats.total_cost_usd, is_over_budget=is_over_budget, alert_triggered=alert_triggered, recommendation=recommendation, ) def should_fallback_to_local(self, provider: str) -> tuple[bool, str]: """ 檢查是否應該 fallback 到本地模型 Returns: (should_fallback, reason) """ if provider.lower() == "ollama": return False, "Already using local model" status = self.get_budget_status(provider) if status.is_over_budget: return True, f"Budget exceeded for {provider}: {status.recommendation}" if status.alert_triggered: # 70% 以上時,可選擇 fallback return False, f"Near budget threshold for {provider}: {status.recommendation}" return False, "Budget OK" def get_all_stats_summary(self) -> dict: """取得所有 Provider 統計摘要""" self._reset_if_needed() summary = { "timestamp": datetime.now(UTC).isoformat(), "providers": {}, "total": { "input_tokens": 0, "output_tokens": 0, "cost_usd": 0.0, "requests": 0, }, } for provider in ["ollama", "gemini", "claude"]: daily = self.get_daily_stats(provider) monthly = self.get_monthly_stats(provider) budget = self.get_budget_status(provider) summary["providers"][provider] = { "daily": { "input_tokens": daily.total_input_tokens, "output_tokens": daily.total_output_tokens, "total_tokens": daily.total_tokens, "cost_usd": round(daily.total_cost_usd, 4), "requests": daily.total_requests, "success_rate": round(daily.success_rate, 1), "avg_latency_ms": round(daily.avg_latency_ms, 1), }, "monthly": { "input_tokens": monthly.total_input_tokens, "output_tokens": monthly.total_output_tokens, "total_tokens": monthly.total_tokens, "cost_usd": round(monthly.total_cost_usd, 4), "requests": monthly.total_requests, }, "budget": { "daily_budget": budget.daily_tokens_budget, "daily_usage_percent": round( (budget.daily_tokens_used / budget.daily_tokens_budget * 100) if budget.daily_tokens_budget > 0 else 0, 1, ), "monthly_budget": budget.monthly_tokens_budget, "monthly_usage_percent": round( (budget.monthly_tokens_used / budget.monthly_tokens_budget * 100) if budget.monthly_tokens_budget > 0 else 0, 1, ), "is_over_budget": budget.is_over_budget, "alert_triggered": budget.alert_triggered, }, } # 累計總計 summary["total"]["input_tokens"] += daily.total_input_tokens summary["total"]["output_tokens"] += daily.total_output_tokens summary["total"]["cost_usd"] += daily.total_cost_usd summary["total"]["requests"] += daily.total_requests summary["total"]["cost_usd"] = round(summary["total"]["cost_usd"], 4) return summary # ============================================================================= # Helper: Usage Tracker Context Manager # ============================================================================= class UsageTracker: """ Token 使用追蹤器 - Context Manager 自動計時並記錄 Token 使用 Usage: async with UsageTracker("ollama", "qwen2.5:7b-instruct") as tracker: result = await call_llm(prompt) tracker.set_tokens(input_tokens=500, output_tokens=200) """ def __init__(self, provider: str, model: str): self.provider = provider self.model = model self.start_time: float = 0 self.input_tokens: int = 0 self.output_tokens: int = 0 self.success: bool = True self.error_message: str | None = None self._counter = get_token_counter() def __enter__(self): self.start_time = time.perf_counter() return self def __exit__(self, exc_type, exc_val, exc_tb): latency_ms = (time.perf_counter() - self.start_time) * 1000 if exc_type is not None: self.success = False self.error_message = str(exc_val) usage = TokenUsage( input_tokens=self.input_tokens, output_tokens=self.output_tokens, provider=self.provider, model=self.model, latency_ms=latency_ms, success=self.success, error_message=self.error_message, ) self._counter.record_usage(usage) async def __aenter__(self): return self.__enter__() async def __aexit__(self, exc_type, exc_val, exc_tb): return self.__exit__(exc_type, exc_val, exc_tb) def set_tokens(self, input_tokens: int, output_tokens: int) -> None: """設定 Token 數量""" self.input_tokens = input_tokens self.output_tokens = output_tokens def mark_failed(self, error_message: str) -> None: """標記失敗""" self.success = False self.error_message = error_message # ============================================================================= # Singleton # ============================================================================= _token_counter: TokenCounter | None = None def get_token_counter() -> TokenCounter: """取得 TokenCounter 單例""" global _token_counter if _token_counter is None: _token_counter = TokenCounter() return _token_counter def reset_token_counter() -> None: """重置單例 (用於測試)""" global _token_counter _token_counter = None # ============================================================================= # Convenience Functions # ============================================================================= def record_token_usage( provider: str, model: str, input_tokens: int, output_tokens: int, latency_ms: float = 0.0, success: bool = True, error_message: str | None = None, ) -> None: """便捷函數: 記錄 Token 使用""" usage = TokenUsage( input_tokens=input_tokens, output_tokens=output_tokens, provider=provider, model=model, latency_ms=latency_ms, success=success, error_message=error_message, ) get_token_counter().record_usage(usage) def should_use_local_model(provider: str) -> tuple[bool, str]: """便捷函數: 檢查是否應該使用本地模型""" return get_token_counter().should_fallback_to_local(provider)