From b00f3184500a4834cee051fcb9a8ac63e04a7951 Mon Sep 17 00:00:00 2001 From: OG T Date: Mon, 23 Mar 2026 00:41:51 +0800 Subject: [PATCH] fix(api): correct OTEL gRPC endpoint format and SignOz query table Root cause analysis: 1. OTEL gRPC endpoint had http:// prefix which is invalid for gRPC 2. SignOz query was targeting wrong table (signoz_metrics.distributed_samples_v4) 3. Should query signoz_traces.distributed_signoz_index_v2 for trace data Fixes: - Remove http:// prefix from OTEL_EXPORTER_OTLP_ENDPOINT (gRPC needs host:port) - Update SignOz client to query traces table instead of metrics table - Fix timestamp format (nanoseconds for DateTime64(9)) - statusCode: 0=Unset, 1=Ok, 2=Error This should enable OTEL traces to reach SigNoz and GlobalPulse to show real metrics. Co-Authored-By: Claude Opus 4.5 --- apps/api/src/core/config.py | 4 +- apps/api/src/services/signoz_client.py | 55 +++++++++++++------------- k8s/awoooi-prod/04-configmap.yaml | 3 +- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/apps/api/src/core/config.py b/apps/api/src/core/config.py index 602e1d7a..d3ec6c78 100644 --- a/apps/api/src/core/config.py +++ b/apps/api/src/core/config.py @@ -130,8 +130,8 @@ class Settings(BaseSettings): description="Enable OpenTelemetry tracing (disable in MOCK_MODE)", ) OTEL_EXPORTER_OTLP_ENDPOINT: str = Field( - default="http://192.168.0.188:24317", - description="SigNoz OTLP gRPC endpoint (Host port 24317 -> Container 4317)", + default="192.168.0.188:24317", + description="SigNoz OTLP gRPC endpoint (Host port 24317 -> Container 4317) - NO http:// prefix for gRPC", ) OTEL_SERVICE_NAME: str = Field( default="awoooi-api", diff --git a/apps/api/src/services/signoz_client.py b/apps/api/src/services/signoz_client.py index 21564b8c..c8f08a83 100644 --- a/apps/api/src/services/signoz_client.py +++ b/apps/api/src/services/signoz_client.py @@ -249,22 +249,23 @@ class SignOzClient: time_range_end=end_time, ) - # 計算 Unix 毫秒時間戳 - start_ms = int(start_time.timestamp() * 1000) - end_ms = int(end_time.timestamp() * 1000) + # ===================================================================== + # Query 1: RPS & Error Rate (從 traces 表直接計算) + # ===================================================================== + # 使用 signoz_traces.distributed_signoz_index_v2 表 + # statusCode: 0=Unset, 1=Ok, 2=Error + # 計算 Unix 納秒時間戳 (ClickHouse DateTime64(9) 格式) + start_ns = int(start_time.timestamp() * 1_000_000_000) + end_ns = int(end_time.timestamp() * 1_000_000_000) - # ===================================================================== - # Query 1: RPS & Error Rate (signoz_calls_total) - # ===================================================================== rps_query = f""" SELECT count() as total_requests, - countIf(JSONExtractString(labels, 'status_code') >= '400') as error_count - FROM signoz_metrics.distributed_samples_v4 + countIf(statusCode = 2) as error_count + FROM signoz_traces.distributed_signoz_index_v2 WHERE - metric_name = 'signoz_calls_total' - AND unix_milli BETWEEN {start_ms} AND {end_ms} - AND JSONExtractString(labels, 'service_name') LIKE '%{service_name}%' + timestamp BETWEEN toDateTime64({start_ns}, 9) AND toDateTime64({end_ns}, 9) + AND serviceName LIKE '%{service_name}%' """ rps_results = await self._query_clickhouse(rps_query) @@ -280,41 +281,39 @@ class SignOzClient: metrics.rps = total / (time_window_minutes * 60) # ===================================================================== - # Query 2: Latency Percentiles (signoz_latency) + # Query 2: Latency Percentiles (從 traces 表的 durationNano) # ===================================================================== latency_query = f""" SELECT - quantile(0.50)(value) as p50, - quantile(0.95)(value) as p95, - quantile(0.99)(value) as p99 - FROM signoz_metrics.distributed_samples_v4 + quantile(0.50)(durationNano / 1000000.0) as p50, + quantile(0.95)(durationNano / 1000000.0) as p95, + quantile(0.99)(durationNano / 1000000.0) as p99 + FROM signoz_traces.distributed_signoz_index_v2 WHERE - metric_name IN ('signoz_latency_count', 'signoz_db_latency_sum') - AND unix_milli BETWEEN {start_ms} AND {end_ms} - AND JSONExtractString(labels, 'service_name') LIKE '%{service_name}%' + timestamp BETWEEN toDateTime64({start_ns}, 9) AND toDateTime64({end_ns}, 9) + AND serviceName LIKE '%{service_name}%' """ latency_results = await self._query_clickhouse(latency_query) if latency_results: row = latency_results[0] - metrics.p50_latency_ms = float(row.get("p50", 0)) - metrics.p95_latency_ms = float(row.get("p95", 0)) - metrics.p99_latency_ms = float(row.get("p99", 0)) + metrics.p50_latency_ms = float(row.get("p50", 0) or 0) + metrics.p95_latency_ms = float(row.get("p95", 0) or 0) + metrics.p99_latency_ms = float(row.get("p99", 0) or 0) # ===================================================================== # Query 3: Trend Analysis (對比前一時間窗) # ===================================================================== - prev_start_ms = int((start_time - timedelta(minutes=time_window_minutes)).timestamp() * 1000) - prev_end_ms = start_ms + prev_start_ns = int((start_time - timedelta(minutes=time_window_minutes)).timestamp() * 1_000_000_000) + prev_end_ns = start_ns trend_query = f""" SELECT count() as prev_requests - FROM signoz_metrics.distributed_samples_v4 + FROM signoz_traces.distributed_signoz_index_v2 WHERE - metric_name = 'signoz_calls_total' - AND unix_milli BETWEEN {prev_start_ms} AND {prev_end_ms} - AND JSONExtractString(labels, 'service_name') LIKE '%{service_name}%' + timestamp BETWEEN toDateTime64({prev_start_ns}, 9) AND toDateTime64({prev_end_ns}, 9) + AND serviceName LIKE '%{service_name}%' """ trend_results = await self._query_clickhouse(trend_query) diff --git a/k8s/awoooi-prod/04-configmap.yaml b/k8s/awoooi-prod/04-configmap.yaml index 06b63b62..c8b3935c 100644 --- a/k8s/awoooi-prod/04-configmap.yaml +++ b/k8s/awoooi-prod/04-configmap.yaml @@ -20,8 +20,9 @@ data: SIGNOZ_URL: "http://192.168.0.188:3301" # OTEL 可觀測性 (P0 核心神經) + # 注意: gRPC endpoint 不需要 http:// 前綴 OTEL_ENABLED: "true" - OTEL_EXPORTER_OTLP_ENDPOINT: "http://192.168.0.188:24317" + OTEL_EXPORTER_OTLP_ENDPOINT: "192.168.0.188:24317" OTEL_SERVICE_NAME: "awoooi-api" # 應用配置