fix(api): correct OTEL gRPC endpoint format and SignOz query table
Root cause analysis: 1. OTEL gRPC endpoint had http:// prefix which is invalid for gRPC 2. SignOz query was targeting wrong table (signoz_metrics.distributed_samples_v4) 3. Should query signoz_traces.distributed_signoz_index_v2 for trace data Fixes: - Remove http:// prefix from OTEL_EXPORTER_OTLP_ENDPOINT (gRPC needs host:port) - Update SignOz client to query traces table instead of metrics table - Fix timestamp format (nanoseconds for DateTime64(9)) - statusCode: 0=Unset, 1=Ok, 2=Error This should enable OTEL traces to reach SigNoz and GlobalPulse to show real metrics. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -130,8 +130,8 @@ class Settings(BaseSettings):
|
||||
description="Enable OpenTelemetry tracing (disable in MOCK_MODE)",
|
||||
)
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: str = Field(
|
||||
default="http://192.168.0.188:24317",
|
||||
description="SigNoz OTLP gRPC endpoint (Host port 24317 -> Container 4317)",
|
||||
default="192.168.0.188:24317",
|
||||
description="SigNoz OTLP gRPC endpoint (Host port 24317 -> Container 4317) - NO http:// prefix for gRPC",
|
||||
)
|
||||
OTEL_SERVICE_NAME: str = Field(
|
||||
default="awoooi-api",
|
||||
|
||||
@@ -249,22 +249,23 @@ class SignOzClient:
|
||||
time_range_end=end_time,
|
||||
)
|
||||
|
||||
# 計算 Unix 毫秒時間戳
|
||||
start_ms = int(start_time.timestamp() * 1000)
|
||||
end_ms = int(end_time.timestamp() * 1000)
|
||||
# =====================================================================
|
||||
# Query 1: RPS & Error Rate (從 traces 表直接計算)
|
||||
# =====================================================================
|
||||
# 使用 signoz_traces.distributed_signoz_index_v2 表
|
||||
# statusCode: 0=Unset, 1=Ok, 2=Error
|
||||
# 計算 Unix 納秒時間戳 (ClickHouse DateTime64(9) 格式)
|
||||
start_ns = int(start_time.timestamp() * 1_000_000_000)
|
||||
end_ns = int(end_time.timestamp() * 1_000_000_000)
|
||||
|
||||
# =====================================================================
|
||||
# Query 1: RPS & Error Rate (signoz_calls_total)
|
||||
# =====================================================================
|
||||
rps_query = f"""
|
||||
SELECT
|
||||
count() as total_requests,
|
||||
countIf(JSONExtractString(labels, 'status_code') >= '400') as error_count
|
||||
FROM signoz_metrics.distributed_samples_v4
|
||||
countIf(statusCode = 2) as error_count
|
||||
FROM signoz_traces.distributed_signoz_index_v2
|
||||
WHERE
|
||||
metric_name = 'signoz_calls_total'
|
||||
AND unix_milli BETWEEN {start_ms} AND {end_ms}
|
||||
AND JSONExtractString(labels, 'service_name') LIKE '%{service_name}%'
|
||||
timestamp BETWEEN toDateTime64({start_ns}, 9) AND toDateTime64({end_ns}, 9)
|
||||
AND serviceName LIKE '%{service_name}%'
|
||||
"""
|
||||
|
||||
rps_results = await self._query_clickhouse(rps_query)
|
||||
@@ -280,41 +281,39 @@ class SignOzClient:
|
||||
metrics.rps = total / (time_window_minutes * 60)
|
||||
|
||||
# =====================================================================
|
||||
# Query 2: Latency Percentiles (signoz_latency)
|
||||
# Query 2: Latency Percentiles (從 traces 表的 durationNano)
|
||||
# =====================================================================
|
||||
latency_query = f"""
|
||||
SELECT
|
||||
quantile(0.50)(value) as p50,
|
||||
quantile(0.95)(value) as p95,
|
||||
quantile(0.99)(value) as p99
|
||||
FROM signoz_metrics.distributed_samples_v4
|
||||
quantile(0.50)(durationNano / 1000000.0) as p50,
|
||||
quantile(0.95)(durationNano / 1000000.0) as p95,
|
||||
quantile(0.99)(durationNano / 1000000.0) as p99
|
||||
FROM signoz_traces.distributed_signoz_index_v2
|
||||
WHERE
|
||||
metric_name IN ('signoz_latency_count', 'signoz_db_latency_sum')
|
||||
AND unix_milli BETWEEN {start_ms} AND {end_ms}
|
||||
AND JSONExtractString(labels, 'service_name') LIKE '%{service_name}%'
|
||||
timestamp BETWEEN toDateTime64({start_ns}, 9) AND toDateTime64({end_ns}, 9)
|
||||
AND serviceName LIKE '%{service_name}%'
|
||||
"""
|
||||
|
||||
latency_results = await self._query_clickhouse(latency_query)
|
||||
|
||||
if latency_results:
|
||||
row = latency_results[0]
|
||||
metrics.p50_latency_ms = float(row.get("p50", 0))
|
||||
metrics.p95_latency_ms = float(row.get("p95", 0))
|
||||
metrics.p99_latency_ms = float(row.get("p99", 0))
|
||||
metrics.p50_latency_ms = float(row.get("p50", 0) or 0)
|
||||
metrics.p95_latency_ms = float(row.get("p95", 0) or 0)
|
||||
metrics.p99_latency_ms = float(row.get("p99", 0) or 0)
|
||||
|
||||
# =====================================================================
|
||||
# Query 3: Trend Analysis (對比前一時間窗)
|
||||
# =====================================================================
|
||||
prev_start_ms = int((start_time - timedelta(minutes=time_window_minutes)).timestamp() * 1000)
|
||||
prev_end_ms = start_ms
|
||||
prev_start_ns = int((start_time - timedelta(minutes=time_window_minutes)).timestamp() * 1_000_000_000)
|
||||
prev_end_ns = start_ns
|
||||
|
||||
trend_query = f"""
|
||||
SELECT count() as prev_requests
|
||||
FROM signoz_metrics.distributed_samples_v4
|
||||
FROM signoz_traces.distributed_signoz_index_v2
|
||||
WHERE
|
||||
metric_name = 'signoz_calls_total'
|
||||
AND unix_milli BETWEEN {prev_start_ms} AND {prev_end_ms}
|
||||
AND JSONExtractString(labels, 'service_name') LIKE '%{service_name}%'
|
||||
timestamp BETWEEN toDateTime64({prev_start_ns}, 9) AND toDateTime64({prev_end_ns}, 9)
|
||||
AND serviceName LIKE '%{service_name}%'
|
||||
"""
|
||||
|
||||
trend_results = await self._query_clickhouse(trend_query)
|
||||
|
||||
@@ -20,8 +20,9 @@ data:
|
||||
SIGNOZ_URL: "http://192.168.0.188:3301"
|
||||
|
||||
# OTEL 可觀測性 (P0 核心神經)
|
||||
# 注意: gRPC endpoint 不需要 http:// 前綴
|
||||
OTEL_ENABLED: "true"
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: "http://192.168.0.188:24317"
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: "192.168.0.188:24317"
|
||||
OTEL_SERVICE_NAME: "awoooi-api"
|
||||
|
||||
# 應用配置
|
||||
|
||||
Reference in New Issue
Block a user