fix(api): correct OTEL gRPC endpoint format and SignOz query table

Root cause analysis:
1. OTEL gRPC endpoint had http:// prefix which is invalid for gRPC
2. SignOz query was targeting wrong table (signoz_metrics.distributed_samples_v4)
3. Should query signoz_traces.distributed_signoz_index_v2 for trace data

Fixes:
- Remove http:// prefix from OTEL_EXPORTER_OTLP_ENDPOINT (gRPC needs host:port)
- Update SignOz client to query traces table instead of metrics table
- Fix timestamp format (nanoseconds for DateTime64(9))
- statusCode: 0=Unset, 1=Ok, 2=Error

This should enable OTEL traces to reach SigNoz and GlobalPulse to show real metrics.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-03-23 00:41:51 +08:00
parent fea6524f35
commit b00f318450
3 changed files with 31 additions and 31 deletions

View File

@@ -130,8 +130,8 @@ class Settings(BaseSettings):
description="Enable OpenTelemetry tracing (disable in MOCK_MODE)",
)
OTEL_EXPORTER_OTLP_ENDPOINT: str = Field(
default="http://192.168.0.188:24317",
description="SigNoz OTLP gRPC endpoint (Host port 24317 -> Container 4317)",
default="192.168.0.188:24317",
description="SigNoz OTLP gRPC endpoint (Host port 24317 -> Container 4317) - NO http:// prefix for gRPC",
)
OTEL_SERVICE_NAME: str = Field(
default="awoooi-api",

View File

@@ -249,22 +249,23 @@ class SignOzClient:
time_range_end=end_time,
)
# 計算 Unix 毫秒時間戳
start_ms = int(start_time.timestamp() * 1000)
end_ms = int(end_time.timestamp() * 1000)
# =====================================================================
# Query 1: RPS & Error Rate (從 traces 表直接計算)
# =====================================================================
# 使用 signoz_traces.distributed_signoz_index_v2 表
# statusCode: 0=Unset, 1=Ok, 2=Error
# 計算 Unix 納秒時間戳 (ClickHouse DateTime64(9) 格式)
start_ns = int(start_time.timestamp() * 1_000_000_000)
end_ns = int(end_time.timestamp() * 1_000_000_000)
# =====================================================================
# Query 1: RPS & Error Rate (signoz_calls_total)
# =====================================================================
rps_query = f"""
SELECT
count() as total_requests,
countIf(JSONExtractString(labels, 'status_code') >= '400') as error_count
FROM signoz_metrics.distributed_samples_v4
countIf(statusCode = 2) as error_count
FROM signoz_traces.distributed_signoz_index_v2
WHERE
metric_name = 'signoz_calls_total'
AND unix_milli BETWEEN {start_ms} AND {end_ms}
AND JSONExtractString(labels, 'service_name') LIKE '%{service_name}%'
timestamp BETWEEN toDateTime64({start_ns}, 9) AND toDateTime64({end_ns}, 9)
AND serviceName LIKE '%{service_name}%'
"""
rps_results = await self._query_clickhouse(rps_query)
@@ -280,41 +281,39 @@ class SignOzClient:
metrics.rps = total / (time_window_minutes * 60)
# =====================================================================
# Query 2: Latency Percentiles (signoz_latency)
# Query 2: Latency Percentiles (從 traces 表的 durationNano)
# =====================================================================
latency_query = f"""
SELECT
quantile(0.50)(value) as p50,
quantile(0.95)(value) as p95,
quantile(0.99)(value) as p99
FROM signoz_metrics.distributed_samples_v4
quantile(0.50)(durationNano / 1000000.0) as p50,
quantile(0.95)(durationNano / 1000000.0) as p95,
quantile(0.99)(durationNano / 1000000.0) as p99
FROM signoz_traces.distributed_signoz_index_v2
WHERE
metric_name IN ('signoz_latency_count', 'signoz_db_latency_sum')
AND unix_milli BETWEEN {start_ms} AND {end_ms}
AND JSONExtractString(labels, 'service_name') LIKE '%{service_name}%'
timestamp BETWEEN toDateTime64({start_ns}, 9) AND toDateTime64({end_ns}, 9)
AND serviceName LIKE '%{service_name}%'
"""
latency_results = await self._query_clickhouse(latency_query)
if latency_results:
row = latency_results[0]
metrics.p50_latency_ms = float(row.get("p50", 0))
metrics.p95_latency_ms = float(row.get("p95", 0))
metrics.p99_latency_ms = float(row.get("p99", 0))
metrics.p50_latency_ms = float(row.get("p50", 0) or 0)
metrics.p95_latency_ms = float(row.get("p95", 0) or 0)
metrics.p99_latency_ms = float(row.get("p99", 0) or 0)
# =====================================================================
# Query 3: Trend Analysis (對比前一時間窗)
# =====================================================================
prev_start_ms = int((start_time - timedelta(minutes=time_window_minutes)).timestamp() * 1000)
prev_end_ms = start_ms
prev_start_ns = int((start_time - timedelta(minutes=time_window_minutes)).timestamp() * 1_000_000_000)
prev_end_ns = start_ns
trend_query = f"""
SELECT count() as prev_requests
FROM signoz_metrics.distributed_samples_v4
FROM signoz_traces.distributed_signoz_index_v2
WHERE
metric_name = 'signoz_calls_total'
AND unix_milli BETWEEN {prev_start_ms} AND {prev_end_ms}
AND JSONExtractString(labels, 'service_name') LIKE '%{service_name}%'
timestamp BETWEEN toDateTime64({prev_start_ns}, 9) AND toDateTime64({prev_end_ns}, 9)
AND serviceName LIKE '%{service_name}%'
"""
trend_results = await self._query_clickhouse(trend_query)

View File

@@ -20,8 +20,9 @@ data:
SIGNOZ_URL: "http://192.168.0.188:3301"
# OTEL 可觀測性 (P0 核心神經)
# 注意: gRPC endpoint 不需要 http:// 前綴
OTEL_ENABLED: "true"
OTEL_EXPORTER_OTLP_ENDPOINT: "http://192.168.0.188:24317"
OTEL_EXPORTER_OTLP_ENDPOINT: "192.168.0.188:24317"
OTEL_SERVICE_NAME: "awoooi-api"
# 應用配置