Files
awoooi/apps/api/src/services/signoz_client.py
OG T 579da38b8b feat(api): Phase 13 智能路由 + CI/CD 整合 (#74-88)
Phase 13.1 CI/CD Integration:
- #76 workflow_run handler for CI failure diagnosis
- #77 SignOz log query (query_logs, error_logs_summary MCP)
- #78 CIAutoRepairService with risk-based execution decisions

Phase 13.3 Smart Routing:
- #85 Intent Classifier v2.0 (rule engine + LLM fallback)
- #86 Complexity Scorer (9-dimension scoring)
- #87 AI Router v3.0 (routing decision matrix)
- #88 Token Counter (OTEL + Langfuse integration)

New files:
- services/ci_auto_repair.py (risk stratification)
- services/model_registry.py (centralized model config)
- services/token_counter.py (677 lines)
- Skill 08: Model Router Expert
- Skill 09: Strangler Pattern Expert
- ADR-023: Smart Routing Architecture
- ADR-024: API Layer Architecture

Tests:
- phase11-conversational.spec.ts (E2E tests)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-26 15:32:52 +08:00

577 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
SignOz Client - 全能視力中心 (戰略校正版)
==========================================
統帥鐵律: 嚴禁 Prometheus 碎片化SignOz 為唯一真相來源
Features:
- ClickHouse 直查 (繞過需認證的 SignOz API)
- Gold Metrics 擷取 (P99 Latency, Error Rate, RPS)
- 動態時間範圍 Trace URL 生成
- 趨勢圖表數據提取 (供 AI 分析)
架構:
- SignOz Query Service: 192.168.0.188:3301 (需認證)
- ClickHouse HTTP API: 192.168.0.188:8123 (直查)
"""
import json
import time
from dataclasses import dataclass, field
from datetime import UTC, datetime, timedelta
import structlog
from src.core.config import settings
from src.core.http_client import get_clickhouse_client
logger = structlog.get_logger(__name__)
# =============================================================================
# SignOz Data Models
# =============================================================================
@dataclass
class GoldMetrics:
"""
Gold Metrics - RED Methodology (Rate, Errors, Duration)
SRE 黃金指標:
- RPS (Requests Per Second): 流量
- Error Rate: 錯誤率 (%)
- P99 Latency: 99th percentile 延遲 (ms)
"""
service_name: str
namespace: str
time_range_start: datetime
time_range_end: datetime
# Rate
rps: float = 0.0
rps_trend: str = "stable" # up, down, stable
# Errors
error_rate: float = 0.0 # percentage
error_count: int = 0
total_requests: int = 0
# Duration
p50_latency_ms: float = 0.0
p95_latency_ms: float = 0.0
p99_latency_ms: float = 0.0
latency_trend: str = "stable"
# Raw data for AI analysis
raw_metrics: dict = field(default_factory=dict)
def to_summary(self) -> str:
"""生成 AI 分析摘要"""
trend_emoji = {"up": "📈", "down": "📉", "stable": "➡️"}
error_emoji = "🟢" if self.error_rate < 1 else ("🟡" if self.error_rate < 5 else "🔴")
return (
f"📊 Gold Metrics ({self.service_name})\n"
f"• RPS: {self.rps:.1f} {trend_emoji.get(self.rps_trend, '➡️')}\n"
f"• Error Rate: {error_emoji} {self.error_rate:.2f}%\n"
f"• P99 Latency: {self.p99_latency_ms:.0f}ms {trend_emoji.get(self.latency_trend, '➡️')}"
)
def to_telegram_block(self) -> str:
"""生成 Telegram 卡片區塊 (HTML)"""
trend_emoji = {"up": "📈", "down": "📉", "stable": "➡️"}
error_emoji = "🟢" if self.error_rate < 1 else ("🟡" if self.error_rate < 5 else "🔴")
return (
f"📊 <b>SignOz 指標</b>\n"
f"├ RPS: <code>{self.rps:.1f}</code> {trend_emoji.get(self.rps_trend, '➡️')}\n"
f"├ Error: {error_emoji} <code>{self.error_rate:.2f}%</code>\n"
f"└ P99: <code>{self.p99_latency_ms:.0f}ms</code> {trend_emoji.get(self.latency_trend, '➡️')}"
)
@dataclass
class SignOzTraceLink:
"""動態 SignOz Trace 連結"""
base_url: str
service_name: str
start_time: datetime
end_time: datetime
namespace: str = "default"
def generate_url(self) -> str:
"""
生成帶時間參數的 Trace URL
格式: http://host:port/traces?service=xxx&start=timestamp&end=timestamp
"""
start_ns = int(self.start_time.timestamp() * 1_000_000_000)
end_ns = int(self.end_time.timestamp() * 1_000_000_000)
return (
f"{self.base_url}/traces?"
f"service={self.service_name}&"
f"start={start_ns}&"
f"end={end_ns}"
)
# =============================================================================
# SignOz Client
# =============================================================================
class SignOzClient:
"""
SignOz Client - 直查 ClickHouse (永久架構版)
統帥鐵律: 禁止 subprocess+curl使用 Lifespan 管理的 httpx.AsyncClient
使用 ClickHouse HTTP API 繞過需認證的 SignOz Query Service
"""
def __init__(self):
self.signoz_url = settings.SIGNOZ_URL # http://192.168.0.188:3301
self.clickhouse_url = settings.CLICKHOUSE_URL # http://192.168.0.188:8123
async def close(self) -> None:
"""關閉連線 (由 Lifespan 統一管理,此處為相容性保留)"""
pass # HTTP Client 由 src.core.http_client 管理
# =========================================================================
# ClickHouse Direct Queries (永久架構)
# =========================================================================
async def _query_clickhouse(self, query: str) -> list[dict]:
"""
執行 ClickHouse 查詢 (原生 httpx非 curl)
統帥鐵律:
- 使用 Lifespan 管理的 httpx.AsyncClient
- trust_env=False 防止 HTTP_PROXY 干擾
- < 50ms 延遲目標
ClickHouse HTTP API: POST body = SQL, 加 FORMAT JSONEachRow 到查詢末尾
"""
# 加入 FORMAT JSONEachRow 到查詢末尾
formatted_query = query.strip().rstrip(";") + " FORMAT JSONEachRow"
start_time = time.perf_counter()
try:
# 取得 Lifespan 管理的 Client
client = await get_clickhouse_client()
logger.debug(
"clickhouse_query_start",
base_url=self.clickhouse_url,
query_preview=formatted_query[:80],
)
# 原生 httpx POST 請求
response = await client.post(
"/", # base_url 已設定,只需 path
content=formatted_query,
)
elapsed_ms = (time.perf_counter() - start_time) * 1000
# 檢查 HTTP 狀態
if response.status_code != 200:
logger.warning(
"clickhouse_query_http_error",
status_code=response.status_code,
response_text=response.text[:200],
elapsed_ms=round(elapsed_ms, 2),
)
return []
# 解析 JSONEachRow 格式 (每行一個 JSON 物件)
results = []
for line in response.text.strip().split("\n"):
if line:
try:
results.append(json.loads(line))
except json.JSONDecodeError:
continue
logger.info(
"clickhouse_query_success",
result_count=len(results),
elapsed_ms=round(elapsed_ms, 2),
method="httpx_native", # 🎯 統帥要求: 原生 httpx非 curl
)
return results
except Exception as e:
elapsed_ms = (time.perf_counter() - start_time) * 1000
logger.warning(
"clickhouse_query_failed",
error=str(e),
error_type=type(e).__name__,
query=query[:100],
elapsed_ms=round(elapsed_ms, 2),
)
return []
# =========================================================================
# Gold Metrics Extraction
# =========================================================================
async def get_gold_metrics(
self,
service_name: str,
namespace: str = "default",
time_window_minutes: int = 10,
) -> GoldMetrics:
"""
從 SignOz/ClickHouse 擷取 Gold Metrics
查詢過去 N 分鐘的:
- signoz_calls_total: RPS + Error Count
- signoz_latency.bucket: P50/P95/P99 延遲
Args:
service_name: 服務名稱 (如 api-gateway, harbor-core)
namespace: K8s namespace
time_window_minutes: 時間窗口 (分鐘)
Returns:
GoldMetrics: 黃金指標數據
"""
now = datetime.now(UTC)
start_time = now - timedelta(minutes=time_window_minutes)
end_time = now
# 初始化 metrics
metrics = GoldMetrics(
service_name=service_name,
namespace=namespace,
time_range_start=start_time,
time_range_end=end_time,
)
# =====================================================================
# Query 1: RPS & Error Rate (從 traces 表直接計算)
# =====================================================================
# 使用 signoz_traces.distributed_signoz_index_v3 表
# statusCode: 0=Unset, 1=Ok, 2=Error
# 使用 INTERVAL 語法避免 Decimal overflow
rps_query = f"""
SELECT
count() as total_requests,
countIf(statusCode = 2) as error_count
FROM signoz_traces.distributed_signoz_index_v3
WHERE
timestamp > now() - INTERVAL {time_window_minutes} MINUTE
AND serviceName LIKE '%{service_name}%'
"""
rps_results = await self._query_clickhouse(rps_query)
if rps_results:
row = rps_results[0]
total = int(row.get("total_requests", 0))
errors = int(row.get("error_count", 0))
metrics.total_requests = total
metrics.error_count = errors
metrics.error_rate = (errors / total * 100) if total > 0 else 0.0
metrics.rps = total / (time_window_minutes * 60)
# =====================================================================
# Query 2: Latency Percentiles (從 traces 表的 durationNano)
# =====================================================================
latency_query = f"""
SELECT
quantile(0.50)(durationNano / 1000000.0) as p50,
quantile(0.95)(durationNano / 1000000.0) as p95,
quantile(0.99)(durationNano / 1000000.0) as p99
FROM signoz_traces.distributed_signoz_index_v3
WHERE
timestamp > now() - INTERVAL {time_window_minutes} MINUTE
AND serviceName LIKE '%{service_name}%'
"""
latency_results = await self._query_clickhouse(latency_query)
if latency_results:
row = latency_results[0]
metrics.p50_latency_ms = float(row.get("p50", 0) or 0)
metrics.p95_latency_ms = float(row.get("p95", 0) or 0)
metrics.p99_latency_ms = float(row.get("p99", 0) or 0)
# =====================================================================
# Query 3: Trend Analysis (對比前一時間窗)
# =====================================================================
trend_query = f"""
SELECT count() as prev_requests
FROM signoz_traces.distributed_signoz_index_v3
WHERE
timestamp BETWEEN now() - INTERVAL {time_window_minutes * 2} MINUTE AND now() - INTERVAL {time_window_minutes} MINUTE
AND serviceName LIKE '%{service_name}%'
"""
trend_results = await self._query_clickhouse(trend_query)
if trend_results:
prev_total = int(trend_results[0].get("prev_requests", 0))
if prev_total > 0:
change_pct = (metrics.total_requests - prev_total) / prev_total * 100
if change_pct > 10:
metrics.rps_trend = "up"
elif change_pct < -10:
metrics.rps_trend = "down"
else:
metrics.rps_trend = "stable"
logger.info(
"signoz_gold_metrics_fetched",
service=service_name,
rps=metrics.rps,
error_rate=metrics.error_rate,
p99_latency=metrics.p99_latency_ms,
)
return metrics
# =========================================================================
# Trace URL Generation
# =========================================================================
def generate_trace_url(
self,
service_name: str,
alert_timestamp: datetime | None = None,
window_minutes: int = 5,
) -> str:
"""
生成動態時間範圍的 SignOz Trace URL
告警發生時間 ± window_minutes
Args:
service_name: 服務名稱
alert_timestamp: 告警發生時間 (預設為現在)
window_minutes: 前後時間窗口 (分鐘)
Returns:
str: SignOz Trace URL with timestamps
"""
if alert_timestamp is None:
alert_timestamp = datetime.now(UTC)
link = SignOzTraceLink(
base_url=self.signoz_url,
service_name=service_name,
start_time=alert_timestamp - timedelta(minutes=window_minutes),
end_time=alert_timestamp + timedelta(minutes=window_minutes),
)
return link.generate_url()
# =========================================================================
# System Metrics (CPU, Memory, Disk)
# =========================================================================
async def get_system_metrics(
self,
_host: str = "192.168.0.188", # Reserved for future host filtering
time_window_minutes: int = 5,
) -> dict:
"""
擷取系統指標 (system.cpu.time, system.disk.io)
用於 High CPU / Disk Full 告警分析
"""
now = datetime.now(UTC)
start_ms = int((now - timedelta(minutes=time_window_minutes)).timestamp() * 1000)
end_ms = int(now.timestamp() * 1000)
cpu_query = f"""
SELECT
avg(value) as cpu_avg,
max(value) as cpu_max
FROM signoz_metrics.distributed_samples_v4
WHERE
metric_name = 'system.cpu.time'
AND unix_milli BETWEEN {start_ms} AND {end_ms}
"""
disk_query = f"""
SELECT
sum(value) as disk_io_bytes
FROM signoz_metrics.distributed_samples_v4
WHERE
metric_name = 'system.disk.io'
AND unix_milli BETWEEN {start_ms} AND {end_ms}
"""
cpu_results = await self._query_clickhouse(cpu_query)
disk_results = await self._query_clickhouse(disk_query)
return {
"cpu": cpu_results[0] if cpu_results else {},
"disk": disk_results[0] if disk_results else {},
"time_range": {
"start": start_ms,
"end": end_ms,
},
}
# =========================================================================
# Log Query (Phase 13.1 #77)
# =========================================================================
async def get_logs(
self,
service_name: str | None = None,
severity: str | None = None,
search_text: str | None = None,
time_window_minutes: int = 30,
limit: int = 100,
) -> list[dict]:
"""
從 SignOz/ClickHouse 查詢日誌 (Phase 13.1 #77)
SignOz 日誌儲存在 signoz_logs.distributed_logs 表
Schema: timestamp, severity_text, body, resources, attributes
Args:
service_name: 服務名稱 (過濾 resources.service.name)
severity: 日誌級別 (ERROR, WARN, INFO, DEBUG)
search_text: 日誌內容搜尋文字
time_window_minutes: 時間窗口 (分鐘)
limit: 返回筆數上限
Returns:
list[dict]: 日誌記錄列表
"""
now = datetime.now(UTC)
start_ns = int((now - timedelta(minutes=time_window_minutes)).timestamp() * 1_000_000_000)
end_ns = int(now.timestamp() * 1_000_000_000)
# 構建 WHERE 條件
conditions = [
f"timestamp >= {start_ns}",
f"timestamp <= {end_ns}",
]
if service_name:
# SignOz 儲存 service.name 在 resources 欄位
conditions.append(f"resources['service.name'] = '{service_name}'")
if severity:
# 支援多個級別 (如 'ERROR,WARN')
severities = [s.strip().upper() for s in severity.split(",")]
severity_list = ", ".join([f"'{s}'" for s in severities])
conditions.append(f"severity_text IN ({severity_list})")
if search_text:
# 日誌內容搜尋 (避免 SQL injection)
safe_text = search_text.replace("'", "''")
conditions.append(f"body LIKE '%{safe_text}%'")
where_clause = " AND ".join(conditions)
query = f"""
SELECT
timestamp,
severity_text,
body,
resources,
attributes,
trace_id,
span_id
FROM signoz_logs.distributed_logs
WHERE {where_clause}
ORDER BY timestamp DESC
LIMIT {limit}
"""
results = await self._query_clickhouse(query)
# 格式化結果
formatted_logs = []
for row in results:
formatted_logs.append({
"timestamp": row.get("timestamp"),
"severity": row.get("severity_text", "UNKNOWN"),
"message": row.get("body", ""),
"service": row.get("resources", {}).get("service.name", "unknown"),
"trace_id": row.get("trace_id", ""),
"span_id": row.get("span_id", ""),
"attributes": row.get("attributes", {}),
})
logger.info(
"signoz_logs_query_completed",
service_name=service_name,
severity=severity,
result_count=len(formatted_logs),
time_window_minutes=time_window_minutes,
)
return formatted_logs
async def get_error_logs_summary(
self,
service_name: str,
time_window_minutes: int = 60,
) -> dict:
"""
取得錯誤日誌摘要 (Phase 13.1 #77 - CI 診斷用)
統計各類錯誤的出現次數和代表性訊息
"""
now = datetime.now(UTC)
start_ns = int((now - timedelta(minutes=time_window_minutes)).timestamp() * 1_000_000_000)
end_ns = int(now.timestamp() * 1_000_000_000)
query = f"""
SELECT
severity_text,
count() as count,
any(body) as sample_message
FROM signoz_logs.distributed_logs
WHERE
timestamp >= {start_ns}
AND timestamp <= {end_ns}
AND resources['service.name'] = '{service_name}'
AND severity_text IN ('ERROR', 'FATAL', 'CRITICAL')
GROUP BY severity_text
ORDER BY count DESC
LIMIT 10
"""
results = await self._query_clickhouse(query)
return {
"service_name": service_name,
"time_window_minutes": time_window_minutes,
"error_summary": results,
"total_errors": sum(r.get("count", 0) for r in results),
}
# =============================================================================
# Singleton
# =============================================================================
_signoz_client: SignOzClient | None = None
def get_signoz_client() -> SignOzClient:
"""取得全域 SignOz Client 實例"""
global _signoz_client
if _signoz_client is None:
_signoz_client = SignOzClient()
return _signoz_client
async def close_signoz_client() -> None:
"""關閉 SignOz Client"""
global _signoz_client
if _signoz_client:
await _signoz_client.close()
_signoz_client = None