""" SignOz Client - 全能視力中心 (戰略校正版) ========================================== 統帥鐵律: 嚴禁 Prometheus 碎片化,SignOz 為唯一真相來源 Features: - ClickHouse 直查 (繞過需認證的 SignOz API) - Gold Metrics 擷取 (P99 Latency, Error Rate, RPS) - 動態時間範圍 Trace URL 生成 - 趨勢圖表數據提取 (供 AI 分析) 架構: - SignOz Query Service: 192.168.0.188:3301 (需認證) - ClickHouse HTTP API: 192.168.0.188:8123 (直查) """ import json import time from dataclasses import dataclass, field from datetime import UTC, datetime, timedelta import structlog from src.core.config import settings from src.core.http_client import get_clickhouse_client logger = structlog.get_logger(__name__) # ============================================================================= # SignOz Data Models # ============================================================================= @dataclass class GoldMetrics: """ Gold Metrics - RED Methodology (Rate, Errors, Duration) SRE 黃金指標: - RPS (Requests Per Second): 流量 - Error Rate: 錯誤率 (%) - P99 Latency: 99th percentile 延遲 (ms) """ service_name: str namespace: str time_range_start: datetime time_range_end: datetime # Rate rps: float = 0.0 rps_trend: str = "stable" # up, down, stable # Errors error_rate: float = 0.0 # percentage error_count: int = 0 total_requests: int = 0 # Duration p50_latency_ms: float = 0.0 p95_latency_ms: float = 0.0 p99_latency_ms: float = 0.0 latency_trend: str = "stable" # Raw data for AI analysis raw_metrics: dict = field(default_factory=dict) def to_summary(self) -> str: """生成 AI 分析摘要""" trend_emoji = {"up": "📈", "down": "📉", "stable": "➡️"} error_emoji = "🟢" if self.error_rate < 1 else ("🟡" if self.error_rate < 5 else "🔴") return ( f"📊 Gold Metrics ({self.service_name})\n" f"• RPS: {self.rps:.1f} {trend_emoji.get(self.rps_trend, '➡️')}\n" f"• Error Rate: {error_emoji} {self.error_rate:.2f}%\n" f"• P99 Latency: {self.p99_latency_ms:.0f}ms {trend_emoji.get(self.latency_trend, '➡️')}" ) def to_telegram_block(self) -> str: """生成 Telegram 卡片區塊 (HTML)""" trend_emoji = {"up": "📈", "down": "📉", "stable": "➡️"} error_emoji = "🟢" if self.error_rate < 1 else ("🟡" if self.error_rate < 5 else "🔴") return ( f"📊 SignOz 指標\n" f"├ RPS: {self.rps:.1f} {trend_emoji.get(self.rps_trend, '➡️')}\n" f"├ Error: {error_emoji} {self.error_rate:.2f}%\n" f"└ P99: {self.p99_latency_ms:.0f}ms {trend_emoji.get(self.latency_trend, '➡️')}" ) @dataclass class SignOzTraceLink: """動態 SignOz Trace 連結""" base_url: str service_name: str start_time: datetime end_time: datetime namespace: str = "default" def generate_url(self) -> str: """ 生成帶時間參數的 Trace URL 格式: http://host:port/traces?service=xxx&start=timestamp&end=timestamp """ start_ns = int(self.start_time.timestamp() * 1_000_000_000) end_ns = int(self.end_time.timestamp() * 1_000_000_000) return ( f"{self.base_url}/traces?" f"service={self.service_name}&" f"start={start_ns}&" f"end={end_ns}" ) # ============================================================================= # SignOz Client # ============================================================================= class SignOzClient: """ SignOz Client - 直查 ClickHouse (永久架構版) 統帥鐵律: 禁止 subprocess+curl,使用 Lifespan 管理的 httpx.AsyncClient 使用 ClickHouse HTTP API 繞過需認證的 SignOz Query Service """ def __init__(self): self.signoz_url = settings.SIGNOZ_URL # http://192.168.0.188:3301 self.clickhouse_url = settings.CLICKHOUSE_URL # http://192.168.0.188:8123 async def close(self) -> None: """關閉連線 (由 Lifespan 統一管理,此處為相容性保留)""" pass # HTTP Client 由 src.core.http_client 管理 # ========================================================================= # ClickHouse Direct Queries (永久架構) # ========================================================================= async def _query_clickhouse(self, query: str) -> list[dict]: """ 執行 ClickHouse 查詢 (原生 httpx,非 curl) 統帥鐵律: - 使用 Lifespan 管理的 httpx.AsyncClient - trust_env=False 防止 HTTP_PROXY 干擾 - < 50ms 延遲目標 ClickHouse HTTP API: POST body = SQL, 加 FORMAT JSONEachRow 到查詢末尾 """ # 加入 FORMAT JSONEachRow 到查詢末尾 formatted_query = query.strip().rstrip(";") + " FORMAT JSONEachRow" start_time = time.perf_counter() try: # 取得 Lifespan 管理的 Client client = await get_clickhouse_client() logger.debug( "clickhouse_query_start", base_url=self.clickhouse_url, query_preview=formatted_query[:80], ) # 原生 httpx POST 請求 response = await client.post( "/", # base_url 已設定,只需 path content=formatted_query, ) elapsed_ms = (time.perf_counter() - start_time) * 1000 # 檢查 HTTP 狀態 if response.status_code != 200: logger.warning( "clickhouse_query_http_error", status_code=response.status_code, response_text=response.text[:200], elapsed_ms=round(elapsed_ms, 2), ) return [] # 解析 JSONEachRow 格式 (每行一個 JSON 物件) results = [] for line in response.text.strip().split("\n"): if line: try: results.append(json.loads(line)) except json.JSONDecodeError: continue logger.info( "clickhouse_query_success", result_count=len(results), elapsed_ms=round(elapsed_ms, 2), method="httpx_native", # 🎯 統帥要求: 原生 httpx,非 curl ) return results except Exception as e: elapsed_ms = (time.perf_counter() - start_time) * 1000 logger.warning( "clickhouse_query_failed", error=str(e), error_type=type(e).__name__, query=query[:100], elapsed_ms=round(elapsed_ms, 2), ) return [] # ========================================================================= # Gold Metrics Extraction # ========================================================================= async def get_gold_metrics( self, service_name: str, namespace: str = "default", time_window_minutes: int = 10, ) -> GoldMetrics: """ 從 SignOz/ClickHouse 擷取 Gold Metrics 查詢過去 N 分鐘的: - signoz_calls_total: RPS + Error Count - signoz_latency.bucket: P50/P95/P99 延遲 Args: service_name: 服務名稱 (如 api-gateway, harbor-core) namespace: K8s namespace time_window_minutes: 時間窗口 (分鐘) Returns: GoldMetrics: 黃金指標數據 """ now = datetime.now(UTC) start_time = now - timedelta(minutes=time_window_minutes) end_time = now # 初始化 metrics metrics = GoldMetrics( service_name=service_name, namespace=namespace, time_range_start=start_time, time_range_end=end_time, ) # ===================================================================== # Query 1: RPS & Error Rate (從 traces 表直接計算) # ===================================================================== # 使用 signoz_traces.distributed_signoz_index_v3 表 # statusCode: 0=Unset, 1=Ok, 2=Error # 使用 INTERVAL 語法避免 Decimal overflow rps_query = f""" SELECT count() as total_requests, countIf(statusCode = 2) as error_count FROM signoz_traces.distributed_signoz_index_v3 WHERE timestamp > now() - INTERVAL {time_window_minutes} MINUTE AND serviceName LIKE '%{service_name}%' """ rps_results = await self._query_clickhouse(rps_query) if rps_results: row = rps_results[0] total = int(row.get("total_requests", 0)) errors = int(row.get("error_count", 0)) metrics.total_requests = total metrics.error_count = errors metrics.error_rate = (errors / total * 100) if total > 0 else 0.0 metrics.rps = total / (time_window_minutes * 60) # ===================================================================== # Query 2: Latency Percentiles (從 traces 表的 durationNano) # ===================================================================== latency_query = f""" SELECT quantile(0.50)(durationNano / 1000000.0) as p50, quantile(0.95)(durationNano / 1000000.0) as p95, quantile(0.99)(durationNano / 1000000.0) as p99 FROM signoz_traces.distributed_signoz_index_v3 WHERE timestamp > now() - INTERVAL {time_window_minutes} MINUTE AND serviceName LIKE '%{service_name}%' """ latency_results = await self._query_clickhouse(latency_query) if latency_results: row = latency_results[0] metrics.p50_latency_ms = float(row.get("p50", 0) or 0) metrics.p95_latency_ms = float(row.get("p95", 0) or 0) metrics.p99_latency_ms = float(row.get("p99", 0) or 0) # ===================================================================== # Query 3: Trend Analysis (對比前一時間窗) # ===================================================================== trend_query = f""" SELECT count() as prev_requests FROM signoz_traces.distributed_signoz_index_v3 WHERE timestamp BETWEEN now() - INTERVAL {time_window_minutes * 2} MINUTE AND now() - INTERVAL {time_window_minutes} MINUTE AND serviceName LIKE '%{service_name}%' """ trend_results = await self._query_clickhouse(trend_query) if trend_results: prev_total = int(trend_results[0].get("prev_requests", 0)) if prev_total > 0: change_pct = (metrics.total_requests - prev_total) / prev_total * 100 if change_pct > 10: metrics.rps_trend = "up" elif change_pct < -10: metrics.rps_trend = "down" else: metrics.rps_trend = "stable" logger.info( "signoz_gold_metrics_fetched", service=service_name, rps=metrics.rps, error_rate=metrics.error_rate, p99_latency=metrics.p99_latency_ms, ) return metrics # ========================================================================= # Trace URL Generation # ========================================================================= def generate_trace_url( self, service_name: str, alert_timestamp: datetime | None = None, window_minutes: int = 5, ) -> str: """ 生成動態時間範圍的 SignOz Trace URL 告警發生時間 ± window_minutes Args: service_name: 服務名稱 alert_timestamp: 告警發生時間 (預設為現在) window_minutes: 前後時間窗口 (分鐘) Returns: str: SignOz Trace URL with timestamps """ if alert_timestamp is None: alert_timestamp = datetime.now(UTC) link = SignOzTraceLink( base_url=self.signoz_url, service_name=service_name, start_time=alert_timestamp - timedelta(minutes=window_minutes), end_time=alert_timestamp + timedelta(minutes=window_minutes), ) return link.generate_url() # ========================================================================= # System Metrics (CPU, Memory, Disk) # ========================================================================= async def get_system_metrics( self, _host: str = "192.168.0.188", # Reserved for future host filtering time_window_minutes: int = 5, ) -> dict: """ 擷取系統指標 (system.cpu.time, system.disk.io) 用於 High CPU / Disk Full 告警分析 """ now = datetime.now(UTC) start_ms = int((now - timedelta(minutes=time_window_minutes)).timestamp() * 1000) end_ms = int(now.timestamp() * 1000) cpu_query = f""" SELECT avg(value) as cpu_avg, max(value) as cpu_max FROM signoz_metrics.distributed_samples_v4 WHERE metric_name = 'system.cpu.time' AND unix_milli BETWEEN {start_ms} AND {end_ms} """ disk_query = f""" SELECT sum(value) as disk_io_bytes FROM signoz_metrics.distributed_samples_v4 WHERE metric_name = 'system.disk.io' AND unix_milli BETWEEN {start_ms} AND {end_ms} """ cpu_results = await self._query_clickhouse(cpu_query) disk_results = await self._query_clickhouse(disk_query) return { "cpu": cpu_results[0] if cpu_results else {}, "disk": disk_results[0] if disk_results else {}, "time_range": { "start": start_ms, "end": end_ms, }, } # ========================================================================= # Log Query (Phase 13.1 #77) # ========================================================================= async def get_logs( self, service_name: str | None = None, severity: str | None = None, search_text: str | None = None, time_window_minutes: int = 30, limit: int = 100, ) -> list[dict]: """ 從 SignOz/ClickHouse 查詢日誌 (Phase 13.1 #77) SignOz v0.8+ 日誌儲存在 signoz_logs.distributed_logs_v2 表 Schema: timestamp, severity_text, body, resources_string, attributes_string Args: service_name: 服務名稱 (過濾 resources.service.name) severity: 日誌級別 (ERROR, WARN, INFO, DEBUG) search_text: 日誌內容搜尋文字 time_window_minutes: 時間窗口 (分鐘) limit: 返回筆數上限 Returns: list[dict]: 日誌記錄列表 """ now = datetime.now(UTC) start_ns = int((now - timedelta(minutes=time_window_minutes)).timestamp() * 1_000_000_000) end_ns = int(now.timestamp() * 1_000_000_000) # 構建 WHERE 條件 conditions = [ f"timestamp >= {start_ns}", f"timestamp <= {end_ns}", ] if service_name: # SignOz 儲存 service.name 在 resources 欄位 safe_service = service_name.replace("'", "''") conditions.append(f"resources_string['service.name'] = '{safe_service}'") if severity: # 支援多個級別 (如 'ERROR,WARN') severities = [s.strip().upper() for s in severity.split(",")] severity_list = ", ".join([f"'{s}'" for s in severities]) conditions.append(f"severity_text IN ({severity_list})") if search_text: # 日誌內容搜尋 (避免 SQL injection) safe_text = search_text.replace("'", "''") conditions.append(f"body LIKE '%{safe_text}%'") where_clause = " AND ".join(conditions) query = f""" SELECT timestamp, severity_text, body, resources_string AS resources, attributes_string AS attributes, trace_id, span_id FROM signoz_logs.distributed_logs_v2 WHERE {where_clause} ORDER BY timestamp DESC LIMIT {limit} """ results = await self._query_clickhouse(query) # 格式化結果 formatted_logs = [] for row in results: formatted_logs.append({ "timestamp": row.get("timestamp"), "severity": row.get("severity_text", "UNKNOWN"), "message": row.get("body", ""), "service": row.get("resources", {}).get("service.name", "unknown"), "trace_id": row.get("trace_id", ""), "span_id": row.get("span_id", ""), "attributes": row.get("attributes", {}), }) logger.info( "signoz_logs_query_completed", service_name=service_name, severity=severity, result_count=len(formatted_logs), time_window_minutes=time_window_minutes, ) return formatted_logs async def get_error_logs_summary( self, service_name: str, time_window_minutes: int = 60, ) -> dict: """ 取得錯誤日誌摘要 (Phase 13.1 #77 - CI 診斷用) 統計各類錯誤的出現次數和代表性訊息 """ now = datetime.now(UTC) start_ns = int((now - timedelta(minutes=time_window_minutes)).timestamp() * 1_000_000_000) end_ns = int(now.timestamp() * 1_000_000_000) query = f""" SELECT severity_text, count() as count, any(body) as sample_message FROM signoz_logs.distributed_logs_v2 WHERE timestamp >= {start_ns} AND timestamp <= {end_ns} AND resources_string['service.name'] = '{service_name.replace("'", "''")}' AND severity_text IN ('ERROR', 'FATAL', 'CRITICAL') GROUP BY severity_text ORDER BY count DESC LIMIT 10 """ results = await self._query_clickhouse(query) return { "service_name": service_name, "time_window_minutes": time_window_minutes, "error_summary": results, "total_errors": sum(r.get("count", 0) for r in results), } # ============================================================================= # Singleton # ============================================================================= _signoz_client: SignOzClient | None = None def get_signoz_client() -> SignOzClient: """取得全域 SignOz Client 實例""" global _signoz_client if _signoz_client is None: _signoz_client = SignOzClient() return _signoz_client async def close_signoz_client() -> None: """關閉 SignOz Client""" global _signoz_client if _signoz_client: await _signoz_client.close() _signoz_client = None