Phase 13.1 CI/CD Integration: - #76 workflow_run handler for CI failure diagnosis - #77 SignOz log query (query_logs, error_logs_summary MCP) - #78 CIAutoRepairService with risk-based execution decisions Phase 13.3 Smart Routing: - #85 Intent Classifier v2.0 (rule engine + LLM fallback) - #86 Complexity Scorer (9-dimension scoring) - #87 AI Router v3.0 (routing decision matrix) - #88 Token Counter (OTEL + Langfuse integration) New files: - services/ci_auto_repair.py (risk stratification) - services/model_registry.py (centralized model config) - services/token_counter.py (677 lines) - Skill 08: Model Router Expert - Skill 09: Strangler Pattern Expert - ADR-023: Smart Routing Architecture - ADR-024: API Layer Architecture Tests: - phase11-conversational.spec.ts (E2E tests) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
577 lines
19 KiB
Python
577 lines
19 KiB
Python
"""
|
||
SignOz Client - 全能視力中心 (戰略校正版)
|
||
==========================================
|
||
統帥鐵律: 嚴禁 Prometheus 碎片化,SignOz 為唯一真相來源
|
||
|
||
Features:
|
||
- ClickHouse 直查 (繞過需認證的 SignOz API)
|
||
- Gold Metrics 擷取 (P99 Latency, Error Rate, RPS)
|
||
- 動態時間範圍 Trace URL 生成
|
||
- 趨勢圖表數據提取 (供 AI 分析)
|
||
|
||
架構:
|
||
- SignOz Query Service: 192.168.0.188:3301 (需認證)
|
||
- ClickHouse HTTP API: 192.168.0.188:8123 (直查)
|
||
"""
|
||
|
||
import json
|
||
import time
|
||
from dataclasses import dataclass, field
|
||
from datetime import UTC, datetime, timedelta
|
||
|
||
import structlog
|
||
|
||
from src.core.config import settings
|
||
from src.core.http_client import get_clickhouse_client
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
|
||
# =============================================================================
|
||
# SignOz Data Models
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class GoldMetrics:
|
||
"""
|
||
Gold Metrics - RED Methodology (Rate, Errors, Duration)
|
||
|
||
SRE 黃金指標:
|
||
- RPS (Requests Per Second): 流量
|
||
- Error Rate: 錯誤率 (%)
|
||
- P99 Latency: 99th percentile 延遲 (ms)
|
||
"""
|
||
service_name: str
|
||
namespace: str
|
||
time_range_start: datetime
|
||
time_range_end: datetime
|
||
|
||
# Rate
|
||
rps: float = 0.0
|
||
rps_trend: str = "stable" # up, down, stable
|
||
|
||
# Errors
|
||
error_rate: float = 0.0 # percentage
|
||
error_count: int = 0
|
||
total_requests: int = 0
|
||
|
||
# Duration
|
||
p50_latency_ms: float = 0.0
|
||
p95_latency_ms: float = 0.0
|
||
p99_latency_ms: float = 0.0
|
||
latency_trend: str = "stable"
|
||
|
||
# Raw data for AI analysis
|
||
raw_metrics: dict = field(default_factory=dict)
|
||
|
||
def to_summary(self) -> str:
|
||
"""生成 AI 分析摘要"""
|
||
trend_emoji = {"up": "📈", "down": "📉", "stable": "➡️"}
|
||
error_emoji = "🟢" if self.error_rate < 1 else ("🟡" if self.error_rate < 5 else "🔴")
|
||
|
||
return (
|
||
f"📊 Gold Metrics ({self.service_name})\n"
|
||
f"• RPS: {self.rps:.1f} {trend_emoji.get(self.rps_trend, '➡️')}\n"
|
||
f"• Error Rate: {error_emoji} {self.error_rate:.2f}%\n"
|
||
f"• P99 Latency: {self.p99_latency_ms:.0f}ms {trend_emoji.get(self.latency_trend, '➡️')}"
|
||
)
|
||
|
||
def to_telegram_block(self) -> str:
|
||
"""生成 Telegram 卡片區塊 (HTML)"""
|
||
trend_emoji = {"up": "📈", "down": "📉", "stable": "➡️"}
|
||
error_emoji = "🟢" if self.error_rate < 1 else ("🟡" if self.error_rate < 5 else "🔴")
|
||
|
||
return (
|
||
f"📊 <b>SignOz 指標</b>\n"
|
||
f"├ RPS: <code>{self.rps:.1f}</code> {trend_emoji.get(self.rps_trend, '➡️')}\n"
|
||
f"├ Error: {error_emoji} <code>{self.error_rate:.2f}%</code>\n"
|
||
f"└ P99: <code>{self.p99_latency_ms:.0f}ms</code> {trend_emoji.get(self.latency_trend, '➡️')}"
|
||
)
|
||
|
||
|
||
@dataclass
|
||
class SignOzTraceLink:
|
||
"""動態 SignOz Trace 連結"""
|
||
base_url: str
|
||
service_name: str
|
||
start_time: datetime
|
||
end_time: datetime
|
||
namespace: str = "default"
|
||
|
||
def generate_url(self) -> str:
|
||
"""
|
||
生成帶時間參數的 Trace URL
|
||
|
||
格式: http://host:port/traces?service=xxx&start=timestamp&end=timestamp
|
||
"""
|
||
start_ns = int(self.start_time.timestamp() * 1_000_000_000)
|
||
end_ns = int(self.end_time.timestamp() * 1_000_000_000)
|
||
|
||
return (
|
||
f"{self.base_url}/traces?"
|
||
f"service={self.service_name}&"
|
||
f"start={start_ns}&"
|
||
f"end={end_ns}"
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# SignOz Client
|
||
# =============================================================================
|
||
|
||
class SignOzClient:
|
||
"""
|
||
SignOz Client - 直查 ClickHouse (永久架構版)
|
||
|
||
統帥鐵律: 禁止 subprocess+curl,使用 Lifespan 管理的 httpx.AsyncClient
|
||
使用 ClickHouse HTTP API 繞過需認證的 SignOz Query Service
|
||
"""
|
||
|
||
def __init__(self):
|
||
self.signoz_url = settings.SIGNOZ_URL # http://192.168.0.188:3301
|
||
self.clickhouse_url = settings.CLICKHOUSE_URL # http://192.168.0.188:8123
|
||
|
||
async def close(self) -> None:
|
||
"""關閉連線 (由 Lifespan 統一管理,此處為相容性保留)"""
|
||
pass # HTTP Client 由 src.core.http_client 管理
|
||
|
||
# =========================================================================
|
||
# ClickHouse Direct Queries (永久架構)
|
||
# =========================================================================
|
||
|
||
async def _query_clickhouse(self, query: str) -> list[dict]:
|
||
"""
|
||
執行 ClickHouse 查詢 (原生 httpx,非 curl)
|
||
|
||
統帥鐵律:
|
||
- 使用 Lifespan 管理的 httpx.AsyncClient
|
||
- trust_env=False 防止 HTTP_PROXY 干擾
|
||
- < 50ms 延遲目標
|
||
|
||
ClickHouse HTTP API: POST body = SQL, 加 FORMAT JSONEachRow 到查詢末尾
|
||
"""
|
||
# 加入 FORMAT JSONEachRow 到查詢末尾
|
||
formatted_query = query.strip().rstrip(";") + " FORMAT JSONEachRow"
|
||
|
||
start_time = time.perf_counter()
|
||
|
||
try:
|
||
# 取得 Lifespan 管理的 Client
|
||
client = await get_clickhouse_client()
|
||
|
||
logger.debug(
|
||
"clickhouse_query_start",
|
||
base_url=self.clickhouse_url,
|
||
query_preview=formatted_query[:80],
|
||
)
|
||
|
||
# 原生 httpx POST 請求
|
||
response = await client.post(
|
||
"/", # base_url 已設定,只需 path
|
||
content=formatted_query,
|
||
)
|
||
|
||
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||
|
||
# 檢查 HTTP 狀態
|
||
if response.status_code != 200:
|
||
logger.warning(
|
||
"clickhouse_query_http_error",
|
||
status_code=response.status_code,
|
||
response_text=response.text[:200],
|
||
elapsed_ms=round(elapsed_ms, 2),
|
||
)
|
||
return []
|
||
|
||
# 解析 JSONEachRow 格式 (每行一個 JSON 物件)
|
||
results = []
|
||
for line in response.text.strip().split("\n"):
|
||
if line:
|
||
try:
|
||
results.append(json.loads(line))
|
||
except json.JSONDecodeError:
|
||
continue
|
||
|
||
logger.info(
|
||
"clickhouse_query_success",
|
||
result_count=len(results),
|
||
elapsed_ms=round(elapsed_ms, 2),
|
||
method="httpx_native", # 🎯 統帥要求: 原生 httpx,非 curl
|
||
)
|
||
|
||
return results
|
||
|
||
except Exception as e:
|
||
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||
logger.warning(
|
||
"clickhouse_query_failed",
|
||
error=str(e),
|
||
error_type=type(e).__name__,
|
||
query=query[:100],
|
||
elapsed_ms=round(elapsed_ms, 2),
|
||
)
|
||
return []
|
||
|
||
# =========================================================================
|
||
# Gold Metrics Extraction
|
||
# =========================================================================
|
||
|
||
async def get_gold_metrics(
|
||
self,
|
||
service_name: str,
|
||
namespace: str = "default",
|
||
time_window_minutes: int = 10,
|
||
) -> GoldMetrics:
|
||
"""
|
||
從 SignOz/ClickHouse 擷取 Gold Metrics
|
||
|
||
查詢過去 N 分鐘的:
|
||
- signoz_calls_total: RPS + Error Count
|
||
- signoz_latency.bucket: P50/P95/P99 延遲
|
||
|
||
Args:
|
||
service_name: 服務名稱 (如 api-gateway, harbor-core)
|
||
namespace: K8s namespace
|
||
time_window_minutes: 時間窗口 (分鐘)
|
||
|
||
Returns:
|
||
GoldMetrics: 黃金指標數據
|
||
"""
|
||
now = datetime.now(UTC)
|
||
start_time = now - timedelta(minutes=time_window_minutes)
|
||
end_time = now
|
||
|
||
# 初始化 metrics
|
||
metrics = GoldMetrics(
|
||
service_name=service_name,
|
||
namespace=namespace,
|
||
time_range_start=start_time,
|
||
time_range_end=end_time,
|
||
)
|
||
|
||
# =====================================================================
|
||
# Query 1: RPS & Error Rate (從 traces 表直接計算)
|
||
# =====================================================================
|
||
# 使用 signoz_traces.distributed_signoz_index_v3 表
|
||
# statusCode: 0=Unset, 1=Ok, 2=Error
|
||
# 使用 INTERVAL 語法避免 Decimal overflow
|
||
|
||
rps_query = f"""
|
||
SELECT
|
||
count() as total_requests,
|
||
countIf(statusCode = 2) as error_count
|
||
FROM signoz_traces.distributed_signoz_index_v3
|
||
WHERE
|
||
timestamp > now() - INTERVAL {time_window_minutes} MINUTE
|
||
AND serviceName LIKE '%{service_name}%'
|
||
"""
|
||
|
||
rps_results = await self._query_clickhouse(rps_query)
|
||
|
||
if rps_results:
|
||
row = rps_results[0]
|
||
total = int(row.get("total_requests", 0))
|
||
errors = int(row.get("error_count", 0))
|
||
|
||
metrics.total_requests = total
|
||
metrics.error_count = errors
|
||
metrics.error_rate = (errors / total * 100) if total > 0 else 0.0
|
||
metrics.rps = total / (time_window_minutes * 60)
|
||
|
||
# =====================================================================
|
||
# Query 2: Latency Percentiles (從 traces 表的 durationNano)
|
||
# =====================================================================
|
||
latency_query = f"""
|
||
SELECT
|
||
quantile(0.50)(durationNano / 1000000.0) as p50,
|
||
quantile(0.95)(durationNano / 1000000.0) as p95,
|
||
quantile(0.99)(durationNano / 1000000.0) as p99
|
||
FROM signoz_traces.distributed_signoz_index_v3
|
||
WHERE
|
||
timestamp > now() - INTERVAL {time_window_minutes} MINUTE
|
||
AND serviceName LIKE '%{service_name}%'
|
||
"""
|
||
|
||
latency_results = await self._query_clickhouse(latency_query)
|
||
|
||
if latency_results:
|
||
row = latency_results[0]
|
||
metrics.p50_latency_ms = float(row.get("p50", 0) or 0)
|
||
metrics.p95_latency_ms = float(row.get("p95", 0) or 0)
|
||
metrics.p99_latency_ms = float(row.get("p99", 0) or 0)
|
||
|
||
# =====================================================================
|
||
# Query 3: Trend Analysis (對比前一時間窗)
|
||
# =====================================================================
|
||
trend_query = f"""
|
||
SELECT count() as prev_requests
|
||
FROM signoz_traces.distributed_signoz_index_v3
|
||
WHERE
|
||
timestamp BETWEEN now() - INTERVAL {time_window_minutes * 2} MINUTE AND now() - INTERVAL {time_window_minutes} MINUTE
|
||
AND serviceName LIKE '%{service_name}%'
|
||
"""
|
||
|
||
trend_results = await self._query_clickhouse(trend_query)
|
||
|
||
if trend_results:
|
||
prev_total = int(trend_results[0].get("prev_requests", 0))
|
||
if prev_total > 0:
|
||
change_pct = (metrics.total_requests - prev_total) / prev_total * 100
|
||
if change_pct > 10:
|
||
metrics.rps_trend = "up"
|
||
elif change_pct < -10:
|
||
metrics.rps_trend = "down"
|
||
else:
|
||
metrics.rps_trend = "stable"
|
||
|
||
logger.info(
|
||
"signoz_gold_metrics_fetched",
|
||
service=service_name,
|
||
rps=metrics.rps,
|
||
error_rate=metrics.error_rate,
|
||
p99_latency=metrics.p99_latency_ms,
|
||
)
|
||
|
||
return metrics
|
||
|
||
# =========================================================================
|
||
# Trace URL Generation
|
||
# =========================================================================
|
||
|
||
def generate_trace_url(
|
||
self,
|
||
service_name: str,
|
||
alert_timestamp: datetime | None = None,
|
||
window_minutes: int = 5,
|
||
) -> str:
|
||
"""
|
||
生成動態時間範圍的 SignOz Trace URL
|
||
|
||
告警發生時間 ± window_minutes
|
||
|
||
Args:
|
||
service_name: 服務名稱
|
||
alert_timestamp: 告警發生時間 (預設為現在)
|
||
window_minutes: 前後時間窗口 (分鐘)
|
||
|
||
Returns:
|
||
str: SignOz Trace URL with timestamps
|
||
"""
|
||
if alert_timestamp is None:
|
||
alert_timestamp = datetime.now(UTC)
|
||
|
||
link = SignOzTraceLink(
|
||
base_url=self.signoz_url,
|
||
service_name=service_name,
|
||
start_time=alert_timestamp - timedelta(minutes=window_minutes),
|
||
end_time=alert_timestamp + timedelta(minutes=window_minutes),
|
||
)
|
||
|
||
return link.generate_url()
|
||
|
||
# =========================================================================
|
||
# System Metrics (CPU, Memory, Disk)
|
||
# =========================================================================
|
||
|
||
async def get_system_metrics(
|
||
self,
|
||
_host: str = "192.168.0.188", # Reserved for future host filtering
|
||
time_window_minutes: int = 5,
|
||
) -> dict:
|
||
"""
|
||
擷取系統指標 (system.cpu.time, system.disk.io)
|
||
|
||
用於 High CPU / Disk Full 告警分析
|
||
"""
|
||
now = datetime.now(UTC)
|
||
start_ms = int((now - timedelta(minutes=time_window_minutes)).timestamp() * 1000)
|
||
end_ms = int(now.timestamp() * 1000)
|
||
|
||
cpu_query = f"""
|
||
SELECT
|
||
avg(value) as cpu_avg,
|
||
max(value) as cpu_max
|
||
FROM signoz_metrics.distributed_samples_v4
|
||
WHERE
|
||
metric_name = 'system.cpu.time'
|
||
AND unix_milli BETWEEN {start_ms} AND {end_ms}
|
||
"""
|
||
|
||
disk_query = f"""
|
||
SELECT
|
||
sum(value) as disk_io_bytes
|
||
FROM signoz_metrics.distributed_samples_v4
|
||
WHERE
|
||
metric_name = 'system.disk.io'
|
||
AND unix_milli BETWEEN {start_ms} AND {end_ms}
|
||
"""
|
||
|
||
cpu_results = await self._query_clickhouse(cpu_query)
|
||
disk_results = await self._query_clickhouse(disk_query)
|
||
|
||
return {
|
||
"cpu": cpu_results[0] if cpu_results else {},
|
||
"disk": disk_results[0] if disk_results else {},
|
||
"time_range": {
|
||
"start": start_ms,
|
||
"end": end_ms,
|
||
},
|
||
}
|
||
|
||
# =========================================================================
|
||
# Log Query (Phase 13.1 #77)
|
||
# =========================================================================
|
||
|
||
async def get_logs(
|
||
self,
|
||
service_name: str | None = None,
|
||
severity: str | None = None,
|
||
search_text: str | None = None,
|
||
time_window_minutes: int = 30,
|
||
limit: int = 100,
|
||
) -> list[dict]:
|
||
"""
|
||
從 SignOz/ClickHouse 查詢日誌 (Phase 13.1 #77)
|
||
|
||
SignOz 日誌儲存在 signoz_logs.distributed_logs 表
|
||
Schema: timestamp, severity_text, body, resources, attributes
|
||
|
||
Args:
|
||
service_name: 服務名稱 (過濾 resources.service.name)
|
||
severity: 日誌級別 (ERROR, WARN, INFO, DEBUG)
|
||
search_text: 日誌內容搜尋文字
|
||
time_window_minutes: 時間窗口 (分鐘)
|
||
limit: 返回筆數上限
|
||
|
||
Returns:
|
||
list[dict]: 日誌記錄列表
|
||
"""
|
||
now = datetime.now(UTC)
|
||
start_ns = int((now - timedelta(minutes=time_window_minutes)).timestamp() * 1_000_000_000)
|
||
end_ns = int(now.timestamp() * 1_000_000_000)
|
||
|
||
# 構建 WHERE 條件
|
||
conditions = [
|
||
f"timestamp >= {start_ns}",
|
||
f"timestamp <= {end_ns}",
|
||
]
|
||
|
||
if service_name:
|
||
# SignOz 儲存 service.name 在 resources 欄位
|
||
conditions.append(f"resources['service.name'] = '{service_name}'")
|
||
|
||
if severity:
|
||
# 支援多個級別 (如 'ERROR,WARN')
|
||
severities = [s.strip().upper() for s in severity.split(",")]
|
||
severity_list = ", ".join([f"'{s}'" for s in severities])
|
||
conditions.append(f"severity_text IN ({severity_list})")
|
||
|
||
if search_text:
|
||
# 日誌內容搜尋 (避免 SQL injection)
|
||
safe_text = search_text.replace("'", "''")
|
||
conditions.append(f"body LIKE '%{safe_text}%'")
|
||
|
||
where_clause = " AND ".join(conditions)
|
||
|
||
query = f"""
|
||
SELECT
|
||
timestamp,
|
||
severity_text,
|
||
body,
|
||
resources,
|
||
attributes,
|
||
trace_id,
|
||
span_id
|
||
FROM signoz_logs.distributed_logs
|
||
WHERE {where_clause}
|
||
ORDER BY timestamp DESC
|
||
LIMIT {limit}
|
||
"""
|
||
|
||
results = await self._query_clickhouse(query)
|
||
|
||
# 格式化結果
|
||
formatted_logs = []
|
||
for row in results:
|
||
formatted_logs.append({
|
||
"timestamp": row.get("timestamp"),
|
||
"severity": row.get("severity_text", "UNKNOWN"),
|
||
"message": row.get("body", ""),
|
||
"service": row.get("resources", {}).get("service.name", "unknown"),
|
||
"trace_id": row.get("trace_id", ""),
|
||
"span_id": row.get("span_id", ""),
|
||
"attributes": row.get("attributes", {}),
|
||
})
|
||
|
||
logger.info(
|
||
"signoz_logs_query_completed",
|
||
service_name=service_name,
|
||
severity=severity,
|
||
result_count=len(formatted_logs),
|
||
time_window_minutes=time_window_minutes,
|
||
)
|
||
|
||
return formatted_logs
|
||
|
||
async def get_error_logs_summary(
|
||
self,
|
||
service_name: str,
|
||
time_window_minutes: int = 60,
|
||
) -> dict:
|
||
"""
|
||
取得錯誤日誌摘要 (Phase 13.1 #77 - CI 診斷用)
|
||
|
||
統計各類錯誤的出現次數和代表性訊息
|
||
"""
|
||
now = datetime.now(UTC)
|
||
start_ns = int((now - timedelta(minutes=time_window_minutes)).timestamp() * 1_000_000_000)
|
||
end_ns = int(now.timestamp() * 1_000_000_000)
|
||
|
||
query = f"""
|
||
SELECT
|
||
severity_text,
|
||
count() as count,
|
||
any(body) as sample_message
|
||
FROM signoz_logs.distributed_logs
|
||
WHERE
|
||
timestamp >= {start_ns}
|
||
AND timestamp <= {end_ns}
|
||
AND resources['service.name'] = '{service_name}'
|
||
AND severity_text IN ('ERROR', 'FATAL', 'CRITICAL')
|
||
GROUP BY severity_text
|
||
ORDER BY count DESC
|
||
LIMIT 10
|
||
"""
|
||
|
||
results = await self._query_clickhouse(query)
|
||
|
||
return {
|
||
"service_name": service_name,
|
||
"time_window_minutes": time_window_minutes,
|
||
"error_summary": results,
|
||
"total_errors": sum(r.get("count", 0) for r in results),
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Singleton
|
||
# =============================================================================
|
||
|
||
_signoz_client: SignOzClient | None = None
|
||
|
||
|
||
def get_signoz_client() -> SignOzClient:
|
||
"""取得全域 SignOz Client 實例"""
|
||
global _signoz_client
|
||
if _signoz_client is None:
|
||
_signoz_client = SignOzClient()
|
||
return _signoz_client
|
||
|
||
|
||
async def close_signoz_client() -> None:
|
||
"""關閉 SignOz Client"""
|
||
global _signoz_client
|
||
if _signoz_client:
|
||
await _signoz_client.close()
|
||
_signoz_client = None
|