- Python: ruff --fix 修復 280 個 lint 錯誤 - lewooogo-core: src/ 目錄未追蹤,導致 CI eslint 失敗 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
222 lines
6.6 KiB
Python
222 lines
6.6 KiB
Python
"""
|
||
AWOOOI OpenTelemetry Configuration
|
||
==================================
|
||
P0 基礎設施: 可觀測性鐵律
|
||
|
||
Traces → SigNoz (192.168.0.188:4317)
|
||
|
||
四主機架構強制校驗:
|
||
| IP | 允許 OTEL? |
|
||
|-----------------|-----------|
|
||
| 192.168.0.110 | ❌ 禁止 |
|
||
| 192.168.0.112 | ❌ 禁止 |
|
||
| 192.168.0.188 | ✅ 唯一 |
|
||
| 192.168.0.120 | ❌ 禁止 |
|
||
|
||
優雅降級 (Graceful Degradation):
|
||
- OTEL 連線失敗不會導致 API 崩潰
|
||
- 使用 BatchSpanProcessor 非同步傳輸
|
||
- 連線超時後自動跳過追蹤
|
||
"""
|
||
|
||
import logging
|
||
|
||
from opentelemetry import trace
|
||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||
from opentelemetry.instrumentation.logging import LoggingInstrumentor
|
||
from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource
|
||
from opentelemetry.sdk.trace import TracerProvider
|
||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||
|
||
from src.core.config import settings
|
||
|
||
# Module logger (not structlog to avoid circular dependency)
|
||
_logger = logging.getLogger("awoooi.telemetry")
|
||
|
||
# Global state
|
||
_tracer_provider: TracerProvider | None = None
|
||
_initialized: bool = False
|
||
|
||
|
||
def _validate_endpoint() -> bool:
|
||
"""
|
||
四主機架構強制校驗
|
||
|
||
OTEL Endpoint 必須指向 192.168.0.188 (AI+Web 中心)
|
||
"""
|
||
endpoint = settings.OTEL_EXPORTER_OTLP_ENDPOINT
|
||
|
||
# 檢查是否為合法的 AI+Web 中心
|
||
if "192.168.0.188" not in endpoint:
|
||
_logger.error(
|
||
f"四主機架構違規! OTEL Endpoint 必須指向 192.168.0.188, "
|
||
f"當前: {endpoint}"
|
||
)
|
||
return False
|
||
|
||
# 檢查是否誤指向其他主機
|
||
forbidden_hosts = ["192.168.0.110", "192.168.0.112", "192.168.0.120", "192.168.0.121"]
|
||
for host in forbidden_hosts:
|
||
if host in endpoint:
|
||
_logger.error(
|
||
f"四主機架構違規! OTEL Endpoint 禁止指向 {host}, "
|
||
f"必須使用 192.168.0.188"
|
||
)
|
||
return False
|
||
|
||
return True
|
||
|
||
|
||
def setup_telemetry(app) -> bool:
|
||
"""
|
||
Initialize OpenTelemetry with graceful degradation
|
||
|
||
Args:
|
||
app: FastAPI application instance
|
||
|
||
Returns:
|
||
bool: True if successfully initialized, False otherwise
|
||
|
||
Graceful Degradation:
|
||
- 如果 MOCK_MODE=true,跳過 OTEL 初始化
|
||
- 如果 OTEL_ENABLED=false,跳過初始化
|
||
- 如果連線失敗,API 仍可正常運作
|
||
"""
|
||
global _tracer_provider, _initialized
|
||
|
||
# 檢查是否啟用
|
||
if settings.MOCK_MODE:
|
||
_logger.info("OTEL 已停用 (MOCK_MODE=true)")
|
||
return False
|
||
|
||
if not settings.OTEL_ENABLED:
|
||
_logger.info("OTEL 已停用 (OTEL_ENABLED=false)")
|
||
return False
|
||
|
||
# 四主機架構校驗
|
||
if not _validate_endpoint():
|
||
_logger.warning("OTEL 初始化失敗: 四主機架構校驗未通過")
|
||
return False
|
||
|
||
# 防止重複初始化
|
||
if _initialized:
|
||
_logger.debug("OTEL 已初始化,跳過")
|
||
return True
|
||
|
||
try:
|
||
# 建立 Resource (服務識別)
|
||
resource = Resource.create({
|
||
SERVICE_NAME: settings.OTEL_SERVICE_NAME,
|
||
SERVICE_VERSION: settings.VERSION,
|
||
"deployment.environment": settings.ENVIRONMENT,
|
||
"service.namespace": "awoooi",
|
||
})
|
||
|
||
# 建立 TracerProvider
|
||
_tracer_provider = TracerProvider(resource=resource)
|
||
|
||
# 建立 OTLP Exporter (gRPC)
|
||
# 使用 BatchSpanProcessor 實現非同步傳輸 (優雅降級關鍵)
|
||
otlp_exporter = OTLPSpanExporter(
|
||
endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT,
|
||
insecure=True, # 內網使用,無需 TLS
|
||
timeout=5, # 5 秒超時,避免阻塞
|
||
)
|
||
|
||
# BatchSpanProcessor 優點:
|
||
# 1. 非同步批量傳輸,不阻塞主執行緒
|
||
# 2. 連線失敗時自動丟棄 spans,不影響 API
|
||
# 3. 記憶體保護: max_queue_size 限制
|
||
span_processor = BatchSpanProcessor(
|
||
otlp_exporter,
|
||
max_queue_size=2048, # 最大佇列大小
|
||
max_export_batch_size=512, # 批量大小
|
||
schedule_delay_millis=5000, # 5 秒批量間隔
|
||
)
|
||
|
||
_tracer_provider.add_span_processor(span_processor)
|
||
trace.set_tracer_provider(_tracer_provider)
|
||
|
||
# 自動埋入 FastAPI 追蹤
|
||
FastAPIInstrumentor.instrument_app(
|
||
app,
|
||
tracer_provider=_tracer_provider,
|
||
excluded_urls="health,healthz,ready,metrics", # 排除健康檢查
|
||
)
|
||
|
||
# 自動追蹤 HTTPX 外部呼叫 (Ollama, ClawBot, etc.)
|
||
HTTPXClientInstrumentor().instrument(tracer_provider=_tracer_provider)
|
||
|
||
# 自動追蹤日誌 (注入 trace_id, span_id)
|
||
LoggingInstrumentor().instrument(
|
||
tracer_provider=_tracer_provider,
|
||
set_logging_format=True,
|
||
)
|
||
|
||
_initialized = True
|
||
_logger.info(
|
||
f"OTEL 初始化成功: "
|
||
f"service={settings.OTEL_SERVICE_NAME}, "
|
||
f"endpoint={settings.OTEL_EXPORTER_OTLP_ENDPOINT}"
|
||
)
|
||
return True
|
||
|
||
except Exception as e:
|
||
# 優雅降級: OTEL 失敗不影響 API 啟動
|
||
_logger.warning(
|
||
f"OTEL 初始化失敗 (API 將繼續運作): {type(e).__name__}: {e}"
|
||
)
|
||
return False
|
||
|
||
|
||
def shutdown_telemetry() -> None:
|
||
"""
|
||
Gracefully shutdown telemetry
|
||
|
||
確保所有 pending spans 在關機前被傳送
|
||
"""
|
||
global _tracer_provider, _initialized
|
||
|
||
if _tracer_provider is not None:
|
||
try:
|
||
_tracer_provider.shutdown()
|
||
_logger.info("OTEL 已關閉")
|
||
except Exception as e:
|
||
_logger.warning(f"OTEL 關閉時發生錯誤: {e}")
|
||
finally:
|
||
_tracer_provider = None
|
||
_initialized = False
|
||
|
||
|
||
def get_tracer(name: str = "awoooi"):
|
||
"""
|
||
Get a tracer instance for manual instrumentation
|
||
|
||
Usage:
|
||
tracer = get_tracer("my_module")
|
||
with tracer.start_as_current_span("my_operation") as span:
|
||
span.set_attribute("key", "value")
|
||
# ... do work ...
|
||
"""
|
||
return trace.get_tracer(name, settings.VERSION)
|
||
|
||
|
||
def get_current_trace_id() -> str | None:
|
||
"""
|
||
Get current trace ID for log correlation
|
||
|
||
Returns:
|
||
Trace ID as hex string, or None if no active span
|
||
"""
|
||
span = trace.get_current_span()
|
||
if span is None:
|
||
return None
|
||
|
||
ctx = span.get_span_context()
|
||
if ctx is None or not ctx.is_valid:
|
||
return None
|
||
|
||
return format(ctx.trace_id, '032x')
|