Files
awoooi/apps/api/src/core/telemetry.py
OG T 6f049877fc fix(lint): ruff auto-fix + lewooogo-core src 加入 git
- Python: ruff --fix 修復 280 個 lint 錯誤
- lewooogo-core: src/ 目錄未追蹤,導致 CI eslint 失敗

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-23 23:51:37 +08:00

222 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI OpenTelemetry Configuration
==================================
P0 基礎設施: 可觀測性鐵律
Traces → SigNoz (192.168.0.188:4317)
四主機架構強制校驗:
| IP | 允許 OTEL? |
|-----------------|-----------|
| 192.168.0.110 | ❌ 禁止 |
| 192.168.0.112 | ❌ 禁止 |
| 192.168.0.188 | ✅ 唯一 |
| 192.168.0.120 | ❌ 禁止 |
優雅降級 (Graceful Degradation):
- OTEL 連線失敗不會導致 API 崩潰
- 使用 BatchSpanProcessor 非同步傳輸
- 連線超時後自動跳過追蹤
"""
import logging
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.logging import LoggingInstrumentor
from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from src.core.config import settings
# Module logger (not structlog to avoid circular dependency)
_logger = logging.getLogger("awoooi.telemetry")
# Global state
_tracer_provider: TracerProvider | None = None
_initialized: bool = False
def _validate_endpoint() -> bool:
"""
四主機架構強制校驗
OTEL Endpoint 必須指向 192.168.0.188 (AI+Web 中心)
"""
endpoint = settings.OTEL_EXPORTER_OTLP_ENDPOINT
# 檢查是否為合法的 AI+Web 中心
if "192.168.0.188" not in endpoint:
_logger.error(
f"四主機架構違規! OTEL Endpoint 必須指向 192.168.0.188, "
f"當前: {endpoint}"
)
return False
# 檢查是否誤指向其他主機
forbidden_hosts = ["192.168.0.110", "192.168.0.112", "192.168.0.120", "192.168.0.121"]
for host in forbidden_hosts:
if host in endpoint:
_logger.error(
f"四主機架構違規! OTEL Endpoint 禁止指向 {host}, "
f"必須使用 192.168.0.188"
)
return False
return True
def setup_telemetry(app) -> bool:
"""
Initialize OpenTelemetry with graceful degradation
Args:
app: FastAPI application instance
Returns:
bool: True if successfully initialized, False otherwise
Graceful Degradation:
- 如果 MOCK_MODE=true跳過 OTEL 初始化
- 如果 OTEL_ENABLED=false跳過初始化
- 如果連線失敗API 仍可正常運作
"""
global _tracer_provider, _initialized
# 檢查是否啟用
if settings.MOCK_MODE:
_logger.info("OTEL 已停用 (MOCK_MODE=true)")
return False
if not settings.OTEL_ENABLED:
_logger.info("OTEL 已停用 (OTEL_ENABLED=false)")
return False
# 四主機架構校驗
if not _validate_endpoint():
_logger.warning("OTEL 初始化失敗: 四主機架構校驗未通過")
return False
# 防止重複初始化
if _initialized:
_logger.debug("OTEL 已初始化,跳過")
return True
try:
# 建立 Resource (服務識別)
resource = Resource.create({
SERVICE_NAME: settings.OTEL_SERVICE_NAME,
SERVICE_VERSION: settings.VERSION,
"deployment.environment": settings.ENVIRONMENT,
"service.namespace": "awoooi",
})
# 建立 TracerProvider
_tracer_provider = TracerProvider(resource=resource)
# 建立 OTLP Exporter (gRPC)
# 使用 BatchSpanProcessor 實現非同步傳輸 (優雅降級關鍵)
otlp_exporter = OTLPSpanExporter(
endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT,
insecure=True, # 內網使用,無需 TLS
timeout=5, # 5 秒超時,避免阻塞
)
# BatchSpanProcessor 優點:
# 1. 非同步批量傳輸,不阻塞主執行緒
# 2. 連線失敗時自動丟棄 spans不影響 API
# 3. 記憶體保護: max_queue_size 限制
span_processor = BatchSpanProcessor(
otlp_exporter,
max_queue_size=2048, # 最大佇列大小
max_export_batch_size=512, # 批量大小
schedule_delay_millis=5000, # 5 秒批量間隔
)
_tracer_provider.add_span_processor(span_processor)
trace.set_tracer_provider(_tracer_provider)
# 自動埋入 FastAPI 追蹤
FastAPIInstrumentor.instrument_app(
app,
tracer_provider=_tracer_provider,
excluded_urls="health,healthz,ready,metrics", # 排除健康檢查
)
# 自動追蹤 HTTPX 外部呼叫 (Ollama, ClawBot, etc.)
HTTPXClientInstrumentor().instrument(tracer_provider=_tracer_provider)
# 自動追蹤日誌 (注入 trace_id, span_id)
LoggingInstrumentor().instrument(
tracer_provider=_tracer_provider,
set_logging_format=True,
)
_initialized = True
_logger.info(
f"OTEL 初始化成功: "
f"service={settings.OTEL_SERVICE_NAME}, "
f"endpoint={settings.OTEL_EXPORTER_OTLP_ENDPOINT}"
)
return True
except Exception as e:
# 優雅降級: OTEL 失敗不影響 API 啟動
_logger.warning(
f"OTEL 初始化失敗 (API 將繼續運作): {type(e).__name__}: {e}"
)
return False
def shutdown_telemetry() -> None:
"""
Gracefully shutdown telemetry
確保所有 pending spans 在關機前被傳送
"""
global _tracer_provider, _initialized
if _tracer_provider is not None:
try:
_tracer_provider.shutdown()
_logger.info("OTEL 已關閉")
except Exception as e:
_logger.warning(f"OTEL 關閉時發生錯誤: {e}")
finally:
_tracer_provider = None
_initialized = False
def get_tracer(name: str = "awoooi"):
"""
Get a tracer instance for manual instrumentation
Usage:
tracer = get_tracer("my_module")
with tracer.start_as_current_span("my_operation") as span:
span.set_attribute("key", "value")
# ... do work ...
"""
return trace.get_tracer(name, settings.VERSION)
def get_current_trace_id() -> str | None:
"""
Get current trace ID for log correlation
Returns:
Trace ID as hex string, or None if no active span
"""
span = trace.get_current_span()
if span is None:
return None
ctx = span.get_span_context()
if ctx is None or not ctx.is_valid:
return None
return format(ctx.trace_id, '032x')