## P0 安全 / 架構修正
### P0-08 telemetry.py — 移除硬碼 IP assert(ADR-121)
- config.py:新增 OTEL_ALLOWED_ENDPOINTS(預設 192.168.0.188)+ OTEL_FORBIDDEN_ENDPOINTS
- telemetry.py:_validate_endpoint() 改為 config-driven allowlist/forbidlist
- EwoooC 可用 env 覆寫 OTEL_ALLOWED_ENDPOINTS 指向自己的 SigNoz host
### P0-13 mcp_bridge.py — K8s namespace 由 settings 提供
- config.py:新增 AWOOOI_K8S_NAMESPACE(預設 "awoooi-prod")
- mcp_bridge.py:5 處 parameters.get("namespace", "awoooi-prod") → settings.AWOOOI_K8S_NAMESPACE
- EwoooC/Tsenyang 可設自己的 namespace
### P1-24 decision_manager.py — silence key 常數統一
- 新增 from src.services.telegram_gateway import SILENCE_KEY_PREFIX
- f"telegram_silence:{target}" → f"{SILENCE_KEY_PREFIX}{target}"
- 消除跨兩處重複定義(ADR-118 No Island Coding 原則)
## Phase 1 Task 1.7 Integration Tests
- tests/integration/test_awooop_phase1_schema.py:31 個測試案例
- awooop_projects CHECK 約束(4 cases)
- revision 不可變性 trigger(5 cases:draft 可改、published 鎖住、身份欄不可改、非法流轉、DELETE 禁止)
- awooop_published_revisions VIEW draft/published 隔離(2 cases)
- active_pointer_guard(3 cases:不可指向 draft、可指向 active、跨租戶 mismatch)
- RLS fail-closed(3 cases:未設/錯設/正確設 project_id)
- outbox FK + dedup(2 cases)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
397 lines
12 KiB
Python
397 lines
12 KiB
Python
"""
|
||
AWOOOI OpenTelemetry Configuration
|
||
==================================
|
||
P0 基礎設施: 可觀測性鐵律
|
||
|
||
Traces + Metrics → SigNoz (192.168.0.188:24317)
|
||
|
||
四主機架構強制校驗(允許 host 由 OTEL_ALLOWED_ENDPOINTS 設定,預設 192.168.0.188):
|
||
| IP | 允許 OTEL? |
|
||
|-----------------|-----------|
|
||
| 192.168.0.110 | ❌ 禁止 |
|
||
| 192.168.0.112 | ❌ 禁止 |
|
||
| 192.168.0.188 | ✅ 預設 |
|
||
| 192.168.0.120 | ❌ 禁止 |
|
||
|
||
P0-08 修正(ADR-121,2026-05-04 ogt + Claude Sonnet 4.6):
|
||
移除硬碼 IP assert,改為 config-driven allowed/forbidden 清單。
|
||
EwoooC 可用 OTEL_ALLOWED_ENDPOINTS env 覆寫指向自己的 SigNoz host。
|
||
|
||
優雅降級 (Graceful Degradation):
|
||
- OTEL 連線失敗不會導致 API 崩潰
|
||
- 使用 BatchSpanProcessor 非同步傳輸
|
||
- 使用 PeriodicExportingMetricReader 批量送出指標
|
||
- 連線超時後自動跳過追蹤
|
||
|
||
Phase 13.3 新增:
|
||
- OTEL Metrics (llm.tokens.*, llm.cost.*, llm.latency.*)
|
||
- Token Counter 整合
|
||
|
||
版本: v1.1
|
||
最後修改: 2026-03-26 14:30 (台北時區)
|
||
修改者: Claude Code
|
||
|
||
變更紀錄:
|
||
| 版本 | 日期 | 執行者 | 變更內容 |
|
||
|------|------|--------|----------|
|
||
| v1.0 | - | - | 初始 Traces 實作 |
|
||
| v1.1 | 2026-03-26 | Claude Code | Phase 13.3 新增 Metrics Provider |
|
||
"""
|
||
|
||
import logging
|
||
|
||
from opentelemetry import metrics, trace
|
||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
|
||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||
from opentelemetry.instrumentation.logging import LoggingInstrumentor
|
||
from opentelemetry.sdk.metrics import MeterProvider
|
||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||
from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource
|
||
from opentelemetry.sdk.trace import TracerProvider
|
||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||
|
||
from src.core.config import settings
|
||
|
||
# Module logger (not structlog to avoid circular dependency)
|
||
_logger = logging.getLogger("awoooi.telemetry")
|
||
|
||
# Global state
|
||
_tracer_provider: TracerProvider | None = None
|
||
_meter_provider: MeterProvider | None = None
|
||
_initialized: bool = False
|
||
|
||
|
||
def _validate_endpoint() -> bool:
|
||
"""
|
||
OTEL Endpoint 校驗(config-driven,P0-08 ADR-121 修正版)
|
||
|
||
允許 host 清單:settings.OTEL_ALLOWED_ENDPOINTS(預設 192.168.0.188)
|
||
禁止 host 清單:settings.OTEL_FORBIDDEN_ENDPOINTS(DevOps / DB / 其他主機)
|
||
"""
|
||
endpoint = settings.OTEL_EXPORTER_OTLP_ENDPOINT
|
||
allowed = settings.OTEL_ALLOWED_ENDPOINTS
|
||
forbidden = settings.OTEL_FORBIDDEN_ENDPOINTS
|
||
|
||
# 明確禁止的 host 優先判斷
|
||
for host in forbidden:
|
||
if host in endpoint:
|
||
_logger.error(
|
||
"otel_endpoint_forbidden_host",
|
||
endpoint=endpoint,
|
||
forbidden_host=host,
|
||
)
|
||
return False
|
||
|
||
# 確認至少有一個允許 host 命中
|
||
if not any(h in endpoint for h in allowed):
|
||
_logger.error(
|
||
"otel_endpoint_not_in_allowlist",
|
||
endpoint=endpoint,
|
||
allowed=allowed,
|
||
)
|
||
return False
|
||
|
||
return True
|
||
|
||
|
||
def setup_telemetry(app) -> bool:
|
||
"""
|
||
Initialize OpenTelemetry with graceful degradation
|
||
|
||
Args:
|
||
app: FastAPI application instance
|
||
|
||
Returns:
|
||
bool: True if successfully initialized, False otherwise
|
||
|
||
Graceful Degradation:
|
||
- 如果 MOCK_MODE=true,跳過 OTEL 初始化
|
||
- 如果 OTEL_ENABLED=false,跳過初始化
|
||
- 如果連線失敗,API 仍可正常運作
|
||
"""
|
||
global _tracer_provider, _meter_provider, _initialized
|
||
|
||
# 檢查是否啟用
|
||
if settings.MOCK_MODE:
|
||
_logger.info("OTEL 已停用 (MOCK_MODE=true)")
|
||
return False
|
||
|
||
if not settings.OTEL_ENABLED:
|
||
_logger.info("OTEL 已停用 (OTEL_ENABLED=false)")
|
||
return False
|
||
|
||
# 四主機架構校驗
|
||
if not _validate_endpoint():
|
||
_logger.warning("OTEL 初始化失敗: 四主機架構校驗未通過")
|
||
return False
|
||
|
||
# 防止重複初始化
|
||
if _initialized:
|
||
_logger.debug("OTEL 已初始化,跳過")
|
||
return True
|
||
|
||
try:
|
||
# 建立 Resource (服務識別)
|
||
resource = Resource.create({
|
||
SERVICE_NAME: settings.OTEL_SERVICE_NAME,
|
||
SERVICE_VERSION: settings.VERSION,
|
||
"deployment.environment": settings.ENVIRONMENT,
|
||
"service.namespace": "awoooi",
|
||
})
|
||
|
||
# =====================================================================
|
||
# Traces Provider
|
||
# =====================================================================
|
||
_tracer_provider = TracerProvider(resource=resource)
|
||
|
||
# 建立 OTLP Exporter (gRPC)
|
||
# 使用 BatchSpanProcessor 實現非同步傳輸 (優雅降級關鍵)
|
||
otlp_trace_exporter = OTLPSpanExporter(
|
||
endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT,
|
||
insecure=True, # 內網使用,無需 TLS
|
||
timeout=5, # 5 秒超時,避免阻塞
|
||
)
|
||
|
||
# BatchSpanProcessor 優點:
|
||
# 1. 非同步批量傳輸,不阻塞主執行緒
|
||
# 2. 連線失敗時自動丟棄 spans,不影響 API
|
||
# 3. 記憶體保護: max_queue_size 限制
|
||
span_processor = BatchSpanProcessor(
|
||
otlp_trace_exporter,
|
||
max_queue_size=2048, # 最大佇列大小
|
||
max_export_batch_size=512, # 批量大小
|
||
schedule_delay_millis=5000, # 5 秒批量間隔
|
||
)
|
||
|
||
_tracer_provider.add_span_processor(span_processor)
|
||
trace.set_tracer_provider(_tracer_provider)
|
||
|
||
# =====================================================================
|
||
# Metrics Provider (Phase 13.3 #88)
|
||
# =====================================================================
|
||
otlp_metric_exporter = OTLPMetricExporter(
|
||
endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT,
|
||
insecure=True,
|
||
timeout=5,
|
||
)
|
||
|
||
# PeriodicExportingMetricReader: 每 30 秒批量送出指標
|
||
metric_reader = PeriodicExportingMetricReader(
|
||
otlp_metric_exporter,
|
||
export_interval_millis=30000, # 30 秒
|
||
)
|
||
|
||
_meter_provider = MeterProvider(
|
||
resource=resource,
|
||
metric_readers=[metric_reader],
|
||
)
|
||
metrics.set_meter_provider(_meter_provider)
|
||
|
||
_logger.info("OTEL Metrics Provider 初始化成功")
|
||
|
||
# =====================================================================
|
||
# Auto Instrumentation
|
||
# =====================================================================
|
||
# 自動埋入 FastAPI 追蹤
|
||
FastAPIInstrumentor.instrument_app(
|
||
app,
|
||
tracer_provider=_tracer_provider,
|
||
excluded_urls="health,healthz,ready,metrics", # 排除健康檢查
|
||
)
|
||
|
||
# 自動追蹤 HTTPX 外部呼叫 (Ollama, OpenClaw, etc.)
|
||
HTTPXClientInstrumentor().instrument(tracer_provider=_tracer_provider)
|
||
|
||
# 自動追蹤日誌 (注入 trace_id, span_id)
|
||
LoggingInstrumentor().instrument(
|
||
tracer_provider=_tracer_provider,
|
||
set_logging_format=True,
|
||
)
|
||
|
||
_initialized = True
|
||
_logger.info(
|
||
f"OTEL 初始化成功 (Traces + Metrics): "
|
||
f"service={settings.OTEL_SERVICE_NAME}, "
|
||
f"endpoint={settings.OTEL_EXPORTER_OTLP_ENDPOINT}"
|
||
)
|
||
return True
|
||
|
||
except Exception as e:
|
||
# 優雅降級: OTEL 失敗不影響 API 啟動
|
||
_logger.warning(
|
||
f"OTEL 初始化失敗 (API 將繼續運作): {type(e).__name__}: {e}"
|
||
)
|
||
return False
|
||
|
||
|
||
def shutdown_telemetry() -> None:
|
||
"""
|
||
Gracefully shutdown telemetry
|
||
|
||
確保所有 pending spans 在關機前被傳送
|
||
"""
|
||
global _tracer_provider, _initialized
|
||
|
||
if _tracer_provider is not None:
|
||
try:
|
||
_tracer_provider.shutdown()
|
||
_logger.info("OTEL 已關閉")
|
||
except Exception as e:
|
||
_logger.warning(f"OTEL 關閉時發生錯誤: {e}")
|
||
finally:
|
||
_tracer_provider = None
|
||
_initialized = False
|
||
|
||
|
||
def get_tracer(name: str = "awoooi"):
|
||
"""
|
||
Get a tracer instance for manual instrumentation
|
||
|
||
Usage:
|
||
tracer = get_tracer("my_module")
|
||
with tracer.start_as_current_span("my_operation") as span:
|
||
span.set_attribute("key", "value")
|
||
# ... do work ...
|
||
"""
|
||
return trace.get_tracer(name, settings.VERSION)
|
||
|
||
|
||
def get_current_trace_id() -> str | None:
|
||
"""
|
||
Get current trace ID for log correlation
|
||
|
||
Returns:
|
||
Trace ID as hex string, or None if no active span
|
||
"""
|
||
span = trace.get_current_span()
|
||
if span is None:
|
||
return None
|
||
|
||
ctx = span.get_span_context()
|
||
if ctx is None or not ctx.is_valid:
|
||
return None
|
||
|
||
return format(ctx.trace_id, '032x')
|
||
|
||
|
||
def get_current_span_id() -> str | None:
|
||
"""
|
||
Get current span ID
|
||
|
||
Returns:
|
||
Span ID as hex string, or None if no active span
|
||
"""
|
||
span = trace.get_current_span()
|
||
if span is None:
|
||
return None
|
||
|
||
ctx = span.get_span_context()
|
||
if ctx is None or not ctx.is_valid:
|
||
return None
|
||
|
||
return format(ctx.span_id, '016x')
|
||
|
||
|
||
# =============================================================================
|
||
# Phase 15.2: Redis Trace Context Propagation
|
||
# =============================================================================
|
||
|
||
def get_trace_context() -> dict[str, str] | None:
|
||
"""
|
||
取得當前 Trace Context 用於 Redis Streams 注入
|
||
|
||
Phase 15.2: 解決 Redis Streams Trace 斷鏈問題
|
||
|
||
Returns:
|
||
dict with trace_id, span_id, or None if no active span
|
||
|
||
Usage (寫入 Redis):
|
||
payload = {**signal.dict(), "_trace_context": get_trace_context()}
|
||
await redis.xadd("stream:signals", payload)
|
||
"""
|
||
trace_id = get_current_trace_id()
|
||
span_id = get_current_span_id()
|
||
|
||
if not trace_id:
|
||
return None
|
||
|
||
return {
|
||
"trace_id": trace_id,
|
||
"span_id": span_id or "",
|
||
}
|
||
|
||
|
||
def restore_trace_context(trace_context: dict[str, str] | None):
|
||
"""
|
||
從 Redis 訊息還原 Trace Context 並建立新 Span
|
||
|
||
Phase 15.2: Worker 端 Context 重建
|
||
|
||
Args:
|
||
trace_context: 從 Redis 訊息取得的 _trace_context
|
||
|
||
Returns:
|
||
Context manager for the restored span
|
||
|
||
Usage (讀取 Redis):
|
||
message = await redis.xreadgroup(...)
|
||
trace_ctx = message.get("_trace_context")
|
||
|
||
with restore_trace_context(trace_ctx) as span:
|
||
# 處理邏輯,此處的 span 會繼承原始 trace_id
|
||
pass
|
||
"""
|
||
from contextlib import contextmanager
|
||
|
||
from opentelemetry.trace import SpanKind
|
||
from opentelemetry.trace.propagation.tracecontext import (
|
||
TraceContextTextMapPropagator,
|
||
)
|
||
|
||
tracer = get_tracer("awoooi.worker")
|
||
|
||
@contextmanager
|
||
def _context_manager():
|
||
if not trace_context or not trace_context.get("trace_id"):
|
||
# 沒有 trace context,建立新的 span
|
||
with tracer.start_as_current_span(
|
||
"worker_process",
|
||
kind=SpanKind.CONSUMER,
|
||
) as span:
|
||
yield span
|
||
return
|
||
|
||
# 有 trace context,嘗試還原
|
||
try:
|
||
# 使用 W3C Trace Context 格式建立 carrier
|
||
carrier = {
|
||
"traceparent": f"00-{trace_context['trace_id']}-{trace_context.get('span_id', '0' * 16)}-01"
|
||
}
|
||
|
||
# 從 carrier 提取 context
|
||
propagator = TraceContextTextMapPropagator()
|
||
ctx = propagator.extract(carrier=carrier)
|
||
|
||
# 建立子 span 繼承原始 trace
|
||
with tracer.start_as_current_span(
|
||
"worker_process",
|
||
context=ctx,
|
||
kind=SpanKind.CONSUMER,
|
||
) as span:
|
||
span.set_attribute("trace.restored", True)
|
||
span.set_attribute("trace.parent_trace_id", trace_context["trace_id"])
|
||
yield span
|
||
|
||
except Exception as e:
|
||
_logger.warning(f"Trace context restore failed: {e}, creating new span")
|
||
with tracer.start_as_current_span(
|
||
"worker_process",
|
||
kind=SpanKind.CONSUMER,
|
||
) as span:
|
||
span.set_attribute("trace.restore_failed", str(e))
|
||
yield span
|
||
|
||
return _context_manager()
|