Files
awoooi/apps/api/src/main.py
OG T 670cd5df86
Some checks are pending
CD Pipeline / build-and-deploy (push) Has started running
refactor(flywheel): 首席架構師審查修正 C1/C2/I1/I2/I3/I4/M1
C1 — Repository 層修正 (積木化鐵律):
  新增 PlaybookEmbeddingRepository (pgvector UPSERT)
  playbook_embedding_service 改透過 Repository 存取 DB,不再直接 db.execute(text(...))

C2 — Router 層業務邏輯移入 Service 層:
  create_incident_for_approval + extract_affected_services (去掉底線前綴) 移入 incident_service.py
  webhooks.py 改從 incident_service import,自身不再含業務邏輯

I1 — _infra_jobs 提升為 module-level frozenset (_INFRA_JOB_NAMES),避免每次呼叫重建

I2 — _persist_embeddings_to_db 補齊 PlaybookRAGService / list[Playbook] 型別標注

I3 — embedding 格式顯式化: "[" + ",".join(str(float(x)) for x in embedding) + "]"
  防止 pgvector 因格式差異靜默解析失敗

I4 — import asyncio 移至 main.py 頂層,移除 try 區塊內重複 import

M1 — similarity.py: 移除死代碼 `if union > 0 else 0.0`
  union 在兩個集合都非空時不可能為 0

2026-04-10 Asia/Taipei — Claude Sonnet 4.6
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-10 11:35:10 +08:00

580 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI API - BFF Gateway
========================
ADR-005: BFF Architecture
ADR-006: AI Fallback Strategy
Four Iron Laws:
1. Async-First - All handlers are async def
2. CORS Whitelist - Strict origin control (NO wildcards)
3. Pydantic Config - Type-safe settings with validation
4. structlog - Structured JSON logging
Observability Stack:
- OpenTelemetry → SignOz (Traces + Logs + Metrics)
- Sentry SDK → Sentry Self-Hosted (Error Tracking + Stack Traces)
Version: 1.0.0
Date: 2026-03-20
"""
import asyncio
import os
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
import sentry_sdk
import structlog
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
from sentry_sdk.integrations.fastapi import FastApiIntegration
from sentry_sdk.integrations.starlette import StarletteIntegration
from src.api.v1 import agents as agents_v1 # Phase 9.5: Agent Teams API
from src.api.v1 import ai as ai_v1
from src.api.v1 import approvals as approvals_v1
from src.api.v1 import alert_operation_logs as alert_operation_logs_v1
from src.api.v1 import audit_logs as audit_logs_v1
from src.api.v1 import auto_repair as auto_repair_v1 # #8: 自動升級決策
from src.api.v1 import csrf as csrf_v1 # Phase 20: CSRF Protection
from src.api.v1 import dashboard as dashboard_v1
from src.api.v1 import errors as errors_v1 # #40: Sentry 錯誤 BFF API
from src.api.v1 import (
gitea_webhook as gitea_webhook_v1, # ADR-059: Gitea → OpenClaw (GitHub → Gitea 遷移)
)
# Import API routers
from src.api.v1 import health as health_v1
from src.api.v1 import incidents as incidents_v1 # Phase 6.4: Decision Proposal
from src.api.v1 import knowledge as knowledge_v1 # KB Phase 1: Knowledge Base
from src.api.v1 import learning as learning_v1 # Phase D-G P0: Learning API
from src.api.v1 import metrics as metrics_v1 # Phase 7: Gold Metrics (真實血脈)
from src.api.v1 import playbooks as playbooks_v1 # #7: Playbook 萃取
from src.api.v1 import proposals as proposals_v1 # Phase 6.4h: Proposals CRUD API
from src.api.v1 import (
sentry_webhook as sentry_webhook_v1, # Phase 10.2.1: Sentry → Telegram
)
from src.api.v1 import (
signoz_webhook as signoz_webhook_v1, # Phase 21: SignOz → Telegram (ADR-037)
)
from src.api.v1 import drift as drift_v1 # Phase 25 P2: Config Drift Detection
from src.api.v1 import rag as rag_v1 # Phase 33 ADR-067: RAG 知識庫
from src.api.v1 import monitoring as monitoring_v1 # 2026-04-03: 監控工具狀態
from src.api.v1 import stats as stats_v1 # Phase 6.5: Statistics Analytics
from src.api.v1 import telegram as telegram_v1 # Phase 5.4: Telegram Gateway
from src.api.v1 import terminal as terminal_v1 # Phase 19.1: Omni-Terminal SSE
from src.api.v1 import timeline as timeline_v1
from src.api.v1 import webhooks as webhooks_v1
from src.core.config import settings
from src.core.http_client import close_all_http_clients, init_all_http_clients
from src.core.logging import get_logger, setup_logging
from src.core.redis_client import close_redis_pool, init_redis_pool
from src.core.sse import get_publisher
from src.core.telemetry import setup_telemetry, shutdown_telemetry
# CTO-201: Database & Executor
from src.db.base import close_db, init_db
# Phase 6.4g: lewooogo-brain 積木路由
from src.routers import proposals as proposals_router
# Legacy route imports (to be migrated)
from src.routes import agent, notifications, pipelines, plugins
from src.services.executor import close_executor
# Phase 5: OpenClaw AI Engine
from src.services.openclaw import close_openclaw
from src.services.telegram_gateway import get_telegram_gateway
# Phase 6.1: Event Bus (Signal Worker)
from src.workers import close_signal_worker, init_signal_worker
# =============================================================================
# Initialize Logging (MUST be first)
# =============================================================================
setup_logging()
logger = get_logger("awoooi.api")
# =============================================================================
# Sentry SDK Initialization (Error Tracking - 補強 SignOz)
# Self-Hosted @ 192.168.0.110
# 分工: Sentry 專注 Error TrackingSignOz 專注 Traces/Logs/Metrics
# Phase 15.3: Deep Linking - 注入 OTEL trace_id 供 SignOz 關聯
# =============================================================================
SENTRY_DSN = os.getenv("SENTRY_DSN")
def _sentry_before_send(event, hint): # noqa: ARG001 - hint is Sentry callback signature
"""
Phase 15.3: Sentry → SignOz Deep Linking
在每個 Sentry event 中注入 OTEL trace_id
讓 Sentry 錯誤能直接連結到 SignOz Trace。
"""
try:
from src.core.deep_linking import DeepLinking
from src.core.telemetry import get_current_trace_id
trace_id = get_current_trace_id()
if trace_id:
# 注入 trace_id 到 tags (Sentry UI 可搜尋)
if "tags" not in event:
event["tags"] = {}
event["tags"]["otel_trace_id"] = trace_id
event["tags"]["signoz_trace_url"] = DeepLinking.signoz_trace_url(trace_id)
# 注入到 contexts (詳情頁顯示)
if "contexts" not in event:
event["contexts"] = {}
event["contexts"]["signoz"] = {
"trace_id": trace_id,
"trace_url": DeepLinking.signoz_trace_url(trace_id),
"service": "awoooi-api",
}
except Exception:
# Deep Linking 失敗不應影響錯誤上報
pass
return event
if SENTRY_DSN:
sentry_sdk.init(
dsn=SENTRY_DSN,
environment=settings.ENVIRONMENT,
release=f"awoooi-api@{settings.VERSION}",
# 效能監控取樣率 (生產環境降低)
traces_sample_rate=0.1 if settings.ENVIRONMENT == "production" else 1.0,
# FastAPI 深度整合
integrations=[
FastApiIntegration(transaction_style="endpoint"),
StarletteIntegration(transaction_style="endpoint"),
],
# 忽略常見的非錯誤
ignore_errors=[
ConnectionRefusedError,
TimeoutError,
],
# 只在生產環境發送
send_default_pii=False,
# Phase 15.3: Deep Linking hook
before_send=_sentry_before_send,
)
# 2026-04-05 Claude Code: 加入統一標籤,對齊 Prometheus/auto_repair layer 規範
sentry_sdk.set_tag("layer", "k8s")
sentry_sdk.set_tag("component", "api")
sentry_sdk.set_tag("host", "k8s-awoooi-prod")
sentry_sdk.set_tag("team", "backend")
logger.info("sentry_initialized", dsn=SENTRY_DSN.split("@")[-1])
else:
logger.info("sentry_disabled", reason="SENTRY_DSN not configured")
# =============================================================================
# Application Lifespan
# =============================================================================
@asynccontextmanager
async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
"""Application lifespan events"""
# Startup
logger.info(
"api_startup",
version=settings.VERSION,
environment=settings.ENVIRONMENT,
mock_mode=settings.MOCK_MODE,
cors_origins=settings.CORS_ORIGINS,
ai_fallback_order=settings.AI_FALLBACK_ORDER,
four_hosts=settings.four_hosts,
kubeconfig=settings.KUBECONFIG_PATH,
)
# CTO-201: Initialize PostgreSQL database (統帥鐵律: 禁止 SQLite)
await init_db()
db_url = settings.DATABASE_URL
logger.info(
"database_initialized", url=db_url.split("@")[-1] if "@" in db_url else db_url
)
# Phase 5: Initialize HTTP Clients (ClickHouse, Ollama)
# 統帥鐵律: 連線池在啟動時建立,關閉時回收
await init_all_http_clients()
logger.info("http_clients_initialized")
# Phase 6.1.1: Initialize Redis Pool (Multi-Sig 狀態持久化)
# 統帥鐵律: Redis 連線池在 Lifespan 啟動時建立
await init_redis_pool()
logger.info("redis_pool_initialized", url=settings.REDIS_URL.split("@")[-1])
# Start SSE publisher
publisher = await get_publisher()
logger.info("sse_publisher_initialized")
# Phase 5: Telegram Gateway 初始化
# 2026-03-23 架構修正: AWOOOI API 不做 Long Polling
# 原因: 同一個 Bot Token 只能有一個 Long Polling 實例
# OpenClaw (192.168.0.188) 是唯一的 Polling 實例
# AWOOOI API 只負責發送訊息,不接收
telegram_gw = get_telegram_gateway()
if settings.TELEGRAM_ENABLE_POLLING:
await telegram_gw.start_long_polling()
logger.info("telegram_long_polling_started")
else:
logger.info("telegram_polling_disabled", reason="OpenClaw 是唯一 Polling 實例")
# ADR-015: MCP Provider 註冊 (DI 模式)
from src.plugins.mcp.providers import register_all_providers
register_all_providers()
logger.info("mcp_providers_registered")
# Phase 6.5: Telegram 心跳監控 (防止沉默盲點)
# - 每 30 分鐘發送心跳,證明告警鏈路正常
# - 超過 2 小時沒訊息則告警
if settings.OPENCLAW_TG_BOT_TOKEN:
await telegram_gw.start_heartbeat_monitor(
heartbeat_interval_minutes=30,
silence_threshold_hours=2,
)
logger.info("telegram_heartbeat_monitor_started")
# Reboot Recovery: Warm-up Redis Working Memory from PostgreSQL
# 2026-04-05 ogt: 重開機後 Redis 清空,從 DB restore 未解決的 incidents
# 統帥批准: 數據必須長久記錄,重開機後自動恢復 Working Memory
try:
from src.services.incident_service import get_incident_service
from src.db.base import get_db_context
from src.db.models import IncidentRecord
from sqlalchemy import select
incident_service = get_incident_service()
async with get_db_context() as db:
result = await db.execute(
select(IncidentRecord).where(
IncidentRecord.status.in_(["investigating", "mitigating"])
)
)
records = result.scalars().all()
restored = 0
for record in records:
from src.models.incident import Incident
incident = Incident(
incident_id=record.incident_id,
status=record.status,
severity=record.severity,
signals=record.signals or [],
affected_services=record.affected_services or [],
decision_chain=record.decision_chain,
proposal_ids=record.proposal_ids or [],
outcome=record.outcome,
created_at=record.created_at,
updated_at=record.updated_at,
resolved_at=record.resolved_at,
closed_at=record.closed_at,
ttl_days=record.ttl_days,
vectorized=record.vectorized,
)
if await incident_service.save_to_working_memory(incident):
restored += 1
logger.info("working_memory_warmed_up", restored=restored, total=len(records))
except Exception as e:
logger.warning("working_memory_warmup_failed", error=str(e))
# Phase 4 飛輪修復: Playbook Embedding 冷啟動索引
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
# 目的: 確保 playbook_embeddings 表有最新向量,供語義相似度查詢
# 使用 asyncio.create_task 非阻塞 — 不影響 API 啟動速度
try:
from src.services.playbook_embedding_service import ensure_playbook_embeddings_indexed
asyncio.create_task(ensure_playbook_embeddings_indexed())
logger.info("playbook_embedding_indexing_scheduled")
except Exception as e:
logger.warning("playbook_embedding_schedule_failed", error=str(e))
# Phase 6.1: 啟動 Signal Worker (Redis Streams Consumer)
# 統帥鐵律: Event Bus 解耦告警接收與處理
await init_signal_worker()
logger.info("signal_worker_initialized")
yield
# Shutdown
# Phase 6.1: 關閉 Signal Worker (先關閉 Consumer)
await close_signal_worker()
await publisher.stop()
await close_executor()
await close_openclaw()
# Phase 5.4: Close Telegram Gateway
telegram_gw = get_telegram_gateway()
await telegram_gw.close()
# Phase 33: Close RAG Service httpx client (ADR-067)
from src.services.knowledge_rag_service import get_knowledge_rag_service
await get_knowledge_rag_service().close()
# Phase 5: Close HTTP Clients (統帥鐵律: 連線池回收)
await close_all_http_clients()
# Phase 6.1.1: Close Redis Pool (統帥鐵律: Redis 連線池回收)
await close_redis_pool()
await close_db()
shutdown_telemetry()
logger.info("api_shutdown", version=settings.VERSION)
# =============================================================================
# FastAPI Application
# =============================================================================
app = FastAPI(
title="AWOOOI API",
description="AWOOOI 智能運維平台 API - 由 leWOOOgo Engine 驅動",
version=settings.VERSION,
docs_url="/api/v1/docs",
redoc_url="/api/v1/redoc",
openapi_url="/api/v1/openapi.json",
lifespan=lifespan,
)
# =============================================================================
# OpenTelemetry Instrumentation (可觀測性鐵律)
# 必須在 Middleware 之前初始化,確保追蹤完整性
# 優雅降級: 失敗不影響 API 啟動
# =============================================================================
otel_enabled = setup_telemetry(app)
if otel_enabled:
logger.info(
"otel_initialized",
service=settings.OTEL_SERVICE_NAME,
endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT,
)
else:
logger.warning("otel_disabled", reason="initialization failed or disabled")
# =============================================================================
# Middleware
# =============================================================================
# 2026-04-03 ogt: Nginx 反向代理修正 — 讓 FastAPI 信任 X-Forwarded-Proto
# 解決問題: /api/v1/knowledge (無結尾斜線) 307 redirect 產生 http:// Location
# 原因: FastAPI 不知道自己在 HTTPS 後面redirect 回 http://
# 效果: 有了此中間件307 Location 會是 https://
from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
app.add_middleware(ProxyHeadersMiddleware, trusted_hosts="*")
# CORS - Strict Whitelist (Iron Law #2)
# NO wildcards, NO UAT
app.add_middleware(
CORSMiddleware,
allow_origins=settings.CORS_ORIGINS,
allow_credentials=True,
allow_methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
allow_headers=["Authorization", "Content-Type", "X-Request-ID"],
expose_headers=["X-Request-ID"],
)
@app.middleware("http")
async def request_logging_middleware(request: Request, call_next):
"""
Structured request logging middleware
Logs every request with:
- Request ID (from header or generated)
- HTTP method and path
- Response status code
- Request duration
"""
import time
request_id = request.headers.get("X-Request-ID", "-")
start_time = time.perf_counter()
# Bind request context for all logs in this request
structlog.contextvars.clear_contextvars()
structlog.contextvars.bind_contextvars(
request_id=request_id,
method=request.method,
path=request.url.path,
)
log = get_logger("awoooi.http")
log.debug("request_start")
response = await call_next(request)
duration_ms = (time.perf_counter() - start_time) * 1000
log.info(
"request_complete",
status_code=response.status_code,
duration_ms=round(duration_ms, 2),
)
# Add request ID to response headers
response.headers["X-Request-ID"] = request_id
return response
# =============================================================================
# Exception Handlers
# =============================================================================
@app.exception_handler(Exception)
async def global_exception_handler(_request: Request, exc: Exception) -> JSONResponse:
"""
Global exception handler with structured logging + Sentry
Catches all unhandled exceptions and returns a safe error response.
Full exception details are logged but not exposed to clients.
Sentry SDK 會自動捕獲並發送到 Self-Hosted Server。
"""
# Sentry 自動捕獲 (如果已初始化)
sentry_sdk.capture_exception(exc)
log = get_logger("awoooi.error")
log.exception(
"unhandled_exception",
exc_type=type(exc).__name__,
exc_message=str(exc),
)
return JSONResponse(
status_code=500,
content={
"code": "INTERNAL_ERROR",
"message": "An internal error occurred",
},
)
# =============================================================================
# API Routers - Path-based routing (/api/v1/*)
# =============================================================================
# New v1 API routes
app.include_router(health_v1.router, prefix="/api/v1", tags=["Health"])
app.include_router(csrf_v1.router, prefix="/api/v1", tags=["Security"]) # Phase 20
app.include_router(dashboard_v1.router, prefix="/api/v1", tags=["Dashboard"])
app.include_router(approvals_v1.router, prefix="/api/v1", tags=["HITL Approvals"])
app.include_router(ai_v1.router, prefix="/api/v1", tags=["AI Decision"])
app.include_router(webhooks_v1.router, prefix="/api/v1", tags=["Webhooks"])
app.include_router(timeline_v1.router, prefix="/api/v1", tags=["Timeline"])
app.include_router(audit_logs_v1.router, prefix="/api/v1", tags=["Audit Logs"])
# 2026-04-09 Claude Sonnet 4.6: alert_operation_log 查詢 API (Sprint 5.2)
app.include_router(alert_operation_logs_v1.router, prefix="/api/v1", tags=["Alert Operation Logs"])
app.include_router(
telegram_v1.router, prefix="/api/v1", tags=["Telegram Gateway"]
) # Phase 5.4
app.include_router(
metrics_v1.router, prefix="/api/v1", tags=["Gold Metrics"]
) # Phase 7: 真實血脈
app.include_router(
incidents_v1.router, prefix="/api/v1", tags=["Incidents"]
) # Phase 6.4: Decision Proposal
app.include_router(
proposals_v1.router, prefix="/api/v1", tags=["Proposals"]
) # Phase 6.4h: Proposals CRUD
app.include_router(
agents_v1.router, prefix="/api/v1", tags=["Agent Teams"]
) # Phase 9.5: Agent Teams
app.include_router(
stats_v1.router, prefix="/api/v1", tags=["Statistics"]
) # Phase 6.5: Statistics Analytics
app.include_router(
monitoring_v1.router, prefix="/api/v1", tags=["Monitoring"]
) # 2026-04-03: 監控工具狀態
app.include_router(
gitea_webhook_v1.router, prefix="/api/v1", tags=["Gitea Webhook"]
) # ADR-059: Gitea → OpenClaw
app.include_router(
playbooks_v1.router, prefix="/api/v1", tags=["Playbooks"]
) # #7: Playbook 萃取
app.include_router(
auto_repair_v1.router, prefix="/api/v1", tags=["Auto Repair"]
) # #8: 自動升級決策
app.include_router(
drift_v1.router, prefix="/api/v1", tags=["Drift Detection"]
) # Phase 25 P2: Config Drift Detection
app.include_router(
rag_v1.router, prefix="/api/v1", tags=["RAG Knowledge Base"]
) # Phase 33 ADR-067: RAG 知識庫
app.include_router(
errors_v1.router, prefix="/api/v1", tags=["Errors"]
) # #40: Sentry 錯誤 BFF API
app.include_router(
sentry_webhook_v1.router, prefix="/api/v1", tags=["Sentry Webhook"]
) # Phase 10.2.1: Sentry → Telegram
app.include_router(
signoz_webhook_v1.router, prefix="/api/v1", tags=["SignOz Webhook"]
) # Phase 21: SignOz → Telegram (ADR-037)
app.include_router(
terminal_v1.router, prefix="/api/v1", tags=["Omni-Terminal"]
) # Phase 19.1: Omni-Terminal SSE
app.include_router(
learning_v1.router, prefix="/api/v1", tags=["Learning"]
) # Phase D-G P0: 學習系統 API
app.include_router(
knowledge_v1.router, prefix="/api/v1", tags=["Knowledge Base"]
) # KB Phase 1: Knowledge Base API
app.include_router(
proposals_router.router, tags=["Proposals (Legacy)"]
) # Phase 6.4g: lewooogo-brain (舊版)
# Legacy routes (to be migrated to api/v1/)
app.include_router(plugins.router, prefix="/api/v1/plugins", tags=["Plugins"])
app.include_router(pipelines.router, prefix="/api/v1/pipelines", tags=["Pipelines"])
app.include_router(agent.router, prefix="/api/v1/agent", tags=["Agent"])
app.include_router(
notifications.router, prefix="/api/v1/notifications", tags=["Notifications"]
)
# =============================================================================
# Prometheus Metrics Endpoint
# =============================================================================
# 2026-03-31 ogt: 暴露 Prometheus 指標供告警系統使用
@app.get("/metrics", include_in_schema=False)
async def prometheus_metrics() -> Response:
"""Prometheus metrics endpoint for alerting"""
return Response(
content=generate_latest(),
media_type=CONTENT_TYPE_LATEST,
)
# =============================================================================
# Root Endpoint
# =============================================================================
@app.get("/", include_in_schema=False)
async def root() -> dict:
"""Root endpoint with API info"""
return {
"name": "AWOOOI API",
"version": settings.VERSION,
"environment": settings.ENVIRONMENT,
"docs": "/api/v1/docs",
"health": "/api/v1/health",
"dashboard": "/api/v1/dashboard",
"stream": "/api/v1/dashboard/stream",
}
# =============================================================================
# Entry Point
# =============================================================================
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"src.main:app",
host="0.0.0.0",
port=8000,
reload=settings.DEBUG,
log_level=settings.LOG_LEVEL.lower(),
)