1203 lines
54 KiB
Python
1203 lines
54 KiB
Python
"""
|
||
AWOOOI API - BFF Gateway
|
||
========================
|
||
ADR-005: BFF Architecture
|
||
ADR-006: AI Fallback Strategy
|
||
|
||
Four Iron Laws:
|
||
1. Async-First - All handlers are async def
|
||
2. CORS Whitelist - Strict origin control (NO wildcards)
|
||
3. Pydantic Config - Type-safe settings with validation
|
||
4. structlog - Structured JSON logging
|
||
|
||
Observability Stack:
|
||
- OpenTelemetry → SignOz (Traces + Logs + Metrics)
|
||
- Sentry SDK → Sentry Self-Hosted (Error Tracking + Stack Traces)
|
||
|
||
Version: 1.0.0
|
||
Date: 2026-03-20
|
||
"""
|
||
|
||
import asyncio
|
||
import os
|
||
from uuid import uuid4
|
||
from collections.abc import AsyncGenerator
|
||
from contextlib import asynccontextmanager
|
||
|
||
import sentry_sdk
|
||
import structlog
|
||
from fastapi import FastAPI, HTTPException, Request
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
from fastapi.responses import JSONResponse, Response
|
||
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
|
||
from sentry_sdk.integrations.fastapi import FastApiIntegration
|
||
from sentry_sdk.integrations.starlette import StarletteIntegration
|
||
from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
|
||
|
||
from src.api.v1 import agents as agents_v1 # Phase 9.5: Agent Teams API
|
||
from src.api.v1 import ai as ai_v1
|
||
from src.api.v1 import (
|
||
ai_governance as ai_governance_v1, # 2026-05-02: /governance 頁面 3 endpoints
|
||
)
|
||
from src.api.v1 import ai_slo as ai_slo_v1 # Phase 6 ADR-087: AI SLO 自我治理
|
||
from src.api.v1 import aider_events as aider_events_v1 # aider-watch v2 ADR-091
|
||
from src.api.v1 import aiops_kpi as aiops_kpi_v1 # ADR-090 § Phase 7 KPI Dashboard
|
||
from src.api.v1 import (
|
||
aiops_timeline as aiops_timeline_v1, # 2026-04-27 Wave8-X3 B4 timeline endpoint
|
||
)
|
||
from src.api.v1 import alert_operation_logs as alert_operation_logs_v1
|
||
from src.api.v1 import approvals as approvals_v1
|
||
from src.api.v1 import audit_logs as audit_logs_v1
|
||
from src.api.v1 import auto_repair as auto_repair_v1 # #8: 自動升級決策
|
||
from src.api.v1 import csrf as csrf_v1 # Phase 20: CSRF Protection
|
||
from src.api.v1 import dashboard as dashboard_v1
|
||
from src.api.v1 import drift as drift_v1 # Phase 25 P2: Config Drift Detection
|
||
from src.api.v1 import errors as errors_v1 # #40: Sentry 錯誤 BFF API
|
||
from src.api.v1 import (
|
||
gitea_webhook as gitea_webhook_v1, # ADR-059: Gitea → OpenClaw (GitHub → Gitea 遷移)
|
||
)
|
||
|
||
# Import API routers
|
||
from src.api.v1 import health as health_v1
|
||
from src.api.v1 import incidents as incidents_v1 # Phase 6.4: Decision Proposal
|
||
from src.api.v1 import knowledge as knowledge_v1 # KB Phase 1: Knowledge Base
|
||
from src.api.v1 import learning as learning_v1 # Phase D-G P0: Learning API
|
||
from src.api.v1 import metrics as metrics_v1 # Phase 7: Gold Metrics (真實血脈)
|
||
from src.api.v1 import monitoring as monitoring_v1 # 2026-04-03: 監控工具狀態
|
||
from src.api.v1 import notifications as notifications_v1 # 2026-04-10: 通知頻道狀態
|
||
from src.api.v1 import (
|
||
platform as platform_v1, # AwoooP Phase 4: Platform Shell(Shadow Mode)
|
||
)
|
||
from src.api.v1 import playbooks as playbooks_v1 # #7: Playbook 萃取
|
||
from src.api.v1 import proposals as proposals_v1 # Phase 6.4h: Proposals CRUD API
|
||
from src.api.v1 import rag as rag_v1 # Phase 33 ADR-067: RAG 知識庫
|
||
from src.api.v1 import (
|
||
sentry_webhook as sentry_webhook_v1, # Phase 10.2.1: Sentry → Telegram
|
||
)
|
||
from src.api.v1 import (
|
||
signoz_webhook as signoz_webhook_v1, # Phase 21: SignOz → Telegram (ADR-037)
|
||
)
|
||
from src.api.v1 import stats as stats_v1 # Phase 6.5: Statistics Analytics
|
||
from src.api.v1 import telegram as telegram_v1 # Phase 5.4: Telegram Gateway
|
||
from src.api.v1 import telegram_webhook as telegram_webhook_v1 # ADR-094: Webhook入口
|
||
from src.api.v1 import terminal as terminal_v1 # Phase 19.1: Omni-Terminal SSE
|
||
from src.api.v1 import timeline as timeline_v1
|
||
from src.api.v1 import webhooks as webhooks_v1
|
||
from src.core.config import settings
|
||
from src.core.http_client import close_all_http_clients, init_all_http_clients
|
||
from src.core.logging import get_logger, setup_logging
|
||
from src.core.redis_client import (
|
||
close_redis_pool,
|
||
close_worker_redis_pool,
|
||
init_redis_pool,
|
||
)
|
||
from src.core.sse import get_publisher
|
||
from src.core.telemetry import setup_telemetry, shutdown_telemetry
|
||
|
||
# CTO-201: Database & Executor
|
||
from src.db.base import close_db, init_db
|
||
|
||
# Phase 6.4g: lewooogo-brain 積木路由
|
||
from src.routers import proposals as proposals_router
|
||
|
||
# Legacy route imports (to be migrated)
|
||
from src.routes import agent, notifications, pipelines, plugins
|
||
from src.services.adr100_slo_metrics_service import get_adr100_slo_metrics_service
|
||
from src.services.alert_chain_metrics_service import get_alert_chain_metrics_service
|
||
from src.services.executor import close_executor
|
||
from src.services.flywheel_stats_service import get_flywheel_stats_service
|
||
|
||
# Phase 5: OpenClaw AI Engine
|
||
from src.services.openclaw import close_openclaw
|
||
from src.services.telegram_gateway import get_telegram_gateway
|
||
|
||
# Phase 6.1: Event Bus (Signal Worker)
|
||
from src.workers import close_signal_worker, init_signal_worker
|
||
|
||
# =============================================================================
|
||
# Initialize Logging (MUST be first)
|
||
# =============================================================================
|
||
setup_logging()
|
||
logger = get_logger("awoooi.api")
|
||
|
||
ALERTMANAGER_WEBHOOK_PATH = "/api/v1/webhooks/alertmanager"
|
||
ALERTMANAGER_DEFAULT_PROJECT_ID = "awoooi"
|
||
|
||
|
||
def _resolve_request_project_context(request: Request) -> tuple[str | None, str]:
|
||
"""Resolve tenant context for RLS while keeping non-webhook routes fail-closed."""
|
||
for candidate in (
|
||
request.headers.get("X-Project-ID"),
|
||
request.headers.get("X-Tenant-ID"),
|
||
request.query_params.get("project_id"),
|
||
):
|
||
project_id = candidate.strip() if candidate else None
|
||
if project_id:
|
||
return project_id, "request.header_or_query"
|
||
|
||
if request.url.path == ALERTMANAGER_WEBHOOK_PATH:
|
||
return ALERTMANAGER_DEFAULT_PROJECT_ID, "request.alertmanager.default_project"
|
||
|
||
return None, "request.project_id.missing"
|
||
|
||
# =============================================================================
|
||
# Sentry SDK Initialization (Error Tracking - 補強 SignOz)
|
||
# Self-Hosted @ 192.168.0.110
|
||
# 分工: Sentry 專注 Error Tracking,SignOz 專注 Traces/Logs/Metrics
|
||
# Phase 15.3: Deep Linking - 注入 OTEL trace_id 供 SignOz 關聯
|
||
# =============================================================================
|
||
SENTRY_DSN = os.getenv("SENTRY_DSN")
|
||
|
||
|
||
def _sentry_before_send(event, hint): # noqa: ARG001 - hint is Sentry callback signature
|
||
"""
|
||
Phase 15.3: Sentry → SignOz Deep Linking
|
||
|
||
在每個 Sentry event 中注入 OTEL trace_id,
|
||
讓 Sentry 錯誤能直接連結到 SignOz Trace。
|
||
"""
|
||
try:
|
||
from src.core.deep_linking import DeepLinking
|
||
from src.core.telemetry import get_current_trace_id
|
||
|
||
trace_id = get_current_trace_id()
|
||
if trace_id:
|
||
# 注入 trace_id 到 tags (Sentry UI 可搜尋)
|
||
if "tags" not in event:
|
||
event["tags"] = {}
|
||
event["tags"]["otel_trace_id"] = trace_id
|
||
event["tags"]["signoz_trace_url"] = DeepLinking.signoz_trace_url(trace_id)
|
||
|
||
# 注入到 contexts (詳情頁顯示)
|
||
if "contexts" not in event:
|
||
event["contexts"] = {}
|
||
event["contexts"]["signoz"] = {
|
||
"trace_id": trace_id,
|
||
"trace_url": DeepLinking.signoz_trace_url(trace_id),
|
||
"service": "awoooi-api",
|
||
}
|
||
except Exception:
|
||
# Deep Linking 失敗不應影響錯誤上報
|
||
pass
|
||
return event
|
||
|
||
|
||
if SENTRY_DSN:
|
||
sentry_sdk.init(
|
||
dsn=SENTRY_DSN,
|
||
environment=settings.ENVIRONMENT,
|
||
release=f"awoooi-api@{settings.VERSION}",
|
||
# 效能監控取樣率 (生產環境降低)
|
||
traces_sample_rate=0.1 if settings.ENVIRONMENT == "production" else 1.0,
|
||
# FastAPI 深度整合
|
||
integrations=[
|
||
FastApiIntegration(transaction_style="endpoint"),
|
||
StarletteIntegration(transaction_style="endpoint"),
|
||
],
|
||
# 忽略常見的非錯誤
|
||
ignore_errors=[
|
||
ConnectionRefusedError,
|
||
TimeoutError,
|
||
],
|
||
# 只在生產環境發送
|
||
send_default_pii=False,
|
||
# Phase 15.3: Deep Linking hook
|
||
before_send=_sentry_before_send,
|
||
)
|
||
# 2026-04-05 Claude Code: 加入統一標籤,對齊 Prometheus/auto_repair layer 規範
|
||
sentry_sdk.set_tag("layer", "k8s")
|
||
sentry_sdk.set_tag("component", "api")
|
||
sentry_sdk.set_tag("host", "k8s-awoooi-prod")
|
||
sentry_sdk.set_tag("team", "backend")
|
||
logger.info("sentry_initialized", dsn=SENTRY_DSN.split("@")[-1])
|
||
else:
|
||
logger.info("sentry_disabled", reason="SENTRY_DSN not configured")
|
||
|
||
|
||
# =============================================================================
|
||
# Application Lifespan
|
||
# =============================================================================
|
||
@asynccontextmanager
|
||
async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||
"""Application lifespan events"""
|
||
# AwoooP Phase 2.4 (2026-05-04 ogt): 設定 startup handler 的 project_id context
|
||
# asyncio.create_task() 自動繼承父任務的 ContextVar → 31 個 background loop 全部標記為 awoooi
|
||
from src.core.context import PROJECT_ID
|
||
PROJECT_ID.set("awoooi")
|
||
|
||
# Startup
|
||
logger.info(
|
||
"api_startup",
|
||
version=settings.VERSION,
|
||
environment=settings.ENVIRONMENT,
|
||
mock_mode=settings.MOCK_MODE,
|
||
cors_origins=settings.CORS_ORIGINS,
|
||
ai_fallback_order=settings.AI_FALLBACK_ORDER,
|
||
four_hosts=settings.four_hosts,
|
||
kubeconfig=settings.KUBECONFIG_PATH,
|
||
)
|
||
|
||
# CTO-201: Initialize PostgreSQL database (統帥鐵律: 禁止 SQLite)
|
||
await init_db()
|
||
db_url = settings.DATABASE_URL
|
||
logger.info(
|
||
"database_initialized", url=db_url.split("@")[-1] if "@" in db_url else db_url
|
||
)
|
||
|
||
# Phase 5: Initialize HTTP Clients (ClickHouse, Ollama)
|
||
# 統帥鐵律: 連線池在啟動時建立,關閉時回收
|
||
await init_all_http_clients()
|
||
logger.info("http_clients_initialized")
|
||
|
||
# Phase 6.1.1: Initialize Redis Pool (Multi-Sig 狀態持久化)
|
||
# 統帥鐵律: Redis 連線池在 Lifespan 啟動時建立
|
||
await init_redis_pool()
|
||
logger.info("redis_pool_initialized", url=settings.REDIS_URL.split("@")[-1])
|
||
|
||
# Start SSE publisher
|
||
publisher = await get_publisher()
|
||
logger.info("sse_publisher_initialized")
|
||
|
||
# Phase 5: Telegram Gateway 初始化
|
||
# 2026-03-23 架構修正: AWOOOI API 不做 Long Polling
|
||
# 原因: 同一個 Bot Token 只能有一個 Long Polling 實例
|
||
# OpenClaw (192.168.0.188) 是唯一的 Polling 實例
|
||
# AWOOOI API 只負責發送訊息,不接收
|
||
telegram_gw = get_telegram_gateway()
|
||
if settings.TELEGRAM_ENABLE_POLLING:
|
||
await telegram_gw.start_long_polling()
|
||
logger.info("telegram_long_polling_started")
|
||
else:
|
||
logger.info("telegram_polling_disabled", reason="OpenClaw 是唯一 Polling 實例")
|
||
|
||
# ADR-015: MCP Provider 註冊 (DI 模式)
|
||
from src.plugins.mcp.providers import register_all_providers
|
||
register_all_providers()
|
||
logger.info("mcp_providers_registered")
|
||
|
||
# ADR-081 Phase 1: MCPToolRegistry 初始化(PreDecisionInvestigator 感官工具)
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 sensors=0/0 根因 — init 從未在 startup 被呼叫
|
||
try:
|
||
from src.services.mcp_tool_registry import init_mcp_tool_registry
|
||
await init_mcp_tool_registry()
|
||
logger.info("mcp_tool_registry_initialized")
|
||
except Exception as e:
|
||
logger.warning("mcp_tool_registry_init_failed", error=str(e))
|
||
|
||
# Phase 6.5: Telegram 心跳監控(每 30 分鐘發送到 SRE 戰情室群組)
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 恢復 — 使用者確認必須繼續在 SRE 戰情室發送
|
||
# 上次停用原因(forwarded_to_separate_group)有誤,群組就是 SRE_GROUP_CHAT_ID
|
||
if settings.OPENCLAW_TG_BOT_TOKEN:
|
||
await telegram_gw.start_heartbeat_monitor(
|
||
heartbeat_interval_minutes=30,
|
||
silence_threshold_hours=2,
|
||
)
|
||
logger.info("telegram_heartbeat_monitor_started", interval_minutes=30)
|
||
else:
|
||
logger.warning("telegram_heartbeat_monitor_skipped", reason="OPENCLAW_TG_BOT_TOKEN not set")
|
||
|
||
# Reboot Recovery: Warm-up Redis Working Memory from PostgreSQL
|
||
# 2026-04-05 ogt: 重開機後 Redis 清空,從 DB restore 未解決的 incidents
|
||
# 統帥批准: 數據必須長久記錄,重開機後自動恢復 Working Memory
|
||
try:
|
||
from sqlalchemy import select
|
||
|
||
from src.db.base import get_db_context
|
||
from src.core.context import clear_project_context, set_project_context
|
||
from src.db.models import IncidentRecord
|
||
from src.models.incident import IncidentStatus
|
||
from src.services.incident_service import get_incident_service
|
||
|
||
startup_ctx_tokens = set_project_context(
|
||
project_id=settings.SYSTEM_NAME,
|
||
source="startup.warmup",
|
||
request_id="startup-warmup",
|
||
)
|
||
|
||
try:
|
||
incident_service = get_incident_service()
|
||
async with get_db_context() as db:
|
||
result = await db.execute(
|
||
select(IncidentRecord).where(
|
||
IncidentRecord.status.in_([
|
||
IncidentStatus.INVESTIGATING,
|
||
IncidentStatus.MITIGATING,
|
||
])
|
||
)
|
||
)
|
||
records = result.scalars().all()
|
||
|
||
restored = 0
|
||
for record in records:
|
||
try:
|
||
incident = incident_service._record_to_incident(record)
|
||
if await incident_service.save_to_working_memory(incident):
|
||
restored += 1
|
||
except Exception as record_error:
|
||
# 舊資料 source 值不合法(node-exporter 等)→ 跳過
|
||
logger.warning(
|
||
"working_memory_warmup_record_skipped",
|
||
incident_id=getattr(record, "incident_id", None),
|
||
error=str(record_error),
|
||
)
|
||
|
||
logger.info(
|
||
"working_memory_warmed_up",
|
||
restored=restored,
|
||
total=len(records),
|
||
startup_project_id=settings.SYSTEM_NAME,
|
||
)
|
||
finally:
|
||
clear_project_context(startup_ctx_tokens)
|
||
except Exception as e:
|
||
logger.warning("working_memory_warmup_failed", error=str(e))
|
||
|
||
# ADR-088: Trust Score 冷啟動 — 從 PostgreSQL 恢復信任分數
|
||
# 解決: Pod 重啟後 TrustScoreManager 記憶歸零,AI 永遠無法累積到 L4 自動放行
|
||
# 2026-04-17 ogt + Claude Sonnet 4.6(亞太): Phase 4 信任持久化
|
||
try:
|
||
from src.repositories.trust_repository import get_trust_repository
|
||
from src.services.trust_engine import get_trust_manager
|
||
|
||
trust_records = await get_trust_repository().load_all()
|
||
loaded = get_trust_manager().bulk_load(trust_records)
|
||
logger.info("trust_scores_warmed_up", loaded=loaded)
|
||
except Exception as e:
|
||
logger.warning("trust_scores_warmup_failed", error=str(e))
|
||
|
||
# Phase 4 飛輪修復: Playbook Embedding 冷啟動索引
|
||
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
|
||
# 目的: 確保 playbook_embeddings 表有最新向量,供語義相似度查詢
|
||
# 使用 asyncio.create_task 非阻塞 — 不影響 API 啟動速度
|
||
# Phase ADR-068 2026-04-10: 從 alert_rules.yaml seed Playbook(冪等)
|
||
# 必須在 embedding indexing 之前,確保 playbook 表有資料
|
||
try:
|
||
from src.services.playbook_seed_service import seed_playbooks_from_rules
|
||
asyncio.create_task(seed_playbooks_from_rules())
|
||
logger.info("playbook_seed_scheduled")
|
||
except Exception as e:
|
||
logger.warning("playbook_seed_schedule_failed", error=str(e))
|
||
|
||
# Phase 3.5 ADR-085: Playbook Redis → PG 補寫(一次性遷移 + 啟動時冪等補救)
|
||
# 確保 Redis 中存在但 PG 中缺少的 Playbook 不因 TTL 消失而永久丟失
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3.5 AI 學習成果持久化
|
||
try:
|
||
from src.repositories.playbook_repository import get_playbook_repository
|
||
asyncio.create_task(get_playbook_repository().backfill_redis_to_pg())
|
||
logger.info("playbook_pg_backfill_scheduled")
|
||
except Exception as e:
|
||
logger.warning("playbook_pg_backfill_schedule_failed", error=str(e))
|
||
|
||
try:
|
||
from src.services.playbook_embedding_service import (
|
||
ensure_playbook_embeddings_indexed,
|
||
)
|
||
asyncio.create_task(ensure_playbook_embeddings_indexed())
|
||
logger.info("playbook_embedding_indexing_scheduled")
|
||
except Exception as e:
|
||
logger.warning("playbook_embedding_schedule_failed", error=str(e))
|
||
|
||
# Phase 6.1: 啟動 Signal Worker (Redis Streams Consumer)
|
||
# 統帥鐵律: Event Bus 解耦告警接收與處理
|
||
await init_signal_worker()
|
||
logger.info("signal_worker_initialized")
|
||
|
||
# BUG-005 修復 2026-04-11: 啟動時掃描 Redis 中所有 state=ready 但未送 Telegram 的 token
|
||
# dedup TTL 10 分鐘過期後,ready decisions 就沒有補送機制 → 長期卡在 ready 無人審核
|
||
try:
|
||
from src.services.decision_manager import get_decision_manager
|
||
asyncio.create_task(get_decision_manager().resend_stale_ready_tokens())
|
||
logger.info("stale_ready_tokens_resend_scheduled")
|
||
except Exception as e:
|
||
logger.warning("stale_ready_tokens_resend_schedule_failed", error=str(e))
|
||
|
||
# 2026-04-16 Claude Sonnet 4.6: 自動 AI 分析 Sweeper(每 90 秒)
|
||
# 修復核心 Gap:Signal Worker 創建 Incident 後無人觸發 AI 分析
|
||
# 除非有人呼叫 GET /api/v1/incidents,否則 Incident 永遠沒有決策
|
||
# Sweeper 定期掃描無 decision token 的 INVESTIGATING incidents → 背景觸發
|
||
try:
|
||
from src.jobs.incident_analysis_sweeper import run_incident_analysis_sweeper
|
||
asyncio.create_task(run_incident_analysis_sweeper())
|
||
logger.info("incident_analysis_sweeper_scheduled", interval_sec=90)
|
||
except Exception as e:
|
||
logger.warning("incident_analysis_sweeper_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § 資產盤點 cron (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每 1 小時掃 K8s pods → 寫 asset_inventory + asset_discovery_run + 7 維 coverage
|
||
# 解開 8 張 0 writer 表的第一個 (asset_inventory / asset_discovery_run / asset_coverage_snapshot)
|
||
try:
|
||
from src.jobs.asset_scanner_job import run_asset_scanner_loop
|
||
asyncio.create_task(run_asset_scanner_loop())
|
||
logger.info("asset_scanner_loop_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("asset_scanner_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Rule Catalog Sync (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每 1 小時從 Prometheus /api/v1/rules 拉 active rules → UPSERT alert_rule_catalog
|
||
# 解鎖 E3 Hermes 自動建規則: AI 需要 alert_rule_catalog 作為 baseline 才能提案修正
|
||
try:
|
||
from src.jobs.rule_catalog_sync_job import run_rule_catalog_sync_loop
|
||
asyncio.create_task(run_rule_catalog_sync_loop())
|
||
logger.info("rule_catalog_sync_loop_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("rule_catalog_sync_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Phase 4 NemoTron 容量巡檢 MVP (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每日 02:00 Taipei 撈 Prometheus node_exporter → 寫 host_capacity_snapshot + violations
|
||
# 解鎖: Phase 4 Holt-Winters 預測有歷史資料 / 容量趨勢分析
|
||
try:
|
||
from src.jobs.capacity_scanner_job import run_capacity_scanner_loop
|
||
asyncio.create_task(run_capacity_scanner_loop())
|
||
logger.info("capacity_scanner_loop_scheduled", daily_trigger_hour_taipei=2)
|
||
except Exception as e:
|
||
logger.warning("capacity_scanner_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § 合規掃描 MVP (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每日 03:00 Taipei 遍歷 asset_inventory → 寫 7 維 asset_compliance_snapshot
|
||
# MVP: secret_rotated 真實檢查,其他 6 維占位 'unknown',後續 agent 補
|
||
try:
|
||
from src.jobs.compliance_scanner_job import run_compliance_scanner_loop
|
||
asyncio.create_task(run_compliance_scanner_loop())
|
||
logger.info("compliance_scanner_loop_scheduled", daily_trigger_hour_taipei=3)
|
||
except Exception as e:
|
||
logger.warning("compliance_scanner_loop_schedule_failed", error=str(e))
|
||
|
||
# aider-watch v2 processor (2026-04-20 ADR-091)
|
||
# 消費 signals:aider:events stream → 建 incident + 寫 aider_events 表
|
||
try:
|
||
from src.workers.aider_event_processor import run_aider_event_processor_loop
|
||
logger.info("aider_event_processor: starting Redis stream consumer")
|
||
asyncio.create_task(run_aider_event_processor_loop())
|
||
except Exception as e:
|
||
logger.warning("aider_event_processor_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Coverage Evaluator (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每 1h 把 asset_coverage_snapshot 從 'unknown' 升級成 green/yellow/red
|
||
# 依據: Prometheus targets / alert_rule_catalog labels / knowledge_entries 覆蓋
|
||
try:
|
||
from src.jobs.coverage_evaluator_job import run_coverage_evaluator_loop
|
||
asyncio.create_task(run_coverage_evaluator_loop())
|
||
logger.info("coverage_evaluator_loop_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("coverage_evaluator_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Rule Stats Updater (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每 1h 從 incidents + approval_records 計算 rule 統計
|
||
# 解鎖 E3 Hermes: noise_rate > 0.5 的 rule 可被 AI 提案 deprecate
|
||
try:
|
||
from src.jobs.rule_stats_updater_job import run_rule_stats_updater_loop
|
||
asyncio.create_task(run_rule_stats_updater_loop())
|
||
logger.info("rule_stats_updater_loop_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("rule_stats_updater_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Asset Change Tracker (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每 1h 比對最近兩次 asset_discovery_run,寫 asset_change_event
|
||
# 解鎖: 資產變化歷史 (added/removed/lifecycle_changed),AI 可追蹤集群演進
|
||
try:
|
||
from src.jobs.asset_change_tracker_job import run_asset_change_tracker_loop
|
||
asyncio.create_task(run_asset_change_tracker_loop())
|
||
logger.info("asset_change_tracker_loop_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("asset_change_tracker_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Hermes Rule Quality Advisor (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每日 04:00 Taipei 分析 alert_rule_catalog.noise_rate,對高噪音規則推 Telegram 建議
|
||
# 統帥鐵律: AI 只推建議不自動改 review_status,人工決策 deprecate
|
||
try:
|
||
from src.jobs.hermes_rule_quality_job import run_hermes_rule_quality_loop
|
||
asyncio.create_task(run_hermes_rule_quality_loop())
|
||
logger.info("hermes_rule_quality_loop_scheduled", daily_trigger_hour_taipei=4)
|
||
except Exception as e:
|
||
logger.warning("hermes_rule_quality_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Phase 4 Capacity Forecaster (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每日 05:00 Taipei 用 Prometheus predict_linear 預測未來 7d disk/mem/cpu 飽和
|
||
# 高風險 host 寫 aol(capacity_recommendation) + Telegram 建議
|
||
try:
|
||
from src.jobs.capacity_forecaster_job import run_capacity_forecaster_loop
|
||
asyncio.create_task(run_capacity_forecaster_loop())
|
||
logger.info("capacity_forecaster_loop_scheduled", daily_trigger_hour_taipei=5)
|
||
except Exception as e:
|
||
logger.warning("capacity_forecaster_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-076 Task 4: 每日 08:00 台北時間自動日度巡檢報告
|
||
# 2026-04-14 Claude Haiku 4.5 Asia/Taipei
|
||
try:
|
||
from src.services.report_generation_service import run_daily_report_loop
|
||
asyncio.create_task(run_daily_report_loop())
|
||
logger.info("daily_report_loop_scheduled", trigger_hour_taipei=8)
|
||
except Exception as e:
|
||
logger.warning("daily_report_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-073 P2 修復 2026-04-15: 逾期 Approval 自動結案(每小時)
|
||
# 確保 PENDING approval 超過 48h 後觸發 resolve_incident → KM 學習鏈閉環
|
||
try:
|
||
from src.jobs.approval_timeout_resolver import run_approval_timeout_resolver
|
||
asyncio.create_task(run_approval_timeout_resolver())
|
||
logger.info("approval_timeout_resolver_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("approval_timeout_resolver_schedule_failed", error=str(e))
|
||
|
||
# T73: 已有完成證據但仍卡在 INVESTIGATING 的舊 incident 小批次收斂。
|
||
# 僅處理 auto-repair success / approval EXECUTION_SUCCESS / approval EXPIRED,
|
||
# 不自動關閉 manual_required 或單純 APPROVED 事件。
|
||
try:
|
||
from src.jobs.incident_lifecycle_reconciler import (
|
||
INTERVAL_SECONDS as INCIDENT_LIFECYCLE_RECONCILER_INTERVAL,
|
||
)
|
||
from src.jobs.incident_lifecycle_reconciler import (
|
||
run_incident_lifecycle_reconciler_loop,
|
||
)
|
||
asyncio.create_task(run_incident_lifecycle_reconciler_loop())
|
||
logger.info(
|
||
"incident_lifecycle_reconciler_scheduled",
|
||
interval_sec=INCIDENT_LIFECYCLE_RECONCILER_INTERVAL,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("incident_lifecycle_reconciler_schedule_failed", error=str(e))
|
||
|
||
# AwoooP Ansible check-mode worker.
|
||
# 只執行 ansible-playbook --check --diff 並回寫 automation_operation_log;
|
||
# apply 仍必須走 approval gate,本 worker 不寫 auto_repair_executions。
|
||
try:
|
||
from src.jobs.awooop_ansible_check_mode_job import (
|
||
run_awooop_ansible_check_mode_loop,
|
||
)
|
||
asyncio.create_task(run_awooop_ansible_check_mode_loop())
|
||
logger.info(
|
||
"awooop_ansible_check_mode_worker_scheduled",
|
||
enabled=settings.ENABLE_AWOOOP_ANSIBLE_CHECK_MODE_WORKER,
|
||
interval_seconds=settings.AWOOOP_ANSIBLE_CHECK_MODE_INTERVAL_SECONDS,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("awooop_ansible_check_mode_worker_schedule_failed", error=str(e))
|
||
|
||
# ADR-083 Phase 3: Evolver Agent(每日)— Playbook 自動合併 + 低信任封存
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立
|
||
try:
|
||
from src.services.playbook_evolver import run_evolver_loop
|
||
asyncio.create_task(run_evolver_loop())
|
||
logger.info("evolver_loop_scheduled", interval_sec=86400)
|
||
except Exception as e:
|
||
logger.warning("evolver_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-104 T2: LLM Playbook DRAFT governance(每小時)
|
||
try:
|
||
from src.jobs.playbook_generation_governance_job import (
|
||
run_playbook_generation_governance_loop,
|
||
)
|
||
asyncio.create_task(run_playbook_generation_governance_loop())
|
||
logger.info(
|
||
"playbook_generation_governance_loop_scheduled",
|
||
interval_sec=settings.PLAYBOOK_DRAFT_GOVERNANCE_INTERVAL_SECONDS,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("playbook_generation_governance_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-083 Phase 3: 知識遺忘 Job(每日)— 30d 未引用 KB entry 標記 archived
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立
|
||
try:
|
||
from src.jobs.knowledge_decay_job import run_knowledge_decay_loop
|
||
asyncio.create_task(run_knowledge_decay_loop())
|
||
logger.info("knowledge_decay_loop_scheduled", interval_sec=86400)
|
||
except Exception as e:
|
||
logger.warning("knowledge_decay_loop_schedule_failed", error=str(e))
|
||
|
||
# C1 P1-1 2026-04-28 ogt + Claude Sonnet 4.6: KM Backfill Reconciler(每 5 分鐘)
|
||
# 補救 _backfill_path_a_approval 失敗寫入 km:backfill:dlq 的 Path A related_approval_id 回填
|
||
# Feature Flag: ENABLE_KM_BACKFILL_RECONCILER=false 停用(回滾用)
|
||
try:
|
||
from src.jobs.km_backfill_reconciler_job import run_km_backfill_reconciler_loop
|
||
asyncio.create_task(run_km_backfill_reconciler_loop())
|
||
logger.info("km_backfill_reconciler_loop_scheduled", interval_sec=300)
|
||
except Exception as e:
|
||
logger.warning("km_backfill_reconciler_loop_schedule_failed", error=str(e))
|
||
|
||
# W2 PR-R2 2026-04-28 ogt + Claude Sonnet 4.6: AOL → alert_rule_catalog EWMA Writeback(每 1 小時)
|
||
# 飛輪斷鏈 C2 修復:automation_operation_log 執行結果回灌 alert_rule_catalog.confidence
|
||
# Feature Flag: ENABLE_AOL_WRITEBACK_JOB=false 預設停用(人工驗證後再開)
|
||
# ADR-091 Task T2
|
||
try:
|
||
from src.jobs.aol_to_catalog_writeback_job import run_aol_writeback_loop
|
||
asyncio.create_task(run_aol_writeback_loop())
|
||
logger.info("aol_to_catalog_writeback_loop_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("aol_to_catalog_writeback_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-087 Phase 6: KB 腐爛清理(月度)— 每月 1 號 03:00 台北時間
|
||
# 掃描 knowledge_entries 中腐爛條目(廢棄 K8s API / Prometheus pattern / 180d 未引用)
|
||
# 2026-04-27 P3.1-T3 by Claude
|
||
try:
|
||
from src.utils.timezone import now_taipei
|
||
|
||
async def _run_kb_rot_cleaner_loop() -> None:
|
||
import asyncio as _asyncio
|
||
|
||
from src.jobs.kb_rot_cleaner import get_kb_rot_cleaner
|
||
while True:
|
||
try:
|
||
now = now_taipei()
|
||
# 計算下次月初 3 點(台北時間)
|
||
if now.day == 1 and now.hour < 3:
|
||
next_run = now.replace(hour=3, minute=0, second=0, microsecond=0)
|
||
elif now.month == 12:
|
||
next_run = now.replace(
|
||
year=now.year + 1, month=1, day=1,
|
||
hour=3, minute=0, second=0, microsecond=0,
|
||
)
|
||
else:
|
||
next_run = now.replace(
|
||
month=now.month + 1, day=1,
|
||
hour=3, minute=0, second=0, microsecond=0,
|
||
)
|
||
sleep_sec = (next_run - now).total_seconds()
|
||
logger.info("kb_rot_cleaner_next_run", next_run=next_run.isoformat(), sleep_sec=int(sleep_sec))
|
||
await _asyncio.sleep(sleep_sec)
|
||
try:
|
||
result = await get_kb_rot_cleaner().run()
|
||
logger.info("kb_rot_cleaner_completed", stale_count=result.stale_count, total=result.total_scanned)
|
||
except Exception as _e:
|
||
logger.exception("kb_rot_cleaner_failed", error=str(_e))
|
||
except _asyncio.CancelledError:
|
||
break
|
||
except Exception as _e:
|
||
logger.exception("kb_rot_cleaner_loop_error", error=str(_e))
|
||
await _asyncio.sleep(3600) # 1h 後重試
|
||
|
||
asyncio.create_task(_run_kb_rot_cleaner_loop())
|
||
logger.info("kb_rot_cleaner_loop_scheduled", trigger="monthly_day1_03h_taipei")
|
||
except Exception as e:
|
||
logger.warning("kb_rot_cleaner_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-083 Phase 3: Fine-tune JSONL 匯出(每週)— EvidenceSnapshot × AgentSession → JSONL
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立
|
||
try:
|
||
from src.services.finetune_exporter import run_finetune_export_loop
|
||
asyncio.create_task(run_finetune_export_loop())
|
||
logger.info("finetune_export_loop_scheduled", interval_sec=604800)
|
||
except Exception as e:
|
||
logger.warning("finetune_export_loop_schedule_failed", error=str(e))
|
||
|
||
# Phase 4 ADR-084: 主動巡檢每 5 分鐘執行一次
|
||
# 協調 DynamicBaselineService + LogAnomalyDetector + TrendPredictor
|
||
# Shadow Mode 控制:AIOPS_P4_SHADOW_MODE=True 時只記錄,不觸發 Alert
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 初始建立
|
||
try:
|
||
from src.services.proactive_inspector import run_proactive_inspector_loop
|
||
asyncio.create_task(run_proactive_inspector_loop())
|
||
logger.info("proactive_inspector_loop_scheduled", interval_sec=300)
|
||
except Exception as e:
|
||
logger.warning("proactive_inspector_schedule_failed", error=str(e))
|
||
|
||
# ADR-087 Phase 6: 離線回放(每 7 天)— 決策一致率基線
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 初始建立
|
||
try:
|
||
from src.jobs.offline_replay_service import run_offline_replay_loop
|
||
asyncio.create_task(run_offline_replay_loop())
|
||
logger.info("offline_replay_loop_scheduled", interval_sec=604800)
|
||
except Exception as e:
|
||
logger.warning("offline_replay_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-092 (2026-04-20 ogt + Claude Opus 4.7): AI SLO Watchdog — 每 15 分鐘自健診
|
||
# MASTER §1.1:系統必須能感知自身故障,W-1 SLO + W-2 Telegram靜默 + W-3 飛輪成功率
|
||
try:
|
||
from src.jobs.ai_slo_watchdog_job import run_ai_slo_watchdog_loop
|
||
asyncio.create_task(run_ai_slo_watchdog_loop())
|
||
logger.info("ai_slo_watchdog_scheduled", interval_sec=900)
|
||
except Exception as e:
|
||
logger.warning("ai_slo_watchdog_schedule_failed", error=str(e))
|
||
|
||
# 2026-04-26 P2.2 by Claude — GovernanceAgent 4 項自檢(每 1 小時)
|
||
# MASTER P2.2:trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius
|
||
try:
|
||
from src.services.governance_agent import run_governance_loop
|
||
asyncio.create_task(run_governance_loop())
|
||
logger.info("governance_agent_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("governance_agent_schedule_failed", error=str(e))
|
||
|
||
# 2026-05-03 ogt + Claude Sonnet 4.6(亞太): GovernanceDispatcher Wave 2E(每 30s poll)
|
||
try:
|
||
from src.services.governance_dispatcher import run_governance_dispatcher_loop
|
||
asyncio.create_task(run_governance_dispatcher_loop())
|
||
logger.info("governance_dispatcher_scheduled", interval_sec=30)
|
||
except Exception as e:
|
||
logger.warning("governance_dispatcher_schedule_failed", error=str(e))
|
||
|
||
# T90 2026-05-19 ogt + Codex: Hermes KB growth worker(每 5 分鐘)
|
||
# 消費 knowledge_degradation 的 hermes_kb_growth_healthcheck dispatch,
|
||
# 只產生 REVIEW 草稿並停在 owner review,不直接批准或發布 KM。
|
||
try:
|
||
from src.jobs.hermes_kb_growth_worker import run_hermes_kb_growth_loop
|
||
asyncio.create_task(run_hermes_kb_growth_loop())
|
||
logger.info("hermes_kb_growth_worker_scheduled", interval_sec=300)
|
||
except Exception as e:
|
||
logger.warning("hermes_kb_growth_worker_schedule_failed", error=str(e))
|
||
|
||
# 2026-04-25 P1.2 by Claude Engineer-A2 — failover 整合到 ai_router + lifespan
|
||
# OllamaFailoverManager + OllamaAutoRecoveryService 飛輪接線:
|
||
# failover 切換時 → recovery_callback → set_current_primary → Redis 持久化
|
||
# recovery service 每 30s 檢查 → 111 連續 3 次 HEALTHY → 自動切回 → clear_cache
|
||
# 順序:先取 singleton → wire callback → 啟動 recovery service(才能接收 callback)
|
||
try:
|
||
from src.services.ollama_auto_recovery import get_ollama_auto_recovery_service
|
||
from src.services.ollama_failover_manager import get_ollama_failover_manager
|
||
|
||
_failover_mgr = get_ollama_failover_manager()
|
||
_recovery_svc = get_ollama_auto_recovery_service()
|
||
|
||
# wire callback:failover 切換時通知 recovery service 更新 current_primary
|
||
_failover_mgr.set_recovery_callback(_recovery_svc.set_current_primary)
|
||
|
||
# 2026-04-26 critic-H3 hotfix by Claude Opus 4.7 — alerter 必須在 recovery 啟動前注入
|
||
# 原順序:start() 後才注入 → recovery bootstrap immediate-check 若觸發 alert_recovery,
|
||
# alerter 還沒注入 Redis → dedup fail-open,告警會送出且無 dedup 保護(重複告警風險)
|
||
# 修法:configure_alerter() 提前到 start() 之前;Redis pool 在 lifespan 早期已就緒
|
||
try:
|
||
from src.core.redis_client import get_redis
|
||
from src.services.failover_alerter import configure_alerter
|
||
configure_alerter(get_redis())
|
||
logger.info("failover_alerter_configured")
|
||
except Exception as _alerter_err:
|
||
logger.warning("failover_alerter_configure_failed", error=str(_alerter_err))
|
||
|
||
# 啟動 recovery service(從 Redis bootstrap current_primary,並啟動背景監控)
|
||
await _recovery_svc.start()
|
||
|
||
logger.info("ollama_failover_system_started")
|
||
except Exception as e:
|
||
logger.warning("ollama_failover_system_start_failed", error=str(e))
|
||
|
||
# 2026-04-27 P3.2.2 by Claude — AI Provider 版本追蹤(每 1 小時)
|
||
# 探測 5 Provider(ollama/ollama_local/gemini/claude/openclaw_nemo)版本
|
||
# 寫入 ai_provider_version_history;版本變更時 log warning,P3.2.3 alerter 後續整合
|
||
try:
|
||
async def _run_model_version_tracker_loop() -> None:
|
||
from src.services.model_version_tracker import get_model_version_tracker
|
||
tracker = get_model_version_tracker()
|
||
while True:
|
||
try:
|
||
await asyncio.sleep(3600) # 每 1 小時
|
||
result = await tracker.run_probe_cycle()
|
||
logger.info(
|
||
"model_version_probe_cycle_done",
|
||
probed=result["probed"],
|
||
changed=result["changed"],
|
||
)
|
||
except asyncio.CancelledError:
|
||
break
|
||
except Exception as _loop_err:
|
||
logger.exception("model_version_tracker_loop_error", error=str(_loop_err))
|
||
await asyncio.sleep(60) # 錯誤後 1 分鐘重試
|
||
|
||
asyncio.create_task(_run_model_version_tracker_loop())
|
||
logger.info("model_version_tracker_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("model_version_tracker_schedule_failed", error=str(e))
|
||
|
||
# AwoooP Phase 4 (2026-05-04 ogt + Claude Sonnet 4.6): Platform Worker(Shadow Mode Shell)
|
||
# ADR-106 Strangler Fig Phase 4:SKIP LOCKED run worker + stale run reaper
|
||
# Shadow mode:is_shadow=True,0 user-visible response,0 destructive tool call
|
||
try:
|
||
from src.workers.platform_worker import start_platform_worker
|
||
await start_platform_worker()
|
||
logger.info("platform_worker_started", mode="shadow")
|
||
except Exception as e:
|
||
logger.warning("platform_worker_start_failed", error=str(e))
|
||
|
||
yield
|
||
|
||
# Shutdown
|
||
# 2026-04-25 P1.2 by Claude Engineer-A2 — 優雅關閉 Ollama failover 背景監控
|
||
# 必須在 Redis pool 關閉之前停止(recovery service 可能仍在寫 Redis)
|
||
try:
|
||
from src.services.ollama_auto_recovery import get_ollama_auto_recovery_service
|
||
await get_ollama_auto_recovery_service().stop()
|
||
logger.info("ollama_failover_system_stopped")
|
||
except Exception as e:
|
||
logger.warning("ollama_failover_system_stop_failed", error=str(e))
|
||
|
||
# 2026-04-27 Wave8-X3 by Claude — B25/B26 drain fix
|
||
# K8s rolling restart:等待 auto_repair fire-and-forget tasks 完成後再關閉
|
||
# 確保 _verify_and_learn / runbook_generator 寫入不被 SIGTERM cancel
|
||
try:
|
||
from src.services.auto_repair_service import get_auto_repair_service
|
||
_repair_svc = get_auto_repair_service()
|
||
if hasattr(_repair_svc, "drain_pending_tasks"):
|
||
_drain_result = await _repair_svc.drain_pending_tasks(timeout=60.0)
|
||
logger.info("auto_repair_drain_complete", **_drain_result)
|
||
except Exception as e:
|
||
logger.warning("auto_repair_drain_failed", error=str(e))
|
||
|
||
# AwoooP Phase 4: Platform Worker 優雅停機(2026-05-04 ogt)
|
||
try:
|
||
from src.workers.platform_worker import stop_platform_worker
|
||
await stop_platform_worker()
|
||
logger.info("platform_worker_stopped")
|
||
except Exception as e:
|
||
logger.warning("platform_worker_stop_failed", error=str(e))
|
||
|
||
# Phase 6.1: 關閉 Signal Worker (先關閉 Consumer)
|
||
await close_signal_worker()
|
||
await close_worker_redis_pool()
|
||
await publisher.stop()
|
||
await close_executor()
|
||
await close_openclaw()
|
||
# Phase 5.4: Close Telegram Gateway
|
||
telegram_gw = get_telegram_gateway()
|
||
await telegram_gw.close()
|
||
# Phase 33: Close RAG Service httpx client (ADR-067)
|
||
from src.services.knowledge_rag_service import get_knowledge_rag_service
|
||
await get_knowledge_rag_service().close()
|
||
# Phase 5: Close HTTP Clients (統帥鐵律: 連線池回收)
|
||
await close_all_http_clients()
|
||
# Phase 6.1.1: Close Redis Pool (統帥鐵律: Redis 連線池回收)
|
||
await close_redis_pool()
|
||
await close_db()
|
||
shutdown_telemetry()
|
||
logger.info("api_shutdown", version=settings.VERSION)
|
||
|
||
|
||
# =============================================================================
|
||
# FastAPI Application
|
||
# =============================================================================
|
||
app = FastAPI(
|
||
title="AWOOOI API",
|
||
description="AWOOOI 智能運維平台 API - 由 leWOOOgo Engine 驅動",
|
||
version=settings.VERSION,
|
||
docs_url="/api/v1/docs",
|
||
redoc_url="/api/v1/redoc",
|
||
openapi_url="/api/v1/openapi.json",
|
||
lifespan=lifespan,
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# OpenTelemetry Instrumentation (可觀測性鐵律)
|
||
# 必須在 Middleware 之前初始化,確保追蹤完整性
|
||
# 優雅降級: 失敗不影響 API 啟動
|
||
# =============================================================================
|
||
otel_enabled = setup_telemetry(app)
|
||
if otel_enabled:
|
||
logger.info(
|
||
"otel_initialized",
|
||
service=settings.OTEL_SERVICE_NAME,
|
||
endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT,
|
||
)
|
||
else:
|
||
logger.warning("otel_disabled", reason="initialization failed or disabled")
|
||
|
||
|
||
# =============================================================================
|
||
# Middleware
|
||
# =============================================================================
|
||
|
||
# 2026-04-03 ogt: Nginx 反向代理修正 — 讓 FastAPI 信任 X-Forwarded-Proto。
|
||
# 避免 /api/v1/knowledge 等 redirect 在 HTTPS 反向代理後產生 http:// Location。
|
||
app.add_middleware(ProxyHeadersMiddleware, trusted_hosts="*")
|
||
|
||
# CORS - Strict Whitelist (Iron Law #2)
|
||
# NO wildcards, NO UAT
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=settings.CORS_ORIGINS,
|
||
allow_credentials=True,
|
||
allow_methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
|
||
allow_headers=["Authorization", "Content-Type", "X-Request-ID"],
|
||
expose_headers=["X-Request-ID"],
|
||
)
|
||
|
||
|
||
@app.middleware("http")
|
||
async def request_logging_middleware(request: Request, call_next):
|
||
"""
|
||
Structured request logging middleware
|
||
|
||
Logs every request with:
|
||
- Request ID (from header or generated)
|
||
- HTTP method and path
|
||
- Response status code
|
||
- Request duration
|
||
"""
|
||
import time
|
||
|
||
from src.core.context import clear_project_context, get_current_project_context, set_project_context
|
||
|
||
request_id = request.headers.get("X-Request-ID") or str(uuid4())
|
||
project_id, source = _resolve_request_project_context(request)
|
||
|
||
context_tokens = set_project_context(
|
||
project_id=project_id,
|
||
source=source,
|
||
request_id=request_id,
|
||
)
|
||
start_time = time.perf_counter()
|
||
|
||
# Bind request context for all logs in this request
|
||
structlog.contextvars.clear_contextvars()
|
||
current_context = get_current_project_context()
|
||
structlog.contextvars.bind_contextvars(
|
||
request_id=request_id,
|
||
method=request.method,
|
||
path=request.url.path,
|
||
project_id=current_context["project_id"],
|
||
project_context_source=current_context["source"],
|
||
)
|
||
|
||
log = get_logger("awoooi.http")
|
||
log.debug("request_start")
|
||
|
||
try:
|
||
response = await call_next(request)
|
||
finally:
|
||
clear_project_context(context_tokens)
|
||
|
||
duration_ms = (time.perf_counter() - start_time) * 1000
|
||
log.info(
|
||
"request_complete",
|
||
status_code=response.status_code,
|
||
duration_ms=round(duration_ms, 2),
|
||
project_id=current_context["project_id"],
|
||
project_context_source=current_context["source"],
|
||
has_project_context=bool(current_context["project_id"]),
|
||
)
|
||
|
||
# Add request ID to response headers
|
||
response.headers["X-Request-ID"] = request_id
|
||
return response
|
||
|
||
|
||
@app.get("/api/v1/security/db-context-guard")
|
||
async def db_context_guard() -> dict:
|
||
"""
|
||
Context Guard Endpoint (P1-1 runtime evidence)
|
||
|
||
- 未提供 project context(X-Project-ID / X-Tenant-ID / project_id query)
|
||
時,應回傳 401,代表 RLS 已採 fail-closed
|
||
- 有提供 context 時回傳 context snapshot,便於稽核
|
||
"""
|
||
from src.core.context import get_current_project_context
|
||
from src.db.base import get_db_context
|
||
|
||
async with get_db_context():
|
||
return {
|
||
"status": "ok",
|
||
"project_context": get_current_project_context(),
|
||
"source": "runtime_guard",
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Exception Handlers
|
||
# =============================================================================
|
||
|
||
|
||
@app.exception_handler(HTTPException)
|
||
async def http_exception_handler(_request: Request, exc: HTTPException) -> JSONResponse:
|
||
"""Preserve intentional HTTP status responses (e.g. 401/403).
|
||
|
||
This is critical for P1-1 fail-closed evidence; without it, all HTTPException
|
||
is swallowed by the generic exception handler and downgraded to 500.
|
||
"""
|
||
return JSONResponse(status_code=exc.status_code, content={"detail": exc.detail}, headers=exc.headers)
|
||
|
||
|
||
@app.exception_handler(Exception)
|
||
async def global_exception_handler(_request: Request, exc: Exception) -> JSONResponse:
|
||
"""
|
||
Global exception handler with structured logging + Sentry
|
||
|
||
Catches all unhandled exceptions and returns a safe error response.
|
||
Full exception details are logged but not exposed to clients.
|
||
Sentry SDK 會自動捕獲並發送到 Self-Hosted Server。
|
||
"""
|
||
# Sentry 自動捕獲 (如果已初始化)
|
||
sentry_sdk.capture_exception(exc)
|
||
|
||
log = get_logger("awoooi.error")
|
||
log.exception(
|
||
"unhandled_exception",
|
||
exc_type=type(exc).__name__,
|
||
exc_message=str(exc),
|
||
)
|
||
return JSONResponse(
|
||
status_code=500,
|
||
content={
|
||
"code": "INTERNAL_ERROR",
|
||
"message": "An internal error occurred",
|
||
},
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# API Routers - Path-based routing (/api/v1/*)
|
||
# =============================================================================
|
||
|
||
# New v1 API routes
|
||
app.include_router(health_v1.router, prefix="/api/v1", tags=["Health"])
|
||
app.include_router(csrf_v1.router, prefix="/api/v1", tags=["Security"]) # Phase 20
|
||
app.include_router(dashboard_v1.router, prefix="/api/v1", tags=["Dashboard"])
|
||
app.include_router(approvals_v1.router, prefix="/api/v1", tags=["HITL Approvals"])
|
||
app.include_router(ai_v1.router, prefix="/api/v1", tags=["AI Decision"])
|
||
app.include_router(ai_governance_v1.router, prefix="/api/v1", tags=["AI Governance"]) # 2026-05-02: /governance 頁面
|
||
app.include_router(ai_slo_v1.router, prefix="/api/v1", tags=["AI SLO"]) # Phase 6 ADR-087
|
||
app.include_router(aiops_kpi_v1.router, prefix="/api/v1", tags=["AIOps KPI"]) # ADR-090 § Phase 7 Dashboard
|
||
app.include_router(aiops_timeline_v1.router, prefix="/api/v1", tags=["AIOps Timeline"]) # 2026-04-27 Wave8-X3 B4
|
||
app.include_router(webhooks_v1.router, prefix="/api/v1", tags=["Webhooks"])
|
||
app.include_router(timeline_v1.router, prefix="/api/v1", tags=["Timeline"])
|
||
app.include_router(audit_logs_v1.router, prefix="/api/v1", tags=["Audit Logs"])
|
||
# 2026-04-09 Claude Sonnet 4.6: alert_operation_log 查詢 API (Sprint 5.2)
|
||
app.include_router(alert_operation_logs_v1.router, prefix="/api/v1", tags=["Alert Operation Logs"])
|
||
app.include_router(
|
||
aider_events_v1.router,
|
||
prefix="/api/v1",
|
||
tags=["Aider Watch"],
|
||
) # aider-watch v2 ADR-091
|
||
app.include_router(
|
||
telegram_v1.router, prefix="/api/v1", tags=["Telegram Gateway"]
|
||
) # Phase 5.4
|
||
app.include_router(
|
||
telegram_webhook_v1.router, prefix="/api/v1", tags=["Telegram Webhook"]
|
||
) # ADR-094: Webhook 入口(WS4 Hermes NL 預留)
|
||
app.include_router(
|
||
metrics_v1.router, prefix="/api/v1", tags=["Gold Metrics"]
|
||
) # Phase 7: 真實血脈
|
||
app.include_router(
|
||
incidents_v1.router, prefix="/api/v1", tags=["Incidents"]
|
||
) # Phase 6.4: Decision Proposal
|
||
app.include_router(
|
||
proposals_v1.router, prefix="/api/v1", tags=["Proposals"]
|
||
) # Phase 6.4h: Proposals CRUD
|
||
app.include_router(
|
||
agents_v1.router, prefix="/api/v1", tags=["Agent Teams"]
|
||
) # Phase 9.5: Agent Teams
|
||
app.include_router(
|
||
stats_v1.router, prefix="/api/v1", tags=["Statistics"]
|
||
) # Phase 6.5: Statistics Analytics
|
||
app.include_router(
|
||
monitoring_v1.router, prefix="/api/v1", tags=["Monitoring"]
|
||
) # 2026-04-03: 監控工具狀態
|
||
app.include_router(
|
||
gitea_webhook_v1.router, prefix="/api/v1", tags=["Gitea Webhook"]
|
||
) # ADR-059: Gitea → OpenClaw
|
||
app.include_router(
|
||
playbooks_v1.router, prefix="/api/v1", tags=["Playbooks"]
|
||
) # #7: Playbook 萃取
|
||
app.include_router(
|
||
auto_repair_v1.router, prefix="/api/v1", tags=["Auto Repair"]
|
||
) # #8: 自動升級決策
|
||
app.include_router(
|
||
drift_v1.router, prefix="/api/v1", tags=["Drift Detection"]
|
||
) # Phase 25 P2: Config Drift Detection
|
||
app.include_router(
|
||
rag_v1.router, prefix="/api/v1", tags=["RAG Knowledge Base"]
|
||
) # Phase 33 ADR-067: RAG 知識庫
|
||
app.include_router(
|
||
errors_v1.router, prefix="/api/v1", tags=["Errors"]
|
||
) # #40: Sentry 錯誤 BFF API
|
||
app.include_router(
|
||
sentry_webhook_v1.router, prefix="/api/v1", tags=["Sentry Webhook"]
|
||
) # Phase 10.2.1: Sentry → Telegram
|
||
app.include_router(
|
||
signoz_webhook_v1.router, prefix="/api/v1", tags=["SignOz Webhook"]
|
||
) # Phase 21: SignOz → Telegram (ADR-037)
|
||
app.include_router(
|
||
notifications_v1.router, prefix="/api/v1", tags=["Notifications"]
|
||
) # 2026-04-10: 通知頻道狀態
|
||
app.include_router(
|
||
terminal_v1.router, prefix="/api/v1", tags=["Omni-Terminal"]
|
||
) # Phase 19.1: Omni-Terminal SSE
|
||
app.include_router(
|
||
learning_v1.router, prefix="/api/v1", tags=["Learning"]
|
||
) # Phase D-G P0: 學習系統 API
|
||
app.include_router(
|
||
knowledge_v1.router, prefix="/api/v1", tags=["Knowledge Base"]
|
||
) # KB Phase 1: Knowledge Base API
|
||
app.include_router(
|
||
proposals_router.router, tags=["Proposals (Legacy)"]
|
||
) # Phase 6.4g: lewooogo-brain (舊版)
|
||
|
||
# Legacy routes (to be migrated to api/v1/)
|
||
app.include_router(plugins.router, prefix="/api/v1/plugins", tags=["Plugins"])
|
||
app.include_router(pipelines.router, prefix="/api/v1/pipelines", tags=["Pipelines"])
|
||
app.include_router(agent.router, prefix="/api/v1/agent", tags=["Agent"])
|
||
app.include_router(
|
||
notifications.router, prefix="/api/v1/notifications", tags=["Notifications"]
|
||
)
|
||
# AwoooP Phase 4 (2026-05-04 ogt): Platform Shell — Shadow Mode Run API
|
||
app.include_router(platform_v1.router, prefix="/api/v1/platform", tags=["AwoooP Platform"])
|
||
|
||
|
||
# =============================================================================
|
||
# Prometheus Metrics Endpoint
|
||
# =============================================================================
|
||
# 2026-03-31 ogt: 暴露 Prometheus 指標供告警系統使用
|
||
|
||
|
||
@app.get("/metrics", include_in_schema=False)
|
||
async def prometheus_metrics() -> Response:
|
||
"""Prometheus metrics endpoint for alerting"""
|
||
# 2026-05-19 Codex — T85 Alert Chain DB evidence refresh.
|
||
# record_alert_chain_success() 是 process-local gauge;部署後第一個 scrape
|
||
# 可能尚未收到新 webhook,導致 smoke test 誤判 metric 不存在。
|
||
# 先用 AwoooP inbound / alert_operation_log 的 durable evidence 回填 last_success。
|
||
try:
|
||
await get_alert_chain_metrics_service().refresh_last_success_gauge()
|
||
except Exception as exc:
|
||
logger.warning("prometheus_metrics_alert_chain_evidence_error", error=str(exc))
|
||
|
||
content = generate_latest().decode("utf-8")
|
||
# 2026-05-07 ogt + Claude Sonnet 4.6 — INC-20260507-99ADF2 修復
|
||
# 飛輪指標(awoooi_flywheel_*)原本只在 /api/v1/stats/flywheel/metrics 暴露,
|
||
# 110 Prom awoooi-api job scrape /metrics 時抓不到 → FlywheelExecutionRateMissing 永久 firing
|
||
# 修法:在此串入飛輪指標,讓既有 scrape job 無需新增 job 即可抓到
|
||
try:
|
||
flywheel_metrics = await get_flywheel_stats_service().compute()
|
||
content += flywheel_metrics.to_prometheus_lines()
|
||
except Exception:
|
||
logger.warning("prometheus_metrics_flywheel_error")
|
||
# 2026-05-14 Codex — T18 ADR-100 SLO emitter
|
||
# GovernanceAgent 讀 Prometheus recording rules;若 /metrics 不吐底層 DB totals,
|
||
# sli:* rules 會全空並每小時重複發 governance_slo_data_gap。
|
||
try:
|
||
content += await get_adr100_slo_metrics_service().to_prometheus_lines()
|
||
except Exception as exc:
|
||
logger.warning("prometheus_metrics_adr100_slo_error", error=str(exc))
|
||
return Response(content=content, media_type=CONTENT_TYPE_LATEST)
|
||
|
||
|
||
# =============================================================================
|
||
# Root Endpoint
|
||
# =============================================================================
|
||
|
||
|
||
@app.get("/", include_in_schema=False)
|
||
async def root() -> dict:
|
||
"""Root endpoint with API info"""
|
||
return {
|
||
"name": "AWOOOI API",
|
||
"version": settings.VERSION,
|
||
"environment": settings.ENVIRONMENT,
|
||
"docs": "/api/v1/docs",
|
||
"health": "/api/v1/health",
|
||
"dashboard": "/api/v1/dashboard",
|
||
"stream": "/api/v1/dashboard/stream",
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Entry Point
|
||
# =============================================================================
|
||
|
||
if __name__ == "__main__":
|
||
import uvicorn
|
||
|
||
uvicorn.run(
|
||
"src.main:app",
|
||
host="0.0.0.0",
|
||
port=8000,
|
||
reload=settings.DEBUG,
|
||
log_level=settings.LOG_LEVEL.lower(),
|
||
)
|