All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 8m22s
實證 rule_stats 跑完後發現 2 條 100% noise_rate 規則:
- PostgreSQLDiskGrowthRate (tp=0 fp=2)
- NoAlertsReceived2Hours (tp=0 fp=1)
加上 MoWoooWorkDown (33%), KubePodCrashLooping (25%)
新增 hermes_rule_quality_job.py (~210 行):
每日 04:00 Taipei 分析 alert_rule_catalog:
- threshold: noise_rate >= 0.7 AND 樣本 >= 5
- 為每條寫 aol('rule_rejected', proposed_action='review_or_deprecate')
- 推 Telegram 摘要給 SRE group
統帥鐵律對齊:
✅ 不自動改 review_status (人工決策 deprecate,AI 只推建議)
✅ threshold 作為「觸發討論」而非「最終決策」
✅ aol(rule_rejected) 留 trail,未來可升級 LLM 辯證
解鎖 E3 Hermes 基礎: 後續可加 LLM 分析假報真因 (expr 缺 for: window、
label match 太寬泛、metric 本身 noisy 等),產出具體改進建議.
Wire main.py lifespan asyncio.create_task()
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
802 lines
34 KiB
Python
802 lines
34 KiB
Python
"""
|
||
AWOOOI API - BFF Gateway
|
||
========================
|
||
ADR-005: BFF Architecture
|
||
ADR-006: AI Fallback Strategy
|
||
|
||
Four Iron Laws:
|
||
1. Async-First - All handlers are async def
|
||
2. CORS Whitelist - Strict origin control (NO wildcards)
|
||
3. Pydantic Config - Type-safe settings with validation
|
||
4. structlog - Structured JSON logging
|
||
|
||
Observability Stack:
|
||
- OpenTelemetry → SignOz (Traces + Logs + Metrics)
|
||
- Sentry SDK → Sentry Self-Hosted (Error Tracking + Stack Traces)
|
||
|
||
Version: 1.0.0
|
||
Date: 2026-03-20
|
||
"""
|
||
|
||
import asyncio
|
||
import os
|
||
from collections.abc import AsyncGenerator
|
||
from contextlib import asynccontextmanager
|
||
|
||
import sentry_sdk
|
||
import structlog
|
||
from fastapi import FastAPI, Request
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
from fastapi.responses import JSONResponse, Response
|
||
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
|
||
from sentry_sdk.integrations.fastapi import FastApiIntegration
|
||
from sentry_sdk.integrations.starlette import StarletteIntegration
|
||
|
||
from src.api.v1 import agents as agents_v1 # Phase 9.5: Agent Teams API
|
||
from src.api.v1 import ai as ai_v1
|
||
from src.api.v1 import ai_slo as ai_slo_v1 # Phase 6 ADR-087: AI SLO 自我治理
|
||
from src.api.v1 import approvals as approvals_v1
|
||
from src.api.v1 import alert_operation_logs as alert_operation_logs_v1
|
||
from src.api.v1 import audit_logs as audit_logs_v1
|
||
from src.api.v1 import auto_repair as auto_repair_v1 # #8: 自動升級決策
|
||
from src.api.v1 import csrf as csrf_v1 # Phase 20: CSRF Protection
|
||
from src.api.v1 import dashboard as dashboard_v1
|
||
from src.api.v1 import errors as errors_v1 # #40: Sentry 錯誤 BFF API
|
||
from src.api.v1 import (
|
||
gitea_webhook as gitea_webhook_v1, # ADR-059: Gitea → OpenClaw (GitHub → Gitea 遷移)
|
||
)
|
||
|
||
# Import API routers
|
||
from src.api.v1 import health as health_v1
|
||
from src.api.v1 import incidents as incidents_v1 # Phase 6.4: Decision Proposal
|
||
from src.api.v1 import knowledge as knowledge_v1 # KB Phase 1: Knowledge Base
|
||
from src.api.v1 import learning as learning_v1 # Phase D-G P0: Learning API
|
||
from src.api.v1 import metrics as metrics_v1 # Phase 7: Gold Metrics (真實血脈)
|
||
from src.api.v1 import playbooks as playbooks_v1 # #7: Playbook 萃取
|
||
from src.api.v1 import proposals as proposals_v1 # Phase 6.4h: Proposals CRUD API
|
||
from src.api.v1 import (
|
||
sentry_webhook as sentry_webhook_v1, # Phase 10.2.1: Sentry → Telegram
|
||
)
|
||
from src.api.v1 import (
|
||
signoz_webhook as signoz_webhook_v1, # Phase 21: SignOz → Telegram (ADR-037)
|
||
)
|
||
from src.api.v1 import drift as drift_v1 # Phase 25 P2: Config Drift Detection
|
||
from src.api.v1 import rag as rag_v1 # Phase 33 ADR-067: RAG 知識庫
|
||
from src.api.v1 import monitoring as monitoring_v1 # 2026-04-03: 監控工具狀態
|
||
from src.api.v1 import notifications as notifications_v1 # 2026-04-10: 通知頻道狀態
|
||
from src.api.v1 import stats as stats_v1 # Phase 6.5: Statistics Analytics
|
||
from src.api.v1 import telegram as telegram_v1 # Phase 5.4: Telegram Gateway
|
||
from src.api.v1 import terminal as terminal_v1 # Phase 19.1: Omni-Terminal SSE
|
||
from src.api.v1 import timeline as timeline_v1
|
||
from src.api.v1 import webhooks as webhooks_v1
|
||
from src.core.config import settings
|
||
from src.core.feature_flags import aiops_flags # ADR-080: AI 自主化飛輪 feature flags 啟動驗證
|
||
from src.core.http_client import close_all_http_clients, init_all_http_clients
|
||
from src.core.logging import get_logger, setup_logging
|
||
from src.core.redis_client import close_redis_pool, init_redis_pool
|
||
from src.core.sse import get_publisher
|
||
from src.core.telemetry import setup_telemetry, shutdown_telemetry
|
||
|
||
# CTO-201: Database & Executor
|
||
from src.db.base import close_db, init_db
|
||
|
||
# Phase 6.4g: lewooogo-brain 積木路由
|
||
from src.routers import proposals as proposals_router
|
||
|
||
# Legacy route imports (to be migrated)
|
||
from src.routes import agent, notifications, pipelines, plugins
|
||
from src.services.executor import close_executor
|
||
|
||
# Phase 5: OpenClaw AI Engine
|
||
from src.services.openclaw import close_openclaw
|
||
from src.services.telegram_gateway import get_telegram_gateway
|
||
|
||
# Phase 6.1: Event Bus (Signal Worker)
|
||
from src.workers import close_signal_worker, init_signal_worker
|
||
|
||
# =============================================================================
|
||
# Initialize Logging (MUST be first)
|
||
# =============================================================================
|
||
setup_logging()
|
||
logger = get_logger("awoooi.api")
|
||
|
||
# =============================================================================
|
||
# Sentry SDK Initialization (Error Tracking - 補強 SignOz)
|
||
# Self-Hosted @ 192.168.0.110
|
||
# 分工: Sentry 專注 Error Tracking,SignOz 專注 Traces/Logs/Metrics
|
||
# Phase 15.3: Deep Linking - 注入 OTEL trace_id 供 SignOz 關聯
|
||
# =============================================================================
|
||
SENTRY_DSN = os.getenv("SENTRY_DSN")
|
||
|
||
|
||
def _sentry_before_send(event, hint): # noqa: ARG001 - hint is Sentry callback signature
|
||
"""
|
||
Phase 15.3: Sentry → SignOz Deep Linking
|
||
|
||
在每個 Sentry event 中注入 OTEL trace_id,
|
||
讓 Sentry 錯誤能直接連結到 SignOz Trace。
|
||
"""
|
||
try:
|
||
from src.core.deep_linking import DeepLinking
|
||
from src.core.telemetry import get_current_trace_id
|
||
|
||
trace_id = get_current_trace_id()
|
||
if trace_id:
|
||
# 注入 trace_id 到 tags (Sentry UI 可搜尋)
|
||
if "tags" not in event:
|
||
event["tags"] = {}
|
||
event["tags"]["otel_trace_id"] = trace_id
|
||
event["tags"]["signoz_trace_url"] = DeepLinking.signoz_trace_url(trace_id)
|
||
|
||
# 注入到 contexts (詳情頁顯示)
|
||
if "contexts" not in event:
|
||
event["contexts"] = {}
|
||
event["contexts"]["signoz"] = {
|
||
"trace_id": trace_id,
|
||
"trace_url": DeepLinking.signoz_trace_url(trace_id),
|
||
"service": "awoooi-api",
|
||
}
|
||
except Exception:
|
||
# Deep Linking 失敗不應影響錯誤上報
|
||
pass
|
||
return event
|
||
|
||
|
||
if SENTRY_DSN:
|
||
sentry_sdk.init(
|
||
dsn=SENTRY_DSN,
|
||
environment=settings.ENVIRONMENT,
|
||
release=f"awoooi-api@{settings.VERSION}",
|
||
# 效能監控取樣率 (生產環境降低)
|
||
traces_sample_rate=0.1 if settings.ENVIRONMENT == "production" else 1.0,
|
||
# FastAPI 深度整合
|
||
integrations=[
|
||
FastApiIntegration(transaction_style="endpoint"),
|
||
StarletteIntegration(transaction_style="endpoint"),
|
||
],
|
||
# 忽略常見的非錯誤
|
||
ignore_errors=[
|
||
ConnectionRefusedError,
|
||
TimeoutError,
|
||
],
|
||
# 只在生產環境發送
|
||
send_default_pii=False,
|
||
# Phase 15.3: Deep Linking hook
|
||
before_send=_sentry_before_send,
|
||
)
|
||
# 2026-04-05 Claude Code: 加入統一標籤,對齊 Prometheus/auto_repair layer 規範
|
||
sentry_sdk.set_tag("layer", "k8s")
|
||
sentry_sdk.set_tag("component", "api")
|
||
sentry_sdk.set_tag("host", "k8s-awoooi-prod")
|
||
sentry_sdk.set_tag("team", "backend")
|
||
logger.info("sentry_initialized", dsn=SENTRY_DSN.split("@")[-1])
|
||
else:
|
||
logger.info("sentry_disabled", reason="SENTRY_DSN not configured")
|
||
|
||
|
||
# =============================================================================
|
||
# Application Lifespan
|
||
# =============================================================================
|
||
@asynccontextmanager
|
||
async def lifespan(_app: FastAPI) -> AsyncGenerator[None, None]:
|
||
"""Application lifespan events"""
|
||
# Startup
|
||
logger.info(
|
||
"api_startup",
|
||
version=settings.VERSION,
|
||
environment=settings.ENVIRONMENT,
|
||
mock_mode=settings.MOCK_MODE,
|
||
cors_origins=settings.CORS_ORIGINS,
|
||
ai_fallback_order=settings.AI_FALLBACK_ORDER,
|
||
four_hosts=settings.four_hosts,
|
||
kubeconfig=settings.KUBECONFIG_PATH,
|
||
)
|
||
|
||
# CTO-201: Initialize PostgreSQL database (統帥鐵律: 禁止 SQLite)
|
||
await init_db()
|
||
db_url = settings.DATABASE_URL
|
||
logger.info(
|
||
"database_initialized", url=db_url.split("@")[-1] if "@" in db_url else db_url
|
||
)
|
||
|
||
# Phase 5: Initialize HTTP Clients (ClickHouse, Ollama)
|
||
# 統帥鐵律: 連線池在啟動時建立,關閉時回收
|
||
await init_all_http_clients()
|
||
logger.info("http_clients_initialized")
|
||
|
||
# Phase 6.1.1: Initialize Redis Pool (Multi-Sig 狀態持久化)
|
||
# 統帥鐵律: Redis 連線池在 Lifespan 啟動時建立
|
||
await init_redis_pool()
|
||
logger.info("redis_pool_initialized", url=settings.REDIS_URL.split("@")[-1])
|
||
|
||
# Start SSE publisher
|
||
publisher = await get_publisher()
|
||
logger.info("sse_publisher_initialized")
|
||
|
||
# Phase 5: Telegram Gateway 初始化
|
||
# 2026-03-23 架構修正: AWOOOI API 不做 Long Polling
|
||
# 原因: 同一個 Bot Token 只能有一個 Long Polling 實例
|
||
# OpenClaw (192.168.0.188) 是唯一的 Polling 實例
|
||
# AWOOOI API 只負責發送訊息,不接收
|
||
telegram_gw = get_telegram_gateway()
|
||
if settings.TELEGRAM_ENABLE_POLLING:
|
||
await telegram_gw.start_long_polling()
|
||
logger.info("telegram_long_polling_started")
|
||
else:
|
||
logger.info("telegram_polling_disabled", reason="OpenClaw 是唯一 Polling 實例")
|
||
|
||
# ADR-015: MCP Provider 註冊 (DI 模式)
|
||
from src.plugins.mcp.providers import register_all_providers
|
||
register_all_providers()
|
||
logger.info("mcp_providers_registered")
|
||
|
||
# ADR-081 Phase 1: MCPToolRegistry 初始化(PreDecisionInvestigator 感官工具)
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 修復 sensors=0/0 根因 — init 從未在 startup 被呼叫
|
||
try:
|
||
from src.services.mcp_tool_registry import init_mcp_tool_registry
|
||
await init_mcp_tool_registry()
|
||
logger.info("mcp_tool_registry_initialized")
|
||
except Exception as e:
|
||
logger.warning("mcp_tool_registry_init_failed", error=str(e))
|
||
|
||
# Phase 6.5: Telegram 心跳監控(每 30 分鐘發送到 SRE 戰情室群組)
|
||
# 2026-04-16 ogt + Claude Sonnet 4.6: 恢復 — 使用者確認必須繼續在 SRE 戰情室發送
|
||
# 上次停用原因(forwarded_to_separate_group)有誤,群組就是 SRE_GROUP_CHAT_ID
|
||
if settings.OPENCLAW_TG_BOT_TOKEN:
|
||
await telegram_gw.start_heartbeat_monitor(
|
||
heartbeat_interval_minutes=30,
|
||
silence_threshold_hours=2,
|
||
)
|
||
logger.info("telegram_heartbeat_monitor_started", interval_minutes=30)
|
||
else:
|
||
logger.warning("telegram_heartbeat_monitor_skipped", reason="OPENCLAW_TG_BOT_TOKEN not set")
|
||
|
||
# Reboot Recovery: Warm-up Redis Working Memory from PostgreSQL
|
||
# 2026-04-05 ogt: 重開機後 Redis 清空,從 DB restore 未解決的 incidents
|
||
# 統帥批准: 數據必須長久記錄,重開機後自動恢復 Working Memory
|
||
try:
|
||
from src.services.incident_service import get_incident_service
|
||
from src.db.base import get_db_context
|
||
from src.db.models import IncidentRecord
|
||
from sqlalchemy import select
|
||
|
||
incident_service = get_incident_service()
|
||
async with get_db_context() as db:
|
||
result = await db.execute(
|
||
select(IncidentRecord).where(
|
||
IncidentRecord.status.in_(["investigating", "mitigating"])
|
||
)
|
||
)
|
||
records = result.scalars().all()
|
||
|
||
restored = 0
|
||
for record in records:
|
||
try:
|
||
from src.models.incident import Incident
|
||
incident = Incident(
|
||
incident_id=record.incident_id,
|
||
status=record.status,
|
||
severity=record.severity,
|
||
signals=record.signals or [],
|
||
affected_services=record.affected_services or [],
|
||
decision_chain=record.decision_chain,
|
||
proposal_ids=record.proposal_ids or [],
|
||
outcome=record.outcome,
|
||
created_at=record.created_at,
|
||
updated_at=record.updated_at,
|
||
resolved_at=record.resolved_at,
|
||
closed_at=record.closed_at,
|
||
ttl_days=record.ttl_days,
|
||
vectorized=record.vectorized,
|
||
# ADR-073: 分類欄位必須還原,否則 KM 寫入時全為 "unknown"
|
||
notification_type=record.notification_type,
|
||
alert_category=record.alert_category,
|
||
)
|
||
if await incident_service.save_to_working_memory(incident):
|
||
restored += 1
|
||
except Exception:
|
||
# 舊資料 source 值不合法(node-exporter 等)→ 跳過
|
||
pass
|
||
|
||
logger.info("working_memory_warmed_up", restored=restored, total=len(records))
|
||
except Exception as e:
|
||
logger.warning("working_memory_warmup_failed", error=str(e))
|
||
|
||
# ADR-088: Trust Score 冷啟動 — 從 PostgreSQL 恢復信任分數
|
||
# 解決: Pod 重啟後 TrustScoreManager 記憶歸零,AI 永遠無法累積到 L4 自動放行
|
||
# 2026-04-17 ogt + Claude Sonnet 4.6(亞太): Phase 4 信任持久化
|
||
try:
|
||
from src.repositories.trust_repository import get_trust_repository
|
||
from src.services.trust_engine import get_trust_manager
|
||
|
||
trust_records = await get_trust_repository().load_all()
|
||
loaded = get_trust_manager().bulk_load(trust_records)
|
||
logger.info("trust_scores_warmed_up", loaded=loaded)
|
||
except Exception as e:
|
||
logger.warning("trust_scores_warmup_failed", error=str(e))
|
||
|
||
# Phase 4 飛輪修復: Playbook Embedding 冷啟動索引
|
||
# 2026-04-10 Claude Sonnet 4.6 Asia/Taipei
|
||
# 目的: 確保 playbook_embeddings 表有最新向量,供語義相似度查詢
|
||
# 使用 asyncio.create_task 非阻塞 — 不影響 API 啟動速度
|
||
# Phase ADR-068 2026-04-10: 從 alert_rules.yaml seed Playbook(冪等)
|
||
# 必須在 embedding indexing 之前,確保 playbook 表有資料
|
||
try:
|
||
from src.services.playbook_seed_service import seed_playbooks_from_rules
|
||
asyncio.create_task(seed_playbooks_from_rules())
|
||
logger.info("playbook_seed_scheduled")
|
||
except Exception as e:
|
||
logger.warning("playbook_seed_schedule_failed", error=str(e))
|
||
|
||
# Phase 3.5 ADR-085: Playbook Redis → PG 補寫(一次性遷移 + 啟動時冪等補救)
|
||
# 確保 Redis 中存在但 PG 中缺少的 Playbook 不因 TTL 消失而永久丟失
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3.5 AI 學習成果持久化
|
||
try:
|
||
from src.repositories.playbook_repository import get_playbook_repository
|
||
asyncio.create_task(get_playbook_repository().backfill_redis_to_pg())
|
||
logger.info("playbook_pg_backfill_scheduled")
|
||
except Exception as e:
|
||
logger.warning("playbook_pg_backfill_schedule_failed", error=str(e))
|
||
|
||
try:
|
||
from src.services.playbook_embedding_service import ensure_playbook_embeddings_indexed
|
||
asyncio.create_task(ensure_playbook_embeddings_indexed())
|
||
logger.info("playbook_embedding_indexing_scheduled")
|
||
except Exception as e:
|
||
logger.warning("playbook_embedding_schedule_failed", error=str(e))
|
||
|
||
# Phase 6.1: 啟動 Signal Worker (Redis Streams Consumer)
|
||
# 統帥鐵律: Event Bus 解耦告警接收與處理
|
||
await init_signal_worker()
|
||
logger.info("signal_worker_initialized")
|
||
|
||
# BUG-005 修復 2026-04-11: 啟動時掃描 Redis 中所有 state=ready 但未送 Telegram 的 token
|
||
# dedup TTL 10 分鐘過期後,ready decisions 就沒有補送機制 → 長期卡在 ready 無人審核
|
||
try:
|
||
from src.services.decision_manager import get_decision_manager
|
||
asyncio.create_task(get_decision_manager().resend_stale_ready_tokens())
|
||
logger.info("stale_ready_tokens_resend_scheduled")
|
||
except Exception as e:
|
||
logger.warning("stale_ready_tokens_resend_schedule_failed", error=str(e))
|
||
|
||
# 2026-04-16 Claude Sonnet 4.6: 自動 AI 分析 Sweeper(每 90 秒)
|
||
# 修復核心 Gap:Signal Worker 創建 Incident 後無人觸發 AI 分析
|
||
# 除非有人呼叫 GET /api/v1/incidents,否則 Incident 永遠沒有決策
|
||
# Sweeper 定期掃描無 decision token 的 INVESTIGATING incidents → 背景觸發
|
||
try:
|
||
from src.jobs.incident_analysis_sweeper import run_incident_analysis_sweeper
|
||
asyncio.create_task(run_incident_analysis_sweeper())
|
||
logger.info("incident_analysis_sweeper_scheduled", interval_sec=90)
|
||
except Exception as e:
|
||
logger.warning("incident_analysis_sweeper_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § 資產盤點 cron (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每 1 小時掃 K8s pods → 寫 asset_inventory + asset_discovery_run + 7 維 coverage
|
||
# 解開 8 張 0 writer 表的第一個 (asset_inventory / asset_discovery_run / asset_coverage_snapshot)
|
||
try:
|
||
from src.jobs.asset_scanner_job import run_asset_scanner_loop
|
||
asyncio.create_task(run_asset_scanner_loop())
|
||
logger.info("asset_scanner_loop_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("asset_scanner_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Rule Catalog Sync (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每 1 小時從 Prometheus /api/v1/rules 拉 active rules → UPSERT alert_rule_catalog
|
||
# 解鎖 E3 Hermes 自動建規則: AI 需要 alert_rule_catalog 作為 baseline 才能提案修正
|
||
try:
|
||
from src.jobs.rule_catalog_sync_job import run_rule_catalog_sync_loop
|
||
asyncio.create_task(run_rule_catalog_sync_loop())
|
||
logger.info("rule_catalog_sync_loop_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("rule_catalog_sync_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Phase 4 NemoTron 容量巡檢 MVP (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每日 02:00 Taipei 撈 Prometheus node_exporter → 寫 host_capacity_snapshot + violations
|
||
# 解鎖: Phase 4 Holt-Winters 預測有歷史資料 / 容量趨勢分析
|
||
try:
|
||
from src.jobs.capacity_scanner_job import run_capacity_scanner_loop
|
||
asyncio.create_task(run_capacity_scanner_loop())
|
||
logger.info("capacity_scanner_loop_scheduled", daily_trigger_hour_taipei=2)
|
||
except Exception as e:
|
||
logger.warning("capacity_scanner_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § 合規掃描 MVP (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每日 03:00 Taipei 遍歷 asset_inventory → 寫 7 維 asset_compliance_snapshot
|
||
# MVP: secret_rotated 真實檢查,其他 6 維占位 'unknown',後續 agent 補
|
||
try:
|
||
from src.jobs.compliance_scanner_job import run_compliance_scanner_loop
|
||
asyncio.create_task(run_compliance_scanner_loop())
|
||
logger.info("compliance_scanner_loop_scheduled", daily_trigger_hour_taipei=3)
|
||
except Exception as e:
|
||
logger.warning("compliance_scanner_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Coverage Evaluator (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每 1h 把 asset_coverage_snapshot 從 'unknown' 升級成 green/yellow/red
|
||
# 依據: Prometheus targets / alert_rule_catalog labels / knowledge_entries 覆蓋
|
||
try:
|
||
from src.jobs.coverage_evaluator_job import run_coverage_evaluator_loop
|
||
asyncio.create_task(run_coverage_evaluator_loop())
|
||
logger.info("coverage_evaluator_loop_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("coverage_evaluator_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Rule Stats Updater (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每 1h 從 incidents + approval_records 計算 rule 統計
|
||
# 解鎖 E3 Hermes: noise_rate > 0.5 的 rule 可被 AI 提案 deprecate
|
||
try:
|
||
from src.jobs.rule_stats_updater_job import run_rule_stats_updater_loop
|
||
asyncio.create_task(run_rule_stats_updater_loop())
|
||
logger.info("rule_stats_updater_loop_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("rule_stats_updater_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Asset Change Tracker (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每 1h 比對最近兩次 asset_discovery_run,寫 asset_change_event
|
||
# 解鎖: 資產變化歷史 (added/removed/lifecycle_changed),AI 可追蹤集群演進
|
||
try:
|
||
from src.jobs.asset_change_tracker_job import run_asset_change_tracker_loop
|
||
asyncio.create_task(run_asset_change_tracker_loop())
|
||
logger.info("asset_change_tracker_loop_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("asset_change_tracker_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-090 § Hermes Rule Quality Advisor (2026-04-19 ogt + Claude Opus 4.7 Asia/Taipei)
|
||
# 每日 04:00 Taipei 分析 alert_rule_catalog.noise_rate,對高噪音規則推 Telegram 建議
|
||
# 統帥鐵律: AI 只推建議不自動改 review_status,人工決策 deprecate
|
||
try:
|
||
from src.jobs.hermes_rule_quality_job import run_hermes_rule_quality_loop
|
||
asyncio.create_task(run_hermes_rule_quality_loop())
|
||
logger.info("hermes_rule_quality_loop_scheduled", daily_trigger_hour_taipei=4)
|
||
except Exception as e:
|
||
logger.warning("hermes_rule_quality_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-076 Task 4: 每日 08:00 台北時間自動日度巡檢報告
|
||
# 2026-04-14 Claude Haiku 4.5 Asia/Taipei
|
||
try:
|
||
from src.services.report_generation_service import run_daily_report_loop
|
||
asyncio.create_task(run_daily_report_loop())
|
||
logger.info("daily_report_loop_scheduled", trigger_hour_taipei=8)
|
||
except Exception as e:
|
||
logger.warning("daily_report_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-073 P2 修復 2026-04-15: 逾期 Approval 自動結案(每小時)
|
||
# 確保 PENDING approval 超過 48h 後觸發 resolve_incident → KM 學習鏈閉環
|
||
try:
|
||
from src.jobs.approval_timeout_resolver import run_approval_timeout_resolver
|
||
asyncio.create_task(run_approval_timeout_resolver())
|
||
logger.info("approval_timeout_resolver_scheduled", interval_sec=3600)
|
||
except Exception as e:
|
||
logger.warning("approval_timeout_resolver_schedule_failed", error=str(e))
|
||
|
||
# ADR-083 Phase 3: Evolver Agent(每日)— Playbook 自動合併 + 低信任封存
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立
|
||
try:
|
||
from src.services.playbook_evolver import run_evolver_loop
|
||
asyncio.create_task(run_evolver_loop())
|
||
logger.info("evolver_loop_scheduled", interval_sec=86400)
|
||
except Exception as e:
|
||
logger.warning("evolver_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-083 Phase 3: 知識遺忘 Job(每日)— 30d 未引用 KB entry 標記 archived
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立
|
||
try:
|
||
from src.jobs.knowledge_decay_job import run_knowledge_decay_loop
|
||
asyncio.create_task(run_knowledge_decay_loop())
|
||
logger.info("knowledge_decay_loop_scheduled", interval_sec=86400)
|
||
except Exception as e:
|
||
logger.warning("knowledge_decay_loop_schedule_failed", error=str(e))
|
||
|
||
# ADR-083 Phase 3: Fine-tune JSONL 匯出(每週)— EvidenceSnapshot × AgentSession → JSONL
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 3 初始建立
|
||
try:
|
||
from src.services.finetune_exporter import run_finetune_export_loop
|
||
asyncio.create_task(run_finetune_export_loop())
|
||
logger.info("finetune_export_loop_scheduled", interval_sec=604800)
|
||
except Exception as e:
|
||
logger.warning("finetune_export_loop_schedule_failed", error=str(e))
|
||
|
||
# Phase 4 ADR-084: 主動巡檢每 5 分鐘執行一次
|
||
# 協調 DynamicBaselineService + LogAnomalyDetector + TrendPredictor
|
||
# Shadow Mode 控制:AIOPS_P4_SHADOW_MODE=True 時只記錄,不觸發 Alert
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 4 初始建立
|
||
try:
|
||
from src.services.proactive_inspector import run_proactive_inspector_loop
|
||
asyncio.create_task(run_proactive_inspector_loop())
|
||
logger.info("proactive_inspector_loop_scheduled", interval_sec=300)
|
||
except Exception as e:
|
||
logger.warning("proactive_inspector_schedule_failed", error=str(e))
|
||
|
||
# ADR-087 Phase 6: 離線回放(每 7 天)— 決策一致率基線
|
||
# 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 6 初始建立
|
||
try:
|
||
from src.jobs.offline_replay_service import run_offline_replay_loop
|
||
asyncio.create_task(run_offline_replay_loop())
|
||
logger.info("offline_replay_loop_scheduled", interval_sec=604800)
|
||
except Exception as e:
|
||
logger.warning("offline_replay_loop_schedule_failed", error=str(e))
|
||
|
||
yield
|
||
|
||
# Shutdown
|
||
# Phase 6.1: 關閉 Signal Worker (先關閉 Consumer)
|
||
await close_signal_worker()
|
||
await publisher.stop()
|
||
await close_executor()
|
||
await close_openclaw()
|
||
# Phase 5.4: Close Telegram Gateway
|
||
telegram_gw = get_telegram_gateway()
|
||
await telegram_gw.close()
|
||
# Phase 33: Close RAG Service httpx client (ADR-067)
|
||
from src.services.knowledge_rag_service import get_knowledge_rag_service
|
||
await get_knowledge_rag_service().close()
|
||
# Phase 5: Close HTTP Clients (統帥鐵律: 連線池回收)
|
||
await close_all_http_clients()
|
||
# Phase 6.1.1: Close Redis Pool (統帥鐵律: Redis 連線池回收)
|
||
await close_redis_pool()
|
||
await close_db()
|
||
shutdown_telemetry()
|
||
logger.info("api_shutdown", version=settings.VERSION)
|
||
|
||
|
||
# =============================================================================
|
||
# FastAPI Application
|
||
# =============================================================================
|
||
app = FastAPI(
|
||
title="AWOOOI API",
|
||
description="AWOOOI 智能運維平台 API - 由 leWOOOgo Engine 驅動",
|
||
version=settings.VERSION,
|
||
docs_url="/api/v1/docs",
|
||
redoc_url="/api/v1/redoc",
|
||
openapi_url="/api/v1/openapi.json",
|
||
lifespan=lifespan,
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# OpenTelemetry Instrumentation (可觀測性鐵律)
|
||
# 必須在 Middleware 之前初始化,確保追蹤完整性
|
||
# 優雅降級: 失敗不影響 API 啟動
|
||
# =============================================================================
|
||
otel_enabled = setup_telemetry(app)
|
||
if otel_enabled:
|
||
logger.info(
|
||
"otel_initialized",
|
||
service=settings.OTEL_SERVICE_NAME,
|
||
endpoint=settings.OTEL_EXPORTER_OTLP_ENDPOINT,
|
||
)
|
||
else:
|
||
logger.warning("otel_disabled", reason="initialization failed or disabled")
|
||
|
||
|
||
# =============================================================================
|
||
# Middleware
|
||
# =============================================================================
|
||
|
||
# 2026-04-03 ogt: Nginx 反向代理修正 — 讓 FastAPI 信任 X-Forwarded-Proto
|
||
# 解決問題: /api/v1/knowledge (無結尾斜線) 307 redirect 產生 http:// Location
|
||
# 原因: FastAPI 不知道自己在 HTTPS 後面,redirect 回 http://
|
||
# 效果: 有了此中間件,307 Location 會是 https://
|
||
from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
|
||
app.add_middleware(ProxyHeadersMiddleware, trusted_hosts="*")
|
||
|
||
# CORS - Strict Whitelist (Iron Law #2)
|
||
# NO wildcards, NO UAT
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=settings.CORS_ORIGINS,
|
||
allow_credentials=True,
|
||
allow_methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
|
||
allow_headers=["Authorization", "Content-Type", "X-Request-ID"],
|
||
expose_headers=["X-Request-ID"],
|
||
)
|
||
|
||
|
||
@app.middleware("http")
|
||
async def request_logging_middleware(request: Request, call_next):
|
||
"""
|
||
Structured request logging middleware
|
||
|
||
Logs every request with:
|
||
- Request ID (from header or generated)
|
||
- HTTP method and path
|
||
- Response status code
|
||
- Request duration
|
||
"""
|
||
import time
|
||
|
||
request_id = request.headers.get("X-Request-ID", "-")
|
||
start_time = time.perf_counter()
|
||
|
||
# Bind request context for all logs in this request
|
||
structlog.contextvars.clear_contextvars()
|
||
structlog.contextvars.bind_contextvars(
|
||
request_id=request_id,
|
||
method=request.method,
|
||
path=request.url.path,
|
||
)
|
||
|
||
log = get_logger("awoooi.http")
|
||
log.debug("request_start")
|
||
|
||
response = await call_next(request)
|
||
|
||
duration_ms = (time.perf_counter() - start_time) * 1000
|
||
log.info(
|
||
"request_complete",
|
||
status_code=response.status_code,
|
||
duration_ms=round(duration_ms, 2),
|
||
)
|
||
|
||
# Add request ID to response headers
|
||
response.headers["X-Request-ID"] = request_id
|
||
return response
|
||
|
||
|
||
# =============================================================================
|
||
# Exception Handlers
|
||
# =============================================================================
|
||
|
||
|
||
@app.exception_handler(Exception)
|
||
async def global_exception_handler(_request: Request, exc: Exception) -> JSONResponse:
|
||
"""
|
||
Global exception handler with structured logging + Sentry
|
||
|
||
Catches all unhandled exceptions and returns a safe error response.
|
||
Full exception details are logged but not exposed to clients.
|
||
Sentry SDK 會自動捕獲並發送到 Self-Hosted Server。
|
||
"""
|
||
# Sentry 自動捕獲 (如果已初始化)
|
||
sentry_sdk.capture_exception(exc)
|
||
|
||
log = get_logger("awoooi.error")
|
||
log.exception(
|
||
"unhandled_exception",
|
||
exc_type=type(exc).__name__,
|
||
exc_message=str(exc),
|
||
)
|
||
return JSONResponse(
|
||
status_code=500,
|
||
content={
|
||
"code": "INTERNAL_ERROR",
|
||
"message": "An internal error occurred",
|
||
},
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# API Routers - Path-based routing (/api/v1/*)
|
||
# =============================================================================
|
||
|
||
# New v1 API routes
|
||
app.include_router(health_v1.router, prefix="/api/v1", tags=["Health"])
|
||
app.include_router(csrf_v1.router, prefix="/api/v1", tags=["Security"]) # Phase 20
|
||
app.include_router(dashboard_v1.router, prefix="/api/v1", tags=["Dashboard"])
|
||
app.include_router(approvals_v1.router, prefix="/api/v1", tags=["HITL Approvals"])
|
||
app.include_router(ai_v1.router, prefix="/api/v1", tags=["AI Decision"])
|
||
app.include_router(ai_slo_v1.router, prefix="/api/v1", tags=["AI SLO"]) # Phase 6 ADR-087
|
||
app.include_router(webhooks_v1.router, prefix="/api/v1", tags=["Webhooks"])
|
||
app.include_router(timeline_v1.router, prefix="/api/v1", tags=["Timeline"])
|
||
app.include_router(audit_logs_v1.router, prefix="/api/v1", tags=["Audit Logs"])
|
||
# 2026-04-09 Claude Sonnet 4.6: alert_operation_log 查詢 API (Sprint 5.2)
|
||
app.include_router(alert_operation_logs_v1.router, prefix="/api/v1", tags=["Alert Operation Logs"])
|
||
app.include_router(
|
||
telegram_v1.router, prefix="/api/v1", tags=["Telegram Gateway"]
|
||
) # Phase 5.4
|
||
app.include_router(
|
||
metrics_v1.router, prefix="/api/v1", tags=["Gold Metrics"]
|
||
) # Phase 7: 真實血脈
|
||
app.include_router(
|
||
incidents_v1.router, prefix="/api/v1", tags=["Incidents"]
|
||
) # Phase 6.4: Decision Proposal
|
||
app.include_router(
|
||
proposals_v1.router, prefix="/api/v1", tags=["Proposals"]
|
||
) # Phase 6.4h: Proposals CRUD
|
||
app.include_router(
|
||
agents_v1.router, prefix="/api/v1", tags=["Agent Teams"]
|
||
) # Phase 9.5: Agent Teams
|
||
app.include_router(
|
||
stats_v1.router, prefix="/api/v1", tags=["Statistics"]
|
||
) # Phase 6.5: Statistics Analytics
|
||
app.include_router(
|
||
monitoring_v1.router, prefix="/api/v1", tags=["Monitoring"]
|
||
) # 2026-04-03: 監控工具狀態
|
||
app.include_router(
|
||
gitea_webhook_v1.router, prefix="/api/v1", tags=["Gitea Webhook"]
|
||
) # ADR-059: Gitea → OpenClaw
|
||
app.include_router(
|
||
playbooks_v1.router, prefix="/api/v1", tags=["Playbooks"]
|
||
) # #7: Playbook 萃取
|
||
app.include_router(
|
||
auto_repair_v1.router, prefix="/api/v1", tags=["Auto Repair"]
|
||
) # #8: 自動升級決策
|
||
app.include_router(
|
||
drift_v1.router, prefix="/api/v1", tags=["Drift Detection"]
|
||
) # Phase 25 P2: Config Drift Detection
|
||
app.include_router(
|
||
rag_v1.router, prefix="/api/v1", tags=["RAG Knowledge Base"]
|
||
) # Phase 33 ADR-067: RAG 知識庫
|
||
app.include_router(
|
||
errors_v1.router, prefix="/api/v1", tags=["Errors"]
|
||
) # #40: Sentry 錯誤 BFF API
|
||
app.include_router(
|
||
sentry_webhook_v1.router, prefix="/api/v1", tags=["Sentry Webhook"]
|
||
) # Phase 10.2.1: Sentry → Telegram
|
||
app.include_router(
|
||
signoz_webhook_v1.router, prefix="/api/v1", tags=["SignOz Webhook"]
|
||
) # Phase 21: SignOz → Telegram (ADR-037)
|
||
app.include_router(
|
||
notifications_v1.router, prefix="/api/v1", tags=["Notifications"]
|
||
) # 2026-04-10: 通知頻道狀態
|
||
app.include_router(
|
||
terminal_v1.router, prefix="/api/v1", tags=["Omni-Terminal"]
|
||
) # Phase 19.1: Omni-Terminal SSE
|
||
app.include_router(
|
||
learning_v1.router, prefix="/api/v1", tags=["Learning"]
|
||
) # Phase D-G P0: 學習系統 API
|
||
app.include_router(
|
||
knowledge_v1.router, prefix="/api/v1", tags=["Knowledge Base"]
|
||
) # KB Phase 1: Knowledge Base API
|
||
app.include_router(
|
||
proposals_router.router, tags=["Proposals (Legacy)"]
|
||
) # Phase 6.4g: lewooogo-brain (舊版)
|
||
|
||
# Legacy routes (to be migrated to api/v1/)
|
||
app.include_router(plugins.router, prefix="/api/v1/plugins", tags=["Plugins"])
|
||
app.include_router(pipelines.router, prefix="/api/v1/pipelines", tags=["Pipelines"])
|
||
app.include_router(agent.router, prefix="/api/v1/agent", tags=["Agent"])
|
||
app.include_router(
|
||
notifications.router, prefix="/api/v1/notifications", tags=["Notifications"]
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Prometheus Metrics Endpoint
|
||
# =============================================================================
|
||
# 2026-03-31 ogt: 暴露 Prometheus 指標供告警系統使用
|
||
|
||
|
||
@app.get("/metrics", include_in_schema=False)
|
||
async def prometheus_metrics() -> Response:
|
||
"""Prometheus metrics endpoint for alerting"""
|
||
return Response(
|
||
content=generate_latest(),
|
||
media_type=CONTENT_TYPE_LATEST,
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Root Endpoint
|
||
# =============================================================================
|
||
|
||
|
||
@app.get("/", include_in_schema=False)
|
||
async def root() -> dict:
|
||
"""Root endpoint with API info"""
|
||
return {
|
||
"name": "AWOOOI API",
|
||
"version": settings.VERSION,
|
||
"environment": settings.ENVIRONMENT,
|
||
"docs": "/api/v1/docs",
|
||
"health": "/api/v1/health",
|
||
"dashboard": "/api/v1/dashboard",
|
||
"stream": "/api/v1/dashboard/stream",
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Entry Point
|
||
# =============================================================================
|
||
|
||
if __name__ == "__main__":
|
||
import uvicorn
|
||
|
||
uvicorn.run(
|
||
"src.main:app",
|
||
host="0.0.0.0",
|
||
port=8000,
|
||
reload=settings.DEBUG,
|
||
log_level=settings.LOG_LEVEL.lower(),
|
||
)
|
||
|