# apps/api/src/services/preflight_service.py # Pre-flight 安全檢查服務 (Q2/Q4 決策) # 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei # 架構: leWOOOgo 積木化,依賴 ServiceRegistryClient + VeleroClient # 參考: ADR-062 from __future__ import annotations import structlog import time from dataclasses import dataclass from enum import Enum from .service_registry import ServiceRegistryClient, get_service_registry from .velero_client import VeleroClient, get_velero_client logger = structlog.get_logger(__name__) class PreflightResult(str, Enum): PASS = "PASS" ABORT_BACKUP_EXPIRED = "ABORT_BACKUP_EXPIRED" ABORT_HIGH_IO = "ABORT_HIGH_IO" SKIP = "SKIP" # 服務不需要 Pre-flight @dataclass class PreflightReport: result: PreflightResult backup_age_hours: float | None = None backup_name_triggered: str | None = None reason: str = "" class PreflightService: """ Pre-flight 安全檢查 - 只有 requires_pre_backup=True 的服務才觸發 - 備份過期 → Abort + 觸發緊急備份(非同步) - CPU/IO 高負載告警 → 禁止觸發備份(Q4) """ def __init__( self, registry: ServiceRegistryClient | None = None, velero: VeleroClient | None = None, ) -> None: self._registry = registry or get_service_registry() self._velero = velero or get_velero_client() async def check( self, service_name: str, alert_labels: dict | None = None, ) -> PreflightReport: """ 執行 Pre-flight 檢查 alert_labels: Prometheus 告警標籤,用於判斷 CPU/IO 負載 """ info = self._registry.get_service(service_name) if info is None or not info.requires_pre_backup: return PreflightReport(result=PreflightResult.SKIP, reason="服務不需要 Pre-flight") # Q4: CPU/IO 高負載告警時禁止觸發備份 if self._is_high_io_alert(alert_labels): logger.warning(f"Pre-flight: {service_name} 屬於 CPU/IO 高負載告警,跳過備份觸發") return PreflightReport( result=PreflightResult.ABORT_HIGH_IO, reason="告警類型為 CPU/IO 高負載,禁止觸發備份(Q4 決策)", ) policies = self._registry.get_backup_policies() max_age = policies.get("velero_max_age_hours", 4) age = await self._velero.get_latest_backup_age_hours() if age <= max_age: return PreflightReport( result=PreflightResult.PASS, backup_age_hours=age, reason=f"備份時間正常 ({age:.1f}h < {max_age}h)", ) # 備份過期 → 觸發緊急備份 + Abort backup_name = f"emergency-preflight-{int(time.time())}" triggered = await self._velero.trigger_emergency_backup(backup_name) return PreflightReport( result=PreflightResult.ABORT_BACKUP_EXPIRED, backup_age_hours=age, backup_name_triggered=backup_name if triggered else None, reason=( f"備份過期 ({age:.1f}h > {max_age}h)。" f"{'緊急備份已啟動: ' + backup_name if triggered else '緊急備份啟動失敗,請人工處理'}" ), ) def _is_high_io_alert(self, labels: dict | None) -> bool: if not labels: return False alert_name = labels.get("alertname", "").lower() return any(kw in alert_name for kw in ["cpu", "io", "disk", "load", "memory"]) _preflight_service: PreflightService | None = None def get_preflight_service() -> PreflightService: global _preflight_service if _preflight_service is None: _preflight_service = PreflightService() return _preflight_service def set_preflight_service(service: PreflightService) -> None: """測試注入用 (P4 規範)""" global _preflight_service _preflight_service = service