Files
awoooi/apps/api/src/services/preflight_service.py
OG T 0f5fecfef5
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m40s
fix(sprint5.1): 首席架構師審查修正 — S1×4 S2×2 S3×1
S1-1: service_registry/velero_client/preflight_service 改用 structlog
S1-2: velero_client datetime.now(UTC) 改用 now_taipei()(台北時區鐵律)
S1-3: Guardrail 失敗改為保守拒絕(原放行方向與安全目標相悖)
S1-4: service_registry import 移至模組頂部(移除函數內 import)
S2-1: telegram_gateway T1-T6 六個通知方法補齊 try/except
S2-2: webhooks.py Langfuse URL 改用 settings.LANGFUSE_URL(移除硬寫內網 IP)
S3-3: velero_client trigger_emergency_backup 改為 kubectl apply Backup CRD
      (原 kubectl create backup 語法不存在,審查發現靜默失敗風險)

審查評分: 70/100 → 修正後預計 90+/100

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 16:36:18 +08:00

117 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# apps/api/src/services/preflight_service.py
# Pre-flight 安全檢查服務 (Q2/Q4 決策)
# 撰寫: Claude Sonnet 4.6 / 2026-04-08 Asia/Taipei
# 架構: leWOOOgo 積木化,依賴 ServiceRegistryClient + VeleroClient
# 參考: ADR-062
from __future__ import annotations
import structlog
import time
from dataclasses import dataclass
from enum import Enum
from .service_registry import ServiceRegistryClient, get_service_registry
from .velero_client import VeleroClient, get_velero_client
logger = structlog.get_logger(__name__)
class PreflightResult(str, Enum):
PASS = "PASS"
ABORT_BACKUP_EXPIRED = "ABORT_BACKUP_EXPIRED"
ABORT_HIGH_IO = "ABORT_HIGH_IO"
SKIP = "SKIP" # 服務不需要 Pre-flight
@dataclass
class PreflightReport:
result: PreflightResult
backup_age_hours: float | None = None
backup_name_triggered: str | None = None
reason: str = ""
class PreflightService:
"""
Pre-flight 安全檢查
- 只有 requires_pre_backup=True 的服務才觸發
- 備份過期 → Abort + 觸發緊急備份(非同步)
- CPU/IO 高負載告警 → 禁止觸發備份Q4
"""
def __init__(
self,
registry: ServiceRegistryClient | None = None,
velero: VeleroClient | None = None,
) -> None:
self._registry = registry or get_service_registry()
self._velero = velero or get_velero_client()
async def check(
self,
service_name: str,
alert_labels: dict | None = None,
) -> PreflightReport:
"""
執行 Pre-flight 檢查
alert_labels: Prometheus 告警標籤,用於判斷 CPU/IO 負載
"""
info = self._registry.get_service(service_name)
if info is None or not info.requires_pre_backup:
return PreflightReport(result=PreflightResult.SKIP, reason="服務不需要 Pre-flight")
# Q4: CPU/IO 高負載告警時禁止觸發備份
if self._is_high_io_alert(alert_labels):
logger.warning(f"Pre-flight: {service_name} 屬於 CPU/IO 高負載告警,跳過備份觸發")
return PreflightReport(
result=PreflightResult.ABORT_HIGH_IO,
reason="告警類型為 CPU/IO 高負載禁止觸發備份Q4 決策)",
)
policies = self._registry.get_backup_policies()
max_age = policies.get("velero_max_age_hours", 4)
age = await self._velero.get_latest_backup_age_hours()
if age <= max_age:
return PreflightReport(
result=PreflightResult.PASS,
backup_age_hours=age,
reason=f"備份時間正常 ({age:.1f}h < {max_age}h)",
)
# 備份過期 → 觸發緊急備份 + Abort
backup_name = f"emergency-preflight-{int(time.time())}"
triggered = await self._velero.trigger_emergency_backup(backup_name)
return PreflightReport(
result=PreflightResult.ABORT_BACKUP_EXPIRED,
backup_age_hours=age,
backup_name_triggered=backup_name if triggered else None,
reason=(
f"備份過期 ({age:.1f}h > {max_age}h)。"
f"{'緊急備份已啟動: ' + backup_name if triggered else '緊急備份啟動失敗,請人工處理'}"
),
)
def _is_high_io_alert(self, labels: dict | None) -> bool:
if not labels:
return False
alert_name = labels.get("alertname", "").lower()
return any(kw in alert_name for kw in ["cpu", "io", "disk", "load", "memory"])
_preflight_service: PreflightService | None = None
def get_preflight_service() -> PreflightService:
global _preflight_service
if _preflight_service is None:
_preflight_service = PreflightService()
return _preflight_service
def set_preflight_service(service: PreflightService) -> None:
"""測試注入用 (P4 規範)"""
global _preflight_service
_preflight_service = service