""" Dry-Run 預演引擎 Phase 2.2: HITL Dry-Run Validation 模擬 K8s 操作的預檢查,回傳 ApprovalCard 所需的 dryRunChecks 格式 """ import re from dataclasses import dataclass from enum import Enum from typing import Literal class CheckStatus(Enum): PASSED = "passed" FAILED = "failed" WARNING = "warning" @dataclass class DryRunCheck: """單項檢查結果""" name: str passed: bool message: str | None = None @dataclass class BlastRadius: """爆炸半徑評估""" affected_pods: int estimated_downtime: str related_services: list[str] data_impact: Literal["NONE", "READ_ONLY", "WRITE", "DESTRUCTIVE"] @dataclass class DryRunResult: """完整 Dry-Run 結果""" checks: list[DryRunCheck] blast_radius: BlastRadius overall_passed: bool risk_level: Literal["low", "medium", "high", "critical"] class MockK8sClient: """ 模擬 K8s Client Phase 2.2: 先用 Mock 資料驗證 API 契約 Phase 3+: 替換為真實 kubernetes-client """ # 模擬的 RBAC 權限表 MOCK_RBAC = { "cluster-admin": ["*"], "developer": ["get", "list", "watch", "create", "update"], "viewer": ["get", "list", "watch"], } # 模擬的資源存在表 MOCK_RESOURCES = { "pods": [ "nginx-frontend-7d4b8c9f5-xk2m3", "nginx-frontend-7d4b8c9f5-ab12c", "nginx-frontend-7d4b8c9f5-de34f", "api-server-8c7d6e5f4-gh56i", "redis-master-0", ], "deployments": ["nginx-frontend", "api-server", "redis"], "services": ["nginx-ingress", "frontend-svc", "api-svc", "redis-svc"], "tables": ["users", "user_sessions", "orders", "products"], } # 模擬的服務依賴圖 MOCK_DEPENDENCIES = { "nginx-frontend": ["nginx-ingress", "frontend-svc", "cdn-cache"], "api-server": ["api-svc", "redis-svc", "postgres"], "redis": ["redis-svc", "api-server"], "user_sessions": ["auth-service", "api-gateway", "user-service"], } def check_rbac(self, role: str, verb: str, resource: str) -> DryRunCheck: """檢查 RBAC 權限""" permissions = self.MOCK_RBAC.get(role, []) has_permission = "*" in permissions or verb in permissions return DryRunCheck( name="RBAC Permission", passed=has_permission, message=role if has_permission else f"Missing {verb} permission", ) def check_syntax(self, operation: str, parameters: dict) -> DryRunCheck: """檢查操作語法""" # 簡單語法驗證 valid = True message = None if operation == "delete_pod": if "pod_name" not in parameters: valid = False message = "Missing pod_name" elif not re.match(r"^[a-z0-9-]+$", parameters.get("pod_name", "")): valid = False message = "Invalid pod name format" elif operation == "scale_deployment": replicas = parameters.get("replicas") if replicas is None or not isinstance(replicas, int): valid = False message = "Invalid replicas value" elif replicas < 0 or replicas > 100: valid = False message = "Replicas must be 0-100" elif operation == "drop_table": if "table_name" not in parameters: valid = False message = "Missing table_name" return DryRunCheck( name="Syntax Valid", passed=valid, message=message, ) def check_resource_exists( self, resource_type: str, resource_name: str ) -> DryRunCheck: """檢查資源是否存在""" resources = self.MOCK_RESOURCES.get(resource_type, []) exists = resource_name in resources return DryRunCheck( name="Resource Exists", passed=exists, message=f"{resource_type[:-1].title()} found" if exists else "Not found", ) def check_replica_count(self, deployment_name: str) -> DryRunCheck: """檢查 Replica 數量 (刪除 Pod 時確保有備援)""" # Mock: 假設所有 deployment 都有 3 replicas replica_count = 3 if deployment_name in self.MOCK_RESOURCES["deployments"] else 0 safe = replica_count > 1 return DryRunCheck( name="Replica Count > 1", passed=safe, message=f"{replica_count} replicas" if safe else "Single replica!", ) def check_backup_available(self, table_name: str) -> DryRunCheck: """檢查是否有近期備份 (資料庫操作)""" # Mock: user_sessions 沒有備份 has_backup = table_name != "user_sessions" return DryRunCheck( name="Backup Available", passed=has_backup, message=None if has_backup else "No recent backup!", ) def get_related_services(self, resource_name: str) -> list[str]: """取得相關服務""" return self.MOCK_DEPENDENCIES.get(resource_name, []) def estimate_downtime(self, operation: str, resource_type: str) -> str: """估算停機時間""" if operation == "delete_pod": return "~2 min" # Pod 重建時間 elif operation == "scale_deployment": return "~30 sec" elif operation == "drop_table": return "0" # 資料庫操作不影響服務可用性 elif operation == "restart_deployment": return "~5 min" return "Unknown" class DryRunEngine: """ Dry-Run 預演引擎 執行操作前的安全檢查,回傳前端 ApprovalCard 所需格式 """ def __init__(self): self.k8s = MockK8sClient() def evaluate( self, operation: str, parameters: dict, user_role: str = "cluster-admin", ) -> DryRunResult: """ 執行 Dry-Run 預演 Args: operation: 操作類型 (delete_pod, scale_deployment, drop_table, etc.) parameters: 操作參數 user_role: 執行者角色 Returns: DryRunResult 包含所有檢查結果與爆炸半徑評估 """ checks: list[DryRunCheck] = [] affected_pods = 0 data_impact: Literal["NONE", "READ_ONLY", "WRITE", "DESTRUCTIVE"] = "NONE" related_services: list[str] = [] # 1. RBAC 權限檢查 verb = self._operation_to_verb(operation) checks.append(self.k8s.check_rbac(user_role, verb, operation)) # 2. 語法檢查 checks.append(self.k8s.check_syntax(operation, parameters)) # 3. 依操作類型執行特定檢查 if operation == "delete_pod": pod_name = parameters.get("pod_name", "") deployment = self._extract_deployment_name(pod_name) checks.append(self.k8s.check_resource_exists("pods", pod_name)) checks.append(self.k8s.check_replica_count(deployment)) affected_pods = 1 related_services = self.k8s.get_related_services(deployment) data_impact = "NONE" elif operation == "scale_deployment": deployment = parameters.get("deployment", "") checks.append(self.k8s.check_resource_exists("deployments", deployment)) affected_pods = abs(parameters.get("replicas", 0) - 3) # 假設原本 3 related_services = self.k8s.get_related_services(deployment) data_impact = "NONE" elif operation == "drop_table": table_name = parameters.get("table_name", "") checks.append(self.k8s.check_resource_exists("tables", table_name)) checks.append(self.k8s.check_backup_available(table_name)) affected_pods = 0 related_services = self.k8s.get_related_services(table_name) data_impact = "DESTRUCTIVE" elif operation == "truncate_table": table_name = parameters.get("table_name", "") checks.append(self.k8s.check_resource_exists("tables", table_name)) checks.append(self.k8s.check_backup_available(table_name)) affected_pods = 0 related_services = self.k8s.get_related_services(table_name) data_impact = "DESTRUCTIVE" elif operation == "update_config": affected_pods = parameters.get("affected_pods", 1) data_impact = "WRITE" # 4. 計算總體結果 overall_passed = all(c.passed for c in checks) risk_level = self._calculate_risk_level(data_impact, affected_pods, overall_passed) return DryRunResult( checks=checks, blast_radius=BlastRadius( affected_pods=affected_pods, estimated_downtime=self.k8s.estimate_downtime(operation, "pods"), related_services=related_services, data_impact=data_impact, ), overall_passed=overall_passed, risk_level=risk_level, ) def _operation_to_verb(self, operation: str) -> str: """操作轉換為 K8s verb""" mapping = { "delete_pod": "delete", "scale_deployment": "update", "drop_table": "delete", "truncate_table": "delete", "update_config": "update", "restart_deployment": "update", } return mapping.get(operation, "get") def _extract_deployment_name(self, pod_name: str) -> str: """從 Pod 名稱提取 Deployment 名稱""" # nginx-frontend-7d4b8c9f5-xk2m3 -> nginx-frontend parts = pod_name.rsplit("-", 2) return parts[0] if len(parts) >= 3 else pod_name def _calculate_risk_level( self, data_impact: str, affected_pods: int, all_checks_passed: bool, ) -> Literal["low", "medium", "high", "critical"]: """計算風險等級""" if not all_checks_passed: return "critical" if data_impact == "DESTRUCTIVE": return "critical" if data_impact == "WRITE" or affected_pods > 5: return "high" if affected_pods > 1: return "medium" return "low" # 全域引擎實例 dry_run_engine = DryRunEngine()