- apps/api: FastAPI backend with Dockerfile - apps/web: Next.js frontend with Dockerfile - apps/sensor: Signal collection agent - packages: shared packages Co-Authored-By: Claude <noreply@anthropic.com>
316 lines
10 KiB
Python
316 lines
10 KiB
Python
"""
|
|
Dry-Run 預演引擎
|
|
Phase 2.2: HITL Dry-Run Validation
|
|
|
|
模擬 K8s 操作的預檢查,回傳 ApprovalCard 所需的 dryRunChecks 格式
|
|
"""
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
from typing import Literal
|
|
|
|
|
|
class CheckStatus(Enum):
|
|
PASSED = "passed"
|
|
FAILED = "failed"
|
|
WARNING = "warning"
|
|
|
|
|
|
@dataclass
|
|
class DryRunCheck:
|
|
"""單項檢查結果"""
|
|
name: str
|
|
passed: bool
|
|
message: str | None = None
|
|
|
|
|
|
@dataclass
|
|
class BlastRadius:
|
|
"""爆炸半徑評估"""
|
|
affected_pods: int
|
|
estimated_downtime: str
|
|
related_services: list[str]
|
|
data_impact: Literal["NONE", "READ_ONLY", "WRITE", "DESTRUCTIVE"]
|
|
|
|
|
|
@dataclass
|
|
class DryRunResult:
|
|
"""完整 Dry-Run 結果"""
|
|
checks: list[DryRunCheck]
|
|
blast_radius: BlastRadius
|
|
overall_passed: bool
|
|
risk_level: Literal["low", "medium", "high", "critical"]
|
|
|
|
|
|
class MockK8sClient:
|
|
"""
|
|
模擬 K8s Client
|
|
|
|
Phase 2.2: 先用 Mock 資料驗證 API 契約
|
|
Phase 3+: 替換為真實 kubernetes-client
|
|
"""
|
|
|
|
# 模擬的 RBAC 權限表
|
|
MOCK_RBAC = {
|
|
"cluster-admin": ["*"],
|
|
"developer": ["get", "list", "watch", "create", "update"],
|
|
"viewer": ["get", "list", "watch"],
|
|
}
|
|
|
|
# 模擬的資源存在表
|
|
MOCK_RESOURCES = {
|
|
"pods": [
|
|
"nginx-frontend-7d4b8c9f5-xk2m3",
|
|
"nginx-frontend-7d4b8c9f5-ab12c",
|
|
"nginx-frontend-7d4b8c9f5-de34f",
|
|
"api-server-8c7d6e5f4-gh56i",
|
|
"redis-master-0",
|
|
],
|
|
"deployments": ["nginx-frontend", "api-server", "redis"],
|
|
"services": ["nginx-ingress", "frontend-svc", "api-svc", "redis-svc"],
|
|
"tables": ["users", "user_sessions", "orders", "products"],
|
|
}
|
|
|
|
# 模擬的服務依賴圖
|
|
MOCK_DEPENDENCIES = {
|
|
"nginx-frontend": ["nginx-ingress", "frontend-svc", "cdn-cache"],
|
|
"api-server": ["api-svc", "redis-svc", "postgres"],
|
|
"redis": ["redis-svc", "api-server"],
|
|
"user_sessions": ["auth-service", "api-gateway", "user-service"],
|
|
}
|
|
|
|
def check_rbac(self, role: str, verb: str, resource: str) -> DryRunCheck:
|
|
"""檢查 RBAC 權限"""
|
|
permissions = self.MOCK_RBAC.get(role, [])
|
|
has_permission = "*" in permissions or verb in permissions
|
|
|
|
return DryRunCheck(
|
|
name="RBAC Permission",
|
|
passed=has_permission,
|
|
message=role if has_permission else f"Missing {verb} permission",
|
|
)
|
|
|
|
def check_syntax(self, operation: str, parameters: dict) -> DryRunCheck:
|
|
"""檢查操作語法"""
|
|
# 簡單語法驗證
|
|
valid = True
|
|
message = None
|
|
|
|
if operation == "delete_pod":
|
|
if "pod_name" not in parameters:
|
|
valid = False
|
|
message = "Missing pod_name"
|
|
elif not re.match(r"^[a-z0-9-]+$", parameters.get("pod_name", "")):
|
|
valid = False
|
|
message = "Invalid pod name format"
|
|
|
|
elif operation == "scale_deployment":
|
|
replicas = parameters.get("replicas")
|
|
if replicas is None or not isinstance(replicas, int):
|
|
valid = False
|
|
message = "Invalid replicas value"
|
|
elif replicas < 0 or replicas > 100:
|
|
valid = False
|
|
message = "Replicas must be 0-100"
|
|
|
|
elif operation == "drop_table":
|
|
if "table_name" not in parameters:
|
|
valid = False
|
|
message = "Missing table_name"
|
|
|
|
return DryRunCheck(
|
|
name="Syntax Valid",
|
|
passed=valid,
|
|
message=message,
|
|
)
|
|
|
|
def check_resource_exists(
|
|
self, resource_type: str, resource_name: str
|
|
) -> DryRunCheck:
|
|
"""檢查資源是否存在"""
|
|
resources = self.MOCK_RESOURCES.get(resource_type, [])
|
|
exists = resource_name in resources
|
|
|
|
return DryRunCheck(
|
|
name="Resource Exists",
|
|
passed=exists,
|
|
message=f"{resource_type[:-1].title()} found" if exists else "Not found",
|
|
)
|
|
|
|
def check_replica_count(self, deployment_name: str) -> DryRunCheck:
|
|
"""檢查 Replica 數量 (刪除 Pod 時確保有備援)"""
|
|
# Mock: 假設所有 deployment 都有 3 replicas
|
|
replica_count = 3 if deployment_name in self.MOCK_RESOURCES["deployments"] else 0
|
|
safe = replica_count > 1
|
|
|
|
return DryRunCheck(
|
|
name="Replica Count > 1",
|
|
passed=safe,
|
|
message=f"{replica_count} replicas" if safe else "Single replica!",
|
|
)
|
|
|
|
def check_backup_available(self, table_name: str) -> DryRunCheck:
|
|
"""檢查是否有近期備份 (資料庫操作)"""
|
|
# Mock: user_sessions 沒有備份
|
|
has_backup = table_name != "user_sessions"
|
|
|
|
return DryRunCheck(
|
|
name="Backup Available",
|
|
passed=has_backup,
|
|
message=None if has_backup else "No recent backup!",
|
|
)
|
|
|
|
def get_related_services(self, resource_name: str) -> list[str]:
|
|
"""取得相關服務"""
|
|
return self.MOCK_DEPENDENCIES.get(resource_name, [])
|
|
|
|
def estimate_downtime(self, operation: str, resource_type: str) -> str:
|
|
"""估算停機時間"""
|
|
if operation == "delete_pod":
|
|
return "~2 min" # Pod 重建時間
|
|
elif operation == "scale_deployment":
|
|
return "~30 sec"
|
|
elif operation == "drop_table":
|
|
return "0" # 資料庫操作不影響服務可用性
|
|
elif operation == "restart_deployment":
|
|
return "~5 min"
|
|
return "Unknown"
|
|
|
|
|
|
class DryRunEngine:
|
|
"""
|
|
Dry-Run 預演引擎
|
|
|
|
執行操作前的安全檢查,回傳前端 ApprovalCard 所需格式
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.k8s = MockK8sClient()
|
|
|
|
def evaluate(
|
|
self,
|
|
operation: str,
|
|
parameters: dict,
|
|
user_role: str = "cluster-admin",
|
|
) -> DryRunResult:
|
|
"""
|
|
執行 Dry-Run 預演
|
|
|
|
Args:
|
|
operation: 操作類型 (delete_pod, scale_deployment, drop_table, etc.)
|
|
parameters: 操作參數
|
|
user_role: 執行者角色
|
|
|
|
Returns:
|
|
DryRunResult 包含所有檢查結果與爆炸半徑評估
|
|
"""
|
|
checks: list[DryRunCheck] = []
|
|
affected_pods = 0
|
|
data_impact: Literal["NONE", "READ_ONLY", "WRITE", "DESTRUCTIVE"] = "NONE"
|
|
related_services: list[str] = []
|
|
|
|
# 1. RBAC 權限檢查
|
|
verb = self._operation_to_verb(operation)
|
|
checks.append(self.k8s.check_rbac(user_role, verb, operation))
|
|
|
|
# 2. 語法檢查
|
|
checks.append(self.k8s.check_syntax(operation, parameters))
|
|
|
|
# 3. 依操作類型執行特定檢查
|
|
if operation == "delete_pod":
|
|
pod_name = parameters.get("pod_name", "")
|
|
deployment = self._extract_deployment_name(pod_name)
|
|
|
|
checks.append(self.k8s.check_resource_exists("pods", pod_name))
|
|
checks.append(self.k8s.check_replica_count(deployment))
|
|
|
|
affected_pods = 1
|
|
related_services = self.k8s.get_related_services(deployment)
|
|
data_impact = "NONE"
|
|
|
|
elif operation == "scale_deployment":
|
|
deployment = parameters.get("deployment", "")
|
|
checks.append(self.k8s.check_resource_exists("deployments", deployment))
|
|
|
|
affected_pods = abs(parameters.get("replicas", 0) - 3) # 假設原本 3
|
|
related_services = self.k8s.get_related_services(deployment)
|
|
data_impact = "NONE"
|
|
|
|
elif operation == "drop_table":
|
|
table_name = parameters.get("table_name", "")
|
|
checks.append(self.k8s.check_resource_exists("tables", table_name))
|
|
checks.append(self.k8s.check_backup_available(table_name))
|
|
|
|
affected_pods = 0
|
|
related_services = self.k8s.get_related_services(table_name)
|
|
data_impact = "DESTRUCTIVE"
|
|
|
|
elif operation == "truncate_table":
|
|
table_name = parameters.get("table_name", "")
|
|
checks.append(self.k8s.check_resource_exists("tables", table_name))
|
|
checks.append(self.k8s.check_backup_available(table_name))
|
|
|
|
affected_pods = 0
|
|
related_services = self.k8s.get_related_services(table_name)
|
|
data_impact = "DESTRUCTIVE"
|
|
|
|
elif operation == "update_config":
|
|
affected_pods = parameters.get("affected_pods", 1)
|
|
data_impact = "WRITE"
|
|
|
|
# 4. 計算總體結果
|
|
overall_passed = all(c.passed for c in checks)
|
|
risk_level = self._calculate_risk_level(data_impact, affected_pods, overall_passed)
|
|
|
|
return DryRunResult(
|
|
checks=checks,
|
|
blast_radius=BlastRadius(
|
|
affected_pods=affected_pods,
|
|
estimated_downtime=self.k8s.estimate_downtime(operation, "pods"),
|
|
related_services=related_services,
|
|
data_impact=data_impact,
|
|
),
|
|
overall_passed=overall_passed,
|
|
risk_level=risk_level,
|
|
)
|
|
|
|
def _operation_to_verb(self, operation: str) -> str:
|
|
"""操作轉換為 K8s verb"""
|
|
mapping = {
|
|
"delete_pod": "delete",
|
|
"scale_deployment": "update",
|
|
"drop_table": "delete",
|
|
"truncate_table": "delete",
|
|
"update_config": "update",
|
|
"restart_deployment": "update",
|
|
}
|
|
return mapping.get(operation, "get")
|
|
|
|
def _extract_deployment_name(self, pod_name: str) -> str:
|
|
"""從 Pod 名稱提取 Deployment 名稱"""
|
|
# nginx-frontend-7d4b8c9f5-xk2m3 -> nginx-frontend
|
|
parts = pod_name.rsplit("-", 2)
|
|
return parts[0] if len(parts) >= 3 else pod_name
|
|
|
|
def _calculate_risk_level(
|
|
self,
|
|
data_impact: str,
|
|
affected_pods: int,
|
|
all_checks_passed: bool,
|
|
) -> Literal["low", "medium", "high", "critical"]:
|
|
"""計算風險等級"""
|
|
if not all_checks_passed:
|
|
return "critical"
|
|
if data_impact == "DESTRUCTIVE":
|
|
return "critical"
|
|
if data_impact == "WRITE" or affected_pods > 5:
|
|
return "high"
|
|
if affected_pods > 1:
|
|
return "medium"
|
|
return "low"
|
|
|
|
|
|
# 全域引擎實例
|
|
dry_run_engine = DryRunEngine()
|