Files
awoooi/apps/api/src/services/dry_run.py
OG T 196d269b92 feat: add all application source code
- apps/api: FastAPI backend with Dockerfile
- apps/web: Next.js frontend with Dockerfile
- apps/sensor: Signal collection agent
- packages: shared packages

Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-22 18:57:44 +08:00

316 lines
10 KiB
Python

"""
Dry-Run 預演引擎
Phase 2.2: HITL Dry-Run Validation
模擬 K8s 操作的預檢查,回傳 ApprovalCard 所需的 dryRunChecks 格式
"""
import re
from dataclasses import dataclass
from enum import Enum
from typing import Literal
class CheckStatus(Enum):
PASSED = "passed"
FAILED = "failed"
WARNING = "warning"
@dataclass
class DryRunCheck:
"""單項檢查結果"""
name: str
passed: bool
message: str | None = None
@dataclass
class BlastRadius:
"""爆炸半徑評估"""
affected_pods: int
estimated_downtime: str
related_services: list[str]
data_impact: Literal["NONE", "READ_ONLY", "WRITE", "DESTRUCTIVE"]
@dataclass
class DryRunResult:
"""完整 Dry-Run 結果"""
checks: list[DryRunCheck]
blast_radius: BlastRadius
overall_passed: bool
risk_level: Literal["low", "medium", "high", "critical"]
class MockK8sClient:
"""
模擬 K8s Client
Phase 2.2: 先用 Mock 資料驗證 API 契約
Phase 3+: 替換為真實 kubernetes-client
"""
# 模擬的 RBAC 權限表
MOCK_RBAC = {
"cluster-admin": ["*"],
"developer": ["get", "list", "watch", "create", "update"],
"viewer": ["get", "list", "watch"],
}
# 模擬的資源存在表
MOCK_RESOURCES = {
"pods": [
"nginx-frontend-7d4b8c9f5-xk2m3",
"nginx-frontend-7d4b8c9f5-ab12c",
"nginx-frontend-7d4b8c9f5-de34f",
"api-server-8c7d6e5f4-gh56i",
"redis-master-0",
],
"deployments": ["nginx-frontend", "api-server", "redis"],
"services": ["nginx-ingress", "frontend-svc", "api-svc", "redis-svc"],
"tables": ["users", "user_sessions", "orders", "products"],
}
# 模擬的服務依賴圖
MOCK_DEPENDENCIES = {
"nginx-frontend": ["nginx-ingress", "frontend-svc", "cdn-cache"],
"api-server": ["api-svc", "redis-svc", "postgres"],
"redis": ["redis-svc", "api-server"],
"user_sessions": ["auth-service", "api-gateway", "user-service"],
}
def check_rbac(self, role: str, verb: str, resource: str) -> DryRunCheck:
"""檢查 RBAC 權限"""
permissions = self.MOCK_RBAC.get(role, [])
has_permission = "*" in permissions or verb in permissions
return DryRunCheck(
name="RBAC Permission",
passed=has_permission,
message=role if has_permission else f"Missing {verb} permission",
)
def check_syntax(self, operation: str, parameters: dict) -> DryRunCheck:
"""檢查操作語法"""
# 簡單語法驗證
valid = True
message = None
if operation == "delete_pod":
if "pod_name" not in parameters:
valid = False
message = "Missing pod_name"
elif not re.match(r"^[a-z0-9-]+$", parameters.get("pod_name", "")):
valid = False
message = "Invalid pod name format"
elif operation == "scale_deployment":
replicas = parameters.get("replicas")
if replicas is None or not isinstance(replicas, int):
valid = False
message = "Invalid replicas value"
elif replicas < 0 or replicas > 100:
valid = False
message = "Replicas must be 0-100"
elif operation == "drop_table":
if "table_name" not in parameters:
valid = False
message = "Missing table_name"
return DryRunCheck(
name="Syntax Valid",
passed=valid,
message=message,
)
def check_resource_exists(
self, resource_type: str, resource_name: str
) -> DryRunCheck:
"""檢查資源是否存在"""
resources = self.MOCK_RESOURCES.get(resource_type, [])
exists = resource_name in resources
return DryRunCheck(
name="Resource Exists",
passed=exists,
message=f"{resource_type[:-1].title()} found" if exists else "Not found",
)
def check_replica_count(self, deployment_name: str) -> DryRunCheck:
"""檢查 Replica 數量 (刪除 Pod 時確保有備援)"""
# Mock: 假設所有 deployment 都有 3 replicas
replica_count = 3 if deployment_name in self.MOCK_RESOURCES["deployments"] else 0
safe = replica_count > 1
return DryRunCheck(
name="Replica Count > 1",
passed=safe,
message=f"{replica_count} replicas" if safe else "Single replica!",
)
def check_backup_available(self, table_name: str) -> DryRunCheck:
"""檢查是否有近期備份 (資料庫操作)"""
# Mock: user_sessions 沒有備份
has_backup = table_name != "user_sessions"
return DryRunCheck(
name="Backup Available",
passed=has_backup,
message=None if has_backup else "No recent backup!",
)
def get_related_services(self, resource_name: str) -> list[str]:
"""取得相關服務"""
return self.MOCK_DEPENDENCIES.get(resource_name, [])
def estimate_downtime(self, operation: str, resource_type: str) -> str:
"""估算停機時間"""
if operation == "delete_pod":
return "~2 min" # Pod 重建時間
elif operation == "scale_deployment":
return "~30 sec"
elif operation == "drop_table":
return "0" # 資料庫操作不影響服務可用性
elif operation == "restart_deployment":
return "~5 min"
return "Unknown"
class DryRunEngine:
"""
Dry-Run 預演引擎
執行操作前的安全檢查,回傳前端 ApprovalCard 所需格式
"""
def __init__(self):
self.k8s = MockK8sClient()
def evaluate(
self,
operation: str,
parameters: dict,
user_role: str = "cluster-admin",
) -> DryRunResult:
"""
執行 Dry-Run 預演
Args:
operation: 操作類型 (delete_pod, scale_deployment, drop_table, etc.)
parameters: 操作參數
user_role: 執行者角色
Returns:
DryRunResult 包含所有檢查結果與爆炸半徑評估
"""
checks: list[DryRunCheck] = []
affected_pods = 0
data_impact: Literal["NONE", "READ_ONLY", "WRITE", "DESTRUCTIVE"] = "NONE"
related_services: list[str] = []
# 1. RBAC 權限檢查
verb = self._operation_to_verb(operation)
checks.append(self.k8s.check_rbac(user_role, verb, operation))
# 2. 語法檢查
checks.append(self.k8s.check_syntax(operation, parameters))
# 3. 依操作類型執行特定檢查
if operation == "delete_pod":
pod_name = parameters.get("pod_name", "")
deployment = self._extract_deployment_name(pod_name)
checks.append(self.k8s.check_resource_exists("pods", pod_name))
checks.append(self.k8s.check_replica_count(deployment))
affected_pods = 1
related_services = self.k8s.get_related_services(deployment)
data_impact = "NONE"
elif operation == "scale_deployment":
deployment = parameters.get("deployment", "")
checks.append(self.k8s.check_resource_exists("deployments", deployment))
affected_pods = abs(parameters.get("replicas", 0) - 3) # 假設原本 3
related_services = self.k8s.get_related_services(deployment)
data_impact = "NONE"
elif operation == "drop_table":
table_name = parameters.get("table_name", "")
checks.append(self.k8s.check_resource_exists("tables", table_name))
checks.append(self.k8s.check_backup_available(table_name))
affected_pods = 0
related_services = self.k8s.get_related_services(table_name)
data_impact = "DESTRUCTIVE"
elif operation == "truncate_table":
table_name = parameters.get("table_name", "")
checks.append(self.k8s.check_resource_exists("tables", table_name))
checks.append(self.k8s.check_backup_available(table_name))
affected_pods = 0
related_services = self.k8s.get_related_services(table_name)
data_impact = "DESTRUCTIVE"
elif operation == "update_config":
affected_pods = parameters.get("affected_pods", 1)
data_impact = "WRITE"
# 4. 計算總體結果
overall_passed = all(c.passed for c in checks)
risk_level = self._calculate_risk_level(data_impact, affected_pods, overall_passed)
return DryRunResult(
checks=checks,
blast_radius=BlastRadius(
affected_pods=affected_pods,
estimated_downtime=self.k8s.estimate_downtime(operation, "pods"),
related_services=related_services,
data_impact=data_impact,
),
overall_passed=overall_passed,
risk_level=risk_level,
)
def _operation_to_verb(self, operation: str) -> str:
"""操作轉換為 K8s verb"""
mapping = {
"delete_pod": "delete",
"scale_deployment": "update",
"drop_table": "delete",
"truncate_table": "delete",
"update_config": "update",
"restart_deployment": "update",
}
return mapping.get(operation, "get")
def _extract_deployment_name(self, pod_name: str) -> str:
"""從 Pod 名稱提取 Deployment 名稱"""
# nginx-frontend-7d4b8c9f5-xk2m3 -> nginx-frontend
parts = pod_name.rsplit("-", 2)
return parts[0] if len(parts) >= 3 else pod_name
def _calculate_risk_level(
self,
data_impact: str,
affected_pods: int,
all_checks_passed: bool,
) -> Literal["low", "medium", "high", "critical"]:
"""計算風險等級"""
if not all_checks_passed:
return "critical"
if data_impact == "DESTRUCTIVE":
return "critical"
if data_impact == "WRITE" or affected_pods > 5:
return "high"
if affected_pods > 1:
return "medium"
return "low"
# 全域引擎實例
dry_run_engine = DryRunEngine()