""" AWOOOI AIOps Phase 5 — Rollback Manager(自動回滾管理器) ========================================================= 職責:當 PostExecutionVerifier 判斷執行結果為 failed/degraded 時, 自動觸發 Declarative rollback(kubectl rollout undo)。 設計原則: 1. 只回滾 Deployment(Rollout 管理的資源)— StatefulSet / DaemonSet 需人工 2. 回滾前:驗證有可回滾的版本(kubectl rollout history revision ≥ 2) 3. 回滾後:等待 120s 確認 rollout 收斂,記錄結果 4. 失敗降級:回滾自身失敗 → Tier 0 告警 + 人工介入 連結點: PostExecutionVerifier.assess_recovery() → TriggerRollback.trigger() → KubernetesMCPProvider.execute("kubectl rollout undo") ADR-086: Phase 5 Declarative 修復與 Blast Radius 分控 2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 5 初始建立 """ from __future__ import annotations from dataclasses import dataclass from typing import TYPE_CHECKING import structlog from src.utils.timezone import now_taipei if TYPE_CHECKING: from src.services.declarative_remediation import DeclarativeSpec logger = structlog.get_logger(__name__) # 回滾後等待收斂的超時(秒) ROLLBACK_CONVERGENCE_TIMEOUT_SEC = 120 # 回滾最大重試次數(防止無限 loop) ROLLBACK_MAX_RETRIES = 1 # ───────────────────────────────────────────────────────────────────────────── # Data Types # ───────────────────────────────────────────────────────────────────────────── @dataclass class RollbackResult: """回滾執行結果""" success: bool incident_id: str deployment: str namespace: str rollback_command: str # 執行的 kubectl rollout undo 命令 convergence_confirmed: bool # rollout status 確認收斂 error: str | None triggered_at: str # ───────────────────────────────────────────────────────────────────────────── # Main Service # ───────────────────────────────────────────────────────────────────────────── class RollbackManager: """ 自動回滾管理器 Usage: mgr = RollbackManager() result = await mgr.trigger( incident_id="INC-001", spec=declarative_spec, verification_result="failed", ) """ async def trigger( self, incident_id: str, spec: "DeclarativeSpec", verification_result: str, ) -> RollbackResult: """ 根據驗證結果決定是否回滾,並執行。 Args: incident_id: 關聯 Incident ID spec: 原始 DeclarativeSpec(提供 rollback_plan) verification_result: PostExecutionVerifier 結果("failed" / "degraded") Returns: RollbackResult(不管成敗都回傳,不 raise) """ triggered_at = now_taipei().isoformat() # 只有 failed / degraded 才觸發回滾 if verification_result not in ("failed", "degraded"): return RollbackResult( success=False, incident_id=incident_id, deployment=spec.target, namespace=spec.namespace, rollback_command="", convergence_confirmed=False, error=f"verification_result={verification_result},無需回滾", triggered_at=triggered_at, ) logger.warning( "rollback_triggered", incident_id=incident_id, deployment=spec.target, namespace=spec.namespace, verification_result=verification_result, original_action=spec.action[:80], ) rollback_command = ( f"kubectl rollout undo deployment/{spec.target} -n {spec.namespace}" ) try: # 1. 確認有可回滾的版本 can_rollback = await self._has_previous_revision(spec.target, spec.namespace) if not can_rollback: return RollbackResult( success=False, incident_id=incident_id, deployment=spec.target, namespace=spec.namespace, rollback_command=rollback_command, convergence_confirmed=False, error="無前一個 revision 可回滾(rollout history 只有 1 個版本)", triggered_at=triggered_at, ) # 2. 執行回滾 exec_result = await self._execute_rollback(rollback_command, spec.namespace) if not exec_result: return RollbackResult( success=False, incident_id=incident_id, deployment=spec.target, namespace=spec.namespace, rollback_command=rollback_command, convergence_confirmed=False, error="kubectl rollout undo 執行失敗", triggered_at=triggered_at, ) # 3. 等待收斂(非阻塞:用 rollout status 確認) converged = await self._wait_convergence(spec.target, spec.namespace) result = RollbackResult( success=True, incident_id=incident_id, deployment=spec.target, namespace=spec.namespace, rollback_command=rollback_command, convergence_confirmed=converged, error=None if converged else "rollout status 超時未收斂,人工確認", triggered_at=triggered_at, ) logger.info( "rollback_completed", incident_id=incident_id, deployment=spec.target, converged=converged, ) return result except Exception as e: logger.error( "rollback_failed_unexpected", incident_id=incident_id, deployment=spec.target, error=str(e), ) return RollbackResult( success=False, incident_id=incident_id, deployment=spec.target, namespace=spec.namespace, rollback_command=rollback_command, convergence_confirmed=False, error=str(e), triggered_at=triggered_at, ) # ────────────────────────────────────────────────────────────────────────── # Private Helpers # ────────────────────────────────────────────────────────────────────────── async def _has_previous_revision(self, deployment: str, namespace: str) -> bool: """確認 Deployment 有前一個 revision(rollout history ≥ 2)。""" from src.services.k8s_mcp import get_kubernetes_mcp k8s = get_kubernetes_mcp() history_cmd = f"kubectl rollout history deployment/{deployment} -n {namespace}" try: result = await k8s.execute(history_cmd) if not result.success: logger.warning("rollback_history_check_failed", deployment=deployment, error=result.error) return False # rollout history 輸出含 "REVISION" 列 + 至少 2 行資料才有前一版本 lines = [l for l in (result.output or "").splitlines() if l.strip() and not l.startswith("REVISION")] return len(lines) >= 2 except Exception as e: logger.warning("rollback_history_error", deployment=deployment, error=str(e)) return False async def _execute_rollback(self, command: str, namespace: str) -> bool: """執行 kubectl rollout undo。""" from src.services.k8s_mcp import get_kubernetes_mcp k8s = get_kubernetes_mcp() try: result = await k8s.execute(command) if result.success: logger.info("rollback_command_executed", command=command[:120]) return True logger.warning("rollback_command_failed", command=command[:120], error=result.error) return False except Exception as e: logger.error("rollback_execute_error", command=command[:120], error=str(e)) return False async def _wait_convergence(self, deployment: str, namespace: str) -> bool: """等待 rollout 收斂(最多 ROLLBACK_CONVERGENCE_TIMEOUT_SEC 秒)。""" from src.services.k8s_mcp import get_kubernetes_mcp k8s = get_kubernetes_mcp() status_cmd = ( f"kubectl rollout status deployment/{deployment} -n {namespace} " f"--timeout={ROLLBACK_CONVERGENCE_TIMEOUT_SEC}s" ) try: result = await k8s.execute(status_cmd) converged = result.success and "successfully rolled out" in (result.output or "").lower() if not converged: logger.warning( "rollback_convergence_timeout", deployment=deployment, namespace=namespace, output=(result.output or "")[:200], ) return converged except Exception as e: logger.warning("rollback_convergence_check_error", deployment=deployment, error=str(e)) return False # ───────────────────────────────────────────────────────────────────────────── # Singleton # ───────────────────────────────────────────────────────────────────────────── _manager: RollbackManager | None = None def get_rollback_manager() -> RollbackManager: global _manager if _manager is None: _manager = RollbackManager() return _manager