Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
## Phase 5 交付(ADR-086)
### 新增服務(4 個)
- blast_radius_calculator.py: 爆炸半徑計算器(0-100 純函數)
- 18 種 kubectl 動作基礎分 + 命名空間倍率 + 特殊 flag 修正
- HARD_RULES 永擋:delete ns/pv/pvc/clusterrole + rm -rf + DROP TABLE
- 分級:≤10 auto / 11-50 human / 51-99 dual / 100 blocked
- declarative_remediation.py: DeclarativeSpec 不可變規格(frozen dataclass)
- evaluate() 封裝 Blast Radius + dry-run + rollback_plan + constraints
- rollback_plan 從 kubectl 動作類型自動推導(不呼叫 LLM)
- gitops_pr_service.py: Gitea Issue 高風險修復審核(tier=dual)
- 含 Blast Radius + 目標狀態 + 回滾計畫 + 雙人審核流程
- AIOPS_P5_GITOPS_PR flag 守衛
- rollback_manager.py: 驗證失敗自動回滾
- 先驗 rollout history ≥ 2 revision,防止無版本可回滾
- kubectl rollout undo + 120s 收斂等待
### decision_manager.py 接線(AIOPS_P5_BLAST_RADIUS_CHECK)
- _auto_execute() 在安全守衛後、ApprovalRequest 前插入分級守衛
- blocked → 永擋 + 人工審核通知
- dual → 非同步 GitOps Issue + 升級人工審核
- human → 升級人工審核(不自動執行)
- auto(≤10)→ 原有自動執行流程
- 失敗降級:計算異常 → 保守升人工
### learning_service.py
- record_declarative_outcome(): 記錄 DeclarativeSpec 執行結果
anomaly_key=declarative:{incident_id},含 blast_radius_score/tier/rollback
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 5 全部完成
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
265 lines
11 KiB
Python
265 lines
11 KiB
Python
"""
|
||
AWOOOI AIOps Phase 5 — Rollback Manager(自動回滾管理器)
|
||
=========================================================
|
||
職責:當 PostExecutionVerifier 判斷執行結果為 failed/degraded 時,
|
||
自動觸發 Declarative rollback(kubectl rollout undo)。
|
||
|
||
設計原則:
|
||
1. 只回滾 Deployment(Rollout 管理的資源)— StatefulSet / DaemonSet 需人工
|
||
2. 回滾前:驗證有可回滾的版本(kubectl rollout history revision ≥ 2)
|
||
3. 回滾後:等待 120s 確認 rollout 收斂,記錄結果
|
||
4. 失敗降級:回滾自身失敗 → Tier 0 告警 + 人工介入
|
||
|
||
連結點:
|
||
PostExecutionVerifier.assess_recovery() → TriggerRollback.trigger()
|
||
→ KubernetesMCPProvider.execute("kubectl rollout undo")
|
||
|
||
ADR-086: Phase 5 Declarative 修復與 Blast Radius 分控
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 5 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from dataclasses import dataclass
|
||
from typing import TYPE_CHECKING
|
||
|
||
import structlog
|
||
|
||
from src.utils.timezone import now_taipei
|
||
|
||
if TYPE_CHECKING:
|
||
from src.services.declarative_remediation import DeclarativeSpec
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# 回滾後等待收斂的超時(秒)
|
||
ROLLBACK_CONVERGENCE_TIMEOUT_SEC = 120
|
||
|
||
# 回滾最大重試次數(防止無限 loop)
|
||
ROLLBACK_MAX_RETRIES = 1
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Data Types
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class RollbackResult:
|
||
"""回滾執行結果"""
|
||
success: bool
|
||
incident_id: str
|
||
deployment: str
|
||
namespace: str
|
||
rollback_command: str # 執行的 kubectl rollout undo 命令
|
||
convergence_confirmed: bool # rollout status 確認收斂
|
||
error: str | None
|
||
triggered_at: str
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Main Service
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class RollbackManager:
|
||
"""
|
||
自動回滾管理器
|
||
|
||
Usage:
|
||
mgr = RollbackManager()
|
||
result = await mgr.trigger(
|
||
incident_id="INC-001",
|
||
spec=declarative_spec,
|
||
verification_result="failed",
|
||
)
|
||
"""
|
||
|
||
async def trigger(
|
||
self,
|
||
incident_id: str,
|
||
spec: "DeclarativeSpec",
|
||
verification_result: str,
|
||
) -> RollbackResult:
|
||
"""
|
||
根據驗證結果決定是否回滾,並執行。
|
||
|
||
Args:
|
||
incident_id: 關聯 Incident ID
|
||
spec: 原始 DeclarativeSpec(提供 rollback_plan)
|
||
verification_result: PostExecutionVerifier 結果("failed" / "degraded")
|
||
|
||
Returns:
|
||
RollbackResult(不管成敗都回傳,不 raise)
|
||
"""
|
||
triggered_at = now_taipei().isoformat()
|
||
|
||
# 只有 failed / degraded 才觸發回滾
|
||
if verification_result not in ("failed", "degraded"):
|
||
return RollbackResult(
|
||
success=False,
|
||
incident_id=incident_id,
|
||
deployment=spec.target,
|
||
namespace=spec.namespace,
|
||
rollback_command="",
|
||
convergence_confirmed=False,
|
||
error=f"verification_result={verification_result},無需回滾",
|
||
triggered_at=triggered_at,
|
||
)
|
||
|
||
logger.warning(
|
||
"rollback_triggered",
|
||
incident_id=incident_id,
|
||
deployment=spec.target,
|
||
namespace=spec.namespace,
|
||
verification_result=verification_result,
|
||
original_action=spec.action[:80],
|
||
)
|
||
|
||
rollback_command = (
|
||
f"kubectl rollout undo deployment/{spec.target} -n {spec.namespace}"
|
||
)
|
||
|
||
try:
|
||
# 1. 確認有可回滾的版本
|
||
can_rollback = await self._has_previous_revision(spec.target, spec.namespace)
|
||
if not can_rollback:
|
||
return RollbackResult(
|
||
success=False,
|
||
incident_id=incident_id,
|
||
deployment=spec.target,
|
||
namespace=spec.namespace,
|
||
rollback_command=rollback_command,
|
||
convergence_confirmed=False,
|
||
error="無前一個 revision 可回滾(rollout history 只有 1 個版本)",
|
||
triggered_at=triggered_at,
|
||
)
|
||
|
||
# 2. 執行回滾
|
||
exec_result = await self._execute_rollback(rollback_command, spec.namespace)
|
||
if not exec_result:
|
||
return RollbackResult(
|
||
success=False,
|
||
incident_id=incident_id,
|
||
deployment=spec.target,
|
||
namespace=spec.namespace,
|
||
rollback_command=rollback_command,
|
||
convergence_confirmed=False,
|
||
error="kubectl rollout undo 執行失敗",
|
||
triggered_at=triggered_at,
|
||
)
|
||
|
||
# 3. 等待收斂(非阻塞:用 rollout status 確認)
|
||
converged = await self._wait_convergence(spec.target, spec.namespace)
|
||
|
||
result = RollbackResult(
|
||
success=True,
|
||
incident_id=incident_id,
|
||
deployment=spec.target,
|
||
namespace=spec.namespace,
|
||
rollback_command=rollback_command,
|
||
convergence_confirmed=converged,
|
||
error=None if converged else "rollout status 超時未收斂,人工確認",
|
||
triggered_at=triggered_at,
|
||
)
|
||
|
||
logger.info(
|
||
"rollback_completed",
|
||
incident_id=incident_id,
|
||
deployment=spec.target,
|
||
converged=converged,
|
||
)
|
||
return result
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
"rollback_failed_unexpected",
|
||
incident_id=incident_id,
|
||
deployment=spec.target,
|
||
error=str(e),
|
||
)
|
||
return RollbackResult(
|
||
success=False,
|
||
incident_id=incident_id,
|
||
deployment=spec.target,
|
||
namespace=spec.namespace,
|
||
rollback_command=rollback_command,
|
||
convergence_confirmed=False,
|
||
error=str(e),
|
||
triggered_at=triggered_at,
|
||
)
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────
|
||
# Private Helpers
|
||
# ──────────────────────────────────────────────────────────────────────────
|
||
|
||
async def _has_previous_revision(self, deployment: str, namespace: str) -> bool:
|
||
"""確認 Deployment 有前一個 revision(rollout history ≥ 2)。"""
|
||
from src.services.k8s_mcp import get_kubernetes_mcp
|
||
|
||
k8s = get_kubernetes_mcp()
|
||
history_cmd = f"kubectl rollout history deployment/{deployment} -n {namespace}"
|
||
try:
|
||
result = await k8s.execute(history_cmd)
|
||
if not result.success:
|
||
logger.warning("rollback_history_check_failed", deployment=deployment, error=result.error)
|
||
return False
|
||
# rollout history 輸出含 "REVISION" 列 + 至少 2 行資料才有前一版本
|
||
lines = [l for l in (result.output or "").splitlines() if l.strip() and not l.startswith("REVISION")]
|
||
return len(lines) >= 2
|
||
except Exception as e:
|
||
logger.warning("rollback_history_error", deployment=deployment, error=str(e))
|
||
return False
|
||
|
||
async def _execute_rollback(self, command: str, namespace: str) -> bool:
|
||
"""執行 kubectl rollout undo。"""
|
||
from src.services.k8s_mcp import get_kubernetes_mcp
|
||
|
||
k8s = get_kubernetes_mcp()
|
||
try:
|
||
result = await k8s.execute(command)
|
||
if result.success:
|
||
logger.info("rollback_command_executed", command=command[:120])
|
||
return True
|
||
logger.warning("rollback_command_failed", command=command[:120], error=result.error)
|
||
return False
|
||
except Exception as e:
|
||
logger.error("rollback_execute_error", command=command[:120], error=str(e))
|
||
return False
|
||
|
||
async def _wait_convergence(self, deployment: str, namespace: str) -> bool:
|
||
"""等待 rollout 收斂(最多 ROLLBACK_CONVERGENCE_TIMEOUT_SEC 秒)。"""
|
||
from src.services.k8s_mcp import get_kubernetes_mcp
|
||
|
||
k8s = get_kubernetes_mcp()
|
||
status_cmd = (
|
||
f"kubectl rollout status deployment/{deployment} -n {namespace} "
|
||
f"--timeout={ROLLBACK_CONVERGENCE_TIMEOUT_SEC}s"
|
||
)
|
||
try:
|
||
result = await k8s.execute(status_cmd)
|
||
converged = result.success and "successfully rolled out" in (result.output or "").lower()
|
||
if not converged:
|
||
logger.warning(
|
||
"rollback_convergence_timeout",
|
||
deployment=deployment,
|
||
namespace=namespace,
|
||
output=(result.output or "")[:200],
|
||
)
|
||
return converged
|
||
except Exception as e:
|
||
logger.warning("rollback_convergence_check_error", deployment=deployment, error=str(e))
|
||
return False
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Singleton
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_manager: RollbackManager | None = None
|
||
|
||
|
||
def get_rollback_manager() -> RollbackManager:
|
||
global _manager
|
||
if _manager is None:
|
||
_manager = RollbackManager()
|
||
return _manager
|