Files
awoooi/apps/api/src/services/rollback_manager.py
OG T 655d1a568a
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
feat(Phase 5): Declarative 修復抽象化 + Blast Radius 分控 全部完成
## Phase 5 交付(ADR-086)

### 新增服務(4 個)
- blast_radius_calculator.py: 爆炸半徑計算器(0-100 純函數)
  - 18 種 kubectl 動作基礎分 + 命名空間倍率 + 特殊 flag 修正
  - HARD_RULES 永擋:delete ns/pv/pvc/clusterrole + rm -rf + DROP TABLE
  - 分級:≤10 auto / 11-50 human / 51-99 dual / 100 blocked
- declarative_remediation.py: DeclarativeSpec 不可變規格(frozen dataclass)
  - evaluate() 封裝 Blast Radius + dry-run + rollback_plan + constraints
  - rollback_plan 從 kubectl 動作類型自動推導(不呼叫 LLM)
- gitops_pr_service.py: Gitea Issue 高風險修復審核(tier=dual)
  - 含 Blast Radius + 目標狀態 + 回滾計畫 + 雙人審核流程
  - AIOPS_P5_GITOPS_PR flag 守衛
- rollback_manager.py: 驗證失敗自動回滾
  - 先驗 rollout history ≥ 2 revision,防止無版本可回滾
  - kubectl rollout undo + 120s 收斂等待

### decision_manager.py 接線(AIOPS_P5_BLAST_RADIUS_CHECK)
- _auto_execute() 在安全守衛後、ApprovalRequest 前插入分級守衛
- blocked → 永擋 + 人工審核通知
- dual → 非同步 GitOps Issue + 升級人工審核
- human → 升級人工審核(不自動執行)
- auto(≤10)→ 原有自動執行流程
- 失敗降級:計算異常 → 保守升人工

### learning_service.py
- record_declarative_outcome(): 記錄 DeclarativeSpec 執行結果
  anomaly_key=declarative:{incident_id},含 blast_radius_score/tier/rollback

2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 5 全部完成

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 16:06:54 +08:00

265 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI AIOps Phase 5 — Rollback Manager自動回滾管理器
=========================================================
職責:當 PostExecutionVerifier 判斷執行結果為 failed/degraded 時,
自動觸發 Declarative rollbackkubectl rollout undo
設計原則:
1. 只回滾 DeploymentRollout 管理的資源)— StatefulSet / DaemonSet 需人工
2. 回滾前驗證有可回滾的版本kubectl rollout history revision ≥ 2
3. 回滾後:等待 120s 確認 rollout 收斂,記錄結果
4. 失敗降級:回滾自身失敗 → Tier 0 告警 + 人工介入
連結點:
PostExecutionVerifier.assess_recovery() → TriggerRollback.trigger()
→ KubernetesMCPProvider.execute("kubectl rollout undo")
ADR-086: Phase 5 Declarative 修復與 Blast Radius 分控
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 5 初始建立
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import TYPE_CHECKING
import structlog
from src.utils.timezone import now_taipei
if TYPE_CHECKING:
from src.services.declarative_remediation import DeclarativeSpec
logger = structlog.get_logger(__name__)
# 回滾後等待收斂的超時(秒)
ROLLBACK_CONVERGENCE_TIMEOUT_SEC = 120
# 回滾最大重試次數(防止無限 loop
ROLLBACK_MAX_RETRIES = 1
# ─────────────────────────────────────────────────────────────────────────────
# Data Types
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class RollbackResult:
"""回滾執行結果"""
success: bool
incident_id: str
deployment: str
namespace: str
rollback_command: str # 執行的 kubectl rollout undo 命令
convergence_confirmed: bool # rollout status 確認收斂
error: str | None
triggered_at: str
# ─────────────────────────────────────────────────────────────────────────────
# Main Service
# ─────────────────────────────────────────────────────────────────────────────
class RollbackManager:
"""
自動回滾管理器
Usage:
mgr = RollbackManager()
result = await mgr.trigger(
incident_id="INC-001",
spec=declarative_spec,
verification_result="failed",
)
"""
async def trigger(
self,
incident_id: str,
spec: "DeclarativeSpec",
verification_result: str,
) -> RollbackResult:
"""
根據驗證結果決定是否回滾,並執行。
Args:
incident_id: 關聯 Incident ID
spec: 原始 DeclarativeSpec提供 rollback_plan
verification_result: PostExecutionVerifier 結果("failed" / "degraded"
Returns:
RollbackResult不管成敗都回傳不 raise
"""
triggered_at = now_taipei().isoformat()
# 只有 failed / degraded 才觸發回滾
if verification_result not in ("failed", "degraded"):
return RollbackResult(
success=False,
incident_id=incident_id,
deployment=spec.target,
namespace=spec.namespace,
rollback_command="",
convergence_confirmed=False,
error=f"verification_result={verification_result},無需回滾",
triggered_at=triggered_at,
)
logger.warning(
"rollback_triggered",
incident_id=incident_id,
deployment=spec.target,
namespace=spec.namespace,
verification_result=verification_result,
original_action=spec.action[:80],
)
rollback_command = (
f"kubectl rollout undo deployment/{spec.target} -n {spec.namespace}"
)
try:
# 1. 確認有可回滾的版本
can_rollback = await self._has_previous_revision(spec.target, spec.namespace)
if not can_rollback:
return RollbackResult(
success=False,
incident_id=incident_id,
deployment=spec.target,
namespace=spec.namespace,
rollback_command=rollback_command,
convergence_confirmed=False,
error="無前一個 revision 可回滾rollout history 只有 1 個版本)",
triggered_at=triggered_at,
)
# 2. 執行回滾
exec_result = await self._execute_rollback(rollback_command, spec.namespace)
if not exec_result:
return RollbackResult(
success=False,
incident_id=incident_id,
deployment=spec.target,
namespace=spec.namespace,
rollback_command=rollback_command,
convergence_confirmed=False,
error="kubectl rollout undo 執行失敗",
triggered_at=triggered_at,
)
# 3. 等待收斂(非阻塞:用 rollout status 確認)
converged = await self._wait_convergence(spec.target, spec.namespace)
result = RollbackResult(
success=True,
incident_id=incident_id,
deployment=spec.target,
namespace=spec.namespace,
rollback_command=rollback_command,
convergence_confirmed=converged,
error=None if converged else "rollout status 超時未收斂,人工確認",
triggered_at=triggered_at,
)
logger.info(
"rollback_completed",
incident_id=incident_id,
deployment=spec.target,
converged=converged,
)
return result
except Exception as e:
logger.error(
"rollback_failed_unexpected",
incident_id=incident_id,
deployment=spec.target,
error=str(e),
)
return RollbackResult(
success=False,
incident_id=incident_id,
deployment=spec.target,
namespace=spec.namespace,
rollback_command=rollback_command,
convergence_confirmed=False,
error=str(e),
triggered_at=triggered_at,
)
# ──────────────────────────────────────────────────────────────────────────
# Private Helpers
# ──────────────────────────────────────────────────────────────────────────
async def _has_previous_revision(self, deployment: str, namespace: str) -> bool:
"""確認 Deployment 有前一個 revisionrollout history ≥ 2"""
from src.services.k8s_mcp import get_kubernetes_mcp
k8s = get_kubernetes_mcp()
history_cmd = f"kubectl rollout history deployment/{deployment} -n {namespace}"
try:
result = await k8s.execute(history_cmd)
if not result.success:
logger.warning("rollback_history_check_failed", deployment=deployment, error=result.error)
return False
# rollout history 輸出含 "REVISION" 列 + 至少 2 行資料才有前一版本
lines = [l for l in (result.output or "").splitlines() if l.strip() and not l.startswith("REVISION")]
return len(lines) >= 2
except Exception as e:
logger.warning("rollback_history_error", deployment=deployment, error=str(e))
return False
async def _execute_rollback(self, command: str, namespace: str) -> bool:
"""執行 kubectl rollout undo。"""
from src.services.k8s_mcp import get_kubernetes_mcp
k8s = get_kubernetes_mcp()
try:
result = await k8s.execute(command)
if result.success:
logger.info("rollback_command_executed", command=command[:120])
return True
logger.warning("rollback_command_failed", command=command[:120], error=result.error)
return False
except Exception as e:
logger.error("rollback_execute_error", command=command[:120], error=str(e))
return False
async def _wait_convergence(self, deployment: str, namespace: str) -> bool:
"""等待 rollout 收斂(最多 ROLLBACK_CONVERGENCE_TIMEOUT_SEC 秒)。"""
from src.services.k8s_mcp import get_kubernetes_mcp
k8s = get_kubernetes_mcp()
status_cmd = (
f"kubectl rollout status deployment/{deployment} -n {namespace} "
f"--timeout={ROLLBACK_CONVERGENCE_TIMEOUT_SEC}s"
)
try:
result = await k8s.execute(status_cmd)
converged = result.success and "successfully rolled out" in (result.output or "").lower()
if not converged:
logger.warning(
"rollback_convergence_timeout",
deployment=deployment,
namespace=namespace,
output=(result.output or "")[:200],
)
return converged
except Exception as e:
logger.warning("rollback_convergence_check_error", deployment=deployment, error=str(e))
return False
# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────
_manager: RollbackManager | None = None
def get_rollback_manager() -> RollbackManager:
global _manager
if _manager is None:
_manager = RollbackManager()
return _manager