【P0.1】knowledge_extractor_service.py:210 — AttributeError 修復 - Signal.description 欄位不存在(100% 失敗,KM 每天+5 根因) - 改用 alert_name + annotations.summary 拼接文字 【P0.2+P0.3】Gate 9+11 唯讀指令鬆綁 - blast_radius_calculator: kubectl get/top/describe/logs/version → score=1(非 50) - operation_parser: 增加 INVESTIGATE 類型識別(唯讀 kubectl 不回 None) - executor.py: OperationType 新增 INVESTIGATE enum - approval_execution.py: INVESTIGATE 路徑直接呼叫 execute_kubectl_command 【P0.4】MCP SSH/K8s Provider 修復 - decision_manager: params= → parameters=(符合 MCPToolProvider.execute 簽名) - decision_manager: MCPToolResult .get() → .success/.output(dataclass 用法) - decision_manager + ssh_provider: 補入 hosts 120/121(原 default 缺失) - auto_approve: phase2_agent_debate source bypass confidence 閾值 【P0.5】告警規則語義矛盾修復 - alert_rules.yaml: 8 條 kubectl 查詢規則 RESTART_DEPLOYMENT → NO_ACTION (CrashLoopBackOff/PostgreSQL 連線/慢查詢/MinIO 磁碟/K3s 節點/告警鏈路/SSL/CoreDNS 等) - incident_service.py: cAdvisor/CoreDNS 從 general 拆出獨立分類 【P0.6】proactive_inspector 動態基線 PromQL 全修 - 5 個 MONITORED_METRICS PromQL 全部修正(cadvisor label/datname/blackbox) - db_connection_pool: datname="awoooi" → "awoooi_prod" - http_error_rate: 無效 http_requests_total → blackbox probe_success - cpu/memory: namespace label → name=~"k8s_api_awoooi-api.*" Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
242 lines
11 KiB
Python
242 lines
11 KiB
Python
"""
|
||
AWOOOI AIOps Phase 5 — Blast Radius Calculator(爆炸半徑計算器)
|
||
===============================================================
|
||
職責:計算修復動作的爆炸半徑分數(0-100),決定執行分級。
|
||
|
||
分級邏輯:
|
||
≤ 10 → auto 自動執行(低衝擊)
|
||
11-50 → human 需一人審核(中衝擊)
|
||
51-99 → dual 需雙人審核 + GitOps PR(高衝擊)
|
||
100 → blocked HARD_RULES 永擋(任何情況不執行)
|
||
|
||
設計原則:
|
||
- 保守計分:不確定情境一律視為高分(> 50)
|
||
- HARD_RULES 優先:任何永擋 pattern 立刻返回 100,不繼續計算
|
||
- 純函數(Stateless):不依賴 DB/Redis,確保呼叫端可同步執行
|
||
- 可審計:每次計算回傳 reason 記錄計分依據
|
||
|
||
ADR-086: Phase 5 Declarative 修復與 Blast Radius 分控
|
||
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 5 初始建立
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
from dataclasses import dataclass
|
||
|
||
import structlog
|
||
|
||
logger = structlog.get_logger(__name__)
|
||
|
||
# ── 分級閾值 ──────────────────────────────────────────────────────────────────
|
||
TIER_AUTO_MAX = 10 # ≤ 10 → auto
|
||
TIER_HUMAN_MAX = 50 # 11-50 → human
|
||
TIER_DUAL_MAX = 99 # 51-99 → dual
|
||
BLAST_BLOCKED = 100 # = 100 → permanent block
|
||
|
||
# ── 基礎分(Kubectl 動作類型)────────────────────────────────────────────────
|
||
_BASE_SCORES: list[tuple[str, int, str]] = [
|
||
# (regex pattern, base_score, reason)
|
||
# 唯讀查詢指令(零衝擊,優先匹配避免被 default=50 吃掉)
|
||
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 9 修復 — kubectl get/top/describe/logs 補 score=1
|
||
(r"kubectl\s+get\b", 1, "get 唯讀查詢(零衝擊)"),
|
||
(r"kubectl\s+top\b", 1, "top 唯讀查詢(零衝擊)"),
|
||
(r"kubectl\s+describe\b", 1, "describe 唯讀查詢(零衝擊)"),
|
||
(r"kubectl\s+logs\b", 1, "logs 唯讀查詢(零衝擊)"),
|
||
(r"kubectl\s+version\b", 1, "version 唯讀(零衝擊)"),
|
||
(r"kubectl\s+rollout\s+restart", 10, "rollout restart 低衝擊"),
|
||
(r"kubectl\s+rollout\s+undo", 25, "rollout undo 中衝擊(版本回退)"),
|
||
(r"kubectl\s+scale.*--replicas=[1-9]", 15, "scale up/down 低中衝擊"),
|
||
(r"kubectl\s+scale.*--replicas=0", 60, "scale to zero 高衝擊(停服)"),
|
||
(r"kubectl\s+apply", 40, "apply 中高衝擊(配置變更)"),
|
||
(r"kubectl\s+patch", 45, "patch 中高衝擊"),
|
||
(r"kubectl\s+set\s+image", 35, "set image 中衝擊"),
|
||
(r"kubectl\s+delete\s+pod", 30, "delete pod 中衝擊(Pod 重建)"),
|
||
(r"kubectl\s+delete\s+deployment", 75, "delete deployment 高衝擊"),
|
||
(r"kubectl\s+delete\s+service", 70, "delete service 高衝擊(流量中斷)"),
|
||
(r"kubectl\s+delete\s+namespace", BLAST_BLOCKED, "delete namespace 永擋"),
|
||
(r"kubectl\s+delete\s+pv\b", BLAST_BLOCKED, "delete PV 永擋(資料遺失)"),
|
||
(r"kubectl\s+delete\s+pvc\b", BLAST_BLOCKED, "delete PVC 永擋(資料遺失)"),
|
||
(r"kubectl\s+delete\s+clusterrole", BLAST_BLOCKED, "delete ClusterRole 永擋(RBAC 毀損)"),
|
||
(r"kubectl\s+delete\s+secret", 80, "delete secret 高衝擊"),
|
||
(r"kubectl\s+delete\s+configmap", 55, "delete configmap 高衝擊"),
|
||
(r"kubectl\s+exec", 65, "exec 高衝擊(互動式執行)"),
|
||
(r"kubectl\s+cp\b", 50, "cp 中高衝擊"),
|
||
(r"kubectl\s+drain", 80, "drain node 高衝擊"),
|
||
(r"kubectl\s+cordon", 55, "cordon node 高衝擊"),
|
||
(r"kubectl\s+taint", 60, "taint 高衝擊"),
|
||
]
|
||
|
||
# ── 永擋命令清單(不含 kubectl 的危險操作)───────────────────────────────────
|
||
_HARD_BLOCK_PATTERNS: list[tuple[str, str]] = [
|
||
(r"rm\s+-rf", "rm -rf 永擋(資料刪除)"),
|
||
(r"DROP\s+TABLE", "DROP TABLE 永擋(DB 資料刪除)"),
|
||
(r"DROP\s+DATABASE", "DROP DATABASE 永擋"),
|
||
(r"TRUNCATE\s+TABLE", "TRUNCATE TABLE 永擋"),
|
||
(r"kubectl\s+exec.*?--.*?rm\b", "kubectl exec rm 永擋"),
|
||
(r"kubectl\s+exec.*?--.*?kill\b", "kubectl exec kill 永擋"),
|
||
]
|
||
|
||
# ── 高風險命名空間 ──────────────────────────────────────────────────────────
|
||
_CRITICAL_NAMESPACES = {"kube-system", "kube-public", "kube-node-lease", "monitoring", "gitea"}
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Data Types
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class BlastRadiusResult:
|
||
"""爆炸半徑計算結果"""
|
||
score: int # 0-100(100 = 永擋)
|
||
tier: str # "auto" / "human" / "dual" / "blocked"
|
||
reason: str # 計分依據(可審計)
|
||
hard_blocked: bool # True = HARD_RULES 永擋
|
||
blocked_reason: str | None # 永擋時的具體原因
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Calculator
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
class BlastRadiusCalculator:
|
||
"""
|
||
爆炸半徑計算器
|
||
|
||
Usage:
|
||
calc = BlastRadiusCalculator()
|
||
result = calc.calculate(action="kubectl rollout restart deployment/awoooi-api",
|
||
namespace="awoooi-prod")
|
||
if result.tier == "auto":
|
||
# 可自動執行
|
||
"""
|
||
|
||
def calculate(
|
||
self,
|
||
action: str,
|
||
namespace: str = "awoooi-prod",
|
||
target: str = "",
|
||
) -> BlastRadiusResult:
|
||
"""
|
||
計算動作的爆炸半徑分數。
|
||
|
||
Args:
|
||
action: 修復命令(kubectl command or description)
|
||
namespace: 目標命名空間
|
||
target: 目標資源名稱
|
||
|
||
Returns:
|
||
BlastRadiusResult(包含分數、分級、計分依據)
|
||
"""
|
||
action_lower = action.lower()
|
||
reasons: list[str] = []
|
||
|
||
# 1. HARD_RULES 永擋優先檢查
|
||
for pattern, block_reason in _HARD_BLOCK_PATTERNS:
|
||
if re.search(pattern, action, re.IGNORECASE):
|
||
logger.warning(
|
||
"blast_radius_hard_blocked",
|
||
action=action[:120],
|
||
reason=block_reason,
|
||
)
|
||
return BlastRadiusResult(
|
||
score=BLAST_BLOCKED,
|
||
tier="blocked",
|
||
reason=block_reason,
|
||
hard_blocked=True,
|
||
blocked_reason=block_reason,
|
||
)
|
||
|
||
# 2. 基礎分(依 kubectl 動作類型)
|
||
base_score = 50 # 保守預設:未知動作 = 50(human tier)
|
||
matched_base_reason = "未知 kubectl 動作,保守 50 分"
|
||
|
||
for pattern, score, reason in _BASE_SCORES:
|
||
if re.search(pattern, action, re.IGNORECASE):
|
||
if score == BLAST_BLOCKED:
|
||
return BlastRadiusResult(
|
||
score=BLAST_BLOCKED,
|
||
tier="blocked",
|
||
reason=reason,
|
||
hard_blocked=True,
|
||
blocked_reason=reason,
|
||
)
|
||
base_score = score
|
||
matched_base_reason = reason
|
||
break
|
||
|
||
reasons.append(f"基礎分 {base_score}:{matched_base_reason}")
|
||
|
||
# 3. 命名空間倍率
|
||
ns_multiplier = 1.0
|
||
if namespace in _CRITICAL_NAMESPACES:
|
||
ns_multiplier = 2.5
|
||
reasons.append(f"命名空間 {namespace} × 2.5(系統級)")
|
||
elif namespace == "default":
|
||
ns_multiplier = 1.8
|
||
reasons.append("default 命名空間 × 1.8(全域影響)")
|
||
|
||
# 4. 追加修正(replicas=0 已在 BASE_SCORES,其他特殊情境)
|
||
bonus = 0
|
||
if "--force" in action_lower:
|
||
bonus += 20
|
||
reasons.append("+20:--force flag 危險")
|
||
if "kube-system" in action_lower:
|
||
bonus += 40
|
||
reasons.append("+40:kube-system 目標")
|
||
if "all" in action_lower and "kubectl delete" in action_lower:
|
||
bonus += 30
|
||
reasons.append("+30:delete all 批量刪除")
|
||
|
||
# 5. 最終分數
|
||
raw_score = base_score * ns_multiplier + bonus
|
||
final_score = min(int(raw_score), 99) # 保留 100 給 HARD_RULES
|
||
|
||
tier = _score_to_tier(final_score)
|
||
reasons.append(f"最終分 {final_score} → {tier}")
|
||
|
||
reason_str = ";".join(reasons)
|
||
logger.debug(
|
||
"blast_radius_calculated",
|
||
score=final_score,
|
||
tier=tier,
|
||
action=action[:80],
|
||
reason=reason_str,
|
||
)
|
||
return BlastRadiusResult(
|
||
score=final_score,
|
||
tier=tier,
|
||
reason=reason_str,
|
||
hard_blocked=False,
|
||
blocked_reason=None,
|
||
)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Helpers
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _score_to_tier(score: int) -> str:
|
||
if score <= TIER_AUTO_MAX:
|
||
return "auto"
|
||
elif score <= TIER_HUMAN_MAX:
|
||
return "human"
|
||
elif score <= TIER_DUAL_MAX:
|
||
return "dual"
|
||
else:
|
||
return "blocked"
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Singleton
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_calculator: BlastRadiusCalculator | None = None
|
||
|
||
|
||
def get_blast_radius_calculator() -> BlastRadiusCalculator:
|
||
global _calculator
|
||
if _calculator is None:
|
||
_calculator = BlastRadiusCalculator()
|
||
return _calculator
|