Files
awoooi/apps/api/src/services/blast_radius_calculator.py
Your Name 7f4088bcd0 fix(aiops-p0): 六大病根 P0 全面修復(ADR-092 B4)
【P0.1】knowledge_extractor_service.py:210 — AttributeError 修復
- Signal.description 欄位不存在(100% 失敗,KM 每天+5 根因)
- 改用 alert_name + annotations.summary 拼接文字

【P0.2+P0.3】Gate 9+11 唯讀指令鬆綁
- blast_radius_calculator: kubectl get/top/describe/logs/version → score=1(非 50)
- operation_parser: 增加 INVESTIGATE 類型識別(唯讀 kubectl 不回 None)
- executor.py: OperationType 新增 INVESTIGATE enum
- approval_execution.py: INVESTIGATE 路徑直接呼叫 execute_kubectl_command

【P0.4】MCP SSH/K8s Provider 修復
- decision_manager: params= → parameters=(符合 MCPToolProvider.execute 簽名)
- decision_manager: MCPToolResult .get() → .success/.output(dataclass 用法)
- decision_manager + ssh_provider: 補入 hosts 120/121(原 default 缺失)
- auto_approve: phase2_agent_debate source bypass confidence 閾值

【P0.5】告警規則語義矛盾修復
- alert_rules.yaml: 8 條 kubectl 查詢規則 RESTART_DEPLOYMENT → NO_ACTION
  (CrashLoopBackOff/PostgreSQL 連線/慢查詢/MinIO 磁碟/K3s 節點/告警鏈路/SSL/CoreDNS 等)
- incident_service.py: cAdvisor/CoreDNS 從 general 拆出獨立分類

【P0.6】proactive_inspector 動態基線 PromQL 全修
- 5 個 MONITORED_METRICS PromQL 全部修正(cadvisor label/datname/blackbox)
- db_connection_pool: datname="awoooi" → "awoooi_prod"
- http_error_rate: 無效 http_requests_total → blackbox probe_success
- cpu/memory: namespace label → name=~"k8s_api_awoooi-api.*"

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-24 15:32:23 +08:00

242 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
AWOOOI AIOps Phase 5 — Blast Radius Calculator爆炸半徑計算器
===============================================================
職責計算修復動作的爆炸半徑分數0-100決定執行分級。
分級邏輯:
≤ 10 → auto 自動執行(低衝擊)
11-50 → human 需一人審核(中衝擊)
51-99 → dual 需雙人審核 + GitOps PR高衝擊
100 → blocked HARD_RULES 永擋(任何情況不執行)
設計原則:
- 保守計分:不確定情境一律視為高分(> 50
- HARD_RULES 優先:任何永擋 pattern 立刻返回 100不繼續計算
- 純函數Stateless不依賴 DB/Redis確保呼叫端可同步執行
- 可審計:每次計算回傳 reason 記錄計分依據
ADR-086: Phase 5 Declarative 修復與 Blast Radius 分控
2026-04-15 ogt + Claude Sonnet 4.6(亞太): Phase 5 初始建立
"""
from __future__ import annotations
import re
from dataclasses import dataclass
import structlog
logger = structlog.get_logger(__name__)
# ── 分級閾值 ──────────────────────────────────────────────────────────────────
TIER_AUTO_MAX = 10 # ≤ 10 → auto
TIER_HUMAN_MAX = 50 # 11-50 → human
TIER_DUAL_MAX = 99 # 51-99 → dual
BLAST_BLOCKED = 100 # = 100 → permanent block
# ── 基礎分Kubectl 動作類型)────────────────────────────────────────────────
_BASE_SCORES: list[tuple[str, int, str]] = [
# (regex pattern, base_score, reason)
# 唯讀查詢指令(零衝擊,優先匹配避免被 default=50 吃掉)
# 2026-04-24 ogt + Claude Sonnet 4.6: Gate 9 修復 — kubectl get/top/describe/logs 補 score=1
(r"kubectl\s+get\b", 1, "get 唯讀查詢(零衝擊)"),
(r"kubectl\s+top\b", 1, "top 唯讀查詢(零衝擊)"),
(r"kubectl\s+describe\b", 1, "describe 唯讀查詢(零衝擊)"),
(r"kubectl\s+logs\b", 1, "logs 唯讀查詢(零衝擊)"),
(r"kubectl\s+version\b", 1, "version 唯讀(零衝擊)"),
(r"kubectl\s+rollout\s+restart", 10, "rollout restart 低衝擊"),
(r"kubectl\s+rollout\s+undo", 25, "rollout undo 中衝擊(版本回退)"),
(r"kubectl\s+scale.*--replicas=[1-9]", 15, "scale up/down 低中衝擊"),
(r"kubectl\s+scale.*--replicas=0", 60, "scale to zero 高衝擊(停服)"),
(r"kubectl\s+apply", 40, "apply 中高衝擊(配置變更)"),
(r"kubectl\s+patch", 45, "patch 中高衝擊"),
(r"kubectl\s+set\s+image", 35, "set image 中衝擊"),
(r"kubectl\s+delete\s+pod", 30, "delete pod 中衝擊Pod 重建)"),
(r"kubectl\s+delete\s+deployment", 75, "delete deployment 高衝擊"),
(r"kubectl\s+delete\s+service", 70, "delete service 高衝擊(流量中斷)"),
(r"kubectl\s+delete\s+namespace", BLAST_BLOCKED, "delete namespace 永擋"),
(r"kubectl\s+delete\s+pv\b", BLAST_BLOCKED, "delete PV 永擋(資料遺失)"),
(r"kubectl\s+delete\s+pvc\b", BLAST_BLOCKED, "delete PVC 永擋(資料遺失)"),
(r"kubectl\s+delete\s+clusterrole", BLAST_BLOCKED, "delete ClusterRole 永擋RBAC 毀損)"),
(r"kubectl\s+delete\s+secret", 80, "delete secret 高衝擊"),
(r"kubectl\s+delete\s+configmap", 55, "delete configmap 高衝擊"),
(r"kubectl\s+exec", 65, "exec 高衝擊(互動式執行)"),
(r"kubectl\s+cp\b", 50, "cp 中高衝擊"),
(r"kubectl\s+drain", 80, "drain node 高衝擊"),
(r"kubectl\s+cordon", 55, "cordon node 高衝擊"),
(r"kubectl\s+taint", 60, "taint 高衝擊"),
]
# ── 永擋命令清單(不含 kubectl 的危險操作)───────────────────────────────────
_HARD_BLOCK_PATTERNS: list[tuple[str, str]] = [
(r"rm\s+-rf", "rm -rf 永擋(資料刪除)"),
(r"DROP\s+TABLE", "DROP TABLE 永擋DB 資料刪除)"),
(r"DROP\s+DATABASE", "DROP DATABASE 永擋"),
(r"TRUNCATE\s+TABLE", "TRUNCATE TABLE 永擋"),
(r"kubectl\s+exec.*?--.*?rm\b", "kubectl exec rm 永擋"),
(r"kubectl\s+exec.*?--.*?kill\b", "kubectl exec kill 永擋"),
]
# ── 高風險命名空間 ──────────────────────────────────────────────────────────
_CRITICAL_NAMESPACES = {"kube-system", "kube-public", "kube-node-lease", "monitoring", "gitea"}
# ─────────────────────────────────────────────────────────────────────────────
# Data Types
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class BlastRadiusResult:
"""爆炸半徑計算結果"""
score: int # 0-100100 = 永擋)
tier: str # "auto" / "human" / "dual" / "blocked"
reason: str # 計分依據(可審計)
hard_blocked: bool # True = HARD_RULES 永擋
blocked_reason: str | None # 永擋時的具體原因
# ─────────────────────────────────────────────────────────────────────────────
# Calculator
# ─────────────────────────────────────────────────────────────────────────────
class BlastRadiusCalculator:
"""
爆炸半徑計算器
Usage:
calc = BlastRadiusCalculator()
result = calc.calculate(action="kubectl rollout restart deployment/awoooi-api",
namespace="awoooi-prod")
if result.tier == "auto":
# 可自動執行
"""
def calculate(
self,
action: str,
namespace: str = "awoooi-prod",
target: str = "",
) -> BlastRadiusResult:
"""
計算動作的爆炸半徑分數。
Args:
action: 修復命令kubectl command or description
namespace: 目標命名空間
target: 目標資源名稱
Returns:
BlastRadiusResult包含分數、分級、計分依據
"""
action_lower = action.lower()
reasons: list[str] = []
# 1. HARD_RULES 永擋優先檢查
for pattern, block_reason in _HARD_BLOCK_PATTERNS:
if re.search(pattern, action, re.IGNORECASE):
logger.warning(
"blast_radius_hard_blocked",
action=action[:120],
reason=block_reason,
)
return BlastRadiusResult(
score=BLAST_BLOCKED,
tier="blocked",
reason=block_reason,
hard_blocked=True,
blocked_reason=block_reason,
)
# 2. 基礎分(依 kubectl 動作類型)
base_score = 50 # 保守預設:未知動作 = 50human tier
matched_base_reason = "未知 kubectl 動作,保守 50 分"
for pattern, score, reason in _BASE_SCORES:
if re.search(pattern, action, re.IGNORECASE):
if score == BLAST_BLOCKED:
return BlastRadiusResult(
score=BLAST_BLOCKED,
tier="blocked",
reason=reason,
hard_blocked=True,
blocked_reason=reason,
)
base_score = score
matched_base_reason = reason
break
reasons.append(f"基礎分 {base_score}{matched_base_reason}")
# 3. 命名空間倍率
ns_multiplier = 1.0
if namespace in _CRITICAL_NAMESPACES:
ns_multiplier = 2.5
reasons.append(f"命名空間 {namespace} × 2.5(系統級)")
elif namespace == "default":
ns_multiplier = 1.8
reasons.append("default 命名空間 × 1.8(全域影響)")
# 4. 追加修正replicas=0 已在 BASE_SCORES其他特殊情境
bonus = 0
if "--force" in action_lower:
bonus += 20
reasons.append("+20--force flag 危險")
if "kube-system" in action_lower:
bonus += 40
reasons.append("+40kube-system 目標")
if "all" in action_lower and "kubectl delete" in action_lower:
bonus += 30
reasons.append("+30delete all 批量刪除")
# 5. 最終分數
raw_score = base_score * ns_multiplier + bonus
final_score = min(int(raw_score), 99) # 保留 100 給 HARD_RULES
tier = _score_to_tier(final_score)
reasons.append(f"最終分 {final_score}{tier}")
reason_str = "".join(reasons)
logger.debug(
"blast_radius_calculated",
score=final_score,
tier=tier,
action=action[:80],
reason=reason_str,
)
return BlastRadiusResult(
score=final_score,
tier=tier,
reason=reason_str,
hard_blocked=False,
blocked_reason=None,
)
# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────
def _score_to_tier(score: int) -> str:
if score <= TIER_AUTO_MAX:
return "auto"
elif score <= TIER_HUMAN_MAX:
return "human"
elif score <= TIER_DUAL_MAX:
return "dual"
else:
return "blocked"
# ─────────────────────────────────────────────────────────────────────────────
# Singleton
# ─────────────────────────────────────────────────────────────────────────────
_calculator: BlastRadiusCalculator | None = None
def get_blast_radius_calculator() -> BlastRadiusCalculator:
global _calculator
if _calculator is None:
_calculator = BlastRadiusCalculator()
return _calculator