""" Alert Analyzer Service - 告警分析大腦 ====================================== 從 api/v1/webhooks.py 抽取至 services 層 (ADR-024 四層架構,R4 #129) 職責: - 根據告警類型、嚴重度、相關指標,判定風險等級 - 計算爆炸半徑 (Blast Radius) - 組裝 ApprovalRequestCreate 設計原則: - 純業務邏輯層,不存取 Redis/DB - 依賴 K8s 資源名稱正規化工具 (ADR-016) - 可獨立測試 (無外部依賴) 版本: v1.0 建立: 2026-04-01 (台北時區) 建立者: Claude Code (R4 Router 瘦身 #129) """ import hashlib from src.models.approval import ( ApprovalRequestCreate, BlastRadius, DataImpact, DryRunCheck, RiskLevel, ) from src.models.webhook import AlertPayload from src.utils.k8s_naming import normalize_resource_name class AlertAnalyzer: """ 告警分析器 - AWOOOI 核心大腦 根據告警類型、嚴重度、相關指標, 自動判定風險等級、爆炸半徑、處置建議。 搬移自: api/v1/webhooks.py (ADR-024 R4 #129, 2026-04-01 ogt) """ # 告警類型 → 風險等級映射 RISK_MAPPING: dict[str, RiskLevel] = { "k8s_node_failure": RiskLevel.CRITICAL, "k8s_pod_crash": RiskLevel.MEDIUM, "db_connection_timeout": RiskLevel.CRITICAL, "service_404": RiskLevel.MEDIUM, "high_cpu": RiskLevel.MEDIUM, "high_memory": RiskLevel.MEDIUM, "disk_full": RiskLevel.CRITICAL, "ssl_expiry": RiskLevel.LOW, "custom": RiskLevel.MEDIUM, } # 告警類型 → 處置建議映射 ACTION_MAPPING: dict[str, str] = { "k8s_node_failure": "kubectl drain {resource} --ignore-daemonsets", "k8s_pod_crash": "kubectl delete pod {resource} -n {namespace}", "db_connection_timeout": "重啟資料庫連線池並檢查網路", "service_404": "kubectl rollout restart deployment/{resource} -n {namespace}", "high_cpu": "kubectl scale deployment/{resource} --replicas=+2 -n {namespace}", "high_memory": "kubectl delete pod {resource} -n {namespace} (記憶體洩漏清理)", "disk_full": "清理 /var/log 與 /tmp 目錄", "ssl_expiry": "更新 SSL 憑證", "custom": "人工分析處置", } # 告警類型 → 爆炸半徑映射 BLAST_RADIUS_MAPPING: dict[str, dict] = { "k8s_node_failure": {"pods": 10, "downtime": "~5 min", "services": ["all-on-node"]}, "k8s_pod_crash": {"pods": 1, "downtime": "~30s", "services": []}, "db_connection_timeout": {"pods": 0, "downtime": "~2 min", "services": ["api", "auth"]}, "service_404": {"pods": 3, "downtime": "~1 min", "services": []}, "high_cpu": {"pods": 0, "downtime": "0", "services": []}, "high_memory": {"pods": 1, "downtime": "~30s", "services": []}, "disk_full": {"pods": 0, "downtime": "~5 min", "services": ["logging"]}, "ssl_expiry": {"pods": 0, "downtime": "0", "services": ["https"]}, "custom": {"pods": 0, "downtime": "unknown", "services": []}, } @classmethod def analyze(cls, alert: AlertPayload) -> ApprovalRequestCreate: """ 分析告警並生成 ApprovalRequestCreate Phase 18.1.7: 整合 K8s 資源名稱正規化 (ADR-016) Returns: ApprovalRequestCreate 用於建立待簽核卡片 """ # Phase 18.1.7: 先正規化資源名稱 normalized = normalize_resource_name(alert.target_resource, alert.namespace) resolved_resource = normalized.normalized or alert.target_resource resolved_namespace = normalized.namespace or alert.namespace # 1. 判定風險等級 base_risk = cls.RISK_MAPPING.get(alert.alert_type, RiskLevel.MEDIUM) # 嚴重度提升 if alert.severity == "critical" and base_risk != RiskLevel.CRITICAL: risk_level = RiskLevel.CRITICAL else: risk_level = base_risk # 2. 取得處置建議 (使用正規化後的資源名稱) action_template = cls.ACTION_MAPPING.get(alert.alert_type, "人工分析處置") action = action_template.format( resource=resolved_resource, namespace=resolved_namespace, ) # 3. 取得爆炸半徑 blast_info = cls.BLAST_RADIUS_MAPPING.get( alert.alert_type, {"pods": 0, "downtime": "unknown", "services": []}, ) # 判定 data_impact data_impact = DataImpact.NONE if alert.alert_type in ["db_connection_timeout", "disk_full"]: data_impact = DataImpact.WRITE # 4. 建立 Dry-run 檢查項目 dry_run_checks = [ DryRunCheck( name="權限驗證", passed=True, message="cluster-admin", ), DryRunCheck( name="語法驗證", passed=True, message=None, ), DryRunCheck( name="告警來源驗證", passed=True, message=alert.source, ), ] # 如果有 metrics,加入 sigma 分析 if alert.metrics: cpu = alert.metrics.get("cpu_percent", 0) sigma = alert.metrics.get("sigma_deviation", 0) if sigma and abs(sigma) > 2: dry_run_checks.append( DryRunCheck( name="基準線偏差分析", passed=True, message=f"CPU: {cpu:.0f}% (σ: {sigma:+.1f})", ) ) # 5. 組裝 description description = f"[{alert.alert_type}] {alert.message}" if alert.metrics: metrics_str = ", ".join(f"{k}={v}" for k, v in alert.metrics.items()) description += f" | 指標: {metrics_str}" # 6. 建立 ApprovalRequestCreate return ApprovalRequestCreate( action=action, description=description, risk_level=risk_level, blast_radius=BlastRadius( affected_pods=blast_info["pods"], estimated_downtime=blast_info["downtime"], related_services=blast_info["services"] + [alert.target_resource], data_impact=data_impact, ), dry_run_checks=dry_run_checks, requested_by="OpenClaw", ) # [首席架構師] 封裝 generate_alert_fingerprint 為 staticmethod v1.2 2026-04-01 Asia/Taipei @staticmethod def generate_fingerprint(alert: AlertPayload) -> str: """ 生成告警唯一指紋 (SHA256 Hash) 指紋組成: namespace:deployment:alertname:target_resource 使用 alertname(而非 alert_type)確保不同告警名稱不共用指紋。 原本用 alert_type 導致許多不同 alertname 都落入 "custom", 造成同目標上的不同告警互相擋截(ADR-073 修復 2026-04-12 ogt)。 同一個告警名稱 + 同一目標 → 相同指紋 → 觸發 debounce 去重。 不同告警名稱(即使同目標)→ 不同指紋 → 各自建立 Incident。 """ # 從 labels 取得 deployment,如果沒有則用 target_resource deployment = "" if alert.labels: deployment = alert.labels.get("deployment", alert.labels.get("app", "")) if not deployment: deployment = alert.target_resource # alertname 優先取 labels,fallback 到 alert_type(非 Alertmanager 來源) alertname = ( alert.labels.get("alertname", "") if alert.labels else "" ) or alert.alert_type # 組合指紋來源 fingerprint_source = f"{alert.namespace}:{deployment}:{alertname}:{alert.target_resource}" # SHA256 Hash return hashlib.sha256(fingerprint_source.encode()).hexdigest()[:32]