Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
問題根因: 1. generate_fingerprint 用 alert_type(大量 alertname 落入 "custom") → 不同告警名稱同目標共用指紋 → 30 分鐘 debounce 互相擋截 2. classify_alert_early 漏掉 DeadMansSwitch / NoAlertsReceived / PrometheusNotConnectedToAlertmanager → 落入 TYPE-3 一般告警 修復: - alert_analyzer_service.py: 指紋改為 namespace:deployment:alertname:target_resource alertname 取自 labels(Alertmanager),fallback 到 alert_type(其他來源) - incident_service.py: DeadMansSwitch → backup/TYPE-1; NoAlertsReceived + PrometheusNotConnectedToAlertmanager → alertchain_health/TYPE-8M - 補 2 個測試,全套 627 passed Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
211 lines
7.8 KiB
Python
211 lines
7.8 KiB
Python
"""
|
||
Alert Analyzer Service - 告警分析大腦
|
||
======================================
|
||
|
||
從 api/v1/webhooks.py 抽取至 services 層 (ADR-024 四層架構,R4 #129)
|
||
|
||
職責:
|
||
- 根據告警類型、嚴重度、相關指標,判定風險等級
|
||
- 計算爆炸半徑 (Blast Radius)
|
||
- 組裝 ApprovalRequestCreate
|
||
|
||
設計原則:
|
||
- 純業務邏輯層,不存取 Redis/DB
|
||
- 依賴 K8s 資源名稱正規化工具 (ADR-016)
|
||
- 可獨立測試 (無外部依賴)
|
||
|
||
版本: v1.0
|
||
建立: 2026-04-01 (台北時區)
|
||
建立者: Claude Code (R4 Router 瘦身 #129)
|
||
"""
|
||
|
||
import hashlib
|
||
|
||
from src.models.approval import (
|
||
ApprovalRequestCreate,
|
||
BlastRadius,
|
||
DataImpact,
|
||
DryRunCheck,
|
||
RiskLevel,
|
||
)
|
||
from src.models.webhook import AlertPayload
|
||
from src.utils.k8s_naming import normalize_resource_name
|
||
|
||
|
||
class AlertAnalyzer:
|
||
"""
|
||
告警分析器 - AWOOOI 核心大腦
|
||
|
||
根據告警類型、嚴重度、相關指標,
|
||
自動判定風險等級、爆炸半徑、處置建議。
|
||
|
||
搬移自: api/v1/webhooks.py (ADR-024 R4 #129, 2026-04-01 ogt)
|
||
"""
|
||
|
||
# 告警類型 → 風險等級映射
|
||
RISK_MAPPING: dict[str, RiskLevel] = {
|
||
"k8s_node_failure": RiskLevel.CRITICAL,
|
||
"k8s_pod_crash": RiskLevel.MEDIUM,
|
||
"db_connection_timeout": RiskLevel.CRITICAL,
|
||
"service_404": RiskLevel.MEDIUM,
|
||
"high_cpu": RiskLevel.MEDIUM,
|
||
"high_memory": RiskLevel.MEDIUM,
|
||
"disk_full": RiskLevel.CRITICAL,
|
||
"ssl_expiry": RiskLevel.LOW,
|
||
"custom": RiskLevel.MEDIUM,
|
||
}
|
||
|
||
# 告警類型 → 處置建議映射
|
||
ACTION_MAPPING: dict[str, str] = {
|
||
"k8s_node_failure": "kubectl drain {resource} --ignore-daemonsets",
|
||
"k8s_pod_crash": "kubectl delete pod {resource} -n {namespace}",
|
||
"db_connection_timeout": "重啟資料庫連線池並檢查網路",
|
||
"service_404": "kubectl rollout restart deployment/{resource} -n {namespace}",
|
||
"high_cpu": "kubectl scale deployment/{resource} --replicas=+2 -n {namespace}",
|
||
"high_memory": "kubectl delete pod {resource} -n {namespace} (記憶體洩漏清理)",
|
||
"disk_full": "清理 /var/log 與 /tmp 目錄",
|
||
"ssl_expiry": "更新 SSL 憑證",
|
||
"custom": "人工分析處置",
|
||
}
|
||
|
||
# 告警類型 → 爆炸半徑映射
|
||
BLAST_RADIUS_MAPPING: dict[str, dict] = {
|
||
"k8s_node_failure": {"pods": 10, "downtime": "~5 min", "services": ["all-on-node"]},
|
||
"k8s_pod_crash": {"pods": 1, "downtime": "~30s", "services": []},
|
||
"db_connection_timeout": {"pods": 0, "downtime": "~2 min", "services": ["api", "auth"]},
|
||
"service_404": {"pods": 3, "downtime": "~1 min", "services": []},
|
||
"high_cpu": {"pods": 0, "downtime": "0", "services": []},
|
||
"high_memory": {"pods": 1, "downtime": "~30s", "services": []},
|
||
"disk_full": {"pods": 0, "downtime": "~5 min", "services": ["logging"]},
|
||
"ssl_expiry": {"pods": 0, "downtime": "0", "services": ["https"]},
|
||
"custom": {"pods": 0, "downtime": "unknown", "services": []},
|
||
}
|
||
|
||
@classmethod
|
||
def analyze(cls, alert: AlertPayload) -> ApprovalRequestCreate:
|
||
"""
|
||
分析告警並生成 ApprovalRequestCreate
|
||
|
||
Phase 18.1.7: 整合 K8s 資源名稱正規化 (ADR-016)
|
||
|
||
Returns:
|
||
ApprovalRequestCreate 用於建立待簽核卡片
|
||
"""
|
||
# Phase 18.1.7: 先正規化資源名稱
|
||
normalized = normalize_resource_name(alert.target_resource, alert.namespace)
|
||
resolved_resource = normalized.normalized or alert.target_resource
|
||
resolved_namespace = normalized.namespace or alert.namespace
|
||
|
||
# 1. 判定風險等級
|
||
base_risk = cls.RISK_MAPPING.get(alert.alert_type, RiskLevel.MEDIUM)
|
||
|
||
# 嚴重度提升
|
||
if alert.severity == "critical" and base_risk != RiskLevel.CRITICAL:
|
||
risk_level = RiskLevel.CRITICAL
|
||
else:
|
||
risk_level = base_risk
|
||
|
||
# 2. 取得處置建議 (使用正規化後的資源名稱)
|
||
action_template = cls.ACTION_MAPPING.get(alert.alert_type, "人工分析處置")
|
||
action = action_template.format(
|
||
resource=resolved_resource,
|
||
namespace=resolved_namespace,
|
||
)
|
||
|
||
# 3. 取得爆炸半徑
|
||
blast_info = cls.BLAST_RADIUS_MAPPING.get(
|
||
alert.alert_type,
|
||
{"pods": 0, "downtime": "unknown", "services": []},
|
||
)
|
||
|
||
# 判定 data_impact
|
||
data_impact = DataImpact.NONE
|
||
if alert.alert_type in ["db_connection_timeout", "disk_full"]:
|
||
data_impact = DataImpact.WRITE
|
||
|
||
# 4. 建立 Dry-run 檢查項目
|
||
dry_run_checks = [
|
||
DryRunCheck(
|
||
name="權限驗證",
|
||
passed=True,
|
||
message="cluster-admin",
|
||
),
|
||
DryRunCheck(
|
||
name="語法驗證",
|
||
passed=True,
|
||
message=None,
|
||
),
|
||
DryRunCheck(
|
||
name="告警來源驗證",
|
||
passed=True,
|
||
message=alert.source,
|
||
),
|
||
]
|
||
|
||
# 如果有 metrics,加入 sigma 分析
|
||
if alert.metrics:
|
||
cpu = alert.metrics.get("cpu_percent", 0)
|
||
sigma = alert.metrics.get("sigma_deviation", 0)
|
||
if sigma and abs(sigma) > 2:
|
||
dry_run_checks.append(
|
||
DryRunCheck(
|
||
name="基準線偏差分析",
|
||
passed=True,
|
||
message=f"CPU: {cpu:.0f}% (σ: {sigma:+.1f})",
|
||
)
|
||
)
|
||
|
||
# 5. 組裝 description
|
||
description = f"[{alert.alert_type}] {alert.message}"
|
||
if alert.metrics:
|
||
metrics_str = ", ".join(f"{k}={v}" for k, v in alert.metrics.items())
|
||
description += f" | 指標: {metrics_str}"
|
||
|
||
# 6. 建立 ApprovalRequestCreate
|
||
return ApprovalRequestCreate(
|
||
action=action,
|
||
description=description,
|
||
risk_level=risk_level,
|
||
blast_radius=BlastRadius(
|
||
affected_pods=blast_info["pods"],
|
||
estimated_downtime=blast_info["downtime"],
|
||
related_services=blast_info["services"] + [alert.target_resource],
|
||
data_impact=data_impact,
|
||
),
|
||
dry_run_checks=dry_run_checks,
|
||
requested_by="OpenClaw",
|
||
)
|
||
|
||
# [首席架構師] 封裝 generate_alert_fingerprint 為 staticmethod v1.2 2026-04-01 Asia/Taipei
|
||
@staticmethod
|
||
def generate_fingerprint(alert: AlertPayload) -> str:
|
||
"""
|
||
生成告警唯一指紋 (SHA256 Hash)
|
||
|
||
指紋組成: namespace:deployment:alertname:target_resource
|
||
|
||
使用 alertname(而非 alert_type)確保不同告警名稱不共用指紋。
|
||
原本用 alert_type 導致許多不同 alertname 都落入 "custom",
|
||
造成同目標上的不同告警互相擋截(ADR-073 修復 2026-04-12 ogt)。
|
||
|
||
同一個告警名稱 + 同一目標 → 相同指紋 → 觸發 debounce 去重。
|
||
不同告警名稱(即使同目標)→ 不同指紋 → 各自建立 Incident。
|
||
"""
|
||
# 從 labels 取得 deployment,如果沒有則用 target_resource
|
||
deployment = ""
|
||
if alert.labels:
|
||
deployment = alert.labels.get("deployment", alert.labels.get("app", ""))
|
||
if not deployment:
|
||
deployment = alert.target_resource
|
||
|
||
# alertname 優先取 labels,fallback 到 alert_type(非 Alertmanager 來源)
|
||
alertname = (
|
||
alert.labels.get("alertname", "") if alert.labels else ""
|
||
) or alert.alert_type
|
||
|
||
# 組合指紋來源
|
||
fingerprint_source = f"{alert.namespace}:{deployment}:{alertname}:{alert.target_resource}"
|
||
|
||
# SHA256 Hash
|
||
return hashlib.sha256(fingerprint_source.encode()).hexdigest()[:32]
|