fix(aiops): ADR-072 BUG-008 alertname_to_type 9→56 筆
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
從 9 筆靜態 map 擴充至完整涵蓋 alerts-unified.yml 全 42 個 alertname: - host_alerts: HostDown/HostHighCpuLoad/HostOutOfMemory/HostOutOfDiskSpace/HostBackupFailed - k8s: K3sNodeNotReady/KubePodCrashLooping/KubeDeploymentReplicasMismatch/Velero* (8筆) - database: PostgreSQL*/Redis* (10 筆) - service_alerts: *Down (8 筆) - external: *Down/SSLExpiring (5 筆) - alert_chain: AlertChainBroken*/NoAlerts/Unhealthy (4 筆) - docker_health: DockerContainerUnhealthy/Exited (2 筆) - auto_repair: AutoRepairLowSuccessRate/PermanentFixRequired (2 筆) - 舊版相容: HighCPUUsage/HighMemoryUsage/DiskSpaceLow/SSLCertExpiringSoon/TargetDown 預期效果: 69/112 incidents "custom" → 大幅降低,HostHighCpuLoad → "host_cpu" BUG-007 確認不需修: alerts-unified.yml 全 42 規則均已有 severity label 2026-04-11 Claude Sonnet 4.6 Asia/Taipei Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1099,17 +1099,67 @@ async def alertmanager_webhook(
|
||||
approval_created=False,
|
||||
)
|
||||
|
||||
# 映射 alertname → alert_type
|
||||
# BUG-008 修復 2026-04-11: 從 9 筆擴充為完整涵蓋 alerts-unified.yml 全部 42 個 alertname
|
||||
# 新規則由 layer/component 語意推導,取代靜態小表
|
||||
alertname_to_type = {
|
||||
"KubePodCrashLooping": "k8s_pod_crash",
|
||||
"KubePodNotReady": "k8s_pod_crash",
|
||||
"KubeNodeNotReady": "k8s_node_failure",
|
||||
"KubeNodeUnreachable": "k8s_node_failure",
|
||||
"HighCPUUsage": "high_cpu",
|
||||
"HighMemoryUsage": "high_memory",
|
||||
"DiskSpaceLow": "disk_full",
|
||||
"SSLCertExpiringSoon": "ssl_expiry",
|
||||
"TargetDown": "service_404",
|
||||
# --- 主機層 (host_alerts) ---
|
||||
"HostDown": "host_down",
|
||||
"HostHighCpuLoad": "host_cpu",
|
||||
"HostOutOfMemory": "host_memory",
|
||||
"HostOutOfDiskSpace": "disk_full",
|
||||
"HostBackupFailed": "backup_failure",
|
||||
# --- K8s 層 (kubernetes_alerts) ---
|
||||
"K3sNodeNotReady": "k8s_node_failure",
|
||||
"KubePodCrashLooping": "k8s_pod_crash",
|
||||
"KubePodNotReady": "k8s_pod_crash",
|
||||
"KubeNodeNotReady": "k8s_node_failure",
|
||||
"KubeNodeUnreachable": "k8s_node_failure",
|
||||
"KubeDeploymentReplicasMismatch": "k8s_deployment_mismatch",
|
||||
"VeleroBackupFailed": "backup_failure",
|
||||
"VeleroBackupNotRun": "backup_failure",
|
||||
# --- 資料庫 (database_alerts / database_detail_alerts) ---
|
||||
"PostgreSQLDown": "database_down",
|
||||
"RedisDown": "database_down",
|
||||
"PostgreSQLHighConnections": "database_performance",
|
||||
"RedisMemoryHigh": "high_memory",
|
||||
"PostgreSQLSlowQueries": "database_performance",
|
||||
"PostgreSQLDeadlocks": "database_performance",
|
||||
"PostgreSQLTooManyConnections": "database_performance",
|
||||
"RedisKeyEviction": "database_performance",
|
||||
"RedisConnectionsHigh": "database_performance",
|
||||
"RedisCommandLatencyHigh": "database_performance",
|
||||
# --- 服務可用性 (service_alerts) ---
|
||||
"OpenClawDown": "service_down",
|
||||
"SignOzDown": "service_down",
|
||||
"SentryDown": "service_down",
|
||||
"HarborDown": "service_down",
|
||||
"GiteaDown": "service_down",
|
||||
"AlertmanagerDown": "service_down",
|
||||
"MinIODown": "service_down",
|
||||
"KaliScannerDown": "service_down",
|
||||
# --- 外部網站 (external_website_alerts) ---
|
||||
"MoWoooWorkDown": "service_404",
|
||||
"TsenyangWebsiteDown": "service_404",
|
||||
"StockWoooWorkDown": "service_404",
|
||||
"BitanWoooWorkDown": "service_404",
|
||||
"ExternalSiteSSLExpiringSoon": "ssl_expiry",
|
||||
# --- 告警鏈路 (alert_chain) ---
|
||||
"AlertChainBroken_Alertmanager": "alert_chain_broken",
|
||||
"AlertChainBroken_Sentry": "alert_chain_broken",
|
||||
"NoAlertsReceived2Hours": "alert_chain_broken",
|
||||
"AlertChainUnhealthy": "alert_chain_broken",
|
||||
# --- Docker 容器 (docker_health_alerts) ---
|
||||
"DockerContainerUnhealthy": "docker_container_unhealthy",
|
||||
"DockerContainerExited": "docker_container_unhealthy",
|
||||
# --- 自動修復監控 (auto_repair) ---
|
||||
"AutoRepairLowSuccessRate": "auto_repair_degraded",
|
||||
"PermanentFixRequired": "auto_repair_degraded",
|
||||
# --- 舊版相容 ---
|
||||
"HighCPUUsage": "high_cpu",
|
||||
"HighMemoryUsage": "high_memory",
|
||||
"DiskSpaceLow": "disk_full",
|
||||
"SSLCertExpiringSoon": "ssl_expiry",
|
||||
"TargetDown": "service_404",
|
||||
}
|
||||
alert_type = alertname_to_type.get(alertname, "custom")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user