fix(aiops): ADR-072 BUG-008 alertname_to_type 9→56 筆
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
從 9 筆靜態 map 擴充至完整涵蓋 alerts-unified.yml 全 42 個 alertname: - host_alerts: HostDown/HostHighCpuLoad/HostOutOfMemory/HostOutOfDiskSpace/HostBackupFailed - k8s: K3sNodeNotReady/KubePodCrashLooping/KubeDeploymentReplicasMismatch/Velero* (8筆) - database: PostgreSQL*/Redis* (10 筆) - service_alerts: *Down (8 筆) - external: *Down/SSLExpiring (5 筆) - alert_chain: AlertChainBroken*/NoAlerts/Unhealthy (4 筆) - docker_health: DockerContainerUnhealthy/Exited (2 筆) - auto_repair: AutoRepairLowSuccessRate/PermanentFixRequired (2 筆) - 舊版相容: HighCPUUsage/HighMemoryUsage/DiskSpaceLow/SSLCertExpiringSoon/TargetDown 預期效果: 69/112 incidents "custom" → 大幅降低,HostHighCpuLoad → "host_cpu" BUG-007 確認不需修: alerts-unified.yml 全 42 規則均已有 severity label 2026-04-11 Claude Sonnet 4.6 Asia/Taipei Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1099,17 +1099,67 @@ async def alertmanager_webhook(
|
|||||||
approval_created=False,
|
approval_created=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 映射 alertname → alert_type
|
# BUG-008 修復 2026-04-11: 從 9 筆擴充為完整涵蓋 alerts-unified.yml 全部 42 個 alertname
|
||||||
|
# 新規則由 layer/component 語意推導,取代靜態小表
|
||||||
alertname_to_type = {
|
alertname_to_type = {
|
||||||
"KubePodCrashLooping": "k8s_pod_crash",
|
# --- 主機層 (host_alerts) ---
|
||||||
"KubePodNotReady": "k8s_pod_crash",
|
"HostDown": "host_down",
|
||||||
"KubeNodeNotReady": "k8s_node_failure",
|
"HostHighCpuLoad": "host_cpu",
|
||||||
"KubeNodeUnreachable": "k8s_node_failure",
|
"HostOutOfMemory": "host_memory",
|
||||||
"HighCPUUsage": "high_cpu",
|
"HostOutOfDiskSpace": "disk_full",
|
||||||
"HighMemoryUsage": "high_memory",
|
"HostBackupFailed": "backup_failure",
|
||||||
"DiskSpaceLow": "disk_full",
|
# --- K8s 層 (kubernetes_alerts) ---
|
||||||
"SSLCertExpiringSoon": "ssl_expiry",
|
"K3sNodeNotReady": "k8s_node_failure",
|
||||||
"TargetDown": "service_404",
|
"KubePodCrashLooping": "k8s_pod_crash",
|
||||||
|
"KubePodNotReady": "k8s_pod_crash",
|
||||||
|
"KubeNodeNotReady": "k8s_node_failure",
|
||||||
|
"KubeNodeUnreachable": "k8s_node_failure",
|
||||||
|
"KubeDeploymentReplicasMismatch": "k8s_deployment_mismatch",
|
||||||
|
"VeleroBackupFailed": "backup_failure",
|
||||||
|
"VeleroBackupNotRun": "backup_failure",
|
||||||
|
# --- 資料庫 (database_alerts / database_detail_alerts) ---
|
||||||
|
"PostgreSQLDown": "database_down",
|
||||||
|
"RedisDown": "database_down",
|
||||||
|
"PostgreSQLHighConnections": "database_performance",
|
||||||
|
"RedisMemoryHigh": "high_memory",
|
||||||
|
"PostgreSQLSlowQueries": "database_performance",
|
||||||
|
"PostgreSQLDeadlocks": "database_performance",
|
||||||
|
"PostgreSQLTooManyConnections": "database_performance",
|
||||||
|
"RedisKeyEviction": "database_performance",
|
||||||
|
"RedisConnectionsHigh": "database_performance",
|
||||||
|
"RedisCommandLatencyHigh": "database_performance",
|
||||||
|
# --- 服務可用性 (service_alerts) ---
|
||||||
|
"OpenClawDown": "service_down",
|
||||||
|
"SignOzDown": "service_down",
|
||||||
|
"SentryDown": "service_down",
|
||||||
|
"HarborDown": "service_down",
|
||||||
|
"GiteaDown": "service_down",
|
||||||
|
"AlertmanagerDown": "service_down",
|
||||||
|
"MinIODown": "service_down",
|
||||||
|
"KaliScannerDown": "service_down",
|
||||||
|
# --- 外部網站 (external_website_alerts) ---
|
||||||
|
"MoWoooWorkDown": "service_404",
|
||||||
|
"TsenyangWebsiteDown": "service_404",
|
||||||
|
"StockWoooWorkDown": "service_404",
|
||||||
|
"BitanWoooWorkDown": "service_404",
|
||||||
|
"ExternalSiteSSLExpiringSoon": "ssl_expiry",
|
||||||
|
# --- 告警鏈路 (alert_chain) ---
|
||||||
|
"AlertChainBroken_Alertmanager": "alert_chain_broken",
|
||||||
|
"AlertChainBroken_Sentry": "alert_chain_broken",
|
||||||
|
"NoAlertsReceived2Hours": "alert_chain_broken",
|
||||||
|
"AlertChainUnhealthy": "alert_chain_broken",
|
||||||
|
# --- Docker 容器 (docker_health_alerts) ---
|
||||||
|
"DockerContainerUnhealthy": "docker_container_unhealthy",
|
||||||
|
"DockerContainerExited": "docker_container_unhealthy",
|
||||||
|
# --- 自動修復監控 (auto_repair) ---
|
||||||
|
"AutoRepairLowSuccessRate": "auto_repair_degraded",
|
||||||
|
"PermanentFixRequired": "auto_repair_degraded",
|
||||||
|
# --- 舊版相容 ---
|
||||||
|
"HighCPUUsage": "high_cpu",
|
||||||
|
"HighMemoryUsage": "high_memory",
|
||||||
|
"DiskSpaceLow": "disk_full",
|
||||||
|
"SSLCertExpiringSoon": "ssl_expiry",
|
||||||
|
"TargetDown": "service_404",
|
||||||
}
|
}
|
||||||
alert_type = alertname_to_type.get(alertname, "custom")
|
alert_type = alertname_to_type.get(alertname, "custom")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user