From f34fe19134146cebecc312551b6bcea54b92d9e4 Mon Sep 17 00:00:00 2001 From: OG T Date: Sat, 11 Apr 2026 20:29:34 +0800 Subject: [PATCH] =?UTF-8?q?fix(aiops):=20ADR-072=20BUG-008=20alertname=5Ft?= =?UTF-8?q?o=5Ftype=209=E2=86=9256=20=E7=AD=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 從 9 筆靜態 map 擴充至完整涵蓋 alerts-unified.yml 全 42 個 alertname: - host_alerts: HostDown/HostHighCpuLoad/HostOutOfMemory/HostOutOfDiskSpace/HostBackupFailed - k8s: K3sNodeNotReady/KubePodCrashLooping/KubeDeploymentReplicasMismatch/Velero* (8筆) - database: PostgreSQL*/Redis* (10 筆) - service_alerts: *Down (8 筆) - external: *Down/SSLExpiring (5 筆) - alert_chain: AlertChainBroken*/NoAlerts/Unhealthy (4 筆) - docker_health: DockerContainerUnhealthy/Exited (2 筆) - auto_repair: AutoRepairLowSuccessRate/PermanentFixRequired (2 筆) - 舊版相容: HighCPUUsage/HighMemoryUsage/DiskSpaceLow/SSLCertExpiringSoon/TargetDown 預期效果: 69/112 incidents "custom" → 大幅降低,HostHighCpuLoad → "host_cpu" BUG-007 確認不需修: alerts-unified.yml 全 42 規則均已有 severity label 2026-04-11 Claude Sonnet 4.6 Asia/Taipei Co-Authored-By: Claude Sonnet 4.6 --- apps/api/src/api/v1/webhooks.py | 70 ++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index f5109649..68dcd7f9 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -1099,17 +1099,67 @@ async def alertmanager_webhook( approval_created=False, ) - # 映射 alertname → alert_type + # BUG-008 修復 2026-04-11: 從 9 筆擴充為完整涵蓋 alerts-unified.yml 全部 42 個 alertname + # 新規則由 layer/component 語意推導,取代靜態小表 alertname_to_type = { - "KubePodCrashLooping": "k8s_pod_crash", - "KubePodNotReady": "k8s_pod_crash", - "KubeNodeNotReady": "k8s_node_failure", - "KubeNodeUnreachable": "k8s_node_failure", - "HighCPUUsage": "high_cpu", - "HighMemoryUsage": "high_memory", - "DiskSpaceLow": "disk_full", - "SSLCertExpiringSoon": "ssl_expiry", - "TargetDown": "service_404", + # --- 主機層 (host_alerts) --- + "HostDown": "host_down", + "HostHighCpuLoad": "host_cpu", + "HostOutOfMemory": "host_memory", + "HostOutOfDiskSpace": "disk_full", + "HostBackupFailed": "backup_failure", + # --- K8s 層 (kubernetes_alerts) --- + "K3sNodeNotReady": "k8s_node_failure", + "KubePodCrashLooping": "k8s_pod_crash", + "KubePodNotReady": "k8s_pod_crash", + "KubeNodeNotReady": "k8s_node_failure", + "KubeNodeUnreachable": "k8s_node_failure", + "KubeDeploymentReplicasMismatch": "k8s_deployment_mismatch", + "VeleroBackupFailed": "backup_failure", + "VeleroBackupNotRun": "backup_failure", + # --- 資料庫 (database_alerts / database_detail_alerts) --- + "PostgreSQLDown": "database_down", + "RedisDown": "database_down", + "PostgreSQLHighConnections": "database_performance", + "RedisMemoryHigh": "high_memory", + "PostgreSQLSlowQueries": "database_performance", + "PostgreSQLDeadlocks": "database_performance", + "PostgreSQLTooManyConnections": "database_performance", + "RedisKeyEviction": "database_performance", + "RedisConnectionsHigh": "database_performance", + "RedisCommandLatencyHigh": "database_performance", + # --- 服務可用性 (service_alerts) --- + "OpenClawDown": "service_down", + "SignOzDown": "service_down", + "SentryDown": "service_down", + "HarborDown": "service_down", + "GiteaDown": "service_down", + "AlertmanagerDown": "service_down", + "MinIODown": "service_down", + "KaliScannerDown": "service_down", + # --- 外部網站 (external_website_alerts) --- + "MoWoooWorkDown": "service_404", + "TsenyangWebsiteDown": "service_404", + "StockWoooWorkDown": "service_404", + "BitanWoooWorkDown": "service_404", + "ExternalSiteSSLExpiringSoon": "ssl_expiry", + # --- 告警鏈路 (alert_chain) --- + "AlertChainBroken_Alertmanager": "alert_chain_broken", + "AlertChainBroken_Sentry": "alert_chain_broken", + "NoAlertsReceived2Hours": "alert_chain_broken", + "AlertChainUnhealthy": "alert_chain_broken", + # --- Docker 容器 (docker_health_alerts) --- + "DockerContainerUnhealthy": "docker_container_unhealthy", + "DockerContainerExited": "docker_container_unhealthy", + # --- 自動修復監控 (auto_repair) --- + "AutoRepairLowSuccessRate": "auto_repair_degraded", + "PermanentFixRequired": "auto_repair_degraded", + # --- 舊版相容 --- + "HighCPUUsage": "high_cpu", + "HighMemoryUsage": "high_memory", + "DiskSpaceLow": "disk_full", + "SSLCertExpiringSoon": "ssl_expiry", + "TargetDown": "service_404", } alert_type = alertname_to_type.get(alertname, "custom")