Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
問題根因: 1. generate_fingerprint 用 alert_type(大量 alertname 落入 "custom") → 不同告警名稱同目標共用指紋 → 30 分鐘 debounce 互相擋截 2. classify_alert_early 漏掉 DeadMansSwitch / NoAlertsReceived / PrometheusNotConnectedToAlertmanager → 落入 TYPE-3 一般告警 修復: - alert_analyzer_service.py: 指紋改為 namespace:deployment:alertname:target_resource alertname 取自 labels(Alertmanager),fallback 到 alert_type(其他來源) - incident_service.py: DeadMansSwitch → backup/TYPE-1; NoAlertsReceived + PrometheusNotConnectedToAlertmanager → alertchain_health/TYPE-8M - 補 2 個測試,全套 627 passed Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
305 lines
11 KiB
Python
305 lines
11 KiB
Python
"""
|
||
classify_alert_early() 分類函數單元測試 — ADR-073 Phase 2-2 + ADR-075
|
||
|
||
覆蓋 13 條分類規則、10 種告警分類:
|
||
config_drift (TYPE-4D)
|
||
alertchain_health, flywheel_health (TYPE-8M) — beats severity
|
||
info, backup (TYPE-1)
|
||
host_resource, infrastructure, kubernetes, database,
|
||
storage, devops_tool, external_site, ssl_cert, general (TYPE-3)
|
||
|
||
2026-04-12 ogt (ADR-073 P2-2 測試補強 + ADR-075 七類新增)
|
||
"""
|
||
|
||
import pytest
|
||
from src.services.incident_service import classify_alert_early
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# TYPE-4D: Config Drift
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestConfigDrift:
|
||
def test_configuration_drift(self):
|
||
ac, nt = classify_alert_early("ConfigurationDrift", "critical", {})
|
||
assert nt == "TYPE-4D"
|
||
assert ac == "config_drift"
|
||
|
||
def test_kube_config_drift(self):
|
||
ac, nt = classify_alert_early("KubeConfigDrift", "warning", {})
|
||
assert nt == "TYPE-4D"
|
||
assert ac == "config_drift"
|
||
|
||
def test_config_drift_case_sensitive(self):
|
||
# 不在白名單裡的變體 → 不應命中 config_drift
|
||
ac, nt = classify_alert_early("configurationdrift", "critical", {})
|
||
assert ac != "config_drift"
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# TYPE-1: Info / Heartbeat
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestInfoAlerts:
|
||
def test_severity_info(self):
|
||
ac, nt = classify_alert_early("SomeAlert", "info", {})
|
||
assert nt == "TYPE-1"
|
||
assert ac == "info"
|
||
|
||
def test_severity_none(self):
|
||
ac, nt = classify_alert_early("SomeAlert", "none", {})
|
||
assert nt == "TYPE-1"
|
||
assert ac == "info"
|
||
|
||
def test_backup_keyword_info_only(self):
|
||
# severity=info → severity 規則先命中,TYPE-1
|
||
ac, nt = classify_alert_early("BackupJobComplete", "info", {})
|
||
assert nt == "TYPE-1"
|
||
|
||
def test_backup_keyword_warning_not_type1(self):
|
||
# BackupJobFailed severity=warning → 繼續走 prefix 規則,不應是 TYPE-1
|
||
ac, nt = classify_alert_early("BackupJobFailed", "warning", {})
|
||
assert nt == "TYPE-3"
|
||
|
||
def test_watchdog_heartbeat(self):
|
||
# Watchdog (Alertmanager 心跳) severity=none → severity 規則先命中,TYPE-1
|
||
ac, nt = classify_alert_early("Watchdog", "none", {})
|
||
assert nt == "TYPE-1"
|
||
|
||
def test_deadmansswitch_heartbeat(self):
|
||
# DeadMansSwitch 心跳 → TYPE-1(補入 2026-04-12 ogt)
|
||
ac, nt = classify_alert_early("DeadMansSwitch", "warning", {})
|
||
assert ac == "backup"
|
||
assert nt == "TYPE-1"
|
||
|
||
def test_backup_critical_not_type1(self):
|
||
# critical backup 告警應走各自 prefix,不是純資訊
|
||
ac, nt = classify_alert_early("BACKUP_MISSING", "critical", {})
|
||
assert nt == "TYPE-3"
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# TYPE-3: Infrastructure (Docker / Host)
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestInfrastructure:
|
||
def test_docker_prefix(self):
|
||
ac, nt = classify_alert_early("DockerContainerOOM", "critical", {})
|
||
assert nt == "TYPE-3"
|
||
assert ac == "infrastructure" # Docker → 保留 infrastructure
|
||
|
||
def test_docker_restart(self):
|
||
ac, nt = classify_alert_early("DockerContainerRestarting", "warning", {})
|
||
assert ac == "infrastructure"
|
||
|
||
# ADR-075: Host* 從 infrastructure 分離為 host_resource
|
||
def test_host_prefix_is_host_resource(self):
|
||
ac, nt = classify_alert_early("HostHighCpuLoad", "warning", {})
|
||
assert nt == "TYPE-3"
|
||
assert ac == "host_resource"
|
||
|
||
def test_host_down(self):
|
||
ac, nt = classify_alert_early("HostDown", "critical", {})
|
||
assert ac == "host_resource"
|
||
|
||
def test_host_memory(self):
|
||
ac, nt = classify_alert_early("HostOutOfMemory", "warning", {})
|
||
assert ac == "host_resource"
|
||
|
||
def test_host_disk(self):
|
||
ac, nt = classify_alert_early("HostOutOfDiskSpace", "warning", {})
|
||
assert ac == "host_resource"
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# ADR-075: alertchain_health (TYPE-8M)
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestAlertchainHealth:
|
||
@pytest.mark.parametrize("alertname", [
|
||
"AlertChainBroken_Alertmanager",
|
||
"AlertChainBroken_Sentry",
|
||
"NoAlertsReceived2Hours",
|
||
"AlertChainUnhealthy",
|
||
"NoAlertsReceived",
|
||
"PrometheusNotConnectedToAlertmanager",
|
||
])
|
||
def test_alertchain_alerts(self, alertname):
|
||
ac, nt = classify_alert_early(alertname, "critical", {})
|
||
assert ac == "alertchain_health"
|
||
assert nt == "TYPE-8M"
|
||
|
||
def test_alertchain_beats_severity_info(self):
|
||
# 即使 severity=info,AlertChainBroken 也必須是 alertchain_health
|
||
ac, nt = classify_alert_early("AlertChainBroken_Alertmanager", "info", {})
|
||
assert ac == "alertchain_health"
|
||
assert nt == "TYPE-8M"
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# ADR-075: flywheel_health (TYPE-8M)
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestFlywheelHealth:
|
||
def test_auto_repair_low_success(self):
|
||
ac, nt = classify_alert_early("AutoRepairLowSuccessRate", "warning", {})
|
||
assert ac == "flywheel_health"
|
||
assert nt == "TYPE-8M"
|
||
|
||
def test_permanent_fix_required(self):
|
||
ac, nt = classify_alert_early("PermanentFixRequired", "warning", {})
|
||
assert ac == "flywheel_health"
|
||
assert nt == "TYPE-8M"
|
||
|
||
def test_flywheel_prefix(self):
|
||
ac, nt = classify_alert_early("FlywheelPlaybookZero", "critical", {})
|
||
assert ac == "flywheel_health"
|
||
assert nt == "TYPE-8M"
|
||
|
||
def test_flywheel_beats_severity_info(self):
|
||
ac, nt = classify_alert_early("AutoRepairLowSuccessRate", "info", {})
|
||
assert ac == "flywheel_health"
|
||
assert nt == "TYPE-8M"
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# ADR-075: storage (TYPE-3)
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestStorage:
|
||
def test_minio_down(self):
|
||
ac, nt = classify_alert_early("MinIODown", "critical", {})
|
||
assert ac == "storage"
|
||
assert nt == "TYPE-3"
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# ADR-075: devops_tool (TYPE-3)
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestDevopsTool:
|
||
@pytest.mark.parametrize("alertname", [
|
||
"OpenClawDown",
|
||
"SignOzDown",
|
||
"GiteaDown",
|
||
"HarborDown",
|
||
"SentryDown",
|
||
"AlertmanagerDown",
|
||
"KaliScannerDown",
|
||
"GiteaCIPipelineFailed",
|
||
])
|
||
def test_devops_tools(self, alertname):
|
||
ac, nt = classify_alert_early(alertname, "critical", {})
|
||
assert ac == "devops_tool"
|
||
assert nt == "TYPE-3"
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# ADR-075: external_site (TYPE-3)
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestExternalSite:
|
||
@pytest.mark.parametrize("alertname", [
|
||
"MoWoooWorkDown",
|
||
"TsenyangWebsiteDown",
|
||
"StockWoooWorkDown",
|
||
"BitanWoooWorkDown",
|
||
])
|
||
def test_external_sites(self, alertname):
|
||
ac, nt = classify_alert_early(alertname, "critical", {})
|
||
assert ac == "external_site"
|
||
assert nt == "TYPE-3"
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# ADR-075: ssl_cert (TYPE-3)
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestSslCert:
|
||
def test_external_site_ssl(self):
|
||
ac, nt = classify_alert_early("ExternalSiteSSLExpiringSoon", "warning", {})
|
||
assert ac == "ssl_cert"
|
||
assert nt == "TYPE-3"
|
||
|
||
def test_tls_cert(self):
|
||
ac, nt = classify_alert_early("TLSCertExpiryCritical", "critical", {})
|
||
assert ac == "ssl_cert"
|
||
assert nt == "TYPE-3"
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# TYPE-3: Kubernetes
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestKubernetes:
|
||
@pytest.mark.parametrize("alertname", [
|
||
"KubePodCrashLooping",
|
||
"PodHighMemory",
|
||
"DeploymentReplicasMismatch",
|
||
"NodeNotReady",
|
||
"ArgoCDSyncFailed",
|
||
])
|
||
def test_k8s_prefixes(self, alertname):
|
||
ac, nt = classify_alert_early(alertname, "critical", {})
|
||
assert nt == "TYPE-3"
|
||
assert ac == "kubernetes"
|
||
|
||
def test_velero_backup_failed_is_kubernetes(self):
|
||
# VeleroBackupFailed severity=critical → backup 規則不命中,走 Velero prefix → kubernetes TYPE-3
|
||
ac, nt = classify_alert_early("VeleroBackupFailed", "critical", {})
|
||
assert nt == "TYPE-3"
|
||
assert ac == "kubernetes"
|
||
|
||
def test_velero_backup_success_info_is_type1(self):
|
||
# VeleroBackupSuccess severity=info → TYPE-1
|
||
ac, nt = classify_alert_early("VeleroBackupSuccess", "info", {})
|
||
assert nt == "TYPE-1"
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# TYPE-3: Database
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestDatabase:
|
||
def test_postgres(self):
|
||
ac, nt = classify_alert_early("PostgresDown", "critical", {})
|
||
assert nt == "TYPE-3"
|
||
assert ac == "database"
|
||
|
||
def test_redis(self):
|
||
ac, nt = classify_alert_early("RedisMemoryHigh", "warning", {})
|
||
assert nt == "TYPE-3"
|
||
assert ac == "database"
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# TYPE-3: General (fallback)
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestGeneral:
|
||
def test_unknown_alert(self):
|
||
ac, nt = classify_alert_early("SomeUnknownAlert", "warning", {})
|
||
assert nt == "TYPE-3"
|
||
assert ac == "general"
|
||
|
||
def test_empty_alertname(self):
|
||
ac, nt = classify_alert_early("", "warning", {})
|
||
assert nt == "TYPE-3"
|
||
assert ac == "general"
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# 優先順序驗證 — config_drift 和 info 應優先於 prefix 規則
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
class TestPriority:
|
||
def test_config_drift_beats_severity(self):
|
||
# ConfigurationDrift 即使 severity=warning 也應命中 config_drift
|
||
ac, nt = classify_alert_early("ConfigurationDrift", "warning", {})
|
||
assert ac == "config_drift"
|
||
|
||
def test_info_severity_beats_docker_prefix(self):
|
||
# Docker 前綴 + severity=info → info 規則先命中
|
||
ac, nt = classify_alert_early("DockerContainerOOM", "info", {})
|
||
assert ac == "info"
|