245 lines
9.5 KiB
Python
245 lines
9.5 KiB
Python
"""
|
||
GAP-A4: 規則 Action 模板 Placeholder 解析修復測試
|
||
====================================================
|
||
建立: 2026-04-14 台北時間 Claude Sonnet 4.6
|
||
|
||
Bug 現象(修復前):
|
||
- Prometheus 告警無 deployment label → target 退回 alertname 或 "unknown"
|
||
- 規則引擎產生垃圾指令: `kubectl rollout restart deployment HostHighCpuLoad`
|
||
- GAP-A1 防注入閘擋下 → 自動修復路徑卡死 → 飛輪沉默 8.3 小時
|
||
|
||
修復邏輯:
|
||
1. _extract_vars 多層 label 查找:deployment > app > statefulset > pod(去後綴) > container
|
||
2. _is_bad_target 垃圾 target 識別(unknown / alertname / IP:port / 含空白等)
|
||
3. match_rule 後置驗證:bad target → 清空 kubectl_command → 降級 LLM
|
||
|
||
🔴 遵循「禁止 Mock 測試鐵律」- 純邏輯不需 DB/Redis
|
||
"""
|
||
|
||
import pytest
|
||
|
||
from src.services.alert_rule_engine import (
|
||
_extract_vars,
|
||
_is_bad_target,
|
||
_strip_pod_suffix,
|
||
match_rule,
|
||
)
|
||
|
||
# =============================================================================
|
||
# _strip_pod_suffix
|
||
# =============================================================================
|
||
|
||
|
||
class TestStripPodSuffix:
|
||
"""Pod 名稱還原 Deployment/StatefulSet base name"""
|
||
|
||
@pytest.mark.parametrize("pod,expected", [
|
||
# Deployment 格式(RS hash 5-10 chars + pod hash 5 chars)
|
||
("awoooi-api-7d6b776f78-4sgjl", "awoooi-api"),
|
||
("api-server-5f8g9-x2m4k", "api-server"),
|
||
("nginx-deployment-abc12345-xyz89", "nginx-deployment"),
|
||
# StatefulSet 格式
|
||
("postgresql-0", "postgresql"),
|
||
("redis-1", "redis"),
|
||
("mongo-replica-2", "mongo-replica"),
|
||
# 無後綴(裸 Deployment 名或 Service 名)
|
||
("postgresql", "postgresql"),
|
||
("awoooi-api", "awoooi-api"),
|
||
])
|
||
def test_strip(self, pod, expected):
|
||
assert _strip_pod_suffix(pod) == expected
|
||
|
||
|
||
# =============================================================================
|
||
# _is_bad_target
|
||
# =============================================================================
|
||
|
||
|
||
class TestIsBadTarget:
|
||
"""垃圾 target 識別"""
|
||
|
||
@pytest.mark.parametrize("target", [
|
||
"", "unknown", "none", "null",
|
||
"HostHighCpuLoad", # == alertname
|
||
"192.168.0.110:9100", # IP:port
|
||
"192.168.0.110", # 純 IP
|
||
"awoooi prod", # 含空白
|
||
"service(x)", # 含括號
|
||
'"quoted"', # 含引號
|
||
"{target}", # 未解析 placeholder
|
||
])
|
||
def test_bad(self, target):
|
||
assert _is_bad_target(target, "HostHighCpuLoad") is True
|
||
|
||
@pytest.mark.parametrize("target", [
|
||
"awoooi-api",
|
||
"postgresql",
|
||
"my-svc-v2",
|
||
"kube-state-metrics",
|
||
])
|
||
def test_good(self, target):
|
||
assert _is_bad_target(target, "HostHighCpuLoad") is False
|
||
|
||
|
||
# =============================================================================
|
||
# _extract_vars — 核心 GAP-A4 場景
|
||
# =============================================================================
|
||
|
||
|
||
class TestExtractVarsGapA4:
|
||
"""修復 8.3h 飛輪沈默的真因:target=alertname/unknown"""
|
||
|
||
def test_target_equals_alertname_returns_unknown(self):
|
||
"""真實 bug 場景:target_resource == alertname → target='unknown'"""
|
||
ctx = {
|
||
"target_resource": "HostHighCpuLoad",
|
||
"labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
vars = _extract_vars(ctx)
|
||
assert vars["target"] == "unknown"
|
||
|
||
def test_deployment_label_priority(self):
|
||
"""labels.deployment 是最權威的來源"""
|
||
ctx = {
|
||
"target_resource": "awoooi-api-7d6b776f78-4sgjl",
|
||
"labels": {
|
||
"alertname": "KubePodCrashLooping",
|
||
"deployment": "awoooi-api",
|
||
"pod": "awoooi-api-7d6b776f78-4sgjl",
|
||
},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
assert _extract_vars(ctx)["target"] == "awoooi-api"
|
||
|
||
def test_pod_label_strips_suffix(self):
|
||
"""只有 pod label 時,去除 RS+pod hash 後綴"""
|
||
ctx = {
|
||
"target_resource": "awoooi-api-7d6b776f78-4sgjl",
|
||
"labels": {"alertname": "KubePodCrashLooping", "pod": "awoooi-api-7d6b776f78-4sgjl"},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
assert _extract_vars(ctx)["target"] == "awoooi-api"
|
||
|
||
def test_app_label_fallback(self):
|
||
"""無 deployment/pod 時,app label 可用"""
|
||
ctx = {
|
||
"target_resource": "prometheus-server-xyz",
|
||
"labels": {"alertname": "PrometheusDown", "app": "prometheus"},
|
||
"namespace": "monitoring",
|
||
}
|
||
assert _extract_vars(ctx)["target"] == "prometheus"
|
||
|
||
def test_statefulset_label(self):
|
||
"""StatefulSet label 優先於 pod"""
|
||
ctx = {
|
||
"target_resource": "postgresql-0",
|
||
"labels": {"alertname": "PostgreSQLDown", "statefulset": "postgresql", "pod": "postgresql-0"},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
assert _extract_vars(ctx)["target"] == "postgresql"
|
||
|
||
def test_ip_port_target_rejected(self):
|
||
"""target_resource 是 IP:port → 退回 unknown(不可組成 deployment 名)"""
|
||
ctx = {
|
||
"target_resource": "192.168.0.110:9100",
|
||
"labels": {"alertname": "HostDown", "instance": "192.168.0.110:9100"},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
assert _extract_vars(ctx)["target"] == "unknown"
|
||
|
||
def test_clean_target_resource_accepted(self):
|
||
"""乾淨的 target_resource 可直接用"""
|
||
ctx = {
|
||
"target_resource": "awoooi-web",
|
||
"labels": {"alertname": "HighRequestLatency"},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
assert _extract_vars(ctx)["target"] == "awoooi-web"
|
||
|
||
def test_systemd_unit_label_extracted_for_host_rule_templates(self):
|
||
"""Systemd runner alert labels must fill {unit} in SSH diagnostics."""
|
||
ctx = {
|
||
"target_resource": "SystemdRunnerWatchdogEnabled",
|
||
"labels": {
|
||
"alertname": "SystemdRunnerWatchdogEnabled",
|
||
"instance": "192.168.0.110:9100",
|
||
"unit": "actions.runner.owenhytsai-awoooi.awoooi-110.service",
|
||
},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
vars = _extract_vars(ctx)
|
||
assert vars["unit"] == "actions.runner.owenhytsai-awoooi.awoooi-110.service"
|
||
|
||
|
||
# =============================================================================
|
||
# match_rule 後置驗證 — 最後一道防線
|
||
# =============================================================================
|
||
|
||
|
||
class TestMatchRuleRejection:
|
||
"""垃圾 target 時 kubectl_command 必須被清空(降級 LLM)"""
|
||
|
||
def test_bad_target_discards_kubectl_command(self):
|
||
"""HostHighCpuLoad target=unknown → 不得組裝成壞 kubectl target。"""
|
||
ctx = {
|
||
"alert_type": "high_cpu",
|
||
"severity": "warning",
|
||
"source": "prometheus",
|
||
"target_resource": "HostHighCpuLoad",
|
||
"namespace": "awoooi-prod",
|
||
"message": "Host CPU > 90%",
|
||
"labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"},
|
||
}
|
||
result = match_rule(ctx)
|
||
# 規則可能匹配 host SSH 診斷;但不能把 HostHighCpuLoad 當成 K8s target。
|
||
if result is not None:
|
||
command = result["kubectl_command"]
|
||
assert command == "" or command.startswith("ssh "), \
|
||
f"bad target 不應組裝 kubectl 指令, got: {command!r}"
|
||
assert "deployment/HostHighCpuLoad" not in command
|
||
|
||
def test_good_target_preserves_kubectl_command(self):
|
||
"""真實 deployment 名稱時,kubectl_command 正常組裝"""
|
||
ctx = {
|
||
"alert_type": "k8s_pod_crash",
|
||
"severity": "critical",
|
||
"source": "alertmanager",
|
||
"target_resource": "awoooi-api-7d6b776f78-4sgjl",
|
||
"namespace": "awoooi-prod",
|
||
"message": "Pod CrashLoopBackOff",
|
||
"labels": {
|
||
"alertname": "KubePodCrashLooping",
|
||
"deployment": "awoooi-api",
|
||
"pod": "awoooi-api-7d6b776f78-4sgjl",
|
||
},
|
||
}
|
||
result = match_rule(ctx)
|
||
# 若有匹配規則且 suggested_action 含 kubectl,則命令應含 awoooi-api
|
||
if result is not None and result.get("kubectl_command"):
|
||
assert "awoooi-api" in result["kubectl_command"]
|
||
assert "unknown" not in result["kubectl_command"]
|
||
assert "KubePodCrashLooping" not in result["kubectl_command"]
|
||
|
||
def test_systemd_runner_rule_preserves_unit_ssh_command(self):
|
||
"""SystemdRunner* must keep a filled read-only SSH diagnostic command."""
|
||
ctx = {
|
||
"alert_type": "infrastructure",
|
||
"severity": "warning",
|
||
"source": "prometheus",
|
||
"target_resource": "SystemdRunnerWatchdogEnabled",
|
||
"namespace": "awoooi-prod",
|
||
"message": "runner watchdog enabled",
|
||
"labels": {
|
||
"alertname": "SystemdRunnerWatchdogEnabled",
|
||
"instance": "192.168.0.110:9100",
|
||
"unit": "actions.runner.owenhytsai-awoooi.awoooi-110.service",
|
||
},
|
||
}
|
||
result = match_rule(ctx)
|
||
assert result is not None
|
||
assert result["rule_id"] == "systemd_runner_baseline_alert"
|
||
assert result["kubectl_command"].startswith("ssh 192.168.0.110 ")
|
||
assert "actions.runner.owenhytsai-awoooi.awoooi-110.service" in result["kubectl_command"]
|
||
assert "{unit}" not in result["kubectl_command"]
|