""" GAP-A4: 規則 Action 模板 Placeholder 解析修復測試 ==================================================== 建立: 2026-04-14 台北時間 Claude Sonnet 4.6 Bug 現象(修復前): - Prometheus 告警無 deployment label → target 退回 alertname 或 "unknown" - 規則引擎產生垃圾指令: `kubectl rollout restart deployment HostHighCpuLoad` - GAP-A1 防注入閘擋下 → 自動修復路徑卡死 → 飛輪沉默 8.3 小時 修復邏輯: 1. _extract_vars 多層 label 查找:deployment > app > statefulset > pod(去後綴) > container 2. _is_bad_target 垃圾 target 識別(unknown / alertname / IP:port / 含空白等) 3. match_rule 後置驗證:bad target → 清空 kubectl_command → 降級 LLM 🔴 遵循「禁止 Mock 測試鐵律」- 純邏輯不需 DB/Redis """ import pytest from src.services.alert_rule_engine import ( _extract_vars, _is_bad_target, _strip_pod_suffix, match_rule, ) # ============================================================================= # _strip_pod_suffix # ============================================================================= class TestStripPodSuffix: """Pod 名稱還原 Deployment/StatefulSet base name""" @pytest.mark.parametrize("pod,expected", [ # Deployment 格式(RS hash 5-10 chars + pod hash 5 chars) ("awoooi-api-7d6b776f78-4sgjl", "awoooi-api"), ("api-server-5f8g9-x2m4k", "api-server"), ("nginx-deployment-abc12345-xyz89", "nginx-deployment"), # StatefulSet 格式 ("postgresql-0", "postgresql"), ("redis-1", "redis"), ("mongo-replica-2", "mongo-replica"), # 無後綴(裸 Deployment 名或 Service 名) ("postgresql", "postgresql"), ("awoooi-api", "awoooi-api"), ]) def test_strip(self, pod, expected): assert _strip_pod_suffix(pod) == expected # ============================================================================= # _is_bad_target # ============================================================================= class TestIsBadTarget: """垃圾 target 識別""" @pytest.mark.parametrize("target", [ "", "unknown", "none", "null", "HostHighCpuLoad", # == alertname "192.168.0.110:9100", # IP:port "192.168.0.110", # 純 IP "awoooi prod", # 含空白 "service(x)", # 含括號 '"quoted"', # 含引號 "{target}", # 未解析 placeholder ]) def test_bad(self, target): assert _is_bad_target(target, "HostHighCpuLoad") is True @pytest.mark.parametrize("target", [ "awoooi-api", "postgresql", "my-svc-v2", "kube-state-metrics", ]) def test_good(self, target): assert _is_bad_target(target, "HostHighCpuLoad") is False # ============================================================================= # _extract_vars — 核心 GAP-A4 場景 # ============================================================================= class TestExtractVarsGapA4: """修復 8.3h 飛輪沈默的真因:target=alertname/unknown""" def test_target_equals_alertname_returns_unknown(self): """真實 bug 場景:target_resource == alertname → target='unknown'""" ctx = { "target_resource": "HostHighCpuLoad", "labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"}, "namespace": "awoooi-prod", } vars = _extract_vars(ctx) assert vars["target"] == "unknown" def test_deployment_label_priority(self): """labels.deployment 是最權威的來源""" ctx = { "target_resource": "awoooi-api-7d6b776f78-4sgjl", "labels": { "alertname": "KubePodCrashLooping", "deployment": "awoooi-api", "pod": "awoooi-api-7d6b776f78-4sgjl", }, "namespace": "awoooi-prod", } assert _extract_vars(ctx)["target"] == "awoooi-api" def test_pod_label_strips_suffix(self): """只有 pod label 時,去除 RS+pod hash 後綴""" ctx = { "target_resource": "awoooi-api-7d6b776f78-4sgjl", "labels": {"alertname": "KubePodCrashLooping", "pod": "awoooi-api-7d6b776f78-4sgjl"}, "namespace": "awoooi-prod", } assert _extract_vars(ctx)["target"] == "awoooi-api" def test_app_label_fallback(self): """無 deployment/pod 時,app label 可用""" ctx = { "target_resource": "prometheus-server-xyz", "labels": {"alertname": "PrometheusDown", "app": "prometheus"}, "namespace": "monitoring", } assert _extract_vars(ctx)["target"] == "prometheus" def test_statefulset_label(self): """StatefulSet label 優先於 pod""" ctx = { "target_resource": "postgresql-0", "labels": {"alertname": "PostgreSQLDown", "statefulset": "postgresql", "pod": "postgresql-0"}, "namespace": "awoooi-prod", } assert _extract_vars(ctx)["target"] == "postgresql" def test_ip_port_target_rejected(self): """target_resource 是 IP:port → 退回 unknown(不可組成 deployment 名)""" ctx = { "target_resource": "192.168.0.110:9100", "labels": {"alertname": "HostDown", "instance": "192.168.0.110:9100"}, "namespace": "awoooi-prod", } assert _extract_vars(ctx)["target"] == "unknown" def test_clean_target_resource_accepted(self): """乾淨的 target_resource 可直接用""" ctx = { "target_resource": "awoooi-web", "labels": {"alertname": "HighRequestLatency"}, "namespace": "awoooi-prod", } assert _extract_vars(ctx)["target"] == "awoooi-web" def test_systemd_unit_label_extracted_for_host_rule_templates(self): """Systemd runner alert labels must fill {unit} in SSH diagnostics.""" ctx = { "target_resource": "SystemdRunnerWatchdogEnabled", "labels": { "alertname": "SystemdRunnerWatchdogEnabled", "instance": "192.168.0.110:9100", "unit": "actions.runner.owenhytsai-awoooi.awoooi-110.service", }, "namespace": "awoooi-prod", } vars = _extract_vars(ctx) assert vars["unit"] == "actions.runner.owenhytsai-awoooi.awoooi-110.service" # ============================================================================= # match_rule 後置驗證 — 最後一道防線 # ============================================================================= class TestMatchRuleRejection: """垃圾 target 時 kubectl_command 必須被清空(降級 LLM)""" def test_bad_target_discards_kubectl_command(self): """HostHighCpuLoad target=unknown → 不得組裝成壞 kubectl target。""" ctx = { "alert_type": "high_cpu", "severity": "warning", "source": "prometheus", "target_resource": "HostHighCpuLoad", "namespace": "awoooi-prod", "message": "Host CPU > 90%", "labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"}, } result = match_rule(ctx) # 規則可能匹配 host SSH 診斷;但不能把 HostHighCpuLoad 當成 K8s target。 if result is not None: command = result["kubectl_command"] assert command == "" or command.startswith("ssh "), \ f"bad target 不應組裝 kubectl 指令, got: {command!r}" assert "deployment/HostHighCpuLoad" not in command def test_good_target_preserves_kubectl_command(self): """真實 deployment 名稱時,kubectl_command 正常組裝""" ctx = { "alert_type": "k8s_pod_crash", "severity": "critical", "source": "alertmanager", "target_resource": "awoooi-api-7d6b776f78-4sgjl", "namespace": "awoooi-prod", "message": "Pod CrashLoopBackOff", "labels": { "alertname": "KubePodCrashLooping", "deployment": "awoooi-api", "pod": "awoooi-api-7d6b776f78-4sgjl", }, } result = match_rule(ctx) # 若有匹配規則且 suggested_action 含 kubectl,則命令應含 awoooi-api if result is not None and result.get("kubectl_command"): assert "awoooi-api" in result["kubectl_command"] assert "unknown" not in result["kubectl_command"] assert "KubePodCrashLooping" not in result["kubectl_command"] def test_systemd_runner_rule_preserves_unit_ssh_command(self): """SystemdRunner* must keep a filled read-only SSH diagnostic command.""" ctx = { "alert_type": "infrastructure", "severity": "warning", "source": "prometheus", "target_resource": "SystemdRunnerWatchdogEnabled", "namespace": "awoooi-prod", "message": "runner watchdog enabled", "labels": { "alertname": "SystemdRunnerWatchdogEnabled", "instance": "192.168.0.110:9100", "unit": "actions.runner.owenhytsai-awoooi.awoooi-110.service", }, } result = match_rule(ctx) assert result is not None assert result["rule_id"] == "systemd_runner_baseline_alert" assert result["kubectl_command"].startswith("ssh 192.168.0.110 ") assert "actions.runner.owenhytsai-awoooi.awoooi-110.service" in result["kubectl_command"] assert "{unit}" not in result["kubectl_command"]