Files
awoooi/apps/api/tests/test_gap_a4_placeholder_resolution.py
Your Name e2ab879636
Some checks failed
CD Pipeline / tests (push) Failing after 52s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 11s
fix(alerts): correct telegram execution truth
2026-05-31 13:58:39 +08:00

245 lines
9.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
GAP-A4: 規則 Action 模板 Placeholder 解析修復測試
====================================================
建立: 2026-04-14 台北時間 Claude Sonnet 4.6
Bug 現象(修復前):
- Prometheus 告警無 deployment label → target 退回 alertname 或 "unknown"
- 規則引擎產生垃圾指令: `kubectl rollout restart deployment HostHighCpuLoad`
- GAP-A1 防注入閘擋下 → 自動修復路徑卡死 → 飛輪沉默 8.3 小時
修復邏輯:
1. _extract_vars 多層 label 查找deployment > app > statefulset > pod(去後綴) > container
2. _is_bad_target 垃圾 target 識別unknown / alertname / IP:port / 含空白等)
3. match_rule 後置驗證bad target → 清空 kubectl_command → 降級 LLM
🔴 遵循「禁止 Mock 測試鐵律」- 純邏輯不需 DB/Redis
"""
import pytest
from src.services.alert_rule_engine import (
_extract_vars,
_is_bad_target,
_strip_pod_suffix,
match_rule,
)
# =============================================================================
# _strip_pod_suffix
# =============================================================================
class TestStripPodSuffix:
"""Pod 名稱還原 Deployment/StatefulSet base name"""
@pytest.mark.parametrize("pod,expected", [
# Deployment 格式RS hash 5-10 chars + pod hash 5 chars
("awoooi-api-7d6b776f78-4sgjl", "awoooi-api"),
("api-server-5f8g9-x2m4k", "api-server"),
("nginx-deployment-abc12345-xyz89", "nginx-deployment"),
# StatefulSet 格式
("postgresql-0", "postgresql"),
("redis-1", "redis"),
("mongo-replica-2", "mongo-replica"),
# 無後綴(裸 Deployment 名或 Service 名)
("postgresql", "postgresql"),
("awoooi-api", "awoooi-api"),
])
def test_strip(self, pod, expected):
assert _strip_pod_suffix(pod) == expected
# =============================================================================
# _is_bad_target
# =============================================================================
class TestIsBadTarget:
"""垃圾 target 識別"""
@pytest.mark.parametrize("target", [
"", "unknown", "none", "null",
"HostHighCpuLoad", # == alertname
"192.168.0.110:9100", # IP:port
"192.168.0.110", # 純 IP
"awoooi prod", # 含空白
"service(x)", # 含括號
'"quoted"', # 含引號
"{target}", # 未解析 placeholder
])
def test_bad(self, target):
assert _is_bad_target(target, "HostHighCpuLoad") is True
@pytest.mark.parametrize("target", [
"awoooi-api",
"postgresql",
"my-svc-v2",
"kube-state-metrics",
])
def test_good(self, target):
assert _is_bad_target(target, "HostHighCpuLoad") is False
# =============================================================================
# _extract_vars — 核心 GAP-A4 場景
# =============================================================================
class TestExtractVarsGapA4:
"""修復 8.3h 飛輪沈默的真因target=alertname/unknown"""
def test_target_equals_alertname_returns_unknown(self):
"""真實 bug 場景target_resource == alertname → target='unknown'"""
ctx = {
"target_resource": "HostHighCpuLoad",
"labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"},
"namespace": "awoooi-prod",
}
vars = _extract_vars(ctx)
assert vars["target"] == "unknown"
def test_deployment_label_priority(self):
"""labels.deployment 是最權威的來源"""
ctx = {
"target_resource": "awoooi-api-7d6b776f78-4sgjl",
"labels": {
"alertname": "KubePodCrashLooping",
"deployment": "awoooi-api",
"pod": "awoooi-api-7d6b776f78-4sgjl",
},
"namespace": "awoooi-prod",
}
assert _extract_vars(ctx)["target"] == "awoooi-api"
def test_pod_label_strips_suffix(self):
"""只有 pod label 時,去除 RS+pod hash 後綴"""
ctx = {
"target_resource": "awoooi-api-7d6b776f78-4sgjl",
"labels": {"alertname": "KubePodCrashLooping", "pod": "awoooi-api-7d6b776f78-4sgjl"},
"namespace": "awoooi-prod",
}
assert _extract_vars(ctx)["target"] == "awoooi-api"
def test_app_label_fallback(self):
"""無 deployment/pod 時app label 可用"""
ctx = {
"target_resource": "prometheus-server-xyz",
"labels": {"alertname": "PrometheusDown", "app": "prometheus"},
"namespace": "monitoring",
}
assert _extract_vars(ctx)["target"] == "prometheus"
def test_statefulset_label(self):
"""StatefulSet label 優先於 pod"""
ctx = {
"target_resource": "postgresql-0",
"labels": {"alertname": "PostgreSQLDown", "statefulset": "postgresql", "pod": "postgresql-0"},
"namespace": "awoooi-prod",
}
assert _extract_vars(ctx)["target"] == "postgresql"
def test_ip_port_target_rejected(self):
"""target_resource 是 IP:port → 退回 unknown不可組成 deployment 名)"""
ctx = {
"target_resource": "192.168.0.110:9100",
"labels": {"alertname": "HostDown", "instance": "192.168.0.110:9100"},
"namespace": "awoooi-prod",
}
assert _extract_vars(ctx)["target"] == "unknown"
def test_clean_target_resource_accepted(self):
"""乾淨的 target_resource 可直接用"""
ctx = {
"target_resource": "awoooi-web",
"labels": {"alertname": "HighRequestLatency"},
"namespace": "awoooi-prod",
}
assert _extract_vars(ctx)["target"] == "awoooi-web"
def test_systemd_unit_label_extracted_for_host_rule_templates(self):
"""Systemd runner alert labels must fill {unit} in SSH diagnostics."""
ctx = {
"target_resource": "SystemdRunnerWatchdogEnabled",
"labels": {
"alertname": "SystemdRunnerWatchdogEnabled",
"instance": "192.168.0.110:9100",
"unit": "actions.runner.owenhytsai-awoooi.awoooi-110.service",
},
"namespace": "awoooi-prod",
}
vars = _extract_vars(ctx)
assert vars["unit"] == "actions.runner.owenhytsai-awoooi.awoooi-110.service"
# =============================================================================
# match_rule 後置驗證 — 最後一道防線
# =============================================================================
class TestMatchRuleRejection:
"""垃圾 target 時 kubectl_command 必須被清空(降級 LLM"""
def test_bad_target_discards_kubectl_command(self):
"""HostHighCpuLoad target=unknown → 不得組裝成壞 kubectl target。"""
ctx = {
"alert_type": "high_cpu",
"severity": "warning",
"source": "prometheus",
"target_resource": "HostHighCpuLoad",
"namespace": "awoooi-prod",
"message": "Host CPU > 90%",
"labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"},
}
result = match_rule(ctx)
# 規則可能匹配 host SSH 診斷;但不能把 HostHighCpuLoad 當成 K8s target。
if result is not None:
command = result["kubectl_command"]
assert command == "" or command.startswith("ssh "), \
f"bad target 不應組裝 kubectl 指令, got: {command!r}"
assert "deployment/HostHighCpuLoad" not in command
def test_good_target_preserves_kubectl_command(self):
"""真實 deployment 名稱時kubectl_command 正常組裝"""
ctx = {
"alert_type": "k8s_pod_crash",
"severity": "critical",
"source": "alertmanager",
"target_resource": "awoooi-api-7d6b776f78-4sgjl",
"namespace": "awoooi-prod",
"message": "Pod CrashLoopBackOff",
"labels": {
"alertname": "KubePodCrashLooping",
"deployment": "awoooi-api",
"pod": "awoooi-api-7d6b776f78-4sgjl",
},
}
result = match_rule(ctx)
# 若有匹配規則且 suggested_action 含 kubectl則命令應含 awoooi-api
if result is not None and result.get("kubectl_command"):
assert "awoooi-api" in result["kubectl_command"]
assert "unknown" not in result["kubectl_command"]
assert "KubePodCrashLooping" not in result["kubectl_command"]
def test_systemd_runner_rule_preserves_unit_ssh_command(self):
"""SystemdRunner* must keep a filled read-only SSH diagnostic command."""
ctx = {
"alert_type": "infrastructure",
"severity": "warning",
"source": "prometheus",
"target_resource": "SystemdRunnerWatchdogEnabled",
"namespace": "awoooi-prod",
"message": "runner watchdog enabled",
"labels": {
"alertname": "SystemdRunnerWatchdogEnabled",
"instance": "192.168.0.110:9100",
"unit": "actions.runner.owenhytsai-awoooi.awoooi-110.service",
},
}
result = match_rule(ctx)
assert result is not None
assert result["rule_id"] == "systemd_runner_baseline_alert"
assert result["kubectl_command"].startswith("ssh 192.168.0.110 ")
assert "actions.runner.owenhytsai-awoooi.awoooi-110.service" in result["kubectl_command"]
assert "{unit}" not in result["kubectl_command"]