Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
🚨 真因診斷(統帥逮到): API log 顯示最近 1 小時爆發大量 auto_execute_blocked_unresolved_placeholder: - action: "kubectl rollout restart deployment HostHighCpuLoad" ← target=alertname - action: "kubectl rollout restart deployment unknown" - action: "kubectl scale deployment unknown --replicas=3" 根因:alert_rule_engine._extract_vars() target 解析邏輯不夠強健, 當 Prometheus 告警無 deployment label 時,退回 alertname 或 "unknown", 產生垃圾指令。GAP-A1 防注入閘正確攔下,但自動修復路徑因此卡死, KM 不寫入 → 飛輪沈默。 修復(三層防護): 1. 新增 _strip_pod_suffix() — K8s Pod 名稱還原 Deployment base - Deployment 格式: awoooi-api-7d6b776f78-4sgjl → awoooi-api - StatefulSet: postgresql-0 → postgresql - Legacy: my-job-x2m4k → my-job 2. 新增 _is_bad_target() — 垃圾 target 識別 - 空串 / "unknown" / "none" / "null" - target == alertname 本身 - IP:port 格式、純 IP、含空白/括號/引號 - 未解析 {placeholder} 3. 重寫 _extract_vars() — 多層 label 查找(權威優先): deployment > app > statefulset > pod(去後綴) > container > service > target_resource 每層都過 _is_bad_target 驗證,全失敗 → target="unknown" 4. match_rule() 後置雙驗證: - bad target → 清空 kubectl_command (降級 LLM) - 殘留 { or } → 清空 kubectl_command (模板未填完) 測試覆蓋: - 33 個新單元測試(GAP-A4 四大場景全覆蓋) - 214/214 回歸測試全過 影響: - 原本產出「kubectl rollout restart deployment HostHighCpuLoad」的路徑 → 現在會 `rule_kubectl_command_discarded_bad_target` 並降級 LLM - LLM 若能從錯誤 log 推理真實 deployment,飛輪恢復正常運轉 - 若 LLM 也無解,進 TYPE-4 人工扶梯 2026-04-14 Claude Sonnet 4.6(MASTER 藍圖之外的隱性 Bug 殲滅) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
208 lines
7.8 KiB
Python
208 lines
7.8 KiB
Python
"""
|
||
GAP-A4: 規則 Action 模板 Placeholder 解析修復測試
|
||
====================================================
|
||
建立: 2026-04-14 台北時間 Claude Sonnet 4.6
|
||
|
||
Bug 現象(修復前):
|
||
- Prometheus 告警無 deployment label → target 退回 alertname 或 "unknown"
|
||
- 規則引擎產生垃圾指令: `kubectl rollout restart deployment HostHighCpuLoad`
|
||
- GAP-A1 防注入閘擋下 → 自動修復路徑卡死 → 飛輪沉默 8.3 小時
|
||
|
||
修復邏輯:
|
||
1. _extract_vars 多層 label 查找:deployment > app > statefulset > pod(去後綴) > container
|
||
2. _is_bad_target 垃圾 target 識別(unknown / alertname / IP:port / 含空白等)
|
||
3. match_rule 後置驗證:bad target → 清空 kubectl_command → 降級 LLM
|
||
|
||
🔴 遵循「禁止 Mock 測試鐵律」- 純邏輯不需 DB/Redis
|
||
"""
|
||
|
||
import pytest
|
||
|
||
from src.services.alert_rule_engine import (
|
||
_extract_vars,
|
||
_is_bad_target,
|
||
_strip_pod_suffix,
|
||
match_rule,
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# _strip_pod_suffix
|
||
# =============================================================================
|
||
|
||
|
||
class TestStripPodSuffix:
|
||
"""Pod 名稱還原 Deployment/StatefulSet base name"""
|
||
|
||
@pytest.mark.parametrize("pod,expected", [
|
||
# Deployment 格式(RS hash 5-10 chars + pod hash 5 chars)
|
||
("awoooi-api-7d6b776f78-4sgjl", "awoooi-api"),
|
||
("api-server-5f8g9-x2m4k", "api-server"),
|
||
("nginx-deployment-abc12345-xyz89", "nginx-deployment"),
|
||
# StatefulSet 格式
|
||
("postgresql-0", "postgresql"),
|
||
("redis-1", "redis"),
|
||
("mongo-replica-2", "mongo-replica"),
|
||
# 無後綴(裸 Deployment 名或 Service 名)
|
||
("postgresql", "postgresql"),
|
||
("awoooi-api", "awoooi-api"),
|
||
])
|
||
def test_strip(self, pod, expected):
|
||
assert _strip_pod_suffix(pod) == expected
|
||
|
||
|
||
# =============================================================================
|
||
# _is_bad_target
|
||
# =============================================================================
|
||
|
||
|
||
class TestIsBadTarget:
|
||
"""垃圾 target 識別"""
|
||
|
||
@pytest.mark.parametrize("target", [
|
||
"", "unknown", "none", "null",
|
||
"HostHighCpuLoad", # == alertname
|
||
"192.168.0.110:9100", # IP:port
|
||
"192.168.0.110", # 純 IP
|
||
"awoooi prod", # 含空白
|
||
"service(x)", # 含括號
|
||
'"quoted"', # 含引號
|
||
"{target}", # 未解析 placeholder
|
||
])
|
||
def test_bad(self, target):
|
||
assert _is_bad_target(target, "HostHighCpuLoad") is True
|
||
|
||
@pytest.mark.parametrize("target", [
|
||
"awoooi-api",
|
||
"postgresql",
|
||
"my-svc-v2",
|
||
"kube-state-metrics",
|
||
])
|
||
def test_good(self, target):
|
||
assert _is_bad_target(target, "HostHighCpuLoad") is False
|
||
|
||
|
||
# =============================================================================
|
||
# _extract_vars — 核心 GAP-A4 場景
|
||
# =============================================================================
|
||
|
||
|
||
class TestExtractVarsGapA4:
|
||
"""修復 8.3h 飛輪沈默的真因:target=alertname/unknown"""
|
||
|
||
def test_target_equals_alertname_returns_unknown(self):
|
||
"""真實 bug 場景:target_resource == alertname → target='unknown'"""
|
||
ctx = {
|
||
"target_resource": "HostHighCpuLoad",
|
||
"labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
vars = _extract_vars(ctx)
|
||
assert vars["target"] == "unknown"
|
||
|
||
def test_deployment_label_priority(self):
|
||
"""labels.deployment 是最權威的來源"""
|
||
ctx = {
|
||
"target_resource": "awoooi-api-7d6b776f78-4sgjl",
|
||
"labels": {
|
||
"alertname": "KubePodCrashLooping",
|
||
"deployment": "awoooi-api",
|
||
"pod": "awoooi-api-7d6b776f78-4sgjl",
|
||
},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
assert _extract_vars(ctx)["target"] == "awoooi-api"
|
||
|
||
def test_pod_label_strips_suffix(self):
|
||
"""只有 pod label 時,去除 RS+pod hash 後綴"""
|
||
ctx = {
|
||
"target_resource": "awoooi-api-7d6b776f78-4sgjl",
|
||
"labels": {"alertname": "KubePodCrashLooping", "pod": "awoooi-api-7d6b776f78-4sgjl"},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
assert _extract_vars(ctx)["target"] == "awoooi-api"
|
||
|
||
def test_app_label_fallback(self):
|
||
"""無 deployment/pod 時,app label 可用"""
|
||
ctx = {
|
||
"target_resource": "prometheus-server-xyz",
|
||
"labels": {"alertname": "PrometheusDown", "app": "prometheus"},
|
||
"namespace": "monitoring",
|
||
}
|
||
assert _extract_vars(ctx)["target"] == "prometheus"
|
||
|
||
def test_statefulset_label(self):
|
||
"""StatefulSet label 優先於 pod"""
|
||
ctx = {
|
||
"target_resource": "postgresql-0",
|
||
"labels": {"alertname": "PostgreSQLDown", "statefulset": "postgresql", "pod": "postgresql-0"},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
assert _extract_vars(ctx)["target"] == "postgresql"
|
||
|
||
def test_ip_port_target_rejected(self):
|
||
"""target_resource 是 IP:port → 退回 unknown(不可組成 deployment 名)"""
|
||
ctx = {
|
||
"target_resource": "192.168.0.110:9100",
|
||
"labels": {"alertname": "HostDown", "instance": "192.168.0.110:9100"},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
assert _extract_vars(ctx)["target"] == "unknown"
|
||
|
||
def test_clean_target_resource_accepted(self):
|
||
"""乾淨的 target_resource 可直接用"""
|
||
ctx = {
|
||
"target_resource": "awoooi-web",
|
||
"labels": {"alertname": "HighRequestLatency"},
|
||
"namespace": "awoooi-prod",
|
||
}
|
||
assert _extract_vars(ctx)["target"] == "awoooi-web"
|
||
|
||
|
||
# =============================================================================
|
||
# match_rule 後置驗證 — 最後一道防線
|
||
# =============================================================================
|
||
|
||
|
||
class TestMatchRuleRejection:
|
||
"""垃圾 target 時 kubectl_command 必須被清空(降級 LLM)"""
|
||
|
||
def test_bad_target_discards_kubectl_command(self):
|
||
"""真實 bug:HostHighCpuLoad target=unknown → kubectl_command 應清空"""
|
||
ctx = {
|
||
"alert_type": "high_cpu",
|
||
"severity": "warning",
|
||
"source": "prometheus",
|
||
"target_resource": "HostHighCpuLoad",
|
||
"namespace": "awoooi-prod",
|
||
"message": "Host CPU > 90%",
|
||
"labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"},
|
||
}
|
||
result = match_rule(ctx)
|
||
# 規則可能匹配(host_high_cpu)但 kubectl_command 必為空
|
||
if result is not None:
|
||
assert result["kubectl_command"] == "", \
|
||
f"bad target 應導致 kubectl_command 清空, got: {result['kubectl_command']!r}"
|
||
|
||
def test_good_target_preserves_kubectl_command(self):
|
||
"""真實 deployment 名稱時,kubectl_command 正常組裝"""
|
||
ctx = {
|
||
"alert_type": "k8s_pod_crash",
|
||
"severity": "critical",
|
||
"source": "alertmanager",
|
||
"target_resource": "awoooi-api-7d6b776f78-4sgjl",
|
||
"namespace": "awoooi-prod",
|
||
"message": "Pod CrashLoopBackOff",
|
||
"labels": {
|
||
"alertname": "KubePodCrashLooping",
|
||
"deployment": "awoooi-api",
|
||
"pod": "awoooi-api-7d6b776f78-4sgjl",
|
||
},
|
||
}
|
||
result = match_rule(ctx)
|
||
# 若有匹配規則且 suggested_action 含 kubectl,則命令應含 awoooi-api
|
||
if result is not None and result.get("kubectl_command"):
|
||
assert "awoooi-api" in result["kubectl_command"]
|
||
assert "unknown" not in result["kubectl_command"]
|
||
assert "KubePodCrashLooping" not in result["kubectl_command"]
|