Files
awoooi/apps/api/tests/test_gap_a4_placeholder_resolution.py
OG T 10b74affcf
Some checks failed
CD Pipeline / build-and-deploy (push) Has been cancelled
fix(GAP-A4): 規則 Action 模板 placeholder 解析修復 — 解開 8.3h 飛輪沉默
🚨 真因診斷(統帥逮到):
API log 顯示最近 1 小時爆發大量 auto_execute_blocked_unresolved_placeholder:
  - action: "kubectl rollout restart deployment HostHighCpuLoad"  ← target=alertname
  - action: "kubectl rollout restart deployment unknown"
  - action: "kubectl scale deployment unknown --replicas=3"

根因:alert_rule_engine._extract_vars() target 解析邏輯不夠強健,
當 Prometheus 告警無 deployment label 時,退回 alertname 或 "unknown",
產生垃圾指令。GAP-A1 防注入閘正確攔下,但自動修復路徑因此卡死,
KM 不寫入 → 飛輪沈默。

修復(三層防護):

1. 新增 _strip_pod_suffix() — K8s Pod 名稱還原 Deployment base
   - Deployment 格式: awoooi-api-7d6b776f78-4sgjl → awoooi-api
   - StatefulSet: postgresql-0 → postgresql
   - Legacy: my-job-x2m4k → my-job

2. 新增 _is_bad_target() — 垃圾 target 識別
   - 空串 / "unknown" / "none" / "null"
   - target == alertname 本身
   - IP:port 格式、純 IP、含空白/括號/引號
   - 未解析 {placeholder}

3. 重寫 _extract_vars() — 多層 label 查找(權威優先):
   deployment > app > statefulset > pod(去後綴) > container > service > target_resource
   每層都過 _is_bad_target 驗證,全失敗 → target="unknown"

4. match_rule() 後置雙驗證:
   - bad target → 清空 kubectl_command (降級 LLM)
   - 殘留 { or } → 清空 kubectl_command (模板未填完)

測試覆蓋:
- 33 個新單元測試(GAP-A4 四大場景全覆蓋)
- 214/214 回歸測試全過

影響:
- 原本產出「kubectl rollout restart deployment HostHighCpuLoad」的路徑
  → 現在會 `rule_kubectl_command_discarded_bad_target` 並降級 LLM
- LLM 若能從錯誤 log 推理真實 deployment,飛輪恢復正常運轉
- 若 LLM 也無解,進 TYPE-4 人工扶梯

2026-04-14 Claude Sonnet 4.6(MASTER 藍圖之外的隱性 Bug 殲滅)

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2026-04-14 18:43:29 +08:00

208 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
GAP-A4: 規則 Action 模板 Placeholder 解析修復測試
====================================================
建立: 2026-04-14 台北時間 Claude Sonnet 4.6
Bug 現象(修復前):
- Prometheus 告警無 deployment label → target 退回 alertname 或 "unknown"
- 規則引擎產生垃圾指令: `kubectl rollout restart deployment HostHighCpuLoad`
- GAP-A1 防注入閘擋下 → 自動修復路徑卡死 → 飛輪沉默 8.3 小時
修復邏輯:
1. _extract_vars 多層 label 查找deployment > app > statefulset > pod(去後綴) > container
2. _is_bad_target 垃圾 target 識別unknown / alertname / IP:port / 含空白等)
3. match_rule 後置驗證bad target → 清空 kubectl_command → 降級 LLM
🔴 遵循「禁止 Mock 測試鐵律」- 純邏輯不需 DB/Redis
"""
import pytest
from src.services.alert_rule_engine import (
_extract_vars,
_is_bad_target,
_strip_pod_suffix,
match_rule,
)
# =============================================================================
# _strip_pod_suffix
# =============================================================================
class TestStripPodSuffix:
"""Pod 名稱還原 Deployment/StatefulSet base name"""
@pytest.mark.parametrize("pod,expected", [
# Deployment 格式RS hash 5-10 chars + pod hash 5 chars
("awoooi-api-7d6b776f78-4sgjl", "awoooi-api"),
("api-server-5f8g9-x2m4k", "api-server"),
("nginx-deployment-abc12345-xyz89", "nginx-deployment"),
# StatefulSet 格式
("postgresql-0", "postgresql"),
("redis-1", "redis"),
("mongo-replica-2", "mongo-replica"),
# 無後綴(裸 Deployment 名或 Service 名)
("postgresql", "postgresql"),
("awoooi-api", "awoooi-api"),
])
def test_strip(self, pod, expected):
assert _strip_pod_suffix(pod) == expected
# =============================================================================
# _is_bad_target
# =============================================================================
class TestIsBadTarget:
"""垃圾 target 識別"""
@pytest.mark.parametrize("target", [
"", "unknown", "none", "null",
"HostHighCpuLoad", # == alertname
"192.168.0.110:9100", # IP:port
"192.168.0.110", # 純 IP
"awoooi prod", # 含空白
"service(x)", # 含括號
'"quoted"', # 含引號
"{target}", # 未解析 placeholder
])
def test_bad(self, target):
assert _is_bad_target(target, "HostHighCpuLoad") is True
@pytest.mark.parametrize("target", [
"awoooi-api",
"postgresql",
"my-svc-v2",
"kube-state-metrics",
])
def test_good(self, target):
assert _is_bad_target(target, "HostHighCpuLoad") is False
# =============================================================================
# _extract_vars — 核心 GAP-A4 場景
# =============================================================================
class TestExtractVarsGapA4:
"""修復 8.3h 飛輪沈默的真因target=alertname/unknown"""
def test_target_equals_alertname_returns_unknown(self):
"""真實 bug 場景target_resource == alertname → target='unknown'"""
ctx = {
"target_resource": "HostHighCpuLoad",
"labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"},
"namespace": "awoooi-prod",
}
vars = _extract_vars(ctx)
assert vars["target"] == "unknown"
def test_deployment_label_priority(self):
"""labels.deployment 是最權威的來源"""
ctx = {
"target_resource": "awoooi-api-7d6b776f78-4sgjl",
"labels": {
"alertname": "KubePodCrashLooping",
"deployment": "awoooi-api",
"pod": "awoooi-api-7d6b776f78-4sgjl",
},
"namespace": "awoooi-prod",
}
assert _extract_vars(ctx)["target"] == "awoooi-api"
def test_pod_label_strips_suffix(self):
"""只有 pod label 時,去除 RS+pod hash 後綴"""
ctx = {
"target_resource": "awoooi-api-7d6b776f78-4sgjl",
"labels": {"alertname": "KubePodCrashLooping", "pod": "awoooi-api-7d6b776f78-4sgjl"},
"namespace": "awoooi-prod",
}
assert _extract_vars(ctx)["target"] == "awoooi-api"
def test_app_label_fallback(self):
"""無 deployment/pod 時app label 可用"""
ctx = {
"target_resource": "prometheus-server-xyz",
"labels": {"alertname": "PrometheusDown", "app": "prometheus"},
"namespace": "monitoring",
}
assert _extract_vars(ctx)["target"] == "prometheus"
def test_statefulset_label(self):
"""StatefulSet label 優先於 pod"""
ctx = {
"target_resource": "postgresql-0",
"labels": {"alertname": "PostgreSQLDown", "statefulset": "postgresql", "pod": "postgresql-0"},
"namespace": "awoooi-prod",
}
assert _extract_vars(ctx)["target"] == "postgresql"
def test_ip_port_target_rejected(self):
"""target_resource 是 IP:port → 退回 unknown不可組成 deployment 名)"""
ctx = {
"target_resource": "192.168.0.110:9100",
"labels": {"alertname": "HostDown", "instance": "192.168.0.110:9100"},
"namespace": "awoooi-prod",
}
assert _extract_vars(ctx)["target"] == "unknown"
def test_clean_target_resource_accepted(self):
"""乾淨的 target_resource 可直接用"""
ctx = {
"target_resource": "awoooi-web",
"labels": {"alertname": "HighRequestLatency"},
"namespace": "awoooi-prod",
}
assert _extract_vars(ctx)["target"] == "awoooi-web"
# =============================================================================
# match_rule 後置驗證 — 最後一道防線
# =============================================================================
class TestMatchRuleRejection:
"""垃圾 target 時 kubectl_command 必須被清空(降級 LLM"""
def test_bad_target_discards_kubectl_command(self):
"""真實 bugHostHighCpuLoad target=unknown → kubectl_command 應清空"""
ctx = {
"alert_type": "high_cpu",
"severity": "warning",
"source": "prometheus",
"target_resource": "HostHighCpuLoad",
"namespace": "awoooi-prod",
"message": "Host CPU > 90%",
"labels": {"alertname": "HostHighCpuLoad", "instance": "192.168.0.110:9100"},
}
result = match_rule(ctx)
# 規則可能匹配host_high_cpu但 kubectl_command 必為空
if result is not None:
assert result["kubectl_command"] == "", \
f"bad target 應導致 kubectl_command 清空, got: {result['kubectl_command']!r}"
def test_good_target_preserves_kubectl_command(self):
"""真實 deployment 名稱時kubectl_command 正常組裝"""
ctx = {
"alert_type": "k8s_pod_crash",
"severity": "critical",
"source": "alertmanager",
"target_resource": "awoooi-api-7d6b776f78-4sgjl",
"namespace": "awoooi-prod",
"message": "Pod CrashLoopBackOff",
"labels": {
"alertname": "KubePodCrashLooping",
"deployment": "awoooi-api",
"pod": "awoooi-api-7d6b776f78-4sgjl",
},
}
result = match_rule(ctx)
# 若有匹配規則且 suggested_action 含 kubectl則命令應含 awoooi-api
if result is not None and result.get("kubectl_command"):
assert "awoooi-api" in result["kubectl_command"]
assert "unknown" not in result["kubectl_command"]
assert "KubePodCrashLooping" not in result["kubectl_command"]