fix(ai): improve docker repair verification signals
This commit is contained in:
@@ -574,6 +574,8 @@ def _classify_non_success_failure(row: dict[str, Any]) -> str:
|
||||
return "verifier_target_missing_pod"
|
||||
if not bool(row.get("auto_success")):
|
||||
return "auto_repair_execution_failed"
|
||||
if "mcp:ssh_diagnose" in combined or "ssh_diagnose" in combined:
|
||||
return "observe_only_playbook"
|
||||
|
||||
result = str(row.get("verification_result") or "").lower()
|
||||
if result in {"failed", "timeout"}:
|
||||
@@ -615,6 +617,13 @@ def _remediation_for_failure_class(failure_class: str) -> dict[str, str]:
|
||||
"owner": "solver_or_operator",
|
||||
"reason": "execution_failed_after_route_normalization",
|
||||
}
|
||||
if failure_class == "observe_only_playbook":
|
||||
return {
|
||||
"status": "needs_playbook_ticket",
|
||||
"action": "promote_diagnostic_to_repair_playbook",
|
||||
"owner": "solver_or_operator",
|
||||
"reason": "auto_repair_only_collected_evidence",
|
||||
}
|
||||
if failure_class in {"verification_failed", "verification_timeout"}:
|
||||
return {
|
||||
"status": "manual_review",
|
||||
@@ -639,6 +648,8 @@ def _next_step_for_failure_class(failure_class: str) -> str:
|
||||
return "map_verifier_target"
|
||||
if failure_class == "auto_repair_execution_failed":
|
||||
return "review_auto_repair_execution"
|
||||
if failure_class == "observe_only_playbook":
|
||||
return "author_mutating_repair_step"
|
||||
if failure_class in {"verification_failed", "verification_timeout"}:
|
||||
return "escalate_verification_failure"
|
||||
return "review_degraded_verification"
|
||||
|
||||
@@ -149,6 +149,19 @@ _EXTERNAL_SITE_ALERTNAMES = {
|
||||
_K3S_NODE_ALERTNAMES = {"K3sNodeDown", "K3sVIPDown"}
|
||||
|
||||
|
||||
def _build_verification_action_taken(playbook_id: str, executed_steps: list[str]) -> str:
|
||||
"""Build a compact verifier action string with the real executed step shape."""
|
||||
base = f"auto_repair:{playbook_id}"
|
||||
if not executed_steps:
|
||||
return base
|
||||
|
||||
joined = " | ".join(step.strip() for step in executed_steps if step.strip())
|
||||
joined = re.sub(r"\s+", " ", joined)
|
||||
if not joined:
|
||||
return base
|
||||
return f"{base} steps={joined[:900]}"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Auto Repair Service Interface
|
||||
# =============================================================================
|
||||
@@ -600,7 +613,10 @@ class AutoRepairService:
|
||||
from src.services.post_execution_verifier import get_post_execution_verifier
|
||||
from src.services.learning_service import get_learning_service
|
||||
|
||||
_action_taken = f"auto_repair:{playbook.playbook_id}"
|
||||
_action_taken = _build_verification_action_taken(
|
||||
playbook.playbook_id,
|
||||
executed_steps,
|
||||
)
|
||||
_verifier = get_post_execution_verifier()
|
||||
_learning = get_learning_service()
|
||||
|
||||
|
||||
@@ -175,10 +175,7 @@ class MCPToolRegistry:
|
||||
# workload/node locator prevents host alerts such as HostErrorLogFlood
|
||||
# from being misrouted into Kubernetes tools just because an upstream
|
||||
# bridge added namespace="infra".
|
||||
has_k8s_locator = any(
|
||||
labels.get(key)
|
||||
for key in ("deployment", "pod", "node", "container")
|
||||
)
|
||||
has_k8s_locator = _has_k8s_locator(alertname, labels)
|
||||
|
||||
# 依優先度排序後篩選
|
||||
sorted_tools = sorted(self._tools, key=lambda t: t.priority)
|
||||
@@ -256,6 +253,23 @@ def _select_provider_balanced_tools(
|
||||
return selected
|
||||
|
||||
|
||||
def _has_k8s_locator(alertname: str, labels: dict[str, Any]) -> bool:
|
||||
"""Return true only when labels can identify a Kubernetes workload/node.
|
||||
|
||||
Docker exporters also use a generic ``container`` label. Treating that
|
||||
label as a pod locator made DockerContainer* alerts call Kubernetes pod
|
||||
tools with an empty pod name, which polluted post-repair verification.
|
||||
"""
|
||||
if any(labels.get(key) for key in ("deployment", "pod", "node")):
|
||||
return True
|
||||
|
||||
alert = alertname or ""
|
||||
if alert.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
|
||||
return bool(labels.get("namespace") and labels.get("container"))
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 工具自動分類(根據 tool name 推斷感官維度)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -255,6 +255,8 @@ class PostExecutionVerifier:
|
||||
"pod_name": labels.get("pod", labels.get("name", "")),
|
||||
"deployment": labels.get("deployment", ""),
|
||||
"host": labels.get("instance", "").split(":")[0] or labels.get("host", ""),
|
||||
"container_name": _extract_container_name(labels),
|
||||
"filter_name": _extract_container_name(labels),
|
||||
"query": _build_prometheus_query(alertname, labels),
|
||||
}
|
||||
|
||||
@@ -495,7 +497,13 @@ def _assess_recovery(
|
||||
"'success': true",
|
||||
'"success": true',
|
||||
]
|
||||
if any(sig in post_str for sig in success_signals):
|
||||
has_success_signal = any(sig in post_str for sig in success_signals)
|
||||
if not has_success_signal and _docker_state_indicates_running(post_str):
|
||||
has_success_signal = True
|
||||
|
||||
if has_success_signal:
|
||||
if _is_observe_only_action(action_taken):
|
||||
return "degraded"
|
||||
# 但如果 pre_state 已經是 running,可能是無效操作
|
||||
if pre_str and any(sig in pre_str for sig in success_signals):
|
||||
# 如果執行的是 restart,即使 pre/post 都 Running 也算 success
|
||||
@@ -533,6 +541,53 @@ def _get_labels(incident: "Incident") -> dict[str, Any]:
|
||||
return {}
|
||||
|
||||
|
||||
def _extract_container_name(labels: dict[str, Any]) -> str:
|
||||
"""Resolve Docker container target labels for post-execution SSH sensors."""
|
||||
for key in ("filter_name", "container_name", "container", "resource", "name"):
|
||||
value = str(labels.get(key) or "").strip()
|
||||
if value and "{" not in value and "}" not in value:
|
||||
return value
|
||||
return ""
|
||||
|
||||
|
||||
def _is_observe_only_action(action_taken: str) -> bool:
|
||||
"""Return true when the executed step collected evidence but did not mutate state."""
|
||||
lowered = (action_taken or "").lower()
|
||||
observe_tokens = (
|
||||
"mcp:ssh_diagnose",
|
||||
"ssh_diagnose",
|
||||
"docker stats",
|
||||
"ps aux",
|
||||
"free -h",
|
||||
"df -h",
|
||||
)
|
||||
mutation_tokens = (
|
||||
"restart",
|
||||
"delete",
|
||||
"rollout",
|
||||
"scale",
|
||||
"patch",
|
||||
"apply",
|
||||
"prune",
|
||||
"truncate",
|
||||
"clear",
|
||||
)
|
||||
return any(token in lowered for token in observe_tokens) and not any(
|
||||
token in lowered for token in mutation_tokens
|
||||
)
|
||||
|
||||
|
||||
def _docker_state_indicates_running(post_str: str) -> bool:
|
||||
"""Recognize Docker-specific healthy/running output without matching uptime."""
|
||||
if not any(token in post_str for token in ("docker ps", "docker inspect", "docker stats")):
|
||||
return False
|
||||
return bool(
|
||||
re.search(r'\bup\s+\d', post_str)
|
||||
or re.search(r'["\']status["\']\s*:\s*["\']running["\']', post_str)
|
||||
or re.search(r'["\']running["\']\s*:\s*true', post_str)
|
||||
)
|
||||
|
||||
|
||||
def _build_prometheus_query(alertname: str, labels: dict[str, Any]) -> str:
|
||||
"""Build a non-empty PromQL probe for post-execution metric sensors."""
|
||||
|
||||
|
||||
@@ -274,3 +274,45 @@ def test_verification_coverage_payload_skips_when_no_auto_repair():
|
||||
assert payload["reason"] == "no_auto_repair_executions_24h"
|
||||
assert payload["evaluable"] is False
|
||||
assert payload["remediation_queue"]["total"] == 0
|
||||
|
||||
|
||||
def test_verification_coverage_payload_routes_observe_only_playbook_to_ticket():
|
||||
payload = _build_verification_coverage_payload(
|
||||
{
|
||||
"total_auto": 1,
|
||||
"successful_auto": 1,
|
||||
"verified_auto": 1,
|
||||
"verified_success": 0,
|
||||
"verified_non_success": 1,
|
||||
"unverified_auto": 0,
|
||||
},
|
||||
[],
|
||||
[
|
||||
{
|
||||
"auto_repair_id": "are-3",
|
||||
"incident_id": "INC-3",
|
||||
"incident_status": "INVESTIGATING",
|
||||
"incident_severity": "P2",
|
||||
"alert_category": "infrastructure",
|
||||
"alertname": "DockerContainerMemoryLimitPressure",
|
||||
"auto_success": True,
|
||||
"playbook_id": "PB-3",
|
||||
"playbook_name": "Docker pressure diagnostic playbook",
|
||||
"triggered_by": "auto_repair",
|
||||
"risk_level": "LOW",
|
||||
"verification_result": "degraded",
|
||||
"auto_error": "",
|
||||
"post_state_text": '{"ssh_diagnose": {"command": "docker stats --no-stream api"}}',
|
||||
"evidence_summary": "SUCCESS: mcp:ssh_diagnose",
|
||||
"auto_created_at": None,
|
||||
"verification_collected_at": None,
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
item = payload["recent_non_success"][0]
|
||||
assert item["failure_class"] == "observe_only_playbook"
|
||||
assert item["next_step"] == "author_mutating_repair_step"
|
||||
assert item["remediation_status"] == "needs_playbook_ticket"
|
||||
assert item["remediation_action"] == "promote_diagnostic_to_repair_playbook"
|
||||
assert payload["remediation_queue"]["needs_human"] == 1
|
||||
|
||||
@@ -225,10 +225,13 @@ async def test_auto_repair_success_triggers_verify_and_learn(monkeypatch):
|
||||
assert len(stub_verifier.calls) == 1, "verifier.verify() 應被呼叫一次"
|
||||
assert stub_verifier.calls[0]["incident_id"] == incident.incident_id
|
||||
assert stub_verifier.calls[0]["snapshot"] is None
|
||||
assert stub_verifier.calls[0]["action_taken"].startswith(f"auto_repair:{playbook.playbook_id}")
|
||||
assert "steps=Step 1:" in stub_verifier.calls[0]["action_taken"]
|
||||
|
||||
assert len(stub_learning.verification_calls) == 1, "record_verification_result 應被呼叫一次"
|
||||
call = stub_learning.verification_calls[0]
|
||||
assert call["incident_id"] == incident.incident_id
|
||||
assert call["action_taken"] == stub_verifier.calls[0]["action_taken"]
|
||||
assert call["verification_result"] == "success"
|
||||
assert call["matched_playbook_id"] == playbook.playbook_id
|
||||
|
||||
|
||||
@@ -387,6 +387,34 @@ class TestSuggestTools:
|
||||
assert "prometheus_query" in names
|
||||
assert "kubectl_describe" not in names
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_docker_container_alert_does_not_treat_container_as_pod_locator(self):
|
||||
registry = MCPToolRegistry()
|
||||
ssh_provider = _StubProvider("ssh_host", ["ssh_diagnose", "ssh_get_container_status"])
|
||||
k8s_provider = _StubProvider("kubernetes", ["kubectl_describe", "k8s_get_pod_logs"])
|
||||
prometheus_provider = _StubProvider("prometheus", ["prometheus_query"])
|
||||
await registry.register_provider(k8s_provider)
|
||||
await registry.register_provider(ssh_provider)
|
||||
await registry.register_provider(prometheus_provider)
|
||||
|
||||
tools = registry.suggest_tools(
|
||||
alertname="DockerContainerMemoryLimitPressure",
|
||||
incident_labels={
|
||||
"namespace": "default",
|
||||
"container": "momo-pro-system",
|
||||
"container_name": "momo-pro-system",
|
||||
"host": "188",
|
||||
},
|
||||
max_tools=8,
|
||||
)
|
||||
names = [reg.tool.name for reg in tools]
|
||||
|
||||
assert "ssh_diagnose" in names
|
||||
assert "ssh_get_container_status" in names
|
||||
assert "prometheus_query" in names
|
||||
assert "kubectl_describe" not in names
|
||||
assert "k8s_get_pod_logs" not in names
|
||||
|
||||
def test_get_all_tools_returns_all(self):
|
||||
registry = MCPToolRegistry()
|
||||
provider = _StubProvider("test", [])
|
||||
|
||||
@@ -136,6 +136,23 @@ class TestAssessRecovery:
|
||||
action = "auto_repair_playbook:PB-TEST mcp:ssh_diagnose docker stats"
|
||||
assert _assess_recovery(pre, post, action) == "degraded"
|
||||
|
||||
def test_diagnosis_only_without_pre_state_is_not_verified_repair(self):
|
||||
"""只有 read-only SSH 診斷且無 pre-state 時,也不能被誤算成真修復。"""
|
||||
post = {"ssh_diagnose": {"command": "docker stats --no-stream api", "stdout": '"Status": "running"'}}
|
||||
action = "auto_repair:PB-TEST steps=Step 1: docker stats -> SUCCESS: mcp:ssh_diagnose"
|
||||
assert _assess_recovery(None, post, action) == "degraded"
|
||||
|
||||
def test_docker_running_after_restart_is_success(self):
|
||||
"""Docker restart 後看到 docker ps/inspect running,可驗證為 success。"""
|
||||
post = {
|
||||
"ssh_get_container_status": {
|
||||
"command": "docker ps -a --filter name=api",
|
||||
"stdout": "api Up 12 seconds",
|
||||
}
|
||||
}
|
||||
action = "auto_repair:PB-TEST steps=Step 1: docker restart api -> SUCCESS"
|
||||
assert _assess_recovery(None, post, action) == "success"
|
||||
|
||||
def test_pre_running_post_running_delete_is_success(self):
|
||||
"""kubectl delete 動作,前後都 Running → success"""
|
||||
pre = {"status": "Running"}
|
||||
@@ -417,6 +434,25 @@ class _PrometheusRegistry:
|
||||
]
|
||||
|
||||
|
||||
class _DockerRegistry:
|
||||
def __init__(self, provider: _CaptureProvider) -> None:
|
||||
self.provider = provider
|
||||
|
||||
def suggest_tools(self, alertname: str = "", incident_labels: dict | None = None) -> list[RegisteredTool]:
|
||||
return [
|
||||
RegisteredTool(
|
||||
tool=MCPTool(
|
||||
name="ssh_get_container_status",
|
||||
description="",
|
||||
input_schema={},
|
||||
server_name="capture",
|
||||
),
|
||||
provider=self.provider,
|
||||
dimensions=[SensorDimension.D3_METRICS],
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class _DbContext:
|
||||
async def __aenter__(self) -> object:
|
||||
return object()
|
||||
@@ -501,6 +537,23 @@ class TestCollectPostStateAuditContext:
|
||||
assert 'host="110"' in query
|
||||
assert 'container_name="momo-scheduler"' in query
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_collect_post_state_sends_docker_container_target(self):
|
||||
provider = _CaptureProvider()
|
||||
verifier = PostExecutionVerifier()
|
||||
verifier._registry = _DockerRegistry(provider)
|
||||
incident = _stub_incident(alertname="DockerContainerUnhealthy")
|
||||
incident.signals[0].labels.update({
|
||||
"host": "188",
|
||||
"container_name": "momo-pro-system",
|
||||
})
|
||||
|
||||
await verifier._collect_post_state(incident)
|
||||
|
||||
assert provider.seen_parameters is not None
|
||||
assert provider.seen_parameters["container_name"] == "momo-pro-system"
|
||||
assert provider.seen_parameters["filter_name"] == "momo-pro-system"
|
||||
|
||||
|
||||
class TestPrometheusQueryBuilder:
|
||||
def test_docker_memory_alert_query_is_not_empty(self):
|
||||
|
||||
Reference in New Issue
Block a user