fix(ai): improve docker repair verification signals
Some checks failed
CD Pipeline / tests (push) Successful in 1m22s
Code Review / ai-code-review (push) Successful in 13s
CD Pipeline / build-and-deploy (push) Successful in 4m10s
CD Pipeline / post-deploy-checks (push) Has been cancelled

This commit is contained in:
Your Name
2026-06-01 19:27:36 +08:00
parent 2ce53829fc
commit 7f3722c7f7
8 changed files with 228 additions and 6 deletions

View File

@@ -574,6 +574,8 @@ def _classify_non_success_failure(row: dict[str, Any]) -> str:
return "verifier_target_missing_pod"
if not bool(row.get("auto_success")):
return "auto_repair_execution_failed"
if "mcp:ssh_diagnose" in combined or "ssh_diagnose" in combined:
return "observe_only_playbook"
result = str(row.get("verification_result") or "").lower()
if result in {"failed", "timeout"}:
@@ -615,6 +617,13 @@ def _remediation_for_failure_class(failure_class: str) -> dict[str, str]:
"owner": "solver_or_operator",
"reason": "execution_failed_after_route_normalization",
}
if failure_class == "observe_only_playbook":
return {
"status": "needs_playbook_ticket",
"action": "promote_diagnostic_to_repair_playbook",
"owner": "solver_or_operator",
"reason": "auto_repair_only_collected_evidence",
}
if failure_class in {"verification_failed", "verification_timeout"}:
return {
"status": "manual_review",
@@ -639,6 +648,8 @@ def _next_step_for_failure_class(failure_class: str) -> str:
return "map_verifier_target"
if failure_class == "auto_repair_execution_failed":
return "review_auto_repair_execution"
if failure_class == "observe_only_playbook":
return "author_mutating_repair_step"
if failure_class in {"verification_failed", "verification_timeout"}:
return "escalate_verification_failure"
return "review_degraded_verification"

View File

@@ -149,6 +149,19 @@ _EXTERNAL_SITE_ALERTNAMES = {
_K3S_NODE_ALERTNAMES = {"K3sNodeDown", "K3sVIPDown"}
def _build_verification_action_taken(playbook_id: str, executed_steps: list[str]) -> str:
"""Build a compact verifier action string with the real executed step shape."""
base = f"auto_repair:{playbook_id}"
if not executed_steps:
return base
joined = " | ".join(step.strip() for step in executed_steps if step.strip())
joined = re.sub(r"\s+", " ", joined)
if not joined:
return base
return f"{base} steps={joined[:900]}"
# =============================================================================
# Auto Repair Service Interface
# =============================================================================
@@ -600,7 +613,10 @@ class AutoRepairService:
from src.services.post_execution_verifier import get_post_execution_verifier
from src.services.learning_service import get_learning_service
_action_taken = f"auto_repair:{playbook.playbook_id}"
_action_taken = _build_verification_action_taken(
playbook.playbook_id,
executed_steps,
)
_verifier = get_post_execution_verifier()
_learning = get_learning_service()

View File

@@ -175,10 +175,7 @@ class MCPToolRegistry:
# workload/node locator prevents host alerts such as HostErrorLogFlood
# from being misrouted into Kubernetes tools just because an upstream
# bridge added namespace="infra".
has_k8s_locator = any(
labels.get(key)
for key in ("deployment", "pod", "node", "container")
)
has_k8s_locator = _has_k8s_locator(alertname, labels)
# 依優先度排序後篩選
sorted_tools = sorted(self._tools, key=lambda t: t.priority)
@@ -256,6 +253,23 @@ def _select_provider_balanced_tools(
return selected
def _has_k8s_locator(alertname: str, labels: dict[str, Any]) -> bool:
"""Return true only when labels can identify a Kubernetes workload/node.
Docker exporters also use a generic ``container`` label. Treating that
label as a pod locator made DockerContainer* alerts call Kubernetes pod
tools with an empty pod name, which polluted post-repair verification.
"""
if any(labels.get(key) for key in ("deployment", "pod", "node")):
return True
alert = alertname or ""
if alert.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")):
return bool(labels.get("namespace") and labels.get("container"))
return False
# ─────────────────────────────────────────────────────────────────────────────
# 工具自動分類(根據 tool name 推斷感官維度)
# ─────────────────────────────────────────────────────────────────────────────

View File

@@ -255,6 +255,8 @@ class PostExecutionVerifier:
"pod_name": labels.get("pod", labels.get("name", "")),
"deployment": labels.get("deployment", ""),
"host": labels.get("instance", "").split(":")[0] or labels.get("host", ""),
"container_name": _extract_container_name(labels),
"filter_name": _extract_container_name(labels),
"query": _build_prometheus_query(alertname, labels),
}
@@ -495,7 +497,13 @@ def _assess_recovery(
"'success': true",
'"success": true',
]
if any(sig in post_str for sig in success_signals):
has_success_signal = any(sig in post_str for sig in success_signals)
if not has_success_signal and _docker_state_indicates_running(post_str):
has_success_signal = True
if has_success_signal:
if _is_observe_only_action(action_taken):
return "degraded"
# 但如果 pre_state 已經是 running可能是無效操作
if pre_str and any(sig in pre_str for sig in success_signals):
# 如果執行的是 restart即使 pre/post 都 Running 也算 success
@@ -533,6 +541,53 @@ def _get_labels(incident: "Incident") -> dict[str, Any]:
return {}
def _extract_container_name(labels: dict[str, Any]) -> str:
"""Resolve Docker container target labels for post-execution SSH sensors."""
for key in ("filter_name", "container_name", "container", "resource", "name"):
value = str(labels.get(key) or "").strip()
if value and "{" not in value and "}" not in value:
return value
return ""
def _is_observe_only_action(action_taken: str) -> bool:
"""Return true when the executed step collected evidence but did not mutate state."""
lowered = (action_taken or "").lower()
observe_tokens = (
"mcp:ssh_diagnose",
"ssh_diagnose",
"docker stats",
"ps aux",
"free -h",
"df -h",
)
mutation_tokens = (
"restart",
"delete",
"rollout",
"scale",
"patch",
"apply",
"prune",
"truncate",
"clear",
)
return any(token in lowered for token in observe_tokens) and not any(
token in lowered for token in mutation_tokens
)
def _docker_state_indicates_running(post_str: str) -> bool:
"""Recognize Docker-specific healthy/running output without matching uptime."""
if not any(token in post_str for token in ("docker ps", "docker inspect", "docker stats")):
return False
return bool(
re.search(r'\bup\s+\d', post_str)
or re.search(r'["\']status["\']\s*:\s*["\']running["\']', post_str)
or re.search(r'["\']running["\']\s*:\s*true', post_str)
)
def _build_prometheus_query(alertname: str, labels: dict[str, Any]) -> str:
"""Build a non-empty PromQL probe for post-execution metric sensors."""

View File

@@ -274,3 +274,45 @@ def test_verification_coverage_payload_skips_when_no_auto_repair():
assert payload["reason"] == "no_auto_repair_executions_24h"
assert payload["evaluable"] is False
assert payload["remediation_queue"]["total"] == 0
def test_verification_coverage_payload_routes_observe_only_playbook_to_ticket():
payload = _build_verification_coverage_payload(
{
"total_auto": 1,
"successful_auto": 1,
"verified_auto": 1,
"verified_success": 0,
"verified_non_success": 1,
"unverified_auto": 0,
},
[],
[
{
"auto_repair_id": "are-3",
"incident_id": "INC-3",
"incident_status": "INVESTIGATING",
"incident_severity": "P2",
"alert_category": "infrastructure",
"alertname": "DockerContainerMemoryLimitPressure",
"auto_success": True,
"playbook_id": "PB-3",
"playbook_name": "Docker pressure diagnostic playbook",
"triggered_by": "auto_repair",
"risk_level": "LOW",
"verification_result": "degraded",
"auto_error": "",
"post_state_text": '{"ssh_diagnose": {"command": "docker stats --no-stream api"}}',
"evidence_summary": "SUCCESS: mcp:ssh_diagnose",
"auto_created_at": None,
"verification_collected_at": None,
},
],
)
item = payload["recent_non_success"][0]
assert item["failure_class"] == "observe_only_playbook"
assert item["next_step"] == "author_mutating_repair_step"
assert item["remediation_status"] == "needs_playbook_ticket"
assert item["remediation_action"] == "promote_diagnostic_to_repair_playbook"
assert payload["remediation_queue"]["needs_human"] == 1

View File

@@ -225,10 +225,13 @@ async def test_auto_repair_success_triggers_verify_and_learn(monkeypatch):
assert len(stub_verifier.calls) == 1, "verifier.verify() 應被呼叫一次"
assert stub_verifier.calls[0]["incident_id"] == incident.incident_id
assert stub_verifier.calls[0]["snapshot"] is None
assert stub_verifier.calls[0]["action_taken"].startswith(f"auto_repair:{playbook.playbook_id}")
assert "steps=Step 1:" in stub_verifier.calls[0]["action_taken"]
assert len(stub_learning.verification_calls) == 1, "record_verification_result 應被呼叫一次"
call = stub_learning.verification_calls[0]
assert call["incident_id"] == incident.incident_id
assert call["action_taken"] == stub_verifier.calls[0]["action_taken"]
assert call["verification_result"] == "success"
assert call["matched_playbook_id"] == playbook.playbook_id

View File

@@ -387,6 +387,34 @@ class TestSuggestTools:
assert "prometheus_query" in names
assert "kubectl_describe" not in names
@pytest.mark.asyncio
async def test_docker_container_alert_does_not_treat_container_as_pod_locator(self):
registry = MCPToolRegistry()
ssh_provider = _StubProvider("ssh_host", ["ssh_diagnose", "ssh_get_container_status"])
k8s_provider = _StubProvider("kubernetes", ["kubectl_describe", "k8s_get_pod_logs"])
prometheus_provider = _StubProvider("prometheus", ["prometheus_query"])
await registry.register_provider(k8s_provider)
await registry.register_provider(ssh_provider)
await registry.register_provider(prometheus_provider)
tools = registry.suggest_tools(
alertname="DockerContainerMemoryLimitPressure",
incident_labels={
"namespace": "default",
"container": "momo-pro-system",
"container_name": "momo-pro-system",
"host": "188",
},
max_tools=8,
)
names = [reg.tool.name for reg in tools]
assert "ssh_diagnose" in names
assert "ssh_get_container_status" in names
assert "prometheus_query" in names
assert "kubectl_describe" not in names
assert "k8s_get_pod_logs" not in names
def test_get_all_tools_returns_all(self):
registry = MCPToolRegistry()
provider = _StubProvider("test", [])

View File

@@ -136,6 +136,23 @@ class TestAssessRecovery:
action = "auto_repair_playbook:PB-TEST mcp:ssh_diagnose docker stats"
assert _assess_recovery(pre, post, action) == "degraded"
def test_diagnosis_only_without_pre_state_is_not_verified_repair(self):
"""只有 read-only SSH 診斷且無 pre-state 時,也不能被誤算成真修復。"""
post = {"ssh_diagnose": {"command": "docker stats --no-stream api", "stdout": '"Status": "running"'}}
action = "auto_repair:PB-TEST steps=Step 1: docker stats -> SUCCESS: mcp:ssh_diagnose"
assert _assess_recovery(None, post, action) == "degraded"
def test_docker_running_after_restart_is_success(self):
"""Docker restart 後看到 docker ps/inspect running可驗證為 success。"""
post = {
"ssh_get_container_status": {
"command": "docker ps -a --filter name=api",
"stdout": "api Up 12 seconds",
}
}
action = "auto_repair:PB-TEST steps=Step 1: docker restart api -> SUCCESS"
assert _assess_recovery(None, post, action) == "success"
def test_pre_running_post_running_delete_is_success(self):
"""kubectl delete 動作,前後都 Running → success"""
pre = {"status": "Running"}
@@ -417,6 +434,25 @@ class _PrometheusRegistry:
]
class _DockerRegistry:
def __init__(self, provider: _CaptureProvider) -> None:
self.provider = provider
def suggest_tools(self, alertname: str = "", incident_labels: dict | None = None) -> list[RegisteredTool]:
return [
RegisteredTool(
tool=MCPTool(
name="ssh_get_container_status",
description="",
input_schema={},
server_name="capture",
),
provider=self.provider,
dimensions=[SensorDimension.D3_METRICS],
)
]
class _DbContext:
async def __aenter__(self) -> object:
return object()
@@ -501,6 +537,23 @@ class TestCollectPostStateAuditContext:
assert 'host="110"' in query
assert 'container_name="momo-scheduler"' in query
@pytest.mark.asyncio
async def test_collect_post_state_sends_docker_container_target(self):
provider = _CaptureProvider()
verifier = PostExecutionVerifier()
verifier._registry = _DockerRegistry(provider)
incident = _stub_incident(alertname="DockerContainerUnhealthy")
incident.signals[0].labels.update({
"host": "188",
"container_name": "momo-pro-system",
})
await verifier._collect_post_state(incident)
assert provider.seen_parameters is not None
assert provider.seen_parameters["container_name"] == "momo-pro-system"
assert provider.seen_parameters["filter_name"] == "momo-pro-system"
class TestPrometheusQueryBuilder:
def test_docker_memory_alert_query_is_not_empty(self):