diff --git a/apps/api/src/services/adr100_slo_status_service.py b/apps/api/src/services/adr100_slo_status_service.py index 9e63e1cf..59fa6d1b 100644 --- a/apps/api/src/services/adr100_slo_status_service.py +++ b/apps/api/src/services/adr100_slo_status_service.py @@ -574,6 +574,8 @@ def _classify_non_success_failure(row: dict[str, Any]) -> str: return "verifier_target_missing_pod" if not bool(row.get("auto_success")): return "auto_repair_execution_failed" + if "mcp:ssh_diagnose" in combined or "ssh_diagnose" in combined: + return "observe_only_playbook" result = str(row.get("verification_result") or "").lower() if result in {"failed", "timeout"}: @@ -615,6 +617,13 @@ def _remediation_for_failure_class(failure_class: str) -> dict[str, str]: "owner": "solver_or_operator", "reason": "execution_failed_after_route_normalization", } + if failure_class == "observe_only_playbook": + return { + "status": "needs_playbook_ticket", + "action": "promote_diagnostic_to_repair_playbook", + "owner": "solver_or_operator", + "reason": "auto_repair_only_collected_evidence", + } if failure_class in {"verification_failed", "verification_timeout"}: return { "status": "manual_review", @@ -639,6 +648,8 @@ def _next_step_for_failure_class(failure_class: str) -> str: return "map_verifier_target" if failure_class == "auto_repair_execution_failed": return "review_auto_repair_execution" + if failure_class == "observe_only_playbook": + return "author_mutating_repair_step" if failure_class in {"verification_failed", "verification_timeout"}: return "escalate_verification_failure" return "review_degraded_verification" diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py index 9a4aac2c..d4ddec4d 100644 --- a/apps/api/src/services/auto_repair_service.py +++ b/apps/api/src/services/auto_repair_service.py @@ -149,6 +149,19 @@ _EXTERNAL_SITE_ALERTNAMES = { _K3S_NODE_ALERTNAMES = {"K3sNodeDown", "K3sVIPDown"} +def _build_verification_action_taken(playbook_id: str, executed_steps: list[str]) -> str: + """Build a compact verifier action string with the real executed step shape.""" + base = f"auto_repair:{playbook_id}" + if not executed_steps: + return base + + joined = " | ".join(step.strip() for step in executed_steps if step.strip()) + joined = re.sub(r"\s+", " ", joined) + if not joined: + return base + return f"{base} steps={joined[:900]}" + + # ============================================================================= # Auto Repair Service Interface # ============================================================================= @@ -600,7 +613,10 @@ class AutoRepairService: from src.services.post_execution_verifier import get_post_execution_verifier from src.services.learning_service import get_learning_service - _action_taken = f"auto_repair:{playbook.playbook_id}" + _action_taken = _build_verification_action_taken( + playbook.playbook_id, + executed_steps, + ) _verifier = get_post_execution_verifier() _learning = get_learning_service() diff --git a/apps/api/src/services/mcp_tool_registry.py b/apps/api/src/services/mcp_tool_registry.py index af19977e..6be48968 100644 --- a/apps/api/src/services/mcp_tool_registry.py +++ b/apps/api/src/services/mcp_tool_registry.py @@ -175,10 +175,7 @@ class MCPToolRegistry: # workload/node locator prevents host alerts such as HostErrorLogFlood # from being misrouted into Kubernetes tools just because an upstream # bridge added namespace="infra". - has_k8s_locator = any( - labels.get(key) - for key in ("deployment", "pod", "node", "container") - ) + has_k8s_locator = _has_k8s_locator(alertname, labels) # 依優先度排序後篩選 sorted_tools = sorted(self._tools, key=lambda t: t.priority) @@ -256,6 +253,23 @@ def _select_provider_balanced_tools( return selected +def _has_k8s_locator(alertname: str, labels: dict[str, Any]) -> bool: + """Return true only when labels can identify a Kubernetes workload/node. + + Docker exporters also use a generic ``container`` label. Treating that + label as a pod locator made DockerContainer* alerts call Kubernetes pod + tools with an empty pod name, which polluted post-repair verification. + """ + if any(labels.get(key) for key in ("deployment", "pod", "node")): + return True + + alert = alertname or "" + if alert.startswith(("Kube", "Pod", "Deploy", "Node", "Velero", "ArgoCD")): + return bool(labels.get("namespace") and labels.get("container")) + + return False + + # ───────────────────────────────────────────────────────────────────────────── # 工具自動分類(根據 tool name 推斷感官維度) # ───────────────────────────────────────────────────────────────────────────── diff --git a/apps/api/src/services/post_execution_verifier.py b/apps/api/src/services/post_execution_verifier.py index 9a45b1ec..04da2cf4 100644 --- a/apps/api/src/services/post_execution_verifier.py +++ b/apps/api/src/services/post_execution_verifier.py @@ -255,6 +255,8 @@ class PostExecutionVerifier: "pod_name": labels.get("pod", labels.get("name", "")), "deployment": labels.get("deployment", ""), "host": labels.get("instance", "").split(":")[0] or labels.get("host", ""), + "container_name": _extract_container_name(labels), + "filter_name": _extract_container_name(labels), "query": _build_prometheus_query(alertname, labels), } @@ -495,7 +497,13 @@ def _assess_recovery( "'success': true", '"success": true', ] - if any(sig in post_str for sig in success_signals): + has_success_signal = any(sig in post_str for sig in success_signals) + if not has_success_signal and _docker_state_indicates_running(post_str): + has_success_signal = True + + if has_success_signal: + if _is_observe_only_action(action_taken): + return "degraded" # 但如果 pre_state 已經是 running,可能是無效操作 if pre_str and any(sig in pre_str for sig in success_signals): # 如果執行的是 restart,即使 pre/post 都 Running 也算 success @@ -533,6 +541,53 @@ def _get_labels(incident: "Incident") -> dict[str, Any]: return {} +def _extract_container_name(labels: dict[str, Any]) -> str: + """Resolve Docker container target labels for post-execution SSH sensors.""" + for key in ("filter_name", "container_name", "container", "resource", "name"): + value = str(labels.get(key) or "").strip() + if value and "{" not in value and "}" not in value: + return value + return "" + + +def _is_observe_only_action(action_taken: str) -> bool: + """Return true when the executed step collected evidence but did not mutate state.""" + lowered = (action_taken or "").lower() + observe_tokens = ( + "mcp:ssh_diagnose", + "ssh_diagnose", + "docker stats", + "ps aux", + "free -h", + "df -h", + ) + mutation_tokens = ( + "restart", + "delete", + "rollout", + "scale", + "patch", + "apply", + "prune", + "truncate", + "clear", + ) + return any(token in lowered for token in observe_tokens) and not any( + token in lowered for token in mutation_tokens + ) + + +def _docker_state_indicates_running(post_str: str) -> bool: + """Recognize Docker-specific healthy/running output without matching uptime.""" + if not any(token in post_str for token in ("docker ps", "docker inspect", "docker stats")): + return False + return bool( + re.search(r'\bup\s+\d', post_str) + or re.search(r'["\']status["\']\s*:\s*["\']running["\']', post_str) + or re.search(r'["\']running["\']\s*:\s*true', post_str) + ) + + def _build_prometheus_query(alertname: str, labels: dict[str, Any]) -> str: """Build a non-empty PromQL probe for post-execution metric sensors.""" diff --git a/apps/api/tests/test_adr100_slo_status_service.py b/apps/api/tests/test_adr100_slo_status_service.py index b836163c..5c6a6a67 100644 --- a/apps/api/tests/test_adr100_slo_status_service.py +++ b/apps/api/tests/test_adr100_slo_status_service.py @@ -274,3 +274,45 @@ def test_verification_coverage_payload_skips_when_no_auto_repair(): assert payload["reason"] == "no_auto_repair_executions_24h" assert payload["evaluable"] is False assert payload["remediation_queue"]["total"] == 0 + + +def test_verification_coverage_payload_routes_observe_only_playbook_to_ticket(): + payload = _build_verification_coverage_payload( + { + "total_auto": 1, + "successful_auto": 1, + "verified_auto": 1, + "verified_success": 0, + "verified_non_success": 1, + "unverified_auto": 0, + }, + [], + [ + { + "auto_repair_id": "are-3", + "incident_id": "INC-3", + "incident_status": "INVESTIGATING", + "incident_severity": "P2", + "alert_category": "infrastructure", + "alertname": "DockerContainerMemoryLimitPressure", + "auto_success": True, + "playbook_id": "PB-3", + "playbook_name": "Docker pressure diagnostic playbook", + "triggered_by": "auto_repair", + "risk_level": "LOW", + "verification_result": "degraded", + "auto_error": "", + "post_state_text": '{"ssh_diagnose": {"command": "docker stats --no-stream api"}}', + "evidence_summary": "SUCCESS: mcp:ssh_diagnose", + "auto_created_at": None, + "verification_collected_at": None, + }, + ], + ) + + item = payload["recent_non_success"][0] + assert item["failure_class"] == "observe_only_playbook" + assert item["next_step"] == "author_mutating_repair_step" + assert item["remediation_status"] == "needs_playbook_ticket" + assert item["remediation_action"] == "promote_diagnostic_to_repair_playbook" + assert payload["remediation_queue"]["needs_human"] == 1 diff --git a/apps/api/tests/test_learning_chain_e2e.py b/apps/api/tests/test_learning_chain_e2e.py index ae350533..06f6cbef 100644 --- a/apps/api/tests/test_learning_chain_e2e.py +++ b/apps/api/tests/test_learning_chain_e2e.py @@ -225,10 +225,13 @@ async def test_auto_repair_success_triggers_verify_and_learn(monkeypatch): assert len(stub_verifier.calls) == 1, "verifier.verify() 應被呼叫一次" assert stub_verifier.calls[0]["incident_id"] == incident.incident_id assert stub_verifier.calls[0]["snapshot"] is None + assert stub_verifier.calls[0]["action_taken"].startswith(f"auto_repair:{playbook.playbook_id}") + assert "steps=Step 1:" in stub_verifier.calls[0]["action_taken"] assert len(stub_learning.verification_calls) == 1, "record_verification_result 應被呼叫一次" call = stub_learning.verification_calls[0] assert call["incident_id"] == incident.incident_id + assert call["action_taken"] == stub_verifier.calls[0]["action_taken"] assert call["verification_result"] == "success" assert call["matched_playbook_id"] == playbook.playbook_id diff --git a/apps/api/tests/test_mcp_tool_registry.py b/apps/api/tests/test_mcp_tool_registry.py index fe5498ee..978eecaa 100644 --- a/apps/api/tests/test_mcp_tool_registry.py +++ b/apps/api/tests/test_mcp_tool_registry.py @@ -387,6 +387,34 @@ class TestSuggestTools: assert "prometheus_query" in names assert "kubectl_describe" not in names + @pytest.mark.asyncio + async def test_docker_container_alert_does_not_treat_container_as_pod_locator(self): + registry = MCPToolRegistry() + ssh_provider = _StubProvider("ssh_host", ["ssh_diagnose", "ssh_get_container_status"]) + k8s_provider = _StubProvider("kubernetes", ["kubectl_describe", "k8s_get_pod_logs"]) + prometheus_provider = _StubProvider("prometheus", ["prometheus_query"]) + await registry.register_provider(k8s_provider) + await registry.register_provider(ssh_provider) + await registry.register_provider(prometheus_provider) + + tools = registry.suggest_tools( + alertname="DockerContainerMemoryLimitPressure", + incident_labels={ + "namespace": "default", + "container": "momo-pro-system", + "container_name": "momo-pro-system", + "host": "188", + }, + max_tools=8, + ) + names = [reg.tool.name for reg in tools] + + assert "ssh_diagnose" in names + assert "ssh_get_container_status" in names + assert "prometheus_query" in names + assert "kubectl_describe" not in names + assert "k8s_get_pod_logs" not in names + def test_get_all_tools_returns_all(self): registry = MCPToolRegistry() provider = _StubProvider("test", []) diff --git a/apps/api/tests/test_post_execution_verifier.py b/apps/api/tests/test_post_execution_verifier.py index 28bdb343..8dd879af 100644 --- a/apps/api/tests/test_post_execution_verifier.py +++ b/apps/api/tests/test_post_execution_verifier.py @@ -136,6 +136,23 @@ class TestAssessRecovery: action = "auto_repair_playbook:PB-TEST mcp:ssh_diagnose docker stats" assert _assess_recovery(pre, post, action) == "degraded" + def test_diagnosis_only_without_pre_state_is_not_verified_repair(self): + """只有 read-only SSH 診斷且無 pre-state 時,也不能被誤算成真修復。""" + post = {"ssh_diagnose": {"command": "docker stats --no-stream api", "stdout": '"Status": "running"'}} + action = "auto_repair:PB-TEST steps=Step 1: docker stats -> SUCCESS: mcp:ssh_diagnose" + assert _assess_recovery(None, post, action) == "degraded" + + def test_docker_running_after_restart_is_success(self): + """Docker restart 後看到 docker ps/inspect running,可驗證為 success。""" + post = { + "ssh_get_container_status": { + "command": "docker ps -a --filter name=api", + "stdout": "api Up 12 seconds", + } + } + action = "auto_repair:PB-TEST steps=Step 1: docker restart api -> SUCCESS" + assert _assess_recovery(None, post, action) == "success" + def test_pre_running_post_running_delete_is_success(self): """kubectl delete 動作,前後都 Running → success""" pre = {"status": "Running"} @@ -417,6 +434,25 @@ class _PrometheusRegistry: ] +class _DockerRegistry: + def __init__(self, provider: _CaptureProvider) -> None: + self.provider = provider + + def suggest_tools(self, alertname: str = "", incident_labels: dict | None = None) -> list[RegisteredTool]: + return [ + RegisteredTool( + tool=MCPTool( + name="ssh_get_container_status", + description="", + input_schema={}, + server_name="capture", + ), + provider=self.provider, + dimensions=[SensorDimension.D3_METRICS], + ) + ] + + class _DbContext: async def __aenter__(self) -> object: return object() @@ -501,6 +537,23 @@ class TestCollectPostStateAuditContext: assert 'host="110"' in query assert 'container_name="momo-scheduler"' in query + @pytest.mark.asyncio + async def test_collect_post_state_sends_docker_container_target(self): + provider = _CaptureProvider() + verifier = PostExecutionVerifier() + verifier._registry = _DockerRegistry(provider) + incident = _stub_incident(alertname="DockerContainerUnhealthy") + incident.signals[0].labels.update({ + "host": "188", + "container_name": "momo-pro-system", + }) + + await verifier._collect_post_state(incident) + + assert provider.seen_parameters is not None + assert provider.seen_parameters["container_name"] == "momo-pro-system" + assert provider.seen_parameters["filter_name"] == "momo-pro-system" + class TestPrometheusQueryBuilder: def test_docker_memory_alert_query_is_not_empty(self):