diff --git a/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_read_gateway_2026-05-14.sql b/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_read_gateway_2026-05-14.sql new file mode 100644 index 00000000..3b71f171 --- /dev/null +++ b/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_read_gateway_2026-05-14.sql @@ -0,0 +1,166 @@ +-- T23: auto-repair executor read-only MCP Gateway seed +-- 目的:讓 YAML_RULE/PlayBook 的只讀 SSH 診斷步驟經過 AwoooP MCP Gateway。 +-- 邊界:只授權 read scope;write/admin SSH 工具仍必須走 approval_executor + Gate 5。 + +SELECT set_config('app.project_id', 'awoooi', FALSE); + +WITH agent_body AS ( + SELECT jsonb_build_object( + 'schema_version', 'awooop_agent_contract_v1', + 'agent_id', 'auto_repair_executor', + 'display_name', 'Auto Repair Executor', + 'project_id', 'awoooi', + 'purpose', 'Read-only auto-repair diagnostics through AwoooP MCP Gateway', + 'allowed_scopes', jsonb_build_array('read'), + 'forbidden_scopes', jsonb_build_array('write', 'admin'), + 'stage', 't23_auto_repair_diagnostic_gateway' + ) AS body_json +), +inserted_revision AS ( + INSERT INTO awooop_contract_revisions ( + project_id, + contract_family, + contract_id, + version_major, + version_minor, + lifecycle_status, + body_json, + body_hash, + body_schema_version, + publisher_id, + published_at + ) + SELECT + 'awoooi', + 'agent', + 'auto_repair_executor', + 1, + 0, + 'active', + body_json, + encode(digest(body_json::text, 'sha256'), 'hex'), + 'v1.0', + 'migration:t23_auto_repair_executor_read_gateway', + NOW() + FROM agent_body + ON CONFLICT (project_id, contract_family, contract_id, version_major, version_minor) + DO NOTHING + RETURNING revision_id, project_id, contract_family, contract_id +), +chosen_revision AS ( + SELECT revision_id, project_id, contract_family, contract_id + FROM inserted_revision + UNION ALL + SELECT revision_id, project_id, contract_family, contract_id + FROM awooop_contract_revisions + WHERE project_id = 'awoooi' + AND contract_family = 'agent' + AND contract_id = 'auto_repair_executor' + AND version_major = 1 + AND version_minor = 0 + AND lifecycle_status = 'active' +), +upsert_pointer AS ( + INSERT INTO awooop_active_revisions ( + project_id, + contract_family, + contract_id, + active_revision_id, + updated_at + ) + SELECT DISTINCT ON (project_id, contract_family, contract_id) + project_id, + contract_family, + contract_id, + revision_id, + NOW() + FROM chosen_revision + ORDER BY project_id, contract_family, contract_id, revision_id + ON CONFLICT (project_id, contract_family, contract_id) + DO UPDATE SET + active_revision_id = EXCLUDED.active_revision_id, + updated_at = NOW() + RETURNING contract_id +) +SELECT 'auto_repair_executor_active_contracts', count(*) FROM upsert_pointer; + +WITH read_tools(tool_name, description) AS ( + VALUES + ('ssh_diagnose', 'SSH host/container diagnosis read'), + ('ssh_get_top_processes', 'SSH top processes read'), + ('ssh_get_disk_usage', 'SSH disk usage read'), + ('ssh_get_memory_info', 'SSH memory info read'), + ('ssh_get_container_logs', 'SSH container logs read'), + ('ssh_get_container_status', 'SSH container status read'), + ('ssh_get_service_status', 'SSH service status read'), + ('ssh_check_port', 'SSH port check read'), + ('ssh_get_nginx_error_log', 'SSH nginx error log read'), + ('ssh_get_swap_info', 'SSH swap info read') +), +upsert_tools AS ( + INSERT INTO awooop_mcp_tool_registry ( + project_id, + tool_name, + tool_type, + description, + allowed_scopes, + environment_tags, + is_active, + updated_at + ) + SELECT + 'awoooi', + tool_name, + 'mcp_server', + description, + '["read"]'::jsonb, + '{"env": "prod"}'::jsonb, + TRUE, + NOW() + FROM read_tools + ON CONFLICT (project_id, tool_name) + DO UPDATE SET + description = EXCLUDED.description, + allowed_scopes = EXCLUDED.allowed_scopes, + environment_tags = EXCLUDED.environment_tags, + is_active = TRUE, + updated_at = NOW() + RETURNING tool_id, tool_name, allowed_scopes +), +upsert_grants AS ( + INSERT INTO awooop_mcp_grants ( + project_id, + agent_id, + tool_id, + granted_by, + granted_scopes, + expires_at, + is_revoked, + revoked_at, + revoked_by + ) + SELECT + 'awoooi', + 'auto_repair_executor', + tool_id, + 'migration:t23_auto_repair_executor_read_gateway', + allowed_scopes, + NULL, + FALSE, + NULL, + NULL + FROM upsert_tools + ON CONFLICT (project_id, agent_id, tool_id) + DO UPDATE SET + granted_by = EXCLUDED.granted_by, + granted_scopes = EXCLUDED.granted_scopes, + expires_at = NULL, + is_revoked = FALSE, + revoked_at = NULL, + revoked_by = NULL + RETURNING grant_id +) +SELECT + 'auto_repair_executor_read_gateway', + (SELECT count(*) FROM upsert_tools) AS tool_rows, + (SELECT count(*) FROM upsert_grants) AS grant_rows; diff --git a/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_read_gateway_2026-05-14_down.sql b/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_read_gateway_2026-05-14_down.sql new file mode 100644 index 00000000..c68604d1 --- /dev/null +++ b/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_read_gateway_2026-05-14_down.sql @@ -0,0 +1,24 @@ +-- Rollback T23 auto-repair executor read-only MCP Gateway grant. + +SELECT set_config('app.project_id', 'awoooi', FALSE); + +UPDATE awooop_mcp_grants +SET is_revoked = TRUE, + revoked_at = NOW(), + revoked_by = 'rollback:t23_auto_repair_executor_read_gateway' +WHERE project_id = 'awoooi' + AND agent_id = 'auto_repair_executor' + AND granted_by = 'migration:t23_auto_repair_executor_read_gateway'; + +DELETE FROM awooop_active_revisions +WHERE project_id = 'awoooi' + AND contract_family = 'agent' + AND contract_id = 'auto_repair_executor'; + +UPDATE awooop_contract_revisions +SET lifecycle_status = 'retired' +WHERE project_id = 'awoooi' + AND contract_family = 'agent' + AND contract_id = 'auto_repair_executor' + AND publisher_id = 'migration:t23_auto_repair_executor_read_gateway' + AND lifecycle_status = 'active'; diff --git a/apps/api/src/plugins/mcp/providers/ssh_provider.py b/apps/api/src/plugins/mcp/providers/ssh_provider.py index 9ee04e19..07cd4a53 100644 --- a/apps/api/src/plugins/mcp/providers/ssh_provider.py +++ b/apps/api/src/plugins/mcp/providers/ssh_provider.py @@ -65,6 +65,12 @@ DEFAULT_HOST_USERS = { # AI/Web host is operated by the ollama account in the current topology. "192.168.0.188": "ollama", } +SHORT_HOST_MAP = { + "110": "192.168.0.110", + "120": "192.168.0.120", + "121": "192.168.0.121", + "188": "192.168.0.188", +} DIAG_TIMEOUT = 10 # 診斷類超時(秒) OP_TIMEOUT = 60 # 操作類超時(秒) @@ -127,7 +133,9 @@ def _normalize_ssh_host(value: str) -> str: if host.count(":") == 1: maybe_host, maybe_port = host.rsplit(":", 1) if maybe_port.isdigit(): - return maybe_host + host = maybe_host + if host in SHORT_HOST_MAP: + return SHORT_HOST_MAP[host] return host @@ -240,6 +248,10 @@ class SSHProvider(MCPToolProvider): ), input_schema={"type": "object", "properties": { "host": {"type": "string", "description": "Target host IP"}, + "container_name": { + "type": "string", + "description": "Optional Docker container name for container-focused diagnostics", + }, }, "required": ["host"]}, server_name=self.name, ), @@ -542,12 +554,23 @@ class SSHProvider(MCPToolProvider): # 所有接受用戶字串的工具,必須先通過 _validate_param() 白名單驗證 if tool_name == "ssh_diagnose": # 2026-04-27 Claude Sonnet 4.6: 主機告警自動診斷 — 只讀,不修改任何狀態 - return ( + command = ( "echo '=== CPU TOP ===' && ps aux --sort=-%cpu | head -15 && " "echo '=== MEMORY ===' && free -h && " "echo '=== DISK ===' && df -h && " "echo '=== LOAD ===' && uptime" ) + container_name = params.get("container_name") + if container_name: + name = _validate_param("container_name", str(container_name)) + command = ( + f"{command} && " + f"echo '=== DOCKER STATS {name} ===' && " + f"docker stats --no-stream {name} 2>&1 && " + f"echo '=== DOCKER INSPECT {name} ===' && " + f"docker inspect {name} 2>&1 | head -80" + ) + return command if tool_name == "ssh_get_top_processes": return "ps aux --sort=-%cpu | head -15" diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py index 0af5da76..2b3bac4a 100644 --- a/apps/api/src/services/auto_repair_service.py +++ b/apps/api/src/services/auto_repair_service.py @@ -22,9 +22,10 @@ Phase 8: 自動化層實作 - P0/P1 嚴重度 Incident 需要人工確認 """ -from dataclasses import dataclass from collections.abc import Callable -from typing import Protocol +from dataclasses import dataclass +import re +from typing import Any, Protocol import structlog @@ -81,6 +82,55 @@ class AutoRepairResult: execution_time_ms: int = 0 +@dataclass(frozen=True) +class _SshMcpRoute: + """Route a legacy SSH playbook command to a governed MCP tool.""" + + tool_name: str + params: dict[str, Any] + + +_SHORT_HOST_MAP: dict[str, str] = { + "110": "192.168.0.110", + "120": "192.168.0.120", + "121": "192.168.0.121", + "188": "192.168.0.188", +} + +_SSH_DIAGNOSTIC_KEYWORDS = ( + "ps aux", + "docker stats", + "docker inspect", + "docker logs", + "docker ps", + "docker top", + "df -h", + "du -", + "free -h", + "journalctl", + "systemctl show", + "tail ", + "top ", + "uptime", +) + +_SSH_WRITE_KEYWORDS = ( + "docker restart", + "docker start", + "docker stop", + "docker rm", + "docker prune", + "systemctl restart", + "systemctl stop", + "systemctl start", + "truncate ", + " rm ", + "rm -", + "certbot renew", + "bash ", +) + + # ============================================================================= # Auto Repair Service Interface # ============================================================================= @@ -918,6 +968,152 @@ class AutoRepairService: # 安全降級:檢查失敗 → 保守拒絕 return False + def _route_legacy_ssh_command_to_mcp( + self, + incident: Incident, + command: str, + ) -> _SshMcpRoute | None: + """Map read-only legacy ``ssh {host} '...'`` steps to MCP Gateway. + + YAML_RULE playbooks predate the URI executor and can contain compound + shell diagnostics. Those commands should not bypass the newer + scheme-based HostRepairAgent or loosen its shell safety guard; read-only + diagnostics are instead routed to the governed SSH MCP provider. + """ + + raw_command = (command or "").strip() + lowered = raw_command.lower() + if not lowered.startswith("ssh "): + return None + + if any(token in lowered for token in _SSH_WRITE_KEYWORDS): + return None + + if not any(token in lowered for token in _SSH_DIAGNOSTIC_KEYWORDS): + return None + + host = self._resolve_ssh_host_for_incident(incident, raw_command) + if not host: + return None + + params: dict[str, Any] = {"host": host} + container_name = self._resolve_container_name_for_incident(incident, raw_command) + if container_name: + params["container_name"] = container_name + + return _SshMcpRoute(tool_name="ssh_diagnose", params=params) + + def _resolve_ssh_host_for_incident(self, incident: Incident, command: str) -> str: + """Resolve ``{host}``, short host labels, and exporter instance ports.""" + + labels = self._incident_labels(incident) + raw_host = "" + match = re.match(r"ssh\s+([^\s'\"]+)", command.strip(), flags=re.IGNORECASE) + if match: + raw_host = match.group(1) + + if not raw_host or "{" in raw_host or "}" in raw_host: + raw_host = ( + str(labels.get("host") or "") + or str(labels.get("instance") or "") + or str(labels.get("node") or "") + or str(labels.get("exported_instance") or "") + ) + + return self._normalize_ssh_host(raw_host) + + @staticmethod + def _normalize_ssh_host(raw_host: str) -> str: + host = (raw_host or "").strip() + if host.startswith("ssh://"): + host = host.removeprefix("ssh://") + if "@" in host: + host = host.rsplit("@", 1)[1] + if host.startswith("[") and "]" in host: + host = host[1:host.index("]")] + if host.count(":") == 1: + maybe_host, maybe_port = host.rsplit(":", 1) + if maybe_port.isdigit(): + host = maybe_host + if host in _SHORT_HOST_MAP: + return _SHORT_HOST_MAP[host] + match = re.fullmatch(r"(?:node-exporter-|host-)?(110|120|121|188)", host) + if match: + return _SHORT_HOST_MAP[match.group(1)] + return host + + def _resolve_container_name_for_incident( + self, + incident: Incident, + command: str, + ) -> str: + labels = self._incident_labels(incident) + for key in ("container_name", "container", "name"): + value = str(labels.get(key) or "").strip() + if value and "{" not in value and "}" not in value: + return value + + match = re.search( + r"docker\s+(?:stats\s+--no-stream|inspect|logs|top|ps\s+-a\s+--filter\s+name=)\s+([a-zA-Z0-9._-]+)", + command, + ) + return match.group(1) if match else "" + + @staticmethod + def _incident_labels(incident: Incident) -> dict[str, Any]: + for signal in incident.signals or []: + labels = getattr(signal, "labels", None) + if labels: + return labels + return {} + + async def _execute_ssh_mcp_route( + self, + incident: Incident, + route: _SshMcpRoute, + ) -> str: + """Execute a routed SSH diagnostic through AwoooP MCP Gateway.""" + + try: + from src.db.base import get_db_context + from src.plugins.mcp.gateway import GatewayContext, McpGateway, McpGatewayError + from src.services.mcp_audit_context import with_mcp_audit_context + + incident_id = incident.incident_id + params = with_mcp_audit_context( + route.params, + session_id=f"incident:{incident_id}:auto_repair_execute", + incident_id=incident_id, + flywheel_node="execute", + agent_role="auto_repair_executor", + ) + async with get_db_context("awoooi") as db: + ctx = GatewayContext( + project_id="awoooi", + agent_id="auto_repair_executor", + tool_name=route.tool_name, + trace_id=incident_id, + is_shadow=False, + environment={"env": "prod"}, + required_scope="read", + ) + result = await McpGateway(db).call(ctx, params) + except McpGatewayError as exc: + return f"FAILED: mcp:{route.tool_name} {exc.error_code}: {exc}" + except Exception as exc: + logger.warning( + "auto_repair_ssh_mcp_route_failed", + incident_id=incident.incident_id, + tool=route.tool_name, + error=str(exc), + ) + return f"FAILED: mcp:{route.tool_name} {exc}" + + if result.success: + preview = str(result.output or "")[:500] + return f"SUCCESS: mcp:{route.tool_name} {preview}".strip() + return f"FAILED: mcp:{route.tool_name} {result.error or 'execution failed'}" + async def _execute_step(self, incident: Incident, step) -> str: """ 執行單一修復步驟 @@ -949,6 +1145,10 @@ class AutoRepairService: # 2026-04-06 Claude Code: Sprint 3 — repair_by_uri (URI scheme 路由) if step.action_type == ActionType.SSH_COMMAND: + route = self._route_legacy_ssh_command_to_mcp(incident, step.command) + if route is not None: + return await self._execute_ssh_mcp_route(incident, route) + from src.services.host_repair_agent import HostRepairAgent agent = HostRepairAgent() approved = not getattr(step, "requires_approval", False) diff --git a/apps/api/src/services/post_execution_verifier.py b/apps/api/src/services/post_execution_verifier.py index 01ebbf6d..5d0eb7d4 100644 --- a/apps/api/src/services/post_execution_verifier.py +++ b/apps/api/src/services/post_execution_verifier.py @@ -34,6 +34,7 @@ MASTER §3.1 L6×D1 from __future__ import annotations import asyncio +import re from typing import TYPE_CHECKING, Any import structlog @@ -236,6 +237,7 @@ class PostExecutionVerifier: "pod_name": labels.get("pod", labels.get("name", "")), "deployment": labels.get("deployment", ""), "host": labels.get("instance", "").split(":")[0] or labels.get("host", ""), + "query": _build_prometheus_query(alertname, labels), } async def _call_one(reg) -> tuple[str, Any]: @@ -501,16 +503,95 @@ def _get_incident_id(incident: "Incident") -> str: def _get_alertname(incident: "Incident") -> str: if incident.signals: - return incident.signals[0].labels.get("alertname", "") + signal = incident.signals[0] + labels = signal.labels or {} + return labels.get("alertname", "") or getattr(signal, "alert_name", "") return "" def _get_labels(incident: "Incident") -> dict[str, Any]: if incident.signals: - return incident.signals[0].labels + return incident.signals[0].labels or {} return {} +def _build_prometheus_query(alertname: str, labels: dict[str, Any]) -> str: + """Build a non-empty PromQL probe for post-execution metric sensors.""" + + alert = (alertname or "").lower() + namespace = _safe_label_value(str(labels.get("namespace") or "awoooi-prod")) + pod_name = _safe_label_value(str(labels.get("pod") or labels.get("name") or "")) + host = _safe_label_value(str(labels.get("host") or _short_instance(labels.get("instance")) or "")) + container = _safe_label_value(str( + labels.get("container_name") + or labels.get("container") + or labels.get("name") + or "" + )) + + if alert.startswith("dockercontainer"): + selector = _selector({ + "host": host, + "container_name": container, + }) + if "memory" in alert: + return ( + f"docker_container_memory_usage_bytes{selector} / " + f"docker_container_memory_limit_bytes{selector}" + ) + if "restart" in alert: + return ( + f"increase(docker_container_inspect_restart_count{selector}[15m]) " + f"or increase(docker_container_restart_count{selector}[15m])" + ) + if "cpu" in alert: + return f"docker_container_cpu_cores{selector}" + return f"docker_container_info{selector}" + + if any(key in alert for key in ("host", "node")): + instance_selector = _selector({"instance": str(labels.get("instance") or "")}) + if "memory" in alert or "oom" in alert: + return f"node_memory_MemAvailable_bytes{instance_selector}" + if "disk" in alert or "storage" in alert: + return f"node_filesystem_avail_bytes{instance_selector}" + if "load" in alert or "cpu" in alert: + return f"node_load5{instance_selector}" + return f"up{instance_selector}" + + pod_filter = f',pod=~"{pod_name}.*"' if pod_name else "" + if any(key in alert for key in ("memory", "mem", "oom")): + return ( + f'avg(container_memory_working_set_bytes{{namespace="{namespace}"{pod_filter}}}) ' + "/ 1048576" + ) + if any(key in alert for key in ("cpu", "load", "throttl")): + return f'avg(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"{pod_filter}}}[5m]))' + if any(key in alert for key in ("crash", "restart", "backoff")): + return f'sum(increase(kube_pod_container_status_restarts_total{{namespace="{namespace}"}}[15m]))' + if any(key in alert for key in ("http", "error", "5xx", "probe", "down", "unhealthy")): + return "1 - avg(probe_success)" + return f'up{{namespace="{namespace}"}}' + + +def _selector(labels: dict[str, str]) -> str: + parts = [f'{key}="{value}"' for key, value in labels.items() if value] + return "{" + ",".join(parts) + "}" if parts else "" + + +def _short_instance(instance: Any) -> str: + raw = str(instance or "") + if raw.count(":") == 1: + return raw.rsplit(":", 1)[0] + return raw + + +def _safe_label_value(value: str) -> str: + cleaned = (value or "").strip() + if re.fullmatch(r"[a-zA-Z0-9_.:-]{1,128}", cleaned): + return cleaned + return "" + + async def _update_snapshot( snapshot: EvidenceSnapshot, post_state: dict[str, Any], diff --git a/apps/api/tests/test_auto_repair_service.py b/apps/api/tests/test_auto_repair_service.py index 0eb29664..0edeaab2 100644 --- a/apps/api/tests/test_auto_repair_service.py +++ b/apps/api/tests/test_auto_repair_service.py @@ -19,6 +19,7 @@ from src.models.playbook import ( RiskLevel, SymptomPattern, ) +from src.plugins.mcp.interfaces import MCPToolResult from src.services.auto_repair_service import AutoRepairService from src.utils.timezone import now_taipei @@ -120,6 +121,14 @@ async def _no_cooldown(*args, **kwargs) -> tuple[bool, str]: return True, "允許自動修復 (test bypass)" +class _DbContext: + async def __aenter__(self) -> object: + return object() + + async def __aexit__(self, *_args: object) -> None: + return None + + class TestAutoRepairService: """Auto Repair Service unit tests""" @@ -415,6 +424,90 @@ class TestAutoRepairService: assert service._should_escalate_failed_verification(incident, playbook) is False + def test_legacy_ssh_diagnostic_routes_to_mcp_gateway(self, service): + """Legacy YAML_RULE SSH diagnostics become governed read-only MCP calls.""" + incident = create_test_incident( + severity=Severity.P2, + alert_category="infrastructure", + alert_name="DockerContainerMemoryLimitPressure", + ) + incident.signals[0].labels.update({ + "host": "110", + "container_name": "sentry-self-hosted-clickhouse-1", + }) + + route = service._route_legacy_ssh_command_to_mcp( + incident, + 'ssh {host} \'echo "=== LOAD ==="; uptime; docker stats --no-stream\'', + ) + + assert route is not None + assert route.tool_name == "ssh_diagnose" + assert route.params == { + "host": "192.168.0.110", + "container_name": "sentry-self-hosted-clickhouse-1", + } + + def test_legacy_ssh_write_action_does_not_route_to_read_mcp(self, service): + """Write operations must stay out of the read-only diagnostic grant.""" + incident = create_test_incident(severity=Severity.P2) + + route = service._route_legacy_ssh_command_to_mcp( + incident, + "ssh {host} 'docker restart minio'", + ) + + assert route is None + + @pytest.mark.asyncio + async def test_execute_legacy_ssh_diagnostic_uses_mcp_gateway( + self, + service, + monkeypatch, + ): + incident = create_test_incident( + severity=Severity.P2, + alert_category="infrastructure", + alert_name="DockerContainerMemoryLimitPressure", + ) + incident.signals[0].labels.update({ + "host": "110", + "container_name": "momo-scheduler", + }) + step = RepairStep( + step_number=1, + action_type=ActionType.SSH_COMMAND, + command='ssh {host} \'echo "=== LOAD ==="; uptime; docker stats --no-stream\'', + risk_level=RiskLevel.LOW, + ) + calls = [] + + class FakeGateway: + def __init__(self, db: object) -> None: + self.db = db + + async def call(self, ctx, params): + calls.append({"ctx": ctx, "params": params, "db": self.db}) + return MCPToolResult( + success=True, + execution_id="gw-ok", + output={"stdout": "=== CPU TOP ==="}, + ) + + monkeypatch.setattr("src.db.base.get_db_context", lambda _project_id: _DbContext()) + monkeypatch.setattr("src.plugins.mcp.gateway.McpGateway", FakeGateway) + + result = await service._execute_step(incident, step) + + assert result.startswith("SUCCESS: mcp:ssh_diagnose") + assert calls + assert calls[0]["ctx"].agent_id == "auto_repair_executor" + assert calls[0]["ctx"].required_scope == "read" + assert calls[0]["ctx"].is_shadow is False + assert calls[0]["params"]["host"] == "192.168.0.110" + assert calls[0]["params"]["container_name"] == "momo-scheduler" + assert calls[0]["params"]["_mcp_audit"]["flywheel_node"] == "execute" + @pytest.mark.asyncio async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service): """Test that LOW risk actions are allowed""" diff --git a/apps/api/tests/test_post_execution_verifier.py b/apps/api/tests/test_post_execution_verifier.py index d3791188..273dc30a 100644 --- a/apps/api/tests/test_post_execution_verifier.py +++ b/apps/api/tests/test_post_execution_verifier.py @@ -384,6 +384,25 @@ class _FakeRegistry: ] +class _PrometheusRegistry: + def __init__(self, provider: _CaptureProvider) -> None: + self.provider = provider + + def suggest_tools(self, alertname: str = "", incident_labels: dict | None = None) -> list[RegisteredTool]: + return [ + RegisteredTool( + tool=MCPTool( + name="prometheus_query", + description="", + input_schema={}, + server_name="capture", + ), + provider=self.provider, + dimensions=[SensorDimension.D3_METRICS], + ) + ] + + class _DbContext: async def __aenter__(self) -> object: return object() @@ -448,3 +467,39 @@ class TestCollectPostStateAuditContext: assert ctx.required_scope == "read" assert calls[0]["parameters"]["_mcp_audit"]["incident_id"] == "INC-TEST" assert calls[0]["parameters"]["_mcp_audit"]["flywheel_node"] == "verify" + + @pytest.mark.asyncio + async def test_collect_post_state_sends_prometheus_query_parameter(self): + provider = _CaptureProvider() + verifier = PostExecutionVerifier() + verifier._registry = _PrometheusRegistry(provider) + incident = _stub_incident(alertname="DockerContainerMemoryLimitPressure") + incident.signals[0].labels.update({ + "host": "110", + "container_name": "momo-scheduler", + }) + + await verifier._collect_post_state(incident) + + assert provider.seen_parameters is not None + query = provider.seen_parameters["query"] + assert "docker_container_memory_usage_bytes" in query + assert 'host="110"' in query + assert 'container_name="momo-scheduler"' in query + + +class TestPrometheusQueryBuilder: + def test_docker_memory_alert_query_is_not_empty(self): + query = pev_module._build_prometheus_query( + "DockerContainerMemoryLimitPressure", + {"host": "110", "container_name": "momo-scheduler"}, + ) + + assert "docker_container_memory_usage_bytes" in query + assert "docker_container_memory_limit_bytes" in query + + def test_canary_alert_uses_generic_non_empty_query(self): + query = pev_module._build_prometheus_query("AwoooPAutoRepairCanaryT16", {}) + + assert query + assert "up" in query diff --git a/apps/api/tests/test_ssh_provider_tools.py b/apps/api/tests/test_ssh_provider_tools.py index e9d14e13..4f7630e9 100644 --- a/apps/api/tests/test_ssh_provider_tools.py +++ b/apps/api/tests/test_ssh_provider_tools.py @@ -21,6 +21,18 @@ async def test_ssh_diagnose_is_registered_read_only_tool(): assert "df -h" in command +def test_ssh_diagnose_can_include_container_read_only_context(): + provider = SSHProvider() + + command = provider._build_command( + "ssh_diagnose", + {"container_name": "sentry-self-hosted-clickhouse-1"}, + ) + + assert "docker stats --no-stream sentry-self-hosted-clickhouse-1" in command + assert "docker inspect sentry-self-hosted-clickhouse-1" in command + + def test_ssh_provider_uses_ollama_user_for_188(): provider = SSHProvider() @@ -32,6 +44,8 @@ def test_ssh_provider_uses_ollama_user_for_188(): "raw,expected", [ ("192.168.0.110:9100", "192.168.0.110"), + ("110:9100", "192.168.0.110"), + ("188", "192.168.0.188"), ("wooo@192.168.0.110", "192.168.0.110"), ("ssh://wooo@192.168.0.110:22", "192.168.0.110"), ("192.168.0.188", "192.168.0.188"),