feat(auto-repair): route ssh diagnostics through mcp gateway

2026-05-14 21:13:05 +08:00
parent 0567135647
commit 813d088339
8 changed files with 662 additions and 6 deletions
--- a/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_read_gateway_2026-05-14.sql
+++ b/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_read_gateway_2026-05-14.sql
@@ -0,0 +1,166 @@
+-- T23: auto-repair executor read-only MCP Gateway seed
+-- 目的：讓 YAML_RULE/PlayBook 的只讀 SSH 診斷步驟經過 AwoooP MCP Gateway。
+-- 邊界：只授權 read scope；write/admin SSH 工具仍必須走 approval_executor + Gate 5。
+
+SELECT set_config('app.project_id', 'awoooi', FALSE);
+
+WITH agent_body AS (
+    SELECT jsonb_build_object(
+        'schema_version', 'awooop_agent_contract_v1',
+        'agent_id', 'auto_repair_executor',
+        'display_name', 'Auto Repair Executor',
+        'project_id', 'awoooi',
+        'purpose', 'Read-only auto-repair diagnostics through AwoooP MCP Gateway',
+        'allowed_scopes', jsonb_build_array('read'),
+        'forbidden_scopes', jsonb_build_array('write', 'admin'),
+        'stage', 't23_auto_repair_diagnostic_gateway'
+    ) AS body_json
+),
+inserted_revision AS (
+    INSERT INTO awooop_contract_revisions (
+        project_id,
+        contract_family,
+        contract_id,
+        version_major,
+        version_minor,
+        lifecycle_status,
+        body_json,
+        body_hash,
+        body_schema_version,
+        publisher_id,
+        published_at
+    )
+    SELECT
+        'awoooi',
+        'agent',
+        'auto_repair_executor',
+        1,
+        0,
+        'active',
+        body_json,
+        encode(digest(body_json::text, 'sha256'), 'hex'),
+        'v1.0',
+        'migration:t23_auto_repair_executor_read_gateway',
+        NOW()
+    FROM agent_body
+    ON CONFLICT (project_id, contract_family, contract_id, version_major, version_minor)
+    DO NOTHING
+    RETURNING revision_id, project_id, contract_family, contract_id
+),
+chosen_revision AS (
+    SELECT revision_id, project_id, contract_family, contract_id
+    FROM inserted_revision
+    UNION ALL
+    SELECT revision_id, project_id, contract_family, contract_id
+    FROM awooop_contract_revisions
+    WHERE project_id = 'awoooi'
+      AND contract_family = 'agent'
+      AND contract_id = 'auto_repair_executor'
+      AND version_major = 1
+      AND version_minor = 0
+      AND lifecycle_status = 'active'
+),
+upsert_pointer AS (
+    INSERT INTO awooop_active_revisions (
+        project_id,
+        contract_family,
+        contract_id,
+        active_revision_id,
+        updated_at
+    )
+    SELECT DISTINCT ON (project_id, contract_family, contract_id)
+        project_id,
+        contract_family,
+        contract_id,
+        revision_id,
+        NOW()
+    FROM chosen_revision
+    ORDER BY project_id, contract_family, contract_id, revision_id
+    ON CONFLICT (project_id, contract_family, contract_id)
+    DO UPDATE SET
+        active_revision_id = EXCLUDED.active_revision_id,
+        updated_at = NOW()
+    RETURNING contract_id
+)
+SELECT 'auto_repair_executor_active_contracts', count(*) FROM upsert_pointer;
+
+WITH read_tools(tool_name, description) AS (
+    VALUES
+        ('ssh_diagnose', 'SSH host/container diagnosis read'),
+        ('ssh_get_top_processes', 'SSH top processes read'),
+        ('ssh_get_disk_usage', 'SSH disk usage read'),
+        ('ssh_get_memory_info', 'SSH memory info read'),
+        ('ssh_get_container_logs', 'SSH container logs read'),
+        ('ssh_get_container_status', 'SSH container status read'),
+        ('ssh_get_service_status', 'SSH service status read'),
+        ('ssh_check_port', 'SSH port check read'),
+        ('ssh_get_nginx_error_log', 'SSH nginx error log read'),
+        ('ssh_get_swap_info', 'SSH swap info read')
+),
+upsert_tools AS (
+    INSERT INTO awooop_mcp_tool_registry (
+        project_id,
+        tool_name,
+        tool_type,
+        description,
+        allowed_scopes,
+        environment_tags,
+        is_active,
+        updated_at
+    )
+    SELECT
+        'awoooi',
+        tool_name,
+        'mcp_server',
+        description,
+        '["read"]'::jsonb,
+        '{"env": "prod"}'::jsonb,
+        TRUE,
+        NOW()
+    FROM read_tools
+    ON CONFLICT (project_id, tool_name)
+    DO UPDATE SET
+        description = EXCLUDED.description,
+        allowed_scopes = EXCLUDED.allowed_scopes,
+        environment_tags = EXCLUDED.environment_tags,
+        is_active = TRUE,
+        updated_at = NOW()
+    RETURNING tool_id, tool_name, allowed_scopes
+),
+upsert_grants AS (
+    INSERT INTO awooop_mcp_grants (
+        project_id,
+        agent_id,
+        tool_id,
+        granted_by,
+        granted_scopes,
+        expires_at,
+        is_revoked,
+        revoked_at,
+        revoked_by
+    )
+    SELECT
+        'awoooi',
+        'auto_repair_executor',
+        tool_id,
+        'migration:t23_auto_repair_executor_read_gateway',
+        allowed_scopes,
+        NULL,
+        FALSE,
+        NULL,
+        NULL
+    FROM upsert_tools
+    ON CONFLICT (project_id, agent_id, tool_id)
+    DO UPDATE SET
+        granted_by = EXCLUDED.granted_by,
+        granted_scopes = EXCLUDED.granted_scopes,
+        expires_at = NULL,
+        is_revoked = FALSE,
+        revoked_at = NULL,
+        revoked_by = NULL
+    RETURNING grant_id
+)
+SELECT
+    'auto_repair_executor_read_gateway',
+    (SELECT count(*) FROM upsert_tools) AS tool_rows,
+    (SELECT count(*) FROM upsert_grants) AS grant_rows;
--- a/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_read_gateway_2026-05-14_down.sql
+++ b/apps/api/migrations/awooop_awoooi_mcp_auto_repair_executor_read_gateway_2026-05-14_down.sql
@@ -0,0 +1,24 @@
+-- Rollback T23 auto-repair executor read-only MCP Gateway grant.
+
+SELECT set_config('app.project_id', 'awoooi', FALSE);
+
+UPDATE awooop_mcp_grants
+SET is_revoked = TRUE,
+    revoked_at = NOW(),
+    revoked_by = 'rollback:t23_auto_repair_executor_read_gateway'
+WHERE project_id = 'awoooi'
+  AND agent_id = 'auto_repair_executor'
+  AND granted_by = 'migration:t23_auto_repair_executor_read_gateway';
+
+DELETE FROM awooop_active_revisions
+WHERE project_id = 'awoooi'
+  AND contract_family = 'agent'
+  AND contract_id = 'auto_repair_executor';
+
+UPDATE awooop_contract_revisions
+SET lifecycle_status = 'retired'
+WHERE project_id = 'awoooi'
+  AND contract_family = 'agent'
+  AND contract_id = 'auto_repair_executor'
+  AND publisher_id = 'migration:t23_auto_repair_executor_read_gateway'
+  AND lifecycle_status = 'active';
--- a/apps/api/src/plugins/mcp/providers/ssh_provider.py
+++ b/apps/api/src/plugins/mcp/providers/ssh_provider.py
@@ -65,6 +65,12 @@ DEFAULT_HOST_USERS = {
    # AI/Web host is operated by the ollama account in the current topology.
    "192.168.0.188": "ollama",
 }
+SHORT_HOST_MAP = {
+    "110": "192.168.0.110",
+    "120": "192.168.0.120",
+    "121": "192.168.0.121",
+    "188": "192.168.0.188",
+}
 DIAG_TIMEOUT = 10   # 診斷類超時（秒）
 OP_TIMEOUT   = 60   # 操作類超時（秒）

@@ -127,7 +133,9 @@ def _normalize_ssh_host(value: str) -> str:
    if host.count(":") == 1:
        maybe_host, maybe_port = host.rsplit(":", 1)
        if maybe_port.isdigit():
-            return maybe_host
+            host = maybe_host
+    if host in SHORT_HOST_MAP:
+        return SHORT_HOST_MAP[host]
    return host


@@ -240,6 +248,10 @@ class SSHProvider(MCPToolProvider):
                ),
                input_schema={"type": "object", "properties": {
                    "host": {"type": "string", "description": "Target host IP"},
+                    "container_name": {
+                        "type": "string",
+                        "description": "Optional Docker container name for container-focused diagnostics",
+                    },
                }, "required": ["host"]},
                server_name=self.name,
            ),
@@ -542,12 +554,23 @@ class SSHProvider(MCPToolProvider):
        # 所有接受用戶字串的工具，必須先通過 _validate_param() 白名單驗證
        if tool_name == "ssh_diagnose":
            # 2026-04-27 Claude Sonnet 4.6: 主機告警自動診斷 — 只讀，不修改任何狀態
-            return (
+            command = (
                "echo '=== CPU TOP ===' && ps aux --sort=-%cpu | head -15 && "
                "echo '=== MEMORY ===' && free -h && "
                "echo '=== DISK ===' && df -h && "
                "echo '=== LOAD ===' && uptime"
            )
+            container_name = params.get("container_name")
+            if container_name:
+                name = _validate_param("container_name", str(container_name))
+                command = (
+                    f"{command} && "
+                    f"echo '=== DOCKER STATS {name} ===' && "
+                    f"docker stats --no-stream {name} 2>&1 && "
+                    f"echo '=== DOCKER INSPECT {name} ===' && "
+                    f"docker inspect {name} 2>&1 | head -80"
+                )
+            return command

        if tool_name == "ssh_get_top_processes":
            return "ps aux --sort=-%cpu | head -15"
--- a/apps/api/src/services/auto_repair_service.py
+++ b/apps/api/src/services/auto_repair_service.py
@@ -22,9 +22,10 @@ Phase 8: 自動化層實作
 - P0/P1 嚴重度 Incident 需要人工確認
 """

-from dataclasses import dataclass
 from collections.abc import Callable
-from typing import Protocol
+from dataclasses import dataclass
+import re
+from typing import Any, Protocol

 import structlog

@@ -81,6 +82,55 @@ class AutoRepairResult:
    execution_time_ms: int = 0


+@dataclass(frozen=True)
+class _SshMcpRoute:
+    """Route a legacy SSH playbook command to a governed MCP tool."""
+
+    tool_name: str
+    params: dict[str, Any]
+
+
+_SHORT_HOST_MAP: dict[str, str] = {
+    "110": "192.168.0.110",
+    "120": "192.168.0.120",
+    "121": "192.168.0.121",
+    "188": "192.168.0.188",
+}
+
+_SSH_DIAGNOSTIC_KEYWORDS = (
+    "ps aux",
+    "docker stats",
+    "docker inspect",
+    "docker logs",
+    "docker ps",
+    "docker top",
+    "df -h",
+    "du -",
+    "free -h",
+    "journalctl",
+    "systemctl show",
+    "tail ",
+    "top ",
+    "uptime",
+)
+
+_SSH_WRITE_KEYWORDS = (
+    "docker restart",
+    "docker start",
+    "docker stop",
+    "docker rm",
+    "docker prune",
+    "systemctl restart",
+    "systemctl stop",
+    "systemctl start",
+    "truncate ",
+    " rm ",
+    "rm -",
+    "certbot renew",
+    "bash ",
+)
+
+
 # =============================================================================
 # Auto Repair Service Interface
 # =============================================================================
@@ -918,6 +968,152 @@ class AutoRepairService:
            # 安全降級：檢查失敗 → 保守拒絕
            return False

+    def _route_legacy_ssh_command_to_mcp(
+        self,
+        incident: Incident,
+        command: str,
+    ) -> _SshMcpRoute | None:
+        """Map read-only legacy ``ssh {host} '...'`` steps to MCP Gateway.
+
+        YAML_RULE playbooks predate the URI executor and can contain compound
+        shell diagnostics.  Those commands should not bypass the newer
+        scheme-based HostRepairAgent or loosen its shell safety guard; read-only
+        diagnostics are instead routed to the governed SSH MCP provider.
+        """
+
+        raw_command = (command or "").strip()
+        lowered = raw_command.lower()
+        if not lowered.startswith("ssh "):
+            return None
+
+        if any(token in lowered for token in _SSH_WRITE_KEYWORDS):
+            return None
+
+        if not any(token in lowered for token in _SSH_DIAGNOSTIC_KEYWORDS):
+            return None
+
+        host = self._resolve_ssh_host_for_incident(incident, raw_command)
+        if not host:
+            return None
+
+        params: dict[str, Any] = {"host": host}
+        container_name = self._resolve_container_name_for_incident(incident, raw_command)
+        if container_name:
+            params["container_name"] = container_name
+
+        return _SshMcpRoute(tool_name="ssh_diagnose", params=params)
+
+    def _resolve_ssh_host_for_incident(self, incident: Incident, command: str) -> str:
+        """Resolve ``{host}``, short host labels, and exporter instance ports."""
+
+        labels = self._incident_labels(incident)
+        raw_host = ""
+        match = re.match(r"ssh\s+([^\s'\"]+)", command.strip(), flags=re.IGNORECASE)
+        if match:
+            raw_host = match.group(1)
+
+        if not raw_host or "{" in raw_host or "}" in raw_host:
+            raw_host = (
+                str(labels.get("host") or "")
+                or str(labels.get("instance") or "")
+                or str(labels.get("node") or "")
+                or str(labels.get("exported_instance") or "")
+            )
+
+        return self._normalize_ssh_host(raw_host)
+
+    @staticmethod
+    def _normalize_ssh_host(raw_host: str) -> str:
+        host = (raw_host or "").strip()
+        if host.startswith("ssh://"):
+            host = host.removeprefix("ssh://")
+        if "@" in host:
+            host = host.rsplit("@", 1)[1]
+        if host.startswith("[") and "]" in host:
+            host = host[1:host.index("]")]
+        if host.count(":") == 1:
+            maybe_host, maybe_port = host.rsplit(":", 1)
+            if maybe_port.isdigit():
+                host = maybe_host
+        if host in _SHORT_HOST_MAP:
+            return _SHORT_HOST_MAP[host]
+        match = re.fullmatch(r"(?:node-exporter-|host-)?(110|120|121|188)", host)
+        if match:
+            return _SHORT_HOST_MAP[match.group(1)]
+        return host
+
+    def _resolve_container_name_for_incident(
+        self,
+        incident: Incident,
+        command: str,
+    ) -> str:
+        labels = self._incident_labels(incident)
+        for key in ("container_name", "container", "name"):
+            value = str(labels.get(key) or "").strip()
+            if value and "{" not in value and "}" not in value:
+                return value
+
+        match = re.search(
+            r"docker\s+(?:stats\s+--no-stream|inspect|logs|top|ps\s+-a\s+--filter\s+name=)\s+([a-zA-Z0-9._-]+)",
+            command,
+        )
+        return match.group(1) if match else ""
+
+    @staticmethod
+    def _incident_labels(incident: Incident) -> dict[str, Any]:
+        for signal in incident.signals or []:
+            labels = getattr(signal, "labels", None)
+            if labels:
+                return labels
+        return {}
+
+    async def _execute_ssh_mcp_route(
+        self,
+        incident: Incident,
+        route: _SshMcpRoute,
+    ) -> str:
+        """Execute a routed SSH diagnostic through AwoooP MCP Gateway."""
+
+        try:
+            from src.db.base import get_db_context
+            from src.plugins.mcp.gateway import GatewayContext, McpGateway, McpGatewayError
+            from src.services.mcp_audit_context import with_mcp_audit_context
+
+            incident_id = incident.incident_id
+            params = with_mcp_audit_context(
+                route.params,
+                session_id=f"incident:{incident_id}:auto_repair_execute",
+                incident_id=incident_id,
+                flywheel_node="execute",
+                agent_role="auto_repair_executor",
+            )
+            async with get_db_context("awoooi") as db:
+                ctx = GatewayContext(
+                    project_id="awoooi",
+                    agent_id="auto_repair_executor",
+                    tool_name=route.tool_name,
+                    trace_id=incident_id,
+                    is_shadow=False,
+                    environment={"env": "prod"},
+                    required_scope="read",
+                )
+                result = await McpGateway(db).call(ctx, params)
+        except McpGatewayError as exc:
+            return f"FAILED: mcp:{route.tool_name} {exc.error_code}: {exc}"
+        except Exception as exc:
+            logger.warning(
+                "auto_repair_ssh_mcp_route_failed",
+                incident_id=incident.incident_id,
+                tool=route.tool_name,
+                error=str(exc),
+            )
+            return f"FAILED: mcp:{route.tool_name} {exc}"
+
+        if result.success:
+            preview = str(result.output or "")[:500]
+            return f"SUCCESS: mcp:{route.tool_name} {preview}".strip()
+        return f"FAILED: mcp:{route.tool_name} {result.error or 'execution failed'}"
+
    async def _execute_step(self, incident: Incident, step) -> str:
        """
        執行單一修復步驟
@@ -949,6 +1145,10 @@ class AutoRepairService:

        # 2026-04-06 Claude Code: Sprint 3 — repair_by_uri (URI scheme 路由)
        if step.action_type == ActionType.SSH_COMMAND:
+            route = self._route_legacy_ssh_command_to_mcp(incident, step.command)
+            if route is not None:
+                return await self._execute_ssh_mcp_route(incident, route)
+
            from src.services.host_repair_agent import HostRepairAgent
            agent = HostRepairAgent()
            approved = not getattr(step, "requires_approval", False)
--- a/apps/api/src/services/post_execution_verifier.py
+++ b/apps/api/src/services/post_execution_verifier.py
@@ -34,6 +34,7 @@ MASTER §3.1 L6×D1
 from __future__ import annotations

 import asyncio
+import re
 from typing import TYPE_CHECKING, Any

 import structlog
@@ -236,6 +237,7 @@ class PostExecutionVerifier:
            "pod_name": labels.get("pod", labels.get("name", "")),
            "deployment": labels.get("deployment", ""),
            "host": labels.get("instance", "").split(":")[0] or labels.get("host", ""),
+            "query": _build_prometheus_query(alertname, labels),
        }

        async def _call_one(reg) -> tuple[str, Any]:
@@ -501,16 +503,95 @@ def _get_incident_id(incident: "Incident") -> str:

 def _get_alertname(incident: "Incident") -> str:
    if incident.signals:
-        return incident.signals[0].labels.get("alertname", "")
+        signal = incident.signals[0]
+        labels = signal.labels or {}
+        return labels.get("alertname", "") or getattr(signal, "alert_name", "")
    return ""


 def _get_labels(incident: "Incident") -> dict[str, Any]:
    if incident.signals:
-        return incident.signals[0].labels
+        return incident.signals[0].labels or {}
    return {}


+def _build_prometheus_query(alertname: str, labels: dict[str, Any]) -> str:
+    """Build a non-empty PromQL probe for post-execution metric sensors."""
+
+    alert = (alertname or "").lower()
+    namespace = _safe_label_value(str(labels.get("namespace") or "awoooi-prod"))
+    pod_name = _safe_label_value(str(labels.get("pod") or labels.get("name") or ""))
+    host = _safe_label_value(str(labels.get("host") or _short_instance(labels.get("instance")) or ""))
+    container = _safe_label_value(str(
+        labels.get("container_name")
+        or labels.get("container")
+        or labels.get("name")
+        or ""
+    ))
+
+    if alert.startswith("dockercontainer"):
+        selector = _selector({
+            "host": host,
+            "container_name": container,
+        })
+        if "memory" in alert:
+            return (
+                f"docker_container_memory_usage_bytes{selector} / "
+                f"docker_container_memory_limit_bytes{selector}"
+            )
+        if "restart" in alert:
+            return (
+                f"increase(docker_container_inspect_restart_count{selector}[15m]) "
+                f"or increase(docker_container_restart_count{selector}[15m])"
+            )
+        if "cpu" in alert:
+            return f"docker_container_cpu_cores{selector}"
+        return f"docker_container_info{selector}"
+
+    if any(key in alert for key in ("host", "node")):
+        instance_selector = _selector({"instance": str(labels.get("instance") or "")})
+        if "memory" in alert or "oom" in alert:
+            return f"node_memory_MemAvailable_bytes{instance_selector}"
+        if "disk" in alert or "storage" in alert:
+            return f"node_filesystem_avail_bytes{instance_selector}"
+        if "load" in alert or "cpu" in alert:
+            return f"node_load5{instance_selector}"
+        return f"up{instance_selector}"
+
+    pod_filter = f',pod=~"{pod_name}.*"' if pod_name else ""
+    if any(key in alert for key in ("memory", "mem", "oom")):
+        return (
+            f'avg(container_memory_working_set_bytes{{namespace="{namespace}"{pod_filter}}}) '
+            "/ 1048576"
+        )
+    if any(key in alert for key in ("cpu", "load", "throttl")):
+        return f'avg(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"{pod_filter}}}[5m]))'
+    if any(key in alert for key in ("crash", "restart", "backoff")):
+        return f'sum(increase(kube_pod_container_status_restarts_total{{namespace="{namespace}"}}[15m]))'
+    if any(key in alert for key in ("http", "error", "5xx", "probe", "down", "unhealthy")):
+        return "1 - avg(probe_success)"
+    return f'up{{namespace="{namespace}"}}'
+
+
+def _selector(labels: dict[str, str]) -> str:
+    parts = [f'{key}="{value}"' for key, value in labels.items() if value]
+    return "{" + ",".join(parts) + "}" if parts else ""
+
+
+def _short_instance(instance: Any) -> str:
+    raw = str(instance or "")
+    if raw.count(":") == 1:
+        return raw.rsplit(":", 1)[0]
+    return raw
+
+
+def _safe_label_value(value: str) -> str:
+    cleaned = (value or "").strip()
+    if re.fullmatch(r"[a-zA-Z0-9_.:-]{1,128}", cleaned):
+        return cleaned
+    return ""
+
+
 async def _update_snapshot(
    snapshot: EvidenceSnapshot,
    post_state: dict[str, Any],
--- a/apps/api/tests/test_auto_repair_service.py
+++ b/apps/api/tests/test_auto_repair_service.py
@@ -19,6 +19,7 @@ from src.models.playbook import (
    RiskLevel,
    SymptomPattern,
 )
+from src.plugins.mcp.interfaces import MCPToolResult
 from src.services.auto_repair_service import AutoRepairService
 from src.utils.timezone import now_taipei

@@ -120,6 +121,14 @@ async def _no_cooldown(*args, **kwargs) -> tuple[bool, str]:
    return True, "允許自動修復 (test bypass)"


+class _DbContext:
+    async def __aenter__(self) -> object:
+        return object()
+
+    async def __aexit__(self, *_args: object) -> None:
+        return None
+
+
 class TestAutoRepairService:
    """Auto Repair Service unit tests"""

@@ -415,6 +424,90 @@ class TestAutoRepairService:

        assert service._should_escalate_failed_verification(incident, playbook) is False

+    def test_legacy_ssh_diagnostic_routes_to_mcp_gateway(self, service):
+        """Legacy YAML_RULE SSH diagnostics become governed read-only MCP calls."""
+        incident = create_test_incident(
+            severity=Severity.P2,
+            alert_category="infrastructure",
+            alert_name="DockerContainerMemoryLimitPressure",
+        )
+        incident.signals[0].labels.update({
+            "host": "110",
+            "container_name": "sentry-self-hosted-clickhouse-1",
+        })
+
+        route = service._route_legacy_ssh_command_to_mcp(
+            incident,
+            'ssh {host} \'echo "=== LOAD ==="; uptime; docker stats --no-stream\'',
+        )
+
+        assert route is not None
+        assert route.tool_name == "ssh_diagnose"
+        assert route.params == {
+            "host": "192.168.0.110",
+            "container_name": "sentry-self-hosted-clickhouse-1",
+        }
+
+    def test_legacy_ssh_write_action_does_not_route_to_read_mcp(self, service):
+        """Write operations must stay out of the read-only diagnostic grant."""
+        incident = create_test_incident(severity=Severity.P2)
+
+        route = service._route_legacy_ssh_command_to_mcp(
+            incident,
+            "ssh {host} 'docker restart minio'",
+        )
+
+        assert route is None
+
+    @pytest.mark.asyncio
+    async def test_execute_legacy_ssh_diagnostic_uses_mcp_gateway(
+        self,
+        service,
+        monkeypatch,
+    ):
+        incident = create_test_incident(
+            severity=Severity.P2,
+            alert_category="infrastructure",
+            alert_name="DockerContainerMemoryLimitPressure",
+        )
+        incident.signals[0].labels.update({
+            "host": "110",
+            "container_name": "momo-scheduler",
+        })
+        step = RepairStep(
+            step_number=1,
+            action_type=ActionType.SSH_COMMAND,
+            command='ssh {host} \'echo "=== LOAD ==="; uptime; docker stats --no-stream\'',
+            risk_level=RiskLevel.LOW,
+        )
+        calls = []
+
+        class FakeGateway:
+            def __init__(self, db: object) -> None:
+                self.db = db
+
+            async def call(self, ctx, params):
+                calls.append({"ctx": ctx, "params": params, "db": self.db})
+                return MCPToolResult(
+                    success=True,
+                    execution_id="gw-ok",
+                    output={"stdout": "=== CPU TOP ==="},
+                )
+
+        monkeypatch.setattr("src.db.base.get_db_context", lambda _project_id: _DbContext())
+        monkeypatch.setattr("src.plugins.mcp.gateway.McpGateway", FakeGateway)
+
+        result = await service._execute_step(incident, step)
+
+        assert result.startswith("SUCCESS: mcp:ssh_diagnose")
+        assert calls
+        assert calls[0]["ctx"].agent_id == "auto_repair_executor"
+        assert calls[0]["ctx"].required_scope == "read"
+        assert calls[0]["ctx"].is_shadow is False
+        assert calls[0]["params"]["host"] == "192.168.0.110"
+        assert calls[0]["params"]["container_name"] == "momo-scheduler"
+        assert calls[0]["params"]["_mcp_audit"]["flywheel_node"] == "execute"
+
    @pytest.mark.asyncio
    async def test_evaluate_low_risk_allowed(self, service, mock_playbook_service):
        """Test that LOW risk actions are allowed"""
--- a/apps/api/tests/test_post_execution_verifier.py
+++ b/apps/api/tests/test_post_execution_verifier.py
@@ -384,6 +384,25 @@ class _FakeRegistry:
        ]


+class _PrometheusRegistry:
+    def __init__(self, provider: _CaptureProvider) -> None:
+        self.provider = provider
+
+    def suggest_tools(self, alertname: str = "", incident_labels: dict | None = None) -> list[RegisteredTool]:
+        return [
+            RegisteredTool(
+                tool=MCPTool(
+                    name="prometheus_query",
+                    description="",
+                    input_schema={},
+                    server_name="capture",
+                ),
+                provider=self.provider,
+                dimensions=[SensorDimension.D3_METRICS],
+            )
+        ]
+
+
 class _DbContext:
    async def __aenter__(self) -> object:
        return object()
@@ -448,3 +467,39 @@ class TestCollectPostStateAuditContext:
        assert ctx.required_scope == "read"
        assert calls[0]["parameters"]["_mcp_audit"]["incident_id"] == "INC-TEST"
        assert calls[0]["parameters"]["_mcp_audit"]["flywheel_node"] == "verify"
+
+    @pytest.mark.asyncio
+    async def test_collect_post_state_sends_prometheus_query_parameter(self):
+        provider = _CaptureProvider()
+        verifier = PostExecutionVerifier()
+        verifier._registry = _PrometheusRegistry(provider)
+        incident = _stub_incident(alertname="DockerContainerMemoryLimitPressure")
+        incident.signals[0].labels.update({
+            "host": "110",
+            "container_name": "momo-scheduler",
+        })
+
+        await verifier._collect_post_state(incident)
+
+        assert provider.seen_parameters is not None
+        query = provider.seen_parameters["query"]
+        assert "docker_container_memory_usage_bytes" in query
+        assert 'host="110"' in query
+        assert 'container_name="momo-scheduler"' in query
+
+
+class TestPrometheusQueryBuilder:
+    def test_docker_memory_alert_query_is_not_empty(self):
+        query = pev_module._build_prometheus_query(
+            "DockerContainerMemoryLimitPressure",
+            {"host": "110", "container_name": "momo-scheduler"},
+        )
+
+        assert "docker_container_memory_usage_bytes" in query
+        assert "docker_container_memory_limit_bytes" in query
+
+    def test_canary_alert_uses_generic_non_empty_query(self):
+        query = pev_module._build_prometheus_query("AwoooPAutoRepairCanaryT16", {})
+
+        assert query
+        assert "up" in query
--- a/apps/api/tests/test_ssh_provider_tools.py
+++ b/apps/api/tests/test_ssh_provider_tools.py
@@ -21,6 +21,18 @@ async def test_ssh_diagnose_is_registered_read_only_tool():
    assert "df -h" in command


+def test_ssh_diagnose_can_include_container_read_only_context():
+    provider = SSHProvider()
+
+    command = provider._build_command(
+        "ssh_diagnose",
+        {"container_name": "sentry-self-hosted-clickhouse-1"},
+    )
+
+    assert "docker stats --no-stream sentry-self-hosted-clickhouse-1" in command
+    assert "docker inspect sentry-self-hosted-clickhouse-1" in command
+
+
 def test_ssh_provider_uses_ollama_user_for_188():
    provider = SSHProvider()

@@ -32,6 +44,8 @@ def test_ssh_provider_uses_ollama_user_for_188():
    "raw,expected",
    [
        ("192.168.0.110:9100", "192.168.0.110"),
+        ("110:9100", "192.168.0.110"),
+        ("188", "192.168.0.188"),
        ("wooo@192.168.0.110", "192.168.0.110"),
        ("ssh://wooo@192.168.0.110:22", "192.168.0.110"),
        ("192.168.0.188", "192.168.0.188"),