Re: [PATCH v7 3/3] vhost: add an RPMsg API

Mathieu Poirier <mathieu.poirier@xxxxxxxxxx> · Thu, 17 Sep 2020 16:01:38 -0600

On Thu, Sep 10, 2020 at 01:13:51PM +0200, Guennadi Liakhovetski wrote:
> Linux supports running the RPMsg protocol over the VirtIO transport
> protocol, but currently there is only support for VirtIO clients and
> no support for VirtIO servers. This patch adds a vhost-based RPMsg
> server implementation, which makes it possible to use RPMsg over
> VirtIO between guest VMs and the host.

I now get the client/server concept you are describing above but that happened
only after a lot of mental gymnastics.  If you drop the whole client/server
concept and concentrate on what this patch does, things will go better.  I would
personally go with what you have in the Kconfig: 

> +	  Vhost RPMsg API allows vhost drivers to communicate with VirtIO
> +	  drivers on guest VMs, using the RPMsg over VirtIO protocol.

It is concise but describes exactly what this patch provide.

> 
> Signed-off-by: Guennadi Liakhovetski <guennadi.liakhovetski@xxxxxxxxxxxxxxx>
> ---
>  drivers/vhost/Kconfig       |   7 +
>  drivers/vhost/Makefile      |   3 +
>  drivers/vhost/rpmsg.c       | 370 ++++++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost_rpmsg.h |  74 ++++++++
>  4 files changed, 454 insertions(+)
>  create mode 100644 drivers/vhost/rpmsg.c
>  create mode 100644 drivers/vhost/vhost_rpmsg.h
> 
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index 587fbae06182..ee1a19b7ab3d 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -38,6 +38,13 @@ config VHOST_NET
>  	  To compile this driver as a module, choose M here: the module will
>  	  be called vhost_net.
>  
> +config VHOST_RPMSG
> +	tristate
> +	select VHOST
> +	help
> +	  Vhost RPMsg API allows vhost drivers to communicate with VirtIO
> +	  drivers on guest VMs, using the RPMsg over VirtIO protocol.
> +

I suppose you intend this to be selectable from another config option?

>  config VHOST_SCSI
>  	tristate "VHOST_SCSI TCM fabric driver"
>  	depends on TARGET_CORE && EVENTFD
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index f3e1897cce85..9cf459d59f97 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -2,6 +2,9 @@
>  obj-$(CONFIG_VHOST_NET) += vhost_net.o
>  vhost_net-y := net.o
>  
> +obj-$(CONFIG_VHOST_RPMSG) += vhost_rpmsg.o
> +vhost_rpmsg-y := rpmsg.o
> +
>  obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
>  vhost_scsi-y := scsi.o
>  
> diff --git a/drivers/vhost/rpmsg.c b/drivers/vhost/rpmsg.c
> new file mode 100644
> index 000000000000..0ddee5b5f017
> --- /dev/null
> +++ b/drivers/vhost/rpmsg.c
> @@ -0,0 +1,370 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright(c) 2020 Intel Corporation. All rights reserved.
> + *
> + * Author: Guennadi Liakhovetski <guennadi.liakhovetski@xxxxxxxxxxxxxxx>
> + *
> + * Vhost RPMsg VirtIO interface provides a set of functions to be used on the
> + * host side as a counterpart to the guest side RPMsg VirtIO API, provided by
> + * drivers/rpmsg/virtio_rpmsg_bus.c. This API can be used by any vhost driver to
> + * handle RPMsg specific virtqueue processing.
> + * Vhost drivers, using this API will use their own VirtIO device IDs, that
> + * should then also be added to the ID table in virtio_rpmsg_bus.c
> + */
> +
> +#include <linux/compat.h>
> +#include <linux/file.h>
> +#include <linux/miscdevice.h>

As far as I can tell the above two are not needed.

> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/rpmsg/virtio.h>
> +#include <linux/vhost.h>
> +#include <uapi/linux/rpmsg.h>
> +
> +#include "vhost.h"
> +#include "vhost_rpmsg.h"
> +
> +/*
> + * All virtio-rpmsg virtual queue kicks always come with just one buffer -
> + * either input or output, but we can also handle split messages
> + */
> +static int vhost_rpmsg_get_msg(struct vhost_virtqueue *vq, unsigned int *cnt)
> +{
> +	struct vhost_rpmsg *vr = container_of(vq->dev, struct vhost_rpmsg, dev);
> +	unsigned int out, in;
> +	int head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in,
> +				     NULL, NULL);
> +	if (head < 0) {
> +		vq_err(vq, "%s(): error %d getting buffer\n",
> +		       __func__, head);
> +		return head;
> +	}
> +
> +	/* Nothing new? */
> +	if (head == vq->num)
> +		return head;
> +
> +	if (vq == &vr->vq[VIRTIO_RPMSG_RESPONSE]) {
> +		if (out) {
> +			vq_err(vq, "%s(): invalid %d output in response queue\n",
> +			       __func__, out);
> +			goto return_buf;
> +		}
> +
> +		*cnt = in;
> +	}
> +
> +	if (vq == &vr->vq[VIRTIO_RPMSG_REQUEST]) {
> +		if (in) {
> +			vq_err(vq, "%s(): invalid %d input in request queue\n",
> +		       __func__, in);
> +			goto return_buf;
> +		}
> +
> +		*cnt = out;
> +	}
> +
> +	return head;
> +
> +return_buf:
> +	vhost_add_used(vq, head, 0);
> +
> +	return -EINVAL;
> +}
> +
> +static const struct vhost_rpmsg_ept *vhost_rpmsg_ept_find(struct vhost_rpmsg *vr, int addr)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < vr->n_epts; i++)
> +		if (vr->ept[i].addr == addr)
> +			return vr->ept + i;
> +
> +	return NULL;
> +}
> +
> +/*
> + * if len < 0, then for reading a request, the complete virtual queue buffer
> + * size is prepared, for sending a response, the length in the iterator is used
> + */
> +int vhost_rpmsg_start_lock(struct vhost_rpmsg *vr, struct vhost_rpmsg_iter *iter,
> +			   unsigned int qid, ssize_t len)
> +	__acquires(vq->mutex)
> +{
> +	struct vhost_virtqueue *vq = vr->vq + qid;
> +	unsigned int cnt;
> +	ssize_t ret;
> +	size_t tmp;
> +
> +	if (qid >= VIRTIO_RPMSG_NUM_OF_VQS)
> +		return -EINVAL;
> +
> +	iter->vq = vq;
> +
> +	mutex_lock(&vq->mutex);
> +	vhost_disable_notify(&vr->dev, vq);
> +
> +	iter->head = vhost_rpmsg_get_msg(vq, &cnt);
> +	if (iter->head == vq->num)
> +		iter->head = -EAGAIN;
> +
> +	if (iter->head < 0) {
> +		ret = iter->head;
> +		goto unlock;
> +	}
> +
> +	tmp = iov_length(vq->iov, cnt);
> +	if (tmp < sizeof(iter->rhdr)) {
> +		vq_err(vq, "%s(): size %zu too small\n", __func__, tmp);
> +		ret = -ENOBUFS;
> +		goto return_buf;
> +	}
> +
> +	switch (qid) {
> +	case VIRTIO_RPMSG_REQUEST:
> +		if (len >= 0) {
> +			if (tmp < sizeof(iter->rhdr) + len) {
> +				ret = -ENOBUFS;
> +				goto return_buf;
> +			}
> +
> +			tmp = len + sizeof(iter->rhdr);
> +		}
> +
> +		/* len is now the size of the payload */
> +		iov_iter_init(&iter->iov_iter, WRITE, vq->iov, cnt, tmp);
> +
> +		/* Read the RPMSG header with endpoint addresses */
> +		tmp = copy_from_iter(&iter->rhdr, sizeof(iter->rhdr), &iter->iov_iter);
> +		if (tmp != sizeof(iter->rhdr)) {
> +			vq_err(vq, "%s(): got %zu instead of %zu\n", __func__,
> +			       tmp, sizeof(iter->rhdr));
> +			ret = -EIO;
> +			goto return_buf;
> +		}
> +
> +		iter->ept = vhost_rpmsg_ept_find(vr, vhost32_to_cpu(vq, iter->rhdr.dst));
> +		if (!iter->ept) {
> +			vq_err(vq, "%s(): no endpoint with address %d\n",
> +			       __func__, vhost32_to_cpu(vq, iter->rhdr.dst));
> +			ret = -ENOENT;
> +			goto return_buf;
> +		}
> +
> +		/* Let the endpoint read the payload */
> +		if (iter->ept->read) {
> +			ret = iter->ept->read(vr, iter);
> +			if (ret < 0)
> +				goto return_buf;
> +
> +			iter->rhdr.len = cpu_to_vhost16(vq, ret);
> +		} else {
> +			iter->rhdr.len = 0;
> +		}
> +
> +		/* Prepare for the response phase */
> +		iter->rhdr.dst = iter->rhdr.src;
> +		iter->rhdr.src = cpu_to_vhost32(vq, iter->ept->addr);

I'm a little puzzled here - what will the response look like?  And why is it
prepared here?  From what I can see doing so introduces coupling with function
handle_rpmsg_req_single().  I think confirmation of reception should be handled
by endpoints rather than in the core. 

> +
> +		break;
> +	case VIRTIO_RPMSG_RESPONSE:
> +		if (!iter->ept && iter->rhdr.dst != cpu_to_vhost32(vq, RPMSG_NS_ADDR)) {
> +			/*
> +			 * Usually the iterator is configured when processing a
> +			 * message on the request queue, but it's also possible
> +			 * to send a message on the response queue without a
> +			 * preceding request, in that case the iterator must
> +			 * contain source and destination addresses.
> +			 */
> +			iter->ept = vhost_rpmsg_ept_find(vr, vhost32_to_cpu(vq, iter->rhdr.src));
> +			if (!iter->ept) {
> +				ret = -ENOENT;
> +				goto return_buf;
> +			}
> +		}
> +
> +		if (len >= 0) {
> +			if (tmp < sizeof(iter->rhdr) + len) {
> +				ret = -ENOBUFS;
> +				goto return_buf;
> +			}
> +
> +			iter->rhdr.len = cpu_to_vhost16(vq, len);
> +			tmp = len + sizeof(iter->rhdr);
> +		}
> +
> +		/* len is now the size of the payload */
> +		iov_iter_init(&iter->iov_iter, READ, vq->iov, cnt, tmp);
> +
> +		/* Write the RPMSG header with endpoint addresses */
> +		tmp = copy_to_iter(&iter->rhdr, sizeof(iter->rhdr), &iter->iov_iter);
> +		if (tmp != sizeof(iter->rhdr)) {
> +			ret = -EIO;
> +			goto return_buf;
> +		}
> +
> +		/* Let the endpoint write the payload */

I would specifically mention that namespace payloads are taken care of by
vhost_rpmsg_ns_announce().  That makes it easier for people to connect the dots. 

> +		if (iter->ept && iter->ept->write) {
> +			ret = iter->ept->write(vr, iter);
> +			if (ret < 0)
> +				goto return_buf;
> +		}
> +
> +		break;
> +	}
> +
> +	return 0;
> +
> +return_buf:
> +	vhost_add_used(vq, iter->head, 0);
> +unlock:
> +	vhost_enable_notify(&vr->dev, vq);
> +	mutex_unlock(&vq->mutex);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(vhost_rpmsg_start_lock);
> +
> +size_t vhost_rpmsg_copy(struct vhost_rpmsg *vr, struct vhost_rpmsg_iter *iter,
> +			void *data, size_t size)
> +{
> +	/*
> +	 * We could check for excess data, but copy_{to,from}_iter() don't do
> +	 * that either
> +	 */
> +	if (iter->vq == vr->vq + VIRTIO_RPMSG_RESPONSE)
> +		return copy_to_iter(data, size, &iter->iov_iter);
> +
> +	return copy_from_iter(data, size, &iter->iov_iter);
> +}
> +EXPORT_SYMBOL_GPL(vhost_rpmsg_copy);
> +
> +int vhost_rpmsg_finish_unlock(struct vhost_rpmsg *vr,
> +			      struct vhost_rpmsg_iter *iter)
> +	__releases(vq->mutex)
> +{
> +	if (iter->head >= 0)
> +		vhost_add_used_and_signal(iter->vq->dev, iter->vq, iter->head,
> +					  vhost16_to_cpu(iter->vq, iter->rhdr.len) +
> +					  sizeof(iter->rhdr));
> +
> +	vhost_enable_notify(&vr->dev, iter->vq);
> +	mutex_unlock(&iter->vq->mutex);
> +
> +	return iter->head;
> +}
> +EXPORT_SYMBOL_GPL(vhost_rpmsg_finish_unlock);
> +
> +/*
> + * Return false to terminate the external loop only if we fail to obtain either
> + * a request or a response buffer
> + */
> +static bool handle_rpmsg_req_single(struct vhost_rpmsg *vr,
> +				    struct vhost_virtqueue *vq)
> +{
> +	struct vhost_rpmsg_iter iter;
> +	int ret = vhost_rpmsg_start_lock(vr, &iter, VIRTIO_RPMSG_REQUEST, -EINVAL);
> +	if (!ret)
> +		ret = vhost_rpmsg_finish_unlock(vr, &iter);
> +	if (ret < 0) {
> +		if (ret != -EAGAIN)
> +			vq_err(vq, "%s(): RPMSG processing failed %d\n",
> +			       __func__, ret);
> +		return false;
> +	}
> +
> +	if (!iter.ept->write)
> +		return true;
> +
> +	ret = vhost_rpmsg_start_lock(vr, &iter, VIRTIO_RPMSG_RESPONSE, -EINVAL);
> +	if (!ret)
> +		ret = vhost_rpmsg_finish_unlock(vr, &iter);
> +	if (ret < 0) {
> +		vq_err(vq, "%s(): RPMSG finalising failed %d\n", __func__, ret);
> +		return false;
> +	}

As I said before dealing with the "response" queue here seems to be introducing
coupling with vhost_rpmsg_start_lock()...  Endpoints should be doing that.

> +
> +	return true;
> +}
> +
> +static void handle_rpmsg_req_kick(struct vhost_work *work)
> +{
> +	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
> +						  poll.work);
> +	struct vhost_rpmsg *vr = container_of(vq->dev, struct vhost_rpmsg, dev);
> +
> +	while (handle_rpmsg_req_single(vr, vq))
> +		;
> +}
> +
> +/*
> + * initialise two virtqueues with an array of endpoints,
> + * request and response callbacks
> + */
> +void vhost_rpmsg_init(struct vhost_rpmsg *vr, const struct vhost_rpmsg_ept *ept,
> +		      unsigned int n_epts)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(vr->vq); i++)
> +		vr->vq_p[i] = &vr->vq[i];
> +
> +	/* vq[0]: host -> guest, vq[1]: host <- guest */
> +	vr->vq[VIRTIO_RPMSG_REQUEST].handle_kick = handle_rpmsg_req_kick;
> +	vr->vq[VIRTIO_RPMSG_RESPONSE].handle_kick = NULL;

The comment depicts vq[0] followed by vq[1] but the code initialise vq[1] before
vq[0], which is wildly confusing.  At the very least this should be: 

	vr->vq[VIRTIO_RPMSG_RESPONSE].handle_kick = NULL;
	vr->vq[VIRTIO_RPMSG_REQUEST].handle_kick = handle_rpmsg_req_kick;

And even better:

        /* See configuration of *vq_cbs[] in rpmsg_probe()  */
	vr->vq[VIRTIO_RPMSG_TX].handle_kick = NULL;
	vr->vq[VIRTIO_RPMSG_RX].handle_kick = handle_rpmsg_req_kick;

> +
> +	vr->ept = ept;
> +	vr->n_epts = n_epts;
> +
> +	vhost_dev_init(&vr->dev, vr->vq_p, VIRTIO_RPMSG_NUM_OF_VQS,
> +		       UIO_MAXIOV, 0, 0, true, NULL);
> +}
> +EXPORT_SYMBOL_GPL(vhost_rpmsg_init);
> +
> +void vhost_rpmsg_destroy(struct vhost_rpmsg *vr)
> +{
> +	if (vhost_dev_has_owner(&vr->dev))
> +		vhost_poll_flush(&vr->vq[VIRTIO_RPMSG_REQUEST].poll);
> +
> +	vhost_dev_cleanup(&vr->dev);
> +}
> +EXPORT_SYMBOL_GPL(vhost_rpmsg_destroy);
> +
> +/* send namespace */
> +int vhost_rpmsg_ns_announce(struct vhost_rpmsg *vr, const char *name, unsigned int src)
> +{
> +	struct vhost_virtqueue *vq = &vr->vq[VIRTIO_RPMSG_RESPONSE];
> +	struct vhost_rpmsg_iter iter = {
> +		.rhdr = {
> +			.src = 0,
> +			.dst = cpu_to_vhost32(vq, RPMSG_NS_ADDR),
> +		},
> +	};
> +	struct rpmsg_ns_msg ns = {
> +		.addr = cpu_to_vhost32(vq, src),
> +		.flags = cpu_to_vhost32(vq, RPMSG_NS_CREATE), /* for rpmsg_ns_cb() */
> +	};

Here we have to assume the source can be found in the endpoints registered in
vhost_rpmsg_init().  I would put a check to make sure that is the case and
return an error otherwise. 

> +	int ret = vhost_rpmsg_start_lock(vr, &iter, VIRTIO_RPMSG_RESPONSE, sizeof(ns));
> +
> +	if (ret < 0)
> +		return ret;
> +
> +	strlcpy(ns.name, name, sizeof(ns.name));
> +
> +	ret = vhost_rpmsg_copy(vr, &iter, &ns, sizeof(ns));
> +	if (ret != sizeof(ns))
> +		vq_err(iter.vq, "%s(): added %d instead of %zu bytes\n",
> +		       __func__, ret, sizeof(ns));
> +
> +	ret = vhost_rpmsg_finish_unlock(vr, &iter);
> +	if (ret < 0)
> +		vq_err(iter.vq, "%s(): namespace announcement failed: %d\n",
> +		       __func__, ret);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(vhost_rpmsg_ns_announce);
> +
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Intel, Inc.");
> +MODULE_DESCRIPTION("Vhost RPMsg API");
> diff --git a/drivers/vhost/vhost_rpmsg.h b/drivers/vhost/vhost_rpmsg.h
> new file mode 100644
> index 000000000000..c020ea14cd16
> --- /dev/null
> +++ b/drivers/vhost/vhost_rpmsg.h
> @@ -0,0 +1,74 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright(c) 2020 Intel Corporation. All rights reserved.
> + *
> + * Author: Guennadi Liakhovetski <guennadi.liakhovetski@xxxxxxxxxxxxxxx>
> + */
> +
> +#ifndef VHOST_RPMSG_H
> +#define VHOST_RPMSG_H
> +
> +#include <linux/rpmsg/virtio.h>
> +#include <linux/uio.h>
> +
> +#include "vhost.h"
> +
> +/* RPMsg uses two VirtQueues: one for each direction */
> +enum {
> +	VIRTIO_RPMSG_RESPONSE,	/* RPMsg response (host->guest) buffers */
> +	VIRTIO_RPMSG_REQUEST,	/* RPMsg request (guest->host) buffers */

As I said above things would be much clearer if this was VIRTIO_RPMSG_TX and
VIRTIO_RPMSG_RX.

I won't be commenting on the mechanic needed to access and send information on
the virtqueues as it is completely foreign to me.  Other than the above I think
this is going somewhere. 

Thanks,
Mathieu

> +	/* Keep last */
> +	VIRTIO_RPMSG_NUM_OF_VQS,
> +};
> +
> +struct vhost_rpmsg_ept;
> +
> +struct vhost_rpmsg_iter {
> +	struct iov_iter iov_iter;
> +	struct rpmsg_hdr rhdr;
> +	struct vhost_virtqueue *vq;
> +	const struct vhost_rpmsg_ept *ept;
> +	int head;
> +	void *priv;
> +};
> +
> +struct vhost_rpmsg {
> +	struct vhost_dev dev;
> +	struct vhost_virtqueue vq[VIRTIO_RPMSG_NUM_OF_VQS];
> +	struct vhost_virtqueue *vq_p[VIRTIO_RPMSG_NUM_OF_VQS];
> +	const struct vhost_rpmsg_ept *ept;
> +	unsigned int n_epts;
> +};
> +
> +struct vhost_rpmsg_ept {
> +	ssize_t (*read)(struct vhost_rpmsg *, struct vhost_rpmsg_iter *);
> +	ssize_t (*write)(struct vhost_rpmsg *, struct vhost_rpmsg_iter *);
> +	int addr;
> +};
> +
> +static inline size_t vhost_rpmsg_iter_len(const struct vhost_rpmsg_iter *iter)
> +{
> +	return iter->rhdr.len;
> +}
> +
> +#define VHOST_RPMSG_ITER(_vq, _src, _dst) {			\
> +	.rhdr = {						\
> +			.src = cpu_to_vhost32(_vq, _src),	\
> +			.dst = cpu_to_vhost32(_vq, _dst),	\
> +		},						\
> +	}
> +
> +void vhost_rpmsg_init(struct vhost_rpmsg *vr, const struct vhost_rpmsg_ept *ept,
> +		      unsigned int n_epts);
> +void vhost_rpmsg_destroy(struct vhost_rpmsg *vr);
> +int vhost_rpmsg_ns_announce(struct vhost_rpmsg *vr, const char *name,
> +			    unsigned int src);
> +int vhost_rpmsg_start_lock(struct vhost_rpmsg *vr,
> +			   struct vhost_rpmsg_iter *iter,
> +			   unsigned int qid, ssize_t len);
> +size_t vhost_rpmsg_copy(struct vhost_rpmsg *vr, struct vhost_rpmsg_iter *iter,
> +			void *data, size_t size);
> +int vhost_rpmsg_finish_unlock(struct vhost_rpmsg *vr,
> +			      struct vhost_rpmsg_iter *iter);
> +
> +#endif
> -- 
> 2.28.0
>