Re: [PATCH v7 5/9] virtio_ring: Support DMA APIs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Feb 02, 2016 at 09:46:36PM -0800, Andy Lutomirski wrote:
> virtio_ring currently sends the device (usually a hypervisor)
> physical addresses of its I/O buffers.  This is okay when DMA
> addresses and physical addresses are the same thing, but this isn't
> always the case.  For example, this never works on Xen guests, and
> it is likely to fail if a physical "virtio" device ever ends up
> behind an IOMMU or swiotlb.
> 
> The immediate use case for me is to enable virtio on Xen guests.
> For that to work, we need vring to support DMA address translation
> as well as a corresponding change to virtio_pci or to another
> driver.
> 
> Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
> ---
>  drivers/virtio/Kconfig           |   2 +-
>  drivers/virtio/virtio_ring.c     | 200 ++++++++++++++++++++++++++++++++-------
>  tools/virtio/linux/dma-mapping.h |  17 ++++
>  3 files changed, 183 insertions(+), 36 deletions(-)
>  create mode 100644 tools/virtio/linux/dma-mapping.h
> 
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index cab9f3f63a38..77590320d44c 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -60,7 +60,7 @@ config VIRTIO_INPUT
>  
>   config VIRTIO_MMIO
>  	tristate "Platform bus driver for memory mapped virtio devices"
> -	depends on HAS_IOMEM
> +	depends on HAS_IOMEM && HAS_DMA
>   	select VIRTIO
>   	---help---
>   	 This drivers provides support for memory mapped virtio

What's this chunk doing here btw? Should be part of the mmio patch?

> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index ab0be6c084f6..9abc008ff7ea 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -24,6 +24,7 @@
>  #include <linux/module.h>
>  #include <linux/hrtimer.h>
>  #include <linux/kmemleak.h>
> +#include <linux/dma-mapping.h>
>  
>  #ifdef DEBUG
>  /* For development, we want to crash whenever the ring is screwed. */
> @@ -54,6 +55,11 @@
>  #define END_USE(vq)
>  #endif
>  
> +struct vring_desc_state {
> +	void *data;			/* Data for callback. */
> +	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
> +};
> +
>  struct vring_virtqueue {
>  	struct virtqueue vq;
>  
> @@ -98,8 +104,8 @@ struct vring_virtqueue {
>  	ktime_t last_add_time;
>  #endif
>  
> -	/* Tokens for callbacks. */
> -	void *data[];
> +	/* Per-descriptor state. */
> +	struct vring_desc_state desc_state[];
>  };
>  
>  #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
> @@ -128,6 +134,79 @@ static bool vring_use_dma_api(struct virtio_device *vdev)
>  	return false;
>  }
>  
> +/*
> + * The DMA ops on various arches are rather gnarly right now, and
> + * making all of the arch DMA ops work on the vring device itself
> + * is a mess.  For now, we use the parent device for DMA ops.
> + */
> +struct device *vring_dma_dev(const struct vring_virtqueue *vq)
> +{
> +	return vq->vq.vdev->dev.parent;
> +}
> +
> +/* Map one sg entry. */
> +static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
> +				   struct scatterlist *sg,
> +				   enum dma_data_direction direction)
> +{
> +	if (!vring_use_dma_api(vq->vq.vdev))
> +		return (dma_addr_t)sg_phys(sg);
> +
> +	/*
> +	 * We can't use dma_map_sg, because we don't use scatterlists in
> +	 * the way it expects (we don't guarantee that the scatterlist
> +	 * will exist for the lifetime of the mapping).
> +	 */
> +	return dma_map_page(vring_dma_dev(vq),
> +			    sg_page(sg), sg->offset, sg->length,
> +			    direction);
> +}
> +
> +static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
> +				   void *cpu_addr, size_t size,
> +				   enum dma_data_direction direction)
> +{
> +	if (!vring_use_dma_api(vq->vq.vdev))
> +		return (dma_addr_t)virt_to_phys(cpu_addr);
> +
> +	return dma_map_single(vring_dma_dev(vq),
> +			      cpu_addr, size, direction);
> +}
> +
> +static void vring_unmap_one(const struct vring_virtqueue *vq,
> +			    struct vring_desc *desc)
> +{
> +	u16 flags;
> +
> +	if (!vring_use_dma_api(vq->vq.vdev))
> +		return;
> +
> +	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> +
> +	if (flags & VRING_DESC_F_INDIRECT) {
> +		dma_unmap_single(vring_dma_dev(vq),
> +				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
> +				 virtio32_to_cpu(vq->vq.vdev, desc->len),
> +				 (flags & VRING_DESC_F_WRITE) ?
> +				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	} else {
> +		dma_unmap_page(vring_dma_dev(vq),
> +			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
> +			       virtio32_to_cpu(vq->vq.vdev, desc->len),
> +			       (flags & VRING_DESC_F_WRITE) ?
> +			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
> +	}
> +}
> +
> +static int vring_mapping_error(const struct vring_virtqueue *vq,
> +			       dma_addr_t addr)
> +{
> +	if (!vring_use_dma_api(vq->vq.vdev))
> +		return 0;
> +
> +	return dma_mapping_error(vring_dma_dev(vq), addr);
> +}
> +
>  static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
>  					 unsigned int total_sg, gfp_t gfp)
>  {
> @@ -161,7 +240,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  	struct vring_virtqueue *vq = to_vvq(_vq);
>  	struct scatterlist *sg;
>  	struct vring_desc *desc;
> -	unsigned int i, n, avail, descs_used, uninitialized_var(prev);
> +	unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
>  	int head;
>  	bool indirect;
>  
> @@ -201,21 +280,15 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  
>  	if (desc) {
>  		/* Use a single buffer which doesn't continue */
> -		vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
> -		vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, virt_to_phys(desc));
> -		/* avoid kmemleak false positive (hidden by virt_to_phys) */
> -		kmemleak_ignore(desc);
> -		vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
> -
> +		indirect = true;
>  		/* Set up rest to use this indirect table. */
>  		i = 0;
>  		descs_used = 1;
> -		indirect = true;
>  	} else {
> +		indirect = false;
>  		desc = vq->vring.desc;
>  		i = head;
>  		descs_used = total_sg;
> -		indirect = false;
>  	}
>  
>  	if (vq->vq.num_free < descs_used) {
> @@ -230,13 +303,14 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  		return -ENOSPC;
>  	}
>  
> -	/* We're about to use some buffers from the free list. */
> -	vq->vq.num_free -= descs_used;
> -
>  	for (n = 0; n < out_sgs; n++) {
>  		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
> +			if (vring_mapping_error(vq, addr))
> +				goto unmap_release;
> +
>  			desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT);
> -			desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg));
> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
>  			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
>  			prev = i;
>  			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
> @@ -244,8 +318,12 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  	}
>  	for (; n < (out_sgs + in_sgs); n++) {
>  		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
> +			if (vring_mapping_error(vq, addr))
> +				goto unmap_release;
> +
>  			desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE);
> -			desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg));
> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
>  			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
>  			prev = i;
>  			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
> @@ -254,14 +332,33 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  	/* Last one doesn't continue. */
>  	desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>  
> +	if (indirect) {
> +		/* Now that the indirect table is filled in, map it. */
> +		dma_addr_t addr = vring_map_single(
> +			vq, desc, total_sg * sizeof(struct vring_desc),
> +			DMA_TO_DEVICE);
> +		if (vring_mapping_error(vq, addr))
> +			goto unmap_release;
> +
> +		vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
> +		vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
> +
> +		vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
> +	}
> +
> +	/* We're using some buffers from the free list. */
> +	vq->vq.num_free -= descs_used;
> +
>  	/* Update free pointer */
>  	if (indirect)
>  		vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next);
>  	else
>  		vq->free_head = i;
>  
> -	/* Set token. */
> -	vq->data[head] = data;
> +	/* Store token and indirect buffer state. */
> +	vq->desc_state[head].data = data;
> +	if (indirect)
> +		vq->desc_state[head].indir_desc = desc;
>  
>  	/* Put entry in available array (but don't update avail->idx until they
>  	 * do sync). */
> @@ -284,6 +381,24 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  		virtqueue_kick(_vq);
>  
>  	return 0;
> +
> +unmap_release:
> +	err_idx = i;
> +	i = head;
> +
> +	for (n = 0; n < total_sg; n++) {
> +		if (i == err_idx)
> +			break;
> +		vring_unmap_one(vq, &desc[i]);
> +		i = vq->vring.desc[i].next;
> +	}
> +
> +	vq->vq.num_free += total_sg;
> +
> +	if (indirect)
> +		kfree(desc);
> +
> +	return -EIO;
>  }
>  
>  /**
> @@ -454,27 +569,43 @@ EXPORT_SYMBOL_GPL(virtqueue_kick);
>  
>  static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
>  {
> -	unsigned int i;
> +	unsigned int i, j;
> +	u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
>  
>  	/* Clear data ptr. */
> -	vq->data[head] = NULL;
> +	vq->desc_state[head].data = NULL;
>  
> -	/* Put back on free list: find end */
> +	/* Put back on free list: unmap first-level descriptors and find end */
>  	i = head;
>  
> -	/* Free the indirect table */
> -	if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))
> -		kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, vq->vring.desc[i].addr)));
> -
> -	while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT)) {
> +	while (vq->vring.desc[i].flags & nextflag) {
> +		vring_unmap_one(vq, &vq->vring.desc[i]);
>  		i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
>  		vq->vq.num_free++;
>  	}
>  
> +	vring_unmap_one(vq, &vq->vring.desc[i]);
>  	vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
>  	vq->free_head = head;
> +
>  	/* Plus final descriptor */
>  	vq->vq.num_free++;
> +
> +	/* Free the indirect table, if any, now that it's unmapped. */
> +	if (vq->desc_state[head].indir_desc) {
> +		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
> +		u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
> +
> +		BUG_ON(!(vq->vring.desc[head].flags &
> +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> +		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
> +
> +		for (j = 0; j < len / sizeof(struct vring_desc); j++)
> +			vring_unmap_one(vq, &indir_desc[j]);
> +
> +		kfree(vq->desc_state[head].indir_desc);
> +		vq->desc_state[head].indir_desc = NULL;
> +	}
>  }
>  
>  static inline bool more_used(const struct vring_virtqueue *vq)
> @@ -529,13 +660,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
>  		BAD_RING(vq, "id %u out of range\n", i);
>  		return NULL;
>  	}
> -	if (unlikely(!vq->data[i])) {
> +	if (unlikely(!vq->desc_state[i].data)) {
>  		BAD_RING(vq, "id %u is not a head!\n", i);
>  		return NULL;
>  	}
>  
>  	/* detach_buf clears data, so grab it now. */
> -	ret = vq->data[i];
> +	ret = vq->desc_state[i].data;
>  	detach_buf(vq, i);
>  	vq->last_used_idx++;
>  	/* If we expect an interrupt for the next entry, tell host
> @@ -709,10 +840,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
>  	START_USE(vq);
>  
>  	for (i = 0; i < vq->vring.num; i++) {
> -		if (!vq->data[i])
> +		if (!vq->desc_state[i].data)
>  			continue;
>  		/* detach_buf clears data, so grab it now. */
> -		buf = vq->data[i];
> +		buf = vq->desc_state[i].data;
>  		detach_buf(vq, i);
>  		vq->avail_idx_shadow--;
>  		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> @@ -766,7 +897,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>  		return NULL;
>  	}
>  
> -	vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
> +	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
> +		     GFP_KERNEL);
>  	if (!vq)
>  		return NULL;
>  
> @@ -800,11 +932,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>  
>  	/* Put everything in free lists. */
>  	vq->free_head = 0;
> -	for (i = 0; i < num-1; i++) {
> +	for (i = 0; i < num-1; i++)
>  		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> -		vq->data[i] = NULL;
> -	}
> -	vq->data[i] = NULL;
> +	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
>  
>  	return &vq->vq;
>  }
> diff --git a/tools/virtio/linux/dma-mapping.h b/tools/virtio/linux/dma-mapping.h
> new file mode 100644
> index 000000000000..4f93af89ae16
> --- /dev/null
> +++ b/tools/virtio/linux/dma-mapping.h
> @@ -0,0 +1,17 @@
> +#ifndef _LINUX_DMA_MAPPING_H
> +#define _LINUX_DMA_MAPPING_H
> +
> +#ifdef CONFIG_HAS_DMA
> +# error Virtio userspace code does not support CONFIG_HAS_DMA
> +#endif
> +
> +#define PCI_DMA_BUS_IS_PHYS 1
> +
> +enum dma_data_direction {
> +	DMA_BIDIRECTIONAL = 0,
> +	DMA_TO_DEVICE = 1,
> +	DMA_FROM_DEVICE = 2,
> +	DMA_NONE = 3,
> +};
> +
> +#endif
> -- 
> 2.5.0
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Kernel Development]     [DCCP]     [Linux ARM Development]     [Linux]     [Photo]     [Yosemite Help]     [Linux ARM Kernel]     [Linux SCSI]     [Linux x86_64]     [Linux Hams]

  Powered by Linux