Re: [PATCH 2/3] virtio_ring: Use DMA APIs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Aug 27, 2014 at 09:29:36AM +0200, Christian Borntraeger wrote:
> On 26/08/14 23:17, Andy Lutomirski wrote:
> > virtio_ring currently sends the device (usually a hypervisor)
> > physical addresses of its I/O buffers.  This is okay when DMA
> > addresses and physical addresses are the same thing, but this isn't
> > always the case.  For example, this never works on Xen guests, and
> > it is likely to fail if a physical "virtio" device ever ends up
> > behind an IOMMU or swiotlb.
> > 
> > The immediate use case for me is to enable virtio on Xen guests.
> > For that to work, we need this fix as well as a corresponding
> > fix to virtio_pci or to another driver.
> > 
> > With this patch, virtfs survives kmemleak and CONFIG_DMA_API_DEBUG.
> > virtio-net warns (correctly) about DMA from the stack in
> > virtnet_set_rx_mode.
> > 
> > This breaks s390's defconfig.  The default configuration for s390
> > does virtio through a KVM-specific interface, but that configuration
> > does not support DMA.  I could modify this patch to stub out the DMA
> > API calls if !CONFIG_HAS_DMA, but it seems to me that it would be
> > much nicer to make s390 support DMA unconditionally.
> 
> s390 has no DMA per se. Newest systems have a PCI-like  I/O attach in 
> addition to the classic channel I/O, and for that we enable the DMA code 
> just for that transport to be able to reuse some of the existing PCI
> drivers. (only some because, we differ in some aspects from how PCI
> looks like) But the architecture itself (and the virtio interface) does
> not provide the DMA interface as you know it:

Don't most of those DMA_API end up then being nops? As in, we do
have in the dma-api file the #ifdef case when a platform does not do
DMA which ends up with all functions stubbed out.

> 
> In essence this patch is a no-go for s390.

Is that due to possible compilation?
> 
> I am also aksing myself, if this makes virtio-ring more expensive?
> 
> Christian
> 
> 
> 
> > 
> > It's actually unclear to me whether this can be fixed without either
> > s390 arch support or a special case for s390 in virtio_ring: on
> > brief inspection of s390's DMA code, it seems to assume that
> > everything goes through a PCI IOMMU, which is presumably not the
> > case for virtio.
> 
> 
> > 
> > Cc: Cornelia Huck <cornelia.huck@xxxxxxxxxx>
> > Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
> > ---
> >  drivers/virtio/Kconfig       |   1 +
> >  drivers/virtio/virtio_ring.c | 116 +++++++++++++++++++++++++++++++++++--------
> >  2 files changed, 95 insertions(+), 22 deletions(-)
> > 
> > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> > index c6683f2e396c..b6f3acc61153 100644
> > --- a/drivers/virtio/Kconfig
> > +++ b/drivers/virtio/Kconfig
> > @@ -1,5 +1,6 @@
> >  config VIRTIO
> >  	tristate
> > +	depends on HAS_DMA
> >  	---help---
> >  	  This option is selected by any driver which implements the virtio
> >  	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index d356a701c9c2..6a78e2fd8e63 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -24,6 +24,7 @@
> >  #include <linux/module.h>
> >  #include <linux/hrtimer.h>
> >  #include <linux/kmemleak.h>
> > +#include <linux/dma-mapping.h>
> > 
> >  #ifdef DEBUG
> >  /* For development, we want to crash whenever the ring is screwed. */
> > @@ -54,6 +55,12 @@
> >  #define END_USE(vq)
> >  #endif
> > 
> > +struct vring_desc_state
> > +{
> > +	void *data;			/* Data for callback. */
> > +	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
> > +};
> > +
> >  struct vring_virtqueue
> >  {
> >  	struct virtqueue vq;
> > @@ -93,12 +100,45 @@ struct vring_virtqueue
> >  	ktime_t last_add_time;
> >  #endif
> > 
> > -	/* Tokens for callbacks. */
> > -	void *data[];
> > +	/* Per-descriptor state. */
> > +	struct vring_desc_state desc_state[];
> >  };
> > 
> >  #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
> > 
> > +/* Helper to map one sg entry, since we can't use dma_map_sg. */
> > +static dma_addr_t dma_map_one_sg(struct vring_virtqueue *vq,
> > +				 struct scatterlist *sg,
> > +				 enum dma_data_direction direction)
> > +{
> > +	return dma_map_page(vq->vq.vdev->dev.parent,
> > +			    sg_page(sg), sg->offset, sg->length, direction);
> > +}
> > +
> > +static void unmap_one(struct vring_virtqueue *vq, struct vring_desc *desc)
> > +{
> > +	if (desc->flags & VRING_DESC_F_INDIRECT) {
> > +		dma_unmap_single(vq->vq.vdev->dev.parent,
> > +				 desc->addr, desc->len,
> > +				 (desc->flags & VRING_DESC_F_WRITE) ?
> > +				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > +	} else {
> > +		dma_unmap_page(vq->vq.vdev->dev.parent,
> > +			       desc->addr, desc->len,
> > +			       (desc->flags & VRING_DESC_F_WRITE) ?
> > +			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
> > +	}
> > +}
> > +
> > +static void unmap_indirect(struct vring_virtqueue *vq, struct vring_desc *desc,
> > +			   int total)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < total; i++)
> > +		unmap_one(vq, &desc[i]);
> > +}
> > +
> >  /* Set up an indirect table of descriptors and add it to the queue. */
> >  static inline int vring_add_indirect(struct vring_virtqueue *vq,
> >  				     struct scatterlist *sgs[],
> > @@ -132,7 +172,11 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
> >  			if (!sg)
> >  				break;
> >  			desc[i].flags = VRING_DESC_F_NEXT;
> > -			desc[i].addr = sg_phys(sg);
> > +			desc[i].addr =
> > +				dma_map_one_sg(vq, sg, DMA_TO_DEVICE);
> > +			if (dma_mapping_error(vq->vq.vdev->dev.parent,
> > +					      desc[i].addr))
> > +				goto unmap_free;
> >  			desc[i].len = sg->length;
> >  			desc[i].next = i+1;
> >  			i++;
> > @@ -143,7 +187,11 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
> >  			if (!sg)
> >  				break;
> >  			desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
> > -			desc[i].addr = sg_phys(sg);
> > +			desc[i].addr =
> > +				dma_map_one_sg(vq, sg, DMA_FROM_DEVICE);
> > +			if (dma_mapping_error(vq->vq.vdev->dev.parent,
> > +					      desc[i].addr))
> > +				goto unmap_free;
> >  			desc[i].len = sg->length;
> >  			desc[i].next = i+1;
> >  			i++;
> > @@ -161,15 +209,27 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq,
> >  	/* Use a single buffer which doesn't continue */
> >  	head = vq->free_head;
> >  	vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT;
> > -	vq->vring.desc[head].addr = virt_to_phys(desc);
> > -	/* kmemleak gives a false positive, as it's hidden by virt_to_phys */
> > -	kmemleak_ignore(desc);
> > +	vq->vring.desc[head].addr =
> > +		dma_map_single(vq->vq.vdev->dev.parent,
> > +			       desc, i * sizeof(struct vring_desc),
> > +			       DMA_TO_DEVICE);
> > +	if (dma_mapping_error(vq->vq.vdev->dev.parent,
> > +			      vq->vring.desc[head].addr))
> > +		goto unmap_free;
> >  	vq->vring.desc[head].len = i * sizeof(struct vring_desc);
> > 
> >  	/* Update free pointer */
> >  	vq->free_head = vq->vring.desc[head].next;
> > 
> > +	/* Save the indirect block */
> > +	vq->desc_state[head].indir_desc = desc;
> > +
> >  	return head;
> > +
> > +unmap_free:
> > +	unmap_indirect(vq, desc, i);
> > +	kfree(desc);
> > +	return -ENOMEM;
> >  }
> > 
> >  static inline int virtqueue_add(struct virtqueue *_vq,
> > @@ -244,7 +304,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> >  			if (!sg)
> >  				break;
> >  			vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
> > -			vq->vring.desc[i].addr = sg_phys(sg);
> > +			vq->vring.desc[i].addr =
> > +				dma_map_one_sg(vq, sg, DMA_TO_DEVICE);
> >  			vq->vring.desc[i].len = sg->length;
> >  			prev = i;
> >  			i = vq->vring.desc[i].next;
> > @@ -255,7 +316,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> >  			if (!sg)
> >  				break;
> >  			vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
> > -			vq->vring.desc[i].addr = sg_phys(sg);
> > +			vq->vring.desc[i].addr =
> > +				dma_map_one_sg(vq, sg, DMA_FROM_DEVICE);
> >  			vq->vring.desc[i].len = sg->length;
> >  			prev = i;
> >  			i = vq->vring.desc[i].next;
> > @@ -269,7 +331,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> > 
> >  add_head:
> >  	/* Set token. */
> > -	vq->data[head] = data;
> > +	vq->desc_state[head].data = data;
> > 
> >  	/* Put entry in available array (but don't update avail->idx until they
> >  	 * do sync). */
> > @@ -470,22 +532,33 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
> >  	unsigned int i;
> > 
> >  	/* Clear data ptr. */
> > -	vq->data[head] = NULL;
> > +	vq->desc_state[head].data = NULL;
> > 
> >  	/* Put back on free list: find end */
> >  	i = head;
> > 
> >  	/* Free the indirect table */
> > -	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
> > -		kfree(phys_to_virt(vq->vring.desc[i].addr));
> > +	if (vq->desc_state[head].indir_desc) {
> > +		u32 len = vq->vring.desc[i].len;
> > +
> > +		BUG_ON(!(vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT));
> > +		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
> > +		unmap_indirect(vq, vq->desc_state[head].indir_desc,
> > +			       len / sizeof(struct vring_desc));
> > +		kfree(vq->desc_state[head].indir_desc);
> > +		vq->desc_state[head].indir_desc = NULL;
> > +	}
> > 
> >  	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
> > +		unmap_one(vq, &vq->vring.desc[i]);
> >  		i = vq->vring.desc[i].next;
> >  		vq->vq.num_free++;
> >  	}
> > 
> > +	unmap_one(vq, &vq->vring.desc[i]);
> >  	vq->vring.desc[i].next = vq->free_head;
> >  	vq->free_head = head;
> > +
> >  	/* Plus final descriptor */
> >  	vq->vq.num_free++;
> >  }
> > @@ -542,13 +615,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
> >  		BAD_RING(vq, "id %u out of range\n", i);
> >  		return NULL;
> >  	}
> > -	if (unlikely(!vq->data[i])) {
> > +	if (unlikely(!vq->desc_state[i].data)) {
> >  		BAD_RING(vq, "id %u is not a head!\n", i);
> >  		return NULL;
> >  	}
> > 
> >  	/* detach_buf clears data, so grab it now. */
> > -	ret = vq->data[i];
> > +	ret = vq->desc_state[i].data;
> >  	detach_buf(vq, i);
> >  	vq->last_used_idx++;
> >  	/* If we expect an interrupt for the next entry, tell host
> > @@ -709,10 +782,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
> >  	START_USE(vq);
> > 
> >  	for (i = 0; i < vq->vring.num; i++) {
> > -		if (!vq->data[i])
> > +		if (!vq->desc_state[i].data)
> >  			continue;
> >  		/* detach_buf clears data, so grab it now. */
> > -		buf = vq->data[i];
> > +		buf = vq->desc_state[i].data;
> >  		detach_buf(vq, i);
> >  		vq->vring.avail->idx--;
> >  		END_USE(vq);
> > @@ -765,7 +838,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
> >  		return NULL;
> >  	}
> > 
> > -	vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
> > +	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
> > +		     GFP_KERNEL);
> >  	if (!vq)
> >  		return NULL;
> > 
> > @@ -795,11 +869,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
> > 
> >  	/* Put everything in free lists. */
> >  	vq->free_head = 0;
> > -	for (i = 0; i < num-1; i++) {
> > +	for (i = 0; i < num-1; i++)
> >  		vq->vring.desc[i].next = i+1;
> > -		vq->data[i] = NULL;
> > -	}
> > -	vq->data[i] = NULL;
> > +	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
> > 
> >  	return &vq->vq;
> >  }
> > 
> 
_______________________________________________
Virtualization mailing list
Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linuxfoundation.org/mailman/listinfo/virtualization




[Index of Archives]     [KVM Development]     [Libvirt Development]     [Libvirt Users]     [CentOS Virtualization]     [Netdev]     [Ethernet Bridging]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux