On Wed, Aug 27, 2014 at 09:29:36AM +0200, Christian Borntraeger wrote: > On 26/08/14 23:17, Andy Lutomirski wrote: > > virtio_ring currently sends the device (usually a hypervisor) > > physical addresses of its I/O buffers. This is okay when DMA > > addresses and physical addresses are the same thing, but this isn't > > always the case. For example, this never works on Xen guests, and > > it is likely to fail if a physical "virtio" device ever ends up > > behind an IOMMU or swiotlb. > > > > The immediate use case for me is to enable virtio on Xen guests. > > For that to work, we need this fix as well as a corresponding > > fix to virtio_pci or to another driver. > > > > With this patch, virtfs survives kmemleak and CONFIG_DMA_API_DEBUG. > > virtio-net warns (correctly) about DMA from the stack in > > virtnet_set_rx_mode. > > > > This breaks s390's defconfig. The default configuration for s390 > > does virtio through a KVM-specific interface, but that configuration > > does not support DMA. I could modify this patch to stub out the DMA > > API calls if !CONFIG_HAS_DMA, but it seems to me that it would be > > much nicer to make s390 support DMA unconditionally. > > s390 has no DMA per se. Newest systems have a PCI-like I/O attach in > addition to the classic channel I/O, and for that we enable the DMA code > just for that transport to be able to reuse some of the existing PCI > drivers. (only some because, we differ in some aspects from how PCI > looks like) But the architecture itself (and the virtio interface) does > not provide the DMA interface as you know it: Don't most of those DMA_API end up then being nops? As in, we do have in the dma-api file the #ifdef case when a platform does not do DMA which ends up with all functions stubbed out. > > In essence this patch is a no-go for s390. Is that due to possible compilation? > > I am also aksing myself, if this makes virtio-ring more expensive? > > Christian > > > > > > > It's actually unclear to me whether this can be fixed without either > > s390 arch support or a special case for s390 in virtio_ring: on > > brief inspection of s390's DMA code, it seems to assume that > > everything goes through a PCI IOMMU, which is presumably not the > > case for virtio. > > > > > > Cc: Cornelia Huck <cornelia.huck@xxxxxxxxxx> > > Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxxxxxx> > > --- > > drivers/virtio/Kconfig | 1 + > > drivers/virtio/virtio_ring.c | 116 +++++++++++++++++++++++++++++++++++-------- > > 2 files changed, 95 insertions(+), 22 deletions(-) > > > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig > > index c6683f2e396c..b6f3acc61153 100644 > > --- a/drivers/virtio/Kconfig > > +++ b/drivers/virtio/Kconfig > > @@ -1,5 +1,6 @@ > > config VIRTIO > > tristate > > + depends on HAS_DMA > > ---help--- > > This option is selected by any driver which implements the virtio > > bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST, > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > > index d356a701c9c2..6a78e2fd8e63 100644 > > --- a/drivers/virtio/virtio_ring.c > > +++ b/drivers/virtio/virtio_ring.c > > @@ -24,6 +24,7 @@ > > #include <linux/module.h> > > #include <linux/hrtimer.h> > > #include <linux/kmemleak.h> > > +#include <linux/dma-mapping.h> > > > > #ifdef DEBUG > > /* For development, we want to crash whenever the ring is screwed. */ > > @@ -54,6 +55,12 @@ > > #define END_USE(vq) > > #endif > > > > +struct vring_desc_state > > +{ > > + void *data; /* Data for callback. */ > > + struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ > > +}; > > + > > struct vring_virtqueue > > { > > struct virtqueue vq; > > @@ -93,12 +100,45 @@ struct vring_virtqueue > > ktime_t last_add_time; > > #endif > > > > - /* Tokens for callbacks. */ > > - void *data[]; > > + /* Per-descriptor state. */ > > + struct vring_desc_state desc_state[]; > > }; > > > > #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) > > > > +/* Helper to map one sg entry, since we can't use dma_map_sg. */ > > +static dma_addr_t dma_map_one_sg(struct vring_virtqueue *vq, > > + struct scatterlist *sg, > > + enum dma_data_direction direction) > > +{ > > + return dma_map_page(vq->vq.vdev->dev.parent, > > + sg_page(sg), sg->offset, sg->length, direction); > > +} > > + > > +static void unmap_one(struct vring_virtqueue *vq, struct vring_desc *desc) > > +{ > > + if (desc->flags & VRING_DESC_F_INDIRECT) { > > + dma_unmap_single(vq->vq.vdev->dev.parent, > > + desc->addr, desc->len, > > + (desc->flags & VRING_DESC_F_WRITE) ? > > + DMA_FROM_DEVICE : DMA_TO_DEVICE); > > + } else { > > + dma_unmap_page(vq->vq.vdev->dev.parent, > > + desc->addr, desc->len, > > + (desc->flags & VRING_DESC_F_WRITE) ? > > + DMA_FROM_DEVICE : DMA_TO_DEVICE); > > + } > > +} > > + > > +static void unmap_indirect(struct vring_virtqueue *vq, struct vring_desc *desc, > > + int total) > > +{ > > + int i; > > + > > + for (i = 0; i < total; i++) > > + unmap_one(vq, &desc[i]); > > +} > > + > > /* Set up an indirect table of descriptors and add it to the queue. */ > > static inline int vring_add_indirect(struct vring_virtqueue *vq, > > struct scatterlist *sgs[], > > @@ -132,7 +172,11 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq, > > if (!sg) > > break; > > desc[i].flags = VRING_DESC_F_NEXT; > > - desc[i].addr = sg_phys(sg); > > + desc[i].addr = > > + dma_map_one_sg(vq, sg, DMA_TO_DEVICE); > > + if (dma_mapping_error(vq->vq.vdev->dev.parent, > > + desc[i].addr)) > > + goto unmap_free; > > desc[i].len = sg->length; > > desc[i].next = i+1; > > i++; > > @@ -143,7 +187,11 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq, > > if (!sg) > > break; > > desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; > > - desc[i].addr = sg_phys(sg); > > + desc[i].addr = > > + dma_map_one_sg(vq, sg, DMA_FROM_DEVICE); > > + if (dma_mapping_error(vq->vq.vdev->dev.parent, > > + desc[i].addr)) > > + goto unmap_free; > > desc[i].len = sg->length; > > desc[i].next = i+1; > > i++; > > @@ -161,15 +209,27 @@ static inline int vring_add_indirect(struct vring_virtqueue *vq, > > /* Use a single buffer which doesn't continue */ > > head = vq->free_head; > > vq->vring.desc[head].flags = VRING_DESC_F_INDIRECT; > > - vq->vring.desc[head].addr = virt_to_phys(desc); > > - /* kmemleak gives a false positive, as it's hidden by virt_to_phys */ > > - kmemleak_ignore(desc); > > + vq->vring.desc[head].addr = > > + dma_map_single(vq->vq.vdev->dev.parent, > > + desc, i * sizeof(struct vring_desc), > > + DMA_TO_DEVICE); > > + if (dma_mapping_error(vq->vq.vdev->dev.parent, > > + vq->vring.desc[head].addr)) > > + goto unmap_free; > > vq->vring.desc[head].len = i * sizeof(struct vring_desc); > > > > /* Update free pointer */ > > vq->free_head = vq->vring.desc[head].next; > > > > + /* Save the indirect block */ > > + vq->desc_state[head].indir_desc = desc; > > + > > return head; > > + > > +unmap_free: > > + unmap_indirect(vq, desc, i); > > + kfree(desc); > > + return -ENOMEM; > > } > > > > static inline int virtqueue_add(struct virtqueue *_vq, > > @@ -244,7 +304,8 @@ static inline int virtqueue_add(struct virtqueue *_vq, > > if (!sg) > > break; > > vq->vring.desc[i].flags = VRING_DESC_F_NEXT; > > - vq->vring.desc[i].addr = sg_phys(sg); > > + vq->vring.desc[i].addr = > > + dma_map_one_sg(vq, sg, DMA_TO_DEVICE); > > vq->vring.desc[i].len = sg->length; > > prev = i; > > i = vq->vring.desc[i].next; > > @@ -255,7 +316,8 @@ static inline int virtqueue_add(struct virtqueue *_vq, > > if (!sg) > > break; > > vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; > > - vq->vring.desc[i].addr = sg_phys(sg); > > + vq->vring.desc[i].addr = > > + dma_map_one_sg(vq, sg, DMA_FROM_DEVICE); > > vq->vring.desc[i].len = sg->length; > > prev = i; > > i = vq->vring.desc[i].next; > > @@ -269,7 +331,7 @@ static inline int virtqueue_add(struct virtqueue *_vq, > > > > add_head: > > /* Set token. */ > > - vq->data[head] = data; > > + vq->desc_state[head].data = data; > > > > /* Put entry in available array (but don't update avail->idx until they > > * do sync). */ > > @@ -470,22 +532,33 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head) > > unsigned int i; > > > > /* Clear data ptr. */ > > - vq->data[head] = NULL; > > + vq->desc_state[head].data = NULL; > > > > /* Put back on free list: find end */ > > i = head; > > > > /* Free the indirect table */ > > - if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT) > > - kfree(phys_to_virt(vq->vring.desc[i].addr)); > > + if (vq->desc_state[head].indir_desc) { > > + u32 len = vq->vring.desc[i].len; > > + > > + BUG_ON(!(vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)); > > + BUG_ON(len == 0 || len % sizeof(struct vring_desc)); > > + unmap_indirect(vq, vq->desc_state[head].indir_desc, > > + len / sizeof(struct vring_desc)); > > + kfree(vq->desc_state[head].indir_desc); > > + vq->desc_state[head].indir_desc = NULL; > > + } > > > > while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) { > > + unmap_one(vq, &vq->vring.desc[i]); > > i = vq->vring.desc[i].next; > > vq->vq.num_free++; > > } > > > > + unmap_one(vq, &vq->vring.desc[i]); > > vq->vring.desc[i].next = vq->free_head; > > vq->free_head = head; > > + > > /* Plus final descriptor */ > > vq->vq.num_free++; > > } > > @@ -542,13 +615,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) > > BAD_RING(vq, "id %u out of range\n", i); > > return NULL; > > } > > - if (unlikely(!vq->data[i])) { > > + if (unlikely(!vq->desc_state[i].data)) { > > BAD_RING(vq, "id %u is not a head!\n", i); > > return NULL; > > } > > > > /* detach_buf clears data, so grab it now. */ > > - ret = vq->data[i]; > > + ret = vq->desc_state[i].data; > > detach_buf(vq, i); > > vq->last_used_idx++; > > /* If we expect an interrupt for the next entry, tell host > > @@ -709,10 +782,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq) > > START_USE(vq); > > > > for (i = 0; i < vq->vring.num; i++) { > > - if (!vq->data[i]) > > + if (!vq->desc_state[i].data) > > continue; > > /* detach_buf clears data, so grab it now. */ > > - buf = vq->data[i]; > > + buf = vq->desc_state[i].data; > > detach_buf(vq, i); > > vq->vring.avail->idx--; > > END_USE(vq); > > @@ -765,7 +838,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, > > return NULL; > > } > > > > - vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL); > > + vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state), > > + GFP_KERNEL); > > if (!vq) > > return NULL; > > > > @@ -795,11 +869,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, > > > > /* Put everything in free lists. */ > > vq->free_head = 0; > > - for (i = 0; i < num-1; i++) { > > + for (i = 0; i < num-1; i++) > > vq->vring.desc[i].next = i+1; > > - vq->data[i] = NULL; > > - } > > - vq->data[i] = NULL; > > + memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state)); > > > > return &vq->vq; > > } > > > _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization