On Tue, Feb 02, 2016 at 09:46:36PM -0800, Andy Lutomirski wrote: > virtio_ring currently sends the device (usually a hypervisor) > physical addresses of its I/O buffers. This is okay when DMA > addresses and physical addresses are the same thing, but this isn't > always the case. For example, this never works on Xen guests, and > it is likely to fail if a physical "virtio" device ever ends up > behind an IOMMU or swiotlb. > > The immediate use case for me is to enable virtio on Xen guests. > For that to work, we need vring to support DMA address translation > as well as a corresponding change to virtio_pci or to another > driver. > > Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx> > --- > drivers/virtio/Kconfig | 2 +- > drivers/virtio/virtio_ring.c | 200 ++++++++++++++++++++++++++++++++------- > tools/virtio/linux/dma-mapping.h | 17 ++++ > 3 files changed, 183 insertions(+), 36 deletions(-) > create mode 100644 tools/virtio/linux/dma-mapping.h > > diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig > index cab9f3f63a38..77590320d44c 100644 > --- a/drivers/virtio/Kconfig > +++ b/drivers/virtio/Kconfig > @@ -60,7 +60,7 @@ config VIRTIO_INPUT > > config VIRTIO_MMIO > tristate "Platform bus driver for memory mapped virtio devices" > - depends on HAS_IOMEM > + depends on HAS_IOMEM && HAS_DMA > select VIRTIO > ---help--- > This drivers provides support for memory mapped virtio What's this chunk doing here btw? Should be part of the mmio patch? > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index ab0be6c084f6..9abc008ff7ea 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -24,6 +24,7 @@ > #include <linux/module.h> > #include <linux/hrtimer.h> > #include <linux/kmemleak.h> > +#include <linux/dma-mapping.h> > > #ifdef DEBUG > /* For development, we want to crash whenever the ring is screwed. */ > @@ -54,6 +55,11 @@ > #define END_USE(vq) > #endif > > +struct vring_desc_state { > + void *data; /* Data for callback. */ > + struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ > +}; > + > struct vring_virtqueue { > struct virtqueue vq; > > @@ -98,8 +104,8 @@ struct vring_virtqueue { > ktime_t last_add_time; > #endif > > - /* Tokens for callbacks. */ > - void *data[]; > + /* Per-descriptor state. */ > + struct vring_desc_state desc_state[]; > }; > > #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) > @@ -128,6 +134,79 @@ static bool vring_use_dma_api(struct virtio_device *vdev) > return false; > } > > +/* > + * The DMA ops on various arches are rather gnarly right now, and > + * making all of the arch DMA ops work on the vring device itself > + * is a mess. For now, we use the parent device for DMA ops. > + */ > +struct device *vring_dma_dev(const struct vring_virtqueue *vq) > +{ > + return vq->vq.vdev->dev.parent; > +} > + > +/* Map one sg entry. */ > +static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, > + struct scatterlist *sg, > + enum dma_data_direction direction) > +{ > + if (!vring_use_dma_api(vq->vq.vdev)) > + return (dma_addr_t)sg_phys(sg); > + > + /* > + * We can't use dma_map_sg, because we don't use scatterlists in > + * the way it expects (we don't guarantee that the scatterlist > + * will exist for the lifetime of the mapping). > + */ > + return dma_map_page(vring_dma_dev(vq), > + sg_page(sg), sg->offset, sg->length, > + direction); > +} > + > +static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, > + void *cpu_addr, size_t size, > + enum dma_data_direction direction) > +{ > + if (!vring_use_dma_api(vq->vq.vdev)) > + return (dma_addr_t)virt_to_phys(cpu_addr); > + > + return dma_map_single(vring_dma_dev(vq), > + cpu_addr, size, direction); > +} > + > +static void vring_unmap_one(const struct vring_virtqueue *vq, > + struct vring_desc *desc) > +{ > + u16 flags; > + > + if (!vring_use_dma_api(vq->vq.vdev)) > + return; > + > + flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); > + > + if (flags & VRING_DESC_F_INDIRECT) { > + dma_unmap_single(vring_dma_dev(vq), > + virtio64_to_cpu(vq->vq.vdev, desc->addr), > + virtio32_to_cpu(vq->vq.vdev, desc->len), > + (flags & VRING_DESC_F_WRITE) ? > + DMA_FROM_DEVICE : DMA_TO_DEVICE); > + } else { > + dma_unmap_page(vring_dma_dev(vq), > + virtio64_to_cpu(vq->vq.vdev, desc->addr), > + virtio32_to_cpu(vq->vq.vdev, desc->len), > + (flags & VRING_DESC_F_WRITE) ? > + DMA_FROM_DEVICE : DMA_TO_DEVICE); > + } > +} > + > +static int vring_mapping_error(const struct vring_virtqueue *vq, > + dma_addr_t addr) > +{ > + if (!vring_use_dma_api(vq->vq.vdev)) > + return 0; > + > + return dma_mapping_error(vring_dma_dev(vq), addr); > +} > + > static struct vring_desc *alloc_indirect(struct virtqueue *_vq, > unsigned int total_sg, gfp_t gfp) > { > @@ -161,7 +240,7 @@ static inline int virtqueue_add(struct virtqueue *_vq, > struct vring_virtqueue *vq = to_vvq(_vq); > struct scatterlist *sg; > struct vring_desc *desc; > - unsigned int i, n, avail, descs_used, uninitialized_var(prev); > + unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx; > int head; > bool indirect; > > @@ -201,21 +280,15 @@ static inline int virtqueue_add(struct virtqueue *_vq, > > if (desc) { > /* Use a single buffer which doesn't continue */ > - vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT); > - vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, virt_to_phys(desc)); > - /* avoid kmemleak false positive (hidden by virt_to_phys) */ > - kmemleak_ignore(desc); > - vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc)); > - > + indirect = true; > /* Set up rest to use this indirect table. */ > i = 0; > descs_used = 1; > - indirect = true; > } else { > + indirect = false; > desc = vq->vring.desc; > i = head; > descs_used = total_sg; > - indirect = false; > } > > if (vq->vq.num_free < descs_used) { > @@ -230,13 +303,14 @@ static inline int virtqueue_add(struct virtqueue *_vq, > return -ENOSPC; > } > > - /* We're about to use some buffers from the free list. */ > - vq->vq.num_free -= descs_used; > - > for (n = 0; n < out_sgs; n++) { > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > + dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE); > + if (vring_mapping_error(vq, addr)) > + goto unmap_release; > + > desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT); > - desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg)); > + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); > desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); > prev = i; > i = virtio16_to_cpu(_vq->vdev, desc[i].next); > @@ -244,8 +318,12 @@ static inline int virtqueue_add(struct virtqueue *_vq, > } > for (; n < (out_sgs + in_sgs); n++) { > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > + dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE); > + if (vring_mapping_error(vq, addr)) > + goto unmap_release; > + > desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE); > - desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg)); > + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); > desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); > prev = i; > i = virtio16_to_cpu(_vq->vdev, desc[i].next); > @@ -254,14 +332,33 @@ static inline int virtqueue_add(struct virtqueue *_vq, > /* Last one doesn't continue. */ > desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); > > + if (indirect) { > + /* Now that the indirect table is filled in, map it. */ > + dma_addr_t addr = vring_map_single( > + vq, desc, total_sg * sizeof(struct vring_desc), > + DMA_TO_DEVICE); > + if (vring_mapping_error(vq, addr)) > + goto unmap_release; > + > + vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT); > + vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr); > + > + vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc)); > + } > + > + /* We're using some buffers from the free list. */ > + vq->vq.num_free -= descs_used; > + > /* Update free pointer */ > if (indirect) > vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next); > else > vq->free_head = i; > > - /* Set token. */ > - vq->data[head] = data; > + /* Store token and indirect buffer state. */ > + vq->desc_state[head].data = data; > + if (indirect) > + vq->desc_state[head].indir_desc = desc; > > /* Put entry in available array (but don't update avail->idx until they > * do sync). */ > @@ -284,6 +381,24 @@ static inline int virtqueue_add(struct virtqueue *_vq, > virtqueue_kick(_vq); > > return 0; > + > +unmap_release: > + err_idx = i; > + i = head; > + > + for (n = 0; n < total_sg; n++) { > + if (i == err_idx) > + break; > + vring_unmap_one(vq, &desc[i]); > + i = vq->vring.desc[i].next; > + } > + > + vq->vq.num_free += total_sg; > + > + if (indirect) > + kfree(desc); > + > + return -EIO; > } > > /** > @@ -454,27 +569,43 @@ EXPORT_SYMBOL_GPL(virtqueue_kick); > > static void detach_buf(struct vring_virtqueue *vq, unsigned int head) > { > - unsigned int i; > + unsigned int i, j; > + u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); > > /* Clear data ptr. */ > - vq->data[head] = NULL; > + vq->desc_state[head].data = NULL; > > - /* Put back on free list: find end */ > + /* Put back on free list: unmap first-level descriptors and find end */ > i = head; > > - /* Free the indirect table */ > - if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)) > - kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, vq->vring.desc[i].addr))); > - > - while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT)) { > + while (vq->vring.desc[i].flags & nextflag) { > + vring_unmap_one(vq, &vq->vring.desc[i]); > i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next); > vq->vq.num_free++; > } > > + vring_unmap_one(vq, &vq->vring.desc[i]); > vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head); > vq->free_head = head; > + > /* Plus final descriptor */ > vq->vq.num_free++; > + > + /* Free the indirect table, if any, now that it's unmapped. */ > + if (vq->desc_state[head].indir_desc) { > + struct vring_desc *indir_desc = vq->desc_state[head].indir_desc; > + u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len); > + > + BUG_ON(!(vq->vring.desc[head].flags & > + cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))); > + BUG_ON(len == 0 || len % sizeof(struct vring_desc)); > + > + for (j = 0; j < len / sizeof(struct vring_desc); j++) > + vring_unmap_one(vq, &indir_desc[j]); > + > + kfree(vq->desc_state[head].indir_desc); > + vq->desc_state[head].indir_desc = NULL; > + } > } > > static inline bool more_used(const struct vring_virtqueue *vq) > @@ -529,13 +660,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) > BAD_RING(vq, "id %u out of range\n", i); > return NULL; > } > - if (unlikely(!vq->data[i])) { > + if (unlikely(!vq->desc_state[i].data)) { > BAD_RING(vq, "id %u is not a head!\n", i); > return NULL; > } > > /* detach_buf clears data, so grab it now. */ > - ret = vq->data[i]; > + ret = vq->desc_state[i].data; > detach_buf(vq, i); > vq->last_used_idx++; > /* If we expect an interrupt for the next entry, tell host > @@ -709,10 +840,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq) > START_USE(vq); > > for (i = 0; i < vq->vring.num; i++) { > - if (!vq->data[i]) > + if (!vq->desc_state[i].data) > continue; > /* detach_buf clears data, so grab it now. */ > - buf = vq->data[i]; > + buf = vq->desc_state[i].data; > detach_buf(vq, i); > vq->avail_idx_shadow--; > vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow); > @@ -766,7 +897,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, > return NULL; > } > > - vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL); > + vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state), > + GFP_KERNEL); > if (!vq) > return NULL; > > @@ -800,11 +932,9 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, > > /* Put everything in free lists. */ > vq->free_head = 0; > - for (i = 0; i < num-1; i++) { > + for (i = 0; i < num-1; i++) > vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1); > - vq->data[i] = NULL; > - } > - vq->data[i] = NULL; > + memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state)); > > return &vq->vq; > } > diff --git a/tools/virtio/linux/dma-mapping.h b/tools/virtio/linux/dma-mapping.h > new file mode 100644 > index 000000000000..4f93af89ae16 > --- /dev/null > +++ b/tools/virtio/linux/dma-mapping.h > @@ -0,0 +1,17 @@ > +#ifndef _LINUX_DMA_MAPPING_H > +#define _LINUX_DMA_MAPPING_H > + > +#ifdef CONFIG_HAS_DMA > +# error Virtio userspace code does not support CONFIG_HAS_DMA > +#endif > + > +#define PCI_DMA_BUS_IS_PHYS 1 > + > +enum dma_data_direction { > + DMA_BIDIRECTIONAL = 0, > + DMA_TO_DEVICE = 1, > + DMA_FROM_DEVICE = 2, > + DMA_NONE = 3, > +}; > + > +#endif > -- > 2.5.0 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html