On Thu, Mar 25, 2021 at 12:53 PM Jason Wang <jasowang@xxxxxxxxxx> wrote: > > > 在 2021/3/24 下午3:39, Yongji Xie 写道: > > On Wed, Mar 24, 2021 at 11:54 AM Jason Wang <jasowang@xxxxxxxxxx> wrote: > >> > >> 在 2021/3/15 下午1:37, Xie Yongji 写道: > >>> This implements an MMU-based IOMMU driver to support mapping > >>> kernel dma buffer into userspace. The basic idea behind it is > >>> treating MMU (VA->PA) as IOMMU (IOVA->PA). The driver will set > >>> up MMU mapping instead of IOMMU mapping for the DMA transfer so > >>> that the userspace process is able to use its virtual address to > >>> access the dma buffer in kernel. > >>> > >>> And to avoid security issue, a bounce-buffering mechanism is > >>> introduced to prevent userspace accessing the original buffer > >>> directly. > >>> > >>> Signed-off-by: Xie Yongji <xieyongji@xxxxxxxxxxxxx> > >>> --- > >>> drivers/vdpa/vdpa_user/iova_domain.c | 535 +++++++++++++++++++++++++++++++++++ > >>> drivers/vdpa/vdpa_user/iova_domain.h | 75 +++++ > >>> 2 files changed, 610 insertions(+) > >>> create mode 100644 drivers/vdpa/vdpa_user/iova_domain.c > >>> create mode 100644 drivers/vdpa/vdpa_user/iova_domain.h > >>> > >>> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c > >>> new file mode 100644 > >>> index 000000000000..83de216b0e51 > >>> --- /dev/null > >>> +++ b/drivers/vdpa/vdpa_user/iova_domain.c > >>> @@ -0,0 +1,535 @@ > >>> +// SPDX-License-Identifier: GPL-2.0-only > >>> +/* > >>> + * MMU-based IOMMU implementation > >>> + * > >>> + * Copyright (C) 2020 Bytedance Inc. and/or its affiliates. All rights reserved. > >> > >> 2021 as well. > >> > > Sure. > > > >>> + * > >>> + * Author: Xie Yongji <xieyongji@xxxxxxxxxxxxx> > >>> + * > >>> + */ > >>> + > >>> +#include <linux/slab.h> > >>> +#include <linux/file.h> > >>> +#include <linux/anon_inodes.h> > >>> +#include <linux/highmem.h> > >>> +#include <linux/vmalloc.h> > >>> +#include <linux/vdpa.h> > >>> + > >>> +#include "iova_domain.h" > >>> + > >>> +static int vduse_iotlb_add_range(struct vduse_iova_domain *domain, > >>> + u64 start, u64 last, > >>> + u64 addr, unsigned int perm, > >>> + struct file *file, u64 offset) > >>> +{ > >>> + struct vdpa_map_file *map_file; > >>> + int ret; > >>> + > >>> + map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC); > >>> + if (!map_file) > >>> + return -ENOMEM; > >>> + > >>> + map_file->file = get_file(file); > >>> + map_file->offset = offset; > >>> + > >>> + ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last, > >>> + addr, perm, map_file); > >>> + if (ret) { > >>> + fput(map_file->file); > >>> + kfree(map_file); > >>> + return ret; > >>> + } > >>> + return 0; > >>> +} > >>> + > >>> +static void vduse_iotlb_del_range(struct vduse_iova_domain *domain, > >>> + u64 start, u64 last) > >>> +{ > >>> + struct vdpa_map_file *map_file; > >>> + struct vhost_iotlb_map *map; > >>> + > >>> + while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) { > >>> + map_file = (struct vdpa_map_file *)map->opaque; > >>> + fput(map_file->file); > >>> + kfree(map_file); > >>> + vhost_iotlb_map_free(domain->iotlb, map); > >>> + } > >>> +} > >>> + > >>> +int vduse_domain_set_map(struct vduse_iova_domain *domain, > >>> + struct vhost_iotlb *iotlb) > >>> +{ > >>> + struct vdpa_map_file *map_file; > >>> + struct vhost_iotlb_map *map; > >>> + u64 start = 0ULL, last = ULLONG_MAX; > >>> + int ret; > >>> + > >>> + spin_lock(&domain->iotlb_lock); > >>> + vduse_iotlb_del_range(domain, start, last); > >>> + > >>> + for (map = vhost_iotlb_itree_first(iotlb, start, last); map; > >>> + map = vhost_iotlb_itree_next(map, start, last)) { > >>> + map_file = (struct vdpa_map_file *)map->opaque; > >>> + ret = vduse_iotlb_add_range(domain, map->start, map->last, > >>> + map->addr, map->perm, > >>> + map_file->file, > >>> + map_file->offset); > >>> + if (ret) > >>> + goto err; > >>> + } > >>> + spin_unlock(&domain->iotlb_lock); > >>> + > >>> + return 0; > >>> +err: > >>> + vduse_iotlb_del_range(domain, start, last); > >>> + spin_unlock(&domain->iotlb_lock); > >>> + return ret; > >>> +} > >>> + > >>> +static void vduse_domain_map_bounce_page(struct vduse_iova_domain *domain, > >>> + u64 iova, u64 size, u64 paddr) > >>> +{ > >>> + struct vduse_bounce_map *map; > >>> + unsigned int index; > >>> + u64 last = iova + size - 1; > >>> + > >>> + while (iova < last) { > >>> + map = &domain->bounce_maps[iova >> PAGE_SHIFT]; > >>> + index = offset_in_page(iova) >> IOVA_ALLOC_ORDER; > >>> + map->orig_phys[index] = paddr; > >>> + paddr += IOVA_ALLOC_SIZE; > >>> + iova += IOVA_ALLOC_SIZE; > >>> + } > >>> +} > >>> + > >>> +static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain, > >>> + u64 iova, u64 size) > >>> +{ > >>> + struct vduse_bounce_map *map; > >>> + unsigned int index; > >>> + u64 last = iova + size - 1; > >>> + > >>> + while (iova < last) { > >>> + map = &domain->bounce_maps[iova >> PAGE_SHIFT]; > >>> + index = offset_in_page(iova) >> IOVA_ALLOC_ORDER; > >>> + map->orig_phys[index] = INVALID_PHYS_ADDR; > >>> + iova += IOVA_ALLOC_SIZE; > >>> + } > >>> +} > >>> + > >>> +static void do_bounce(phys_addr_t orig, void *addr, size_t size, > >>> + enum dma_data_direction dir) > >>> +{ > >>> + unsigned long pfn = PFN_DOWN(orig); > >>> + > >>> + if (PageHighMem(pfn_to_page(pfn))) { > >>> + unsigned int offset = offset_in_page(orig); > >>> + char *buffer; > >>> + unsigned int sz = 0; > >>> + > >>> + while (size) { > >>> + sz = min_t(size_t, PAGE_SIZE - offset, size); > >>> + > >>> + buffer = kmap_atomic(pfn_to_page(pfn)); > >> > >> So kmap_atomic() can autoamtically go with fast path if the page does > >> not belong to highmem. > >> > >> I think we can removce the condition and just use kmap_atomic() for all > >> the cases here. > >> > > Looks good to me. > > > >>> + if (dir == DMA_TO_DEVICE) > >>> + memcpy(addr, buffer + offset, sz); > >>> + else > >>> + memcpy(buffer + offset, addr, sz); > >>> + kunmap_atomic(buffer); > >>> + > >>> + size -= sz; > >>> + pfn++; > >>> + addr += sz; > >>> + offset = 0; > >>> + } > >>> + } else if (dir == DMA_TO_DEVICE) { > >>> + memcpy(addr, phys_to_virt(orig), size); > >>> + } else { > >>> + memcpy(phys_to_virt(orig), addr, size); > >>> + } > >>> +} > >>> + > >>> +static void vduse_domain_bounce(struct vduse_iova_domain *domain, > >>> + dma_addr_t iova, size_t size, > >>> + enum dma_data_direction dir) > >>> +{ > >>> + struct vduse_bounce_map *map; > >>> + unsigned int index, offset; > >>> + void *addr; > >>> + size_t sz; > >>> + > >>> + while (size) { > >>> + map = &domain->bounce_maps[iova >> PAGE_SHIFT]; > >>> + offset = offset_in_page(iova); > >>> + sz = min_t(size_t, IOVA_ALLOC_SIZE, size); > >>> + > >>> + if (map->bounce_page && > >>> + map->orig_phys[index] != INVALID_PHYS_ADDR) { > >>> + addr = page_address(map->bounce_page) + offset; > >>> + index = offset >> IOVA_ALLOC_ORDER; > >>> + do_bounce(map->orig_phys[index], addr, sz, dir); > >>> + } > >>> + size -= sz; > >>> + iova += sz; > >>> + } > >>> +} > >>> + > >>> +static struct page * > >>> +vduse_domain_get_mapping_page(struct vduse_iova_domain *domain, u64 iova) > >>> +{ > >>> + u64 start = iova & PAGE_MASK; > >>> + u64 last = start + PAGE_SIZE - 1; > >>> + struct vhost_iotlb_map *map; > >>> + struct page *page = NULL; > >>> + > >>> + spin_lock(&domain->iotlb_lock); > >>> + map = vhost_iotlb_itree_first(domain->iotlb, start, last); > >>> + if (!map) > >>> + goto out; > >>> + > >>> + page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT); > >>> + get_page(page); > >>> +out: > >>> + spin_unlock(&domain->iotlb_lock); > >>> + > >>> + return page; > >>> +} > >>> + > >>> +static struct page * > >>> +vduse_domain_alloc_bounce_page(struct vduse_iova_domain *domain, u64 iova) > >>> +{ > >>> + u64 start = iova & PAGE_MASK; > >>> + struct page *page = alloc_page(GFP_KERNEL); > >>> + struct vduse_bounce_map *map; > >>> + > >>> + if (!page) > >>> + return NULL; > >>> + > >>> + spin_lock(&domain->iotlb_lock); > >>> + map = &domain->bounce_maps[iova >> PAGE_SHIFT]; > >>> + if (map->bounce_page) { > >>> + __free_page(page); > >>> + goto out; > >>> + } > >>> + map->bounce_page = page; > >>> + > >>> + /* paired with vduse_domain_map_page() */ > >>> + smp_mb(); > >> > >> So this is suspicious. It's better to explain like, we need make sure A > >> must be done after B. > > OK. I see. It's used to protect this pattern: > > > > vduse_domain_alloc_bounce_page: vduse_domain_map_page: > > write map->bounce_page write map->orig_phys > > mb() mb() > > read map->orig_phys read map->bounce_page > > > > Make sure there will always be a path to do bouncing. > > > Ok. > > > > > >> And it looks to me the iotlb_lock is sufficnet to do the synchronization > >> here. E.g any reason that you don't take it in > >> vduse_domain_map_bounce_page(). > >> > > Yes, we can. But the performance in multi-queue cases will go down if > > we use iotlb_lock on this critical path. > > > >> And what's more, is there anyway to aovid holding the spinlock during > >> bouncing? > >> > > Looks like we can't. In the case that multiple page faults happen on > > the same page, we should make sure the bouncing is done before any > > page fault handler returns. > > > So it looks to me all those extra complexitiy comes from the fact that > the bounce_page and orig_phys are set by different places so we need to > do the bouncing in two places. > > I wonder how much we can gain from the "lazy" boucning in page fault. > The buffer mapped via dma_ops from virtio driver is expected to be > accessed by the userspace soon. It looks to me we can do all those > stuffs during dma_map() then things would be greatly simplified. > If so, we need to allocate lots of pages from the pool reserved for atomic memory allocation requests. Thanks, Yongji