On Wed, 3 Feb 2021 15:47:56 -0500 Daniel Jordan <daniel.m.jordan@xxxxxxxxxx> wrote: > Pinning one 4K page at a time is inefficient, so do it in batches of 512 > instead. This is just an optimization with no functional change > intended, and in particular the driver still calls iommu_map() with the > largest physically contiguous range possible. > > Add two fields in vfio_batch to remember where to start between calls to > vfio_pin_pages_remote(), and use vfio_batch_unpin() to handle remaining > pages in the batch in case of error. > > qemu pins pages for guests around 8% faster on my test system, a > two-node Broadwell server with 128G memory per node. The qemu process > was bound to one node with its allocations constrained there as well. > > base test > guest ---------------- ---------------- > mem (GB) speedup avg sec (std) avg sec (std) > 1 7.4% 0.61 (0.00) 0.56 (0.00) > 2 8.3% 0.93 (0.00) 0.85 (0.00) > 4 8.4% 1.46 (0.00) 1.34 (0.00) > 8 8.6% 2.54 (0.01) 2.32 (0.00) > 16 8.3% 4.66 (0.00) 4.27 (0.01) > 32 8.3% 8.94 (0.01) 8.20 (0.01) > 64 8.2% 17.47 (0.01) 16.04 (0.03) > 120 8.5% 32.45 (0.13) 29.69 (0.01) > > perf diff confirms less time spent in pup. Here are the top ten > functions: > > Baseline Delta Abs Symbol > > 78.63% +6.64% clear_page_erms > 1.50% -1.50% __gup_longterm_locked > 1.27% -0.78% __get_user_pages > +0.76% kvm_zap_rmapp.constprop.0 > 0.54% -0.53% vmacache_find > 0.55% -0.51% get_pfnblock_flags_mask > 0.48% -0.48% __get_user_pages_remote > +0.39% slot_rmap_walk_next > +0.32% vfio_pin_map_dma > +0.26% kvm_handle_hva_range > ... > > Suggested-by: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx> > Signed-off-by: Daniel Jordan <daniel.m.jordan@xxxxxxxxxx> > --- > drivers/vfio/vfio_iommu_type1.c | 133 +++++++++++++++++++++----------- > 1 file changed, 88 insertions(+), 45 deletions(-) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index c26c1a4697e5..ac59bfc4e332 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -101,6 +101,8 @@ struct vfio_batch { > struct page **pages; /* for pin_user_pages_remote */ > struct page *fallback_page; /* if pages alloc fails */ > int capacity; /* length of pages array */ > + int size; /* of batch currently */ > + int offset; /* of next entry in pages */ > }; > > struct vfio_group { > @@ -425,6 +427,9 @@ static int put_pfn(unsigned long pfn, int prot) > > static void vfio_batch_init(struct vfio_batch *batch) > { > + batch->size = 0; > + batch->offset = 0; > + > if (unlikely(disable_hugepages)) > goto fallback; > > @@ -440,6 +445,17 @@ static void vfio_batch_init(struct vfio_batch *batch) > batch->capacity = 1; > } > > +static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma) > +{ > + while (batch->size) { > + unsigned long pfn = page_to_pfn(batch->pages[batch->offset]); > + > + put_pfn(pfn, dma->prot); > + batch->offset++; > + batch->size--; > + } > +} > + > static void vfio_batch_fini(struct vfio_batch *batch) > { > if (batch->capacity == VFIO_BATCH_MAX_CAPACITY) > @@ -526,65 +542,88 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > long npage, unsigned long *pfn_base, > unsigned long limit, struct vfio_batch *batch) > { > - unsigned long pfn = 0; > + unsigned long pfn; > + struct mm_struct *mm = current->mm; > long ret, pinned = 0, lock_acct = 0; > bool rsvd; > dma_addr_t iova = vaddr - dma->vaddr + dma->iova; > > /* This code path is only user initiated */ > - if (!current->mm) > + if (!mm) > return -ENODEV; > > - ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, pfn_base, > - batch->pages); > - if (ret < 0) > - return ret; > - > - pinned++; > - rsvd = is_invalid_reserved_pfn(*pfn_base); > - > - /* > - * Reserved pages aren't counted against the user, externally pinned > - * pages are already counted against the user. > - */ > - if (!rsvd && !vfio_find_vpfn(dma, iova)) { > - if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) { > - put_pfn(*pfn_base, dma->prot); > - pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, > - limit << PAGE_SHIFT); > - return -ENOMEM; > - } > - lock_acct++; > + if (batch->size) { > + /* Leftover pages in batch from an earlier call. */ > + *pfn_base = page_to_pfn(batch->pages[batch->offset]); > + pfn = *pfn_base; > + rsvd = is_invalid_reserved_pfn(*pfn_base); > + } else { > + *pfn_base = 0; > } > > - if (unlikely(disable_hugepages)) > - goto out; > + while (npage) { > + if (!batch->size) { > + /* Empty batch, so refill it. */ > + long req_pages = min_t(long, npage, batch->capacity); > > - /* Lock all the consecutive pages from pfn_base */ > - for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage; > - pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { > - ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, &pfn, > - batch->pages); > - if (ret < 0) > - break; > + ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot, > + &pfn, batch->pages); > + if (ret < 0) > + return ret; This might not be the first batch we fill, I think this needs to unwind rather than direct return. Series looks good otherwise. Thanks, Alex > > - if (pfn != *pfn_base + pinned || > - rsvd != is_invalid_reserved_pfn(pfn)) { > - put_pfn(pfn, dma->prot); > - break; > + batch->size = ret; > + batch->offset = 0; > + > + if (!*pfn_base) { > + *pfn_base = pfn; > + rsvd = is_invalid_reserved_pfn(*pfn_base); > + } > } > > - if (!rsvd && !vfio_find_vpfn(dma, iova)) { > - if (!dma->lock_cap && > - current->mm->locked_vm + lock_acct + 1 > limit) { > - put_pfn(pfn, dma->prot); > - pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > - __func__, limit << PAGE_SHIFT); > - ret = -ENOMEM; > - goto unpin_out; > + /* > + * pfn is preset for the first iteration of this inner loop and > + * updated at the end to handle a VM_PFNMAP pfn. In that case, > + * batch->pages isn't valid (there's no struct page), so allow > + * batch->pages to be touched only when there's more than one > + * pfn to check, which guarantees the pfns are from a > + * !VM_PFNMAP vma. > + */ > + while (true) { > + if (pfn != *pfn_base + pinned || > + rsvd != is_invalid_reserved_pfn(pfn)) > + goto out; > + > + /* > + * Reserved pages aren't counted against the user, > + * externally pinned pages are already counted against > + * the user. > + */ > + if (!rsvd && !vfio_find_vpfn(dma, iova)) { > + if (!dma->lock_cap && > + mm->locked_vm + lock_acct + 1 > limit) { > + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > + __func__, limit << PAGE_SHIFT); > + ret = -ENOMEM; > + goto unpin_out; > + } > + lock_acct++; > } > - lock_acct++; > + > + pinned++; > + npage--; > + vaddr += PAGE_SIZE; > + iova += PAGE_SIZE; > + batch->offset++; > + batch->size--; > + > + if (!batch->size) > + break; > + > + pfn = page_to_pfn(batch->pages[batch->offset]); > } > + > + if (unlikely(disable_hugepages)) > + break; > } > > out: > @@ -596,6 +635,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > for (pfn = *pfn_base ; pinned ; pfn++, pinned--) > put_pfn(pfn, dma->prot); > } > + vfio_batch_unpin(batch, dma); > > return ret; > } > @@ -1305,6 +1345,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, > if (ret) { > vfio_unpin_pages_remote(dma, iova + dma->size, pfn, > npage, true); > + vfio_batch_unpin(&batch, dma); > break; > } > > @@ -1546,11 +1587,13 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, > ret = iommu_map(domain->domain, iova, phys, > size, dma->prot | domain->prot); > if (ret) { > - if (!dma->iommu_mapped) > + if (!dma->iommu_mapped) { > vfio_unpin_pages_remote(dma, iova, > phys >> PAGE_SHIFT, > size >> PAGE_SHIFT, > true); > + vfio_batch_unpin(&batch, dma); > + } > goto unwind; > } >