On 1/29/2019 6:58 PM, jglisse@xxxxxxxxxx wrote: > Convert ODP to use HMM so that we can build on common infrastructure > for different class of devices that want to mirror a process address > space into a device. There is no functional changes. Thanks for sending this patch. I think in general it is a good idea to use a common infrastructure for ODP. I have a couple of questions below. > -static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, > - const struct mmu_notifier_range *range) > -{ > - struct ib_ucontext_per_mm *per_mm = > - container_of(mn, struct ib_ucontext_per_mm, mn); > - > - if (unlikely(!per_mm->active)) > - return; > - > - rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, > - range->end, > - invalidate_range_end_trampoline, true, NULL); > up_read(&per_mm->umem_rwsem); > + return ret; > } Previously the code held the umem_rwsem between range_start and range_end calls. I guess that was in order to guarantee that no device page faults take reference to the pages being invalidated while the invalidation is ongoing. I assume this is now handled by hmm instead, correct? > + > +static uint64_t odp_hmm_flags[HMM_PFN_FLAG_MAX] = { > + ODP_READ_BIT, /* HMM_PFN_VALID */ > + ODP_WRITE_BIT, /* HMM_PFN_WRITE */ > + ODP_DEVICE_BIT, /* HMM_PFN_DEVICE_PRIVATE */ It seems that the mlx5_ib code in this patch currently ignores the ODP_DEVICE_BIT (e.g., in umem_dma_to_mtt). Is that okay? Or is it handled implicitly by the HMM_PFN_SPECIAL case? > @@ -327,9 +287,10 @@ void put_per_mm(struct ib_umem_odp *umem_odp) > up_write(&per_mm->umem_rwsem); > > WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); > - mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); > + hmm_mirror_unregister(&per_mm->mirror); > put_pid(per_mm->tgid); > - mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); > + > + kfree(per_mm); > } Previously the per_mm struct was released through call srcu, but now it is released immediately. Is it safe? I saw that hmm_mirror_unregister calls mmu_notifier_unregister_no_release, so I don't understand what prevents concurrently running invalidations from accessing the released per_mm struct. > @@ -578,11 +578,27 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, > > next_mr: > size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt); > - > page_shift = mr->umem->page_shift; > page_mask = ~(BIT(page_shift) - 1); > + off = (io_virt & (~page_mask)); > + size += (io_virt & (~page_mask)); > + io_virt = io_virt & page_mask; > + off += (size & (~page_mask)); > + size = ALIGN(size, 1UL << page_shift); > + > + if (io_virt < ib_umem_start(&odp->umem)) > + return -EINVAL; > + > start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; > > + if (odp_mr->per_mm == NULL || odp_mr->per_mm->mm == NULL) > + return -ENOENT; > + > + ret = hmm_range_register(&range, odp_mr->per_mm->mm, > + io_virt, io_virt + size, page_shift); > + if (ret) > + return ret; > + > if (prefetch && !downgrade && !mr->umem->writable) { > /* prefetch with write-access must > * be supported by the MR Isn't there a mistake in the calculation of the variable size? Itis first set to the size of the page fault range, but then you add the virtual address, so I guess it is actually the range end. Then you pass io_virt + size to hmm_range_register. Doesn't it double the size of the range > -void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, > - u64 bound) > +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, > + u64 virt, u64 bound) > { > + struct device *device = umem_odp->umem.context->device->dma_device; > struct ib_umem *umem = &umem_odp->umem; > - int idx; > - u64 addr; > - struct ib_device *dev = umem->context->device; > + unsigned long idx, page_mask; > + struct hmm_range range; > + long ret; > + > + if (!umem->npages) > + return; > + > + bound = ALIGN(bound, 1UL << umem->page_shift); > + page_mask = ~(BIT(umem->page_shift) - 1); > + virt &= page_mask; > > virt = max_t(u64, virt, ib_umem_start(umem)); > bound = min_t(u64, bound, ib_umem_end(umem)); > - /* Note that during the run of this function, the > - * notifiers_count of the MR is > 0, preventing any racing > - * faults from completion. We might be racing with other > - * invalidations, so we must make sure we free each page only > - * once. */ > + > + idx = ((unsigned long)virt - ib_umem_start(umem)) >> PAGE_SHIFT; > + > + range.page_shift = umem->page_shift; > + range.pfns = &umem_odp->pfns[idx]; > + range.pfn_shift = ODP_FLAGS_BITS; > + range.values = odp_hmm_values; > + range.flags = odp_hmm_flags; > + range.start = virt; > + range.end = bound; > + > mutex_lock(&umem_odp->umem_mutex); > - for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) { > - idx = (addr - ib_umem_start(umem)) >> umem->page_shift; > - if (umem_odp->page_list[idx]) { > - struct page *page = umem_odp->page_list[idx]; > - dma_addr_t dma = umem_odp->dma_list[idx]; > - dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; > - > - WARN_ON(!dma_addr); > - > - ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, > - DMA_BIDIRECTIONAL); > - if (dma & ODP_WRITE_ALLOWED_BIT) { > - struct page *head_page = compound_head(page); > - /* > - * set_page_dirty prefers being called with > - * the page lock. However, MMU notifiers are > - * called sometimes with and sometimes without > - * the lock. We rely on the umem_mutex instead > - * to prevent other mmu notifiers from > - * continuing and allowing the page mapping to > - * be removed. > - */ > - set_page_dirty(head_page); > - } > - /* on demand pinning support */ > - if (!umem->context->invalidate_range) > - put_page(page); > - umem_odp->page_list[idx] = NULL; > - umem_odp->dma_list[idx] = 0; > - umem->npages--; > - } > - } > + ret = hmm_range_dma_unmap(&range, NULL, device, > + &umem_odp->dma_list[idx], true); > + if (ret > 0) > + umem->npages -= ret; Can hmm_range_dma_unmap fail? If it does, we do we simply leak the DMA mappings? > mutex_unlock(&umem_odp->umem_mutex); > } Regards, Haggai