From: Jason Gunthorpe <jgg@xxxxxxxxxxxx> This is a significant simplification, no extra list is kept per FD, and the interval tree is now shared between all the ucontexts, reducing overhead if there are multiple ucontexts active. Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxxxx> --- drivers/infiniband/core/umem_odp.c | 170 ++++++++------------------ drivers/infiniband/core/uverbs_cmd.c | 3 - drivers/infiniband/core/uverbs_main.c | 1 + drivers/infiniband/hw/mlx5/main.c | 5 - include/rdma/ib_umem_odp.h | 10 +- include/rdma/ib_verbs.h | 3 - 6 files changed, 54 insertions(+), 138 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index c6a992392ee2b8..a02e6e3d7b72fb 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -82,7 +82,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, struct rb_node *node; down_read(&per_mm->umem_rwsem); - if (!per_mm->active) + if (!per_mm->mn.users) goto out; for (node = rb_first_cached(&per_mm->umem_tree); node; @@ -132,7 +132,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - if (!per_mm->active) { + if (!per_mm->mn.users) { up_read(&per_mm->umem_rwsem); /* * At this point active is permanently set and visible to this @@ -165,7 +165,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - if (unlikely(!per_mm->active)) + if (unlikely(!per_mm->mn.users)) return; rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, @@ -174,125 +174,47 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, up_read(&per_mm->umem_rwsem); } -static const struct mmu_notifier_ops ib_umem_notifiers = { - .release = ib_umem_notifier_release, - .invalidate_range_start = ib_umem_notifier_invalidate_range_start, - .invalidate_range_end = ib_umem_notifier_invalidate_range_end, -}; - -static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) -{ - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - - if (umem_odp->is_implicit_odp) - return; - - down_write(&per_mm->umem_rwsem); - interval_tree_remove(&umem_odp->interval_tree, &per_mm->umem_tree); - complete_all(&umem_odp->notifier_completion); - up_write(&per_mm->umem_rwsem); -} - -static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, - struct mm_struct *mm) +static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm) { struct ib_ucontext_per_mm *per_mm; - int ret; per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); if (!per_mm) return ERR_PTR(-ENOMEM); - per_mm->context = ctx; - per_mm->mm = mm; per_mm->umem_tree = RB_ROOT_CACHED; init_rwsem(&per_mm->umem_rwsem); - per_mm->active = true; + WARN_ON(mm != current->mm); rcu_read_lock(); per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); rcu_read_unlock(); - - WARN_ON(mm != current->mm); - - per_mm->mn.ops = &ib_umem_notifiers; - ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); - if (ret) { - dev_err(&ctx->device->dev, - "Failed to register mmu_notifier %d\n", ret); - goto out_pid; - } - - list_add(&per_mm->ucontext_list, &ctx->per_mm_list); - return per_mm; - -out_pid: - put_pid(per_mm->tgid); - kfree(per_mm); - return ERR_PTR(ret); + return &per_mm->mn; } -static struct ib_ucontext_per_mm *get_per_mm(struct ib_umem_odp *umem_odp) +static void ib_umem_free_notifier(struct mmu_notifier *mn) { - struct ib_ucontext *ctx = umem_odp->umem.context; - struct ib_ucontext_per_mm *per_mm; - - lockdep_assert_held(&ctx->per_mm_list_lock); - - /* - * Generally speaking we expect only one or two per_mm in this list, - * so no reason to optimize this search today. - */ - list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { - if (per_mm->mm == umem_odp->umem.owning_mm) - return per_mm; - } - - return alloc_per_mm(ctx, umem_odp->umem.owning_mm); -} - -static void free_per_mm(struct rcu_head *rcu) -{ - kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); -} - -static void put_per_mm(struct ib_umem_odp *umem_odp) -{ - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - struct ib_ucontext *ctx = umem_odp->umem.context; - bool need_free; - - mutex_lock(&ctx->per_mm_list_lock); - umem_odp->per_mm = NULL; - per_mm->odp_mrs_count--; - need_free = per_mm->odp_mrs_count == 0; - if (need_free) - list_del(&per_mm->ucontext_list); - mutex_unlock(&ctx->per_mm_list_lock); - - if (!need_free) - return; - - /* - * NOTE! mmu_notifier_unregister() can happen between a start/end - * callback, resulting in an start/end, and thus an unbalanced - * lock. This doesn't really matter to us since we are about to kfree - * the memory that holds the lock, however LOCKDEP doesn't like this. - */ - down_write(&per_mm->umem_rwsem); - per_mm->active = false; - up_write(&per_mm->umem_rwsem); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); - mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); + put_pid(per_mm->tgid); - mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); + kfree(per_mm); } -static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, - struct ib_ucontext_per_mm *per_mm) +static const struct mmu_notifier_ops ib_umem_notifiers = { + .release = ib_umem_notifier_release, + .invalidate_range_start = ib_umem_notifier_invalidate_range_start, + .invalidate_range_end = ib_umem_notifier_invalidate_range_end, + .alloc_notifier = ib_umem_alloc_notifier, + .free_notifier = ib_umem_free_notifier, +}; + +static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp) { - struct ib_ucontext *ctx = umem_odp->umem.context; + struct ib_ucontext_per_mm *per_mm; + struct mmu_notifier *mn; int ret; if (!umem_odp->is_implicit_odp) { @@ -338,18 +260,13 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, } } - mutex_lock(&ctx->per_mm_list_lock); - if (per_mm) { - umem_odp->per_mm = per_mm; - } else { - umem_odp->per_mm = get_per_mm(umem_odp); - if (IS_ERR(umem_odp->per_mm)) { - ret = PTR_ERR(umem_odp->per_mm); - goto out_unlock; - } + mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm); + if (IS_ERR(mn)) { + ret = PTR_ERR(mn); + goto out_dma_list; } - per_mm->odp_mrs_count++; - mutex_unlock(&ctx->per_mm_list_lock); + umem_odp->per_mm = per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); mutex_init(&umem_odp->umem_mutex); init_completion(&umem_odp->notifier_completion); @@ -363,8 +280,7 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, return 0; -out_unlock: - mutex_unlock(&ctx->per_mm_list_lock); +out_dma_list: kvfree(umem_odp->dma_list); out_page_list: kvfree(umem_odp->page_list); @@ -409,7 +325,7 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, umem_odp->is_implicit_odp = 1; umem_odp->page_shift = PAGE_SHIFT; - ret = ib_init_umem_odp(umem_odp, NULL); + ret = ib_init_umem_odp(umem_odp); if (ret) { kfree(umem_odp); return ERR_PTR(ret); @@ -455,7 +371,7 @@ struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root, umem->owning_mm = root->umem.owning_mm; odp_data->page_shift = PAGE_SHIFT; - ret = ib_init_umem_odp(odp_data, root->per_mm); + ret = ib_init_umem_odp(odp_data); if (ret) { kfree(odp_data); return ERR_PTR(ret); @@ -498,11 +414,13 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) up_read(&mm->mmap_sem); } - return ib_init_umem_odp(umem_odp, NULL); + return ib_init_umem_odp(umem_odp); } void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + /* * Ensure that no more pages are mapped in the umem. * @@ -512,8 +430,24 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); - remove_umem_from_per_mm(umem_odp); - put_per_mm(umem_odp); + down_write(&per_mm->umem_rwsem); + if (!umem_odp->is_implicit_odp) { + interval_tree_remove(&umem_odp->interval_tree, + &per_mm->umem_tree); + complete_all(&umem_odp->notifier_completion); + } + /* + * NOTE! mmu_notifier_unregister() can happen between a start/end + * callback, resulting in an start/end, and thus an unbalanced + * lock. This doesn't really matter to us since we are about to kfree + * the memory that holds the lock, however LOCKDEP doesn't like this. + * Thus we call the mmu_notifier_put under the rwsem and test the + * internal users count to reliably see if we are past this point. + */ + mmu_notifier_put(&umem_odp->per_mm->mn); + up_write(&per_mm->umem_rwsem); + + umem_odp->per_mm = NULL; kvfree(umem_odp->dma_list); kvfree(umem_odp->page_list); } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 8f4fd4fac1593a..7c10dfe417a446 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -252,9 +252,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) ucontext->closing = false; ucontext->cleanup_retryable = false; - mutex_init(&ucontext->per_mm_list_lock); - INIT_LIST_HEAD(&ucontext->per_mm_list); - ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) goto err_free; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 11c13c1381cf5c..e369ac0d6f5159 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1487,6 +1487,7 @@ static void __exit ib_uverbs_cleanup(void) IB_UVERBS_NUM_FIXED_MINOR); unregister_chrdev_region(dynamic_uverbs_dev, IB_UVERBS_NUM_DYNAMIC_MINOR); + mmu_notifier_synchronize(); } module_init(ib_uverbs_init); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index cdb6bbbaa14fd8..4400ff7457c7c8 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1995,11 +1995,6 @@ static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_bfreg_info *bfregi; - /* All umem's must be destroyed before destroying the ucontext. */ - mutex_lock(&ibcontext->per_mm_list_lock); - WARN_ON(!list_empty(&ibcontext->per_mm_list)); - mutex_unlock(&ibcontext->per_mm_list_lock); - bfregi = &context->bfregi; mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 468c9afabbb2fd..5a6c7cd3f33388 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -116,20 +116,12 @@ static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_ucontext_per_mm { - struct ib_ucontext *context; - struct mm_struct *mm; + struct mmu_notifier mn; struct pid *tgid; - bool active; struct rb_root_cached umem_tree; /* Protects umem_tree */ struct rw_semaphore umem_rwsem; - - struct mmu_notifier mn; - unsigned int odp_mrs_count; - - struct list_head ucontext_list; - struct rcu_head rcu; }; int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 16196196659a4c..9bd3cde7e8dbe9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1417,9 +1417,6 @@ struct ib_ucontext { bool cleanup_retryable; - struct mutex per_mm_list_lock; - struct list_head per_mm_list; - struct ib_rdmacg_object cg_obj; /* * Implementation details of the RDMA core, don't use in drivers: -- 2.22.0