From: Jason Gunthorpe <jgg@xxxxxxxxxxxx> This is intrinsically racy and the scheme is simply unnecessary. New MR registration can wait for any on going invalidation to fully complete. CPU0 CPU1 if (atomic_read()) if (atomic_dec_and_test() && !list_empty()) { /* not taken */ } list_add() Putting the new UMEM into some kind of purgatory until another invalidate rolls through.. Instead hold the read side of the umem_rwsem across the pair'd start/end and get rid of the racy 'deferred add' approach. Since all umem's in the rbt are always ready to go, also get rid of the mn_counters_active stuff. Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxxxx> Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> --- drivers/infiniband/core/umem_odp.c | 113 ++++++------------------------------- include/rdma/ib_umem_odp.h | 15 ----- 2 files changed, 18 insertions(+), 110 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 0577f9ff600f..1c0c4a431218 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -80,83 +80,29 @@ INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { mutex_lock(&umem_odp->umem_mutex); - - /* Only update private counters for this umem if it has them. - * Otherwise skip it. All page faults will be delayed for this umem. */ - if (umem_odp->mn_counters_active) { - int notifiers_count = umem_odp->notifiers_count++; - - if (notifiers_count == 0) - /* Initialize the completion object for waiting on - * notifiers. Since notifier_count is zero, no one - * should be waiting right now. */ - reinit_completion(&umem_odp->notifier_completion); - } + if (umem_odp->notifiers_count++ == 0) + /* + * Initialize the completion object for waiting on + * notifiers. Since notifier_count is zero, no one should be + * waiting right now. + */ + reinit_completion(&umem_odp->notifier_completion); mutex_unlock(&umem_odp->umem_mutex); } static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) { mutex_lock(&umem_odp->umem_mutex); - - /* Only update private counters for this umem if it has them. - * Otherwise skip it. All page faults will be delayed for this umem. */ - if (umem_odp->mn_counters_active) { - /* - * This sequence increase will notify the QP page fault that - * the page that is going to be mapped in the spte could have - * been freed. - */ - ++umem_odp->notifiers_seq; - if (--umem_odp->notifiers_count == 0) - complete_all(&umem_odp->notifier_completion); - } + /* + * This sequence increase will notify the QP page fault that the page + * that is going to be mapped in the spte could have been freed. + */ + ++umem_odp->notifiers_seq; + if (--umem_odp->notifiers_count == 0) + complete_all(&umem_odp->notifier_completion); mutex_unlock(&umem_odp->umem_mutex); } -/* Account for a new mmu notifier in an ib_ucontext. */ -static void -ib_ucontext_notifier_start_account(struct ib_ucontext_per_mm *per_mm) -{ - atomic_inc(&per_mm->notifier_count); -} - -/* Account for a terminating mmu notifier in an ib_ucontext. - * - * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since - * the function takes the semaphore itself. */ -static void ib_ucontext_notifier_end_account(struct ib_ucontext_per_mm *per_mm) -{ - int zero_notifiers = atomic_dec_and_test(&per_mm->notifier_count); - - if (zero_notifiers && - !list_empty(&per_mm->no_private_counters)) { - /* No currently running mmu notifiers. Now is the chance to - * add private accounting to all previously added umems. */ - struct ib_umem_odp *odp_data, *next; - - /* Prevent concurrent mmu notifiers from working on the - * no_private_counters list. */ - down_write(&per_mm->umem_rwsem); - - /* Read the notifier_count again, with the umem_rwsem - * semaphore taken for write. */ - if (!atomic_read(&per_mm->notifier_count)) { - list_for_each_entry_safe(odp_data, next, - &per_mm->no_private_counters, - no_private_counters) { - mutex_lock(&odp_data->umem_mutex); - odp_data->mn_counters_active = true; - list_del(&odp_data->no_private_counters); - complete_all(&odp_data->notifier_completion); - mutex_unlock(&odp_data->umem_mutex); - } - } - - up_write(&per_mm->umem_rwsem); - } -} - static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { @@ -186,7 +132,6 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, if (!per_mm->context->invalidate_range) return; - ib_ucontext_notifier_start_account(per_mm); down_read(&per_mm->umem_rwsem); rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, @@ -231,14 +176,9 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - ib_ucontext_notifier_start_account(per_mm); - ret = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, - end, - invalidate_range_start_trampoline, - blockable, NULL); - up_read(&per_mm->umem_rwsem); - - return ret; + return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, + invalidate_range_start_trampoline, + blockable, NULL); } static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, @@ -259,17 +199,10 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, if (!per_mm->context->invalidate_range) return; - /* - * TODO: we currently bail out if there is any sleepable work to be done - * in ib_umem_notifier_invalidate_range_start so we shouldn't really block - * here. But this is ugly and fragile. - */ - down_read(&per_mm->umem_rwsem); rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_end_trampoline, true, NULL); up_read(&per_mm->umem_rwsem); - ib_ucontext_notifier_end_account(per_mm); } static const struct mmu_notifier_ops ib_umem_notifiers = { @@ -287,12 +220,6 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_insert(&umem_odp->interval_tree, &per_mm->umem_tree); - - if (likely(!atomic_read(&per_mm->notifier_count))) - umem_odp->mn_counters_active = true; - else - list_add(&umem_odp->no_private_counters, - &per_mm->no_private_counters); up_write(&per_mm->umem_rwsem); } @@ -305,10 +232,7 @@ static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_remove(&umem_odp->interval_tree, &per_mm->umem_tree); - if (!umem_odp->mn_counters_active) { - list_del(&umem_odp->no_private_counters); - complete_all(&umem_odp->notifier_completion); - } + complete_all(&umem_odp->notifier_completion); up_write(&per_mm->umem_rwsem); } @@ -327,7 +251,6 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, per_mm->mm = mm; per_mm->umem_tree = RB_ROOT_CACHED; init_rwsem(&per_mm->umem_rwsem); - INIT_LIST_HEAD(&per_mm->no_private_counters); rcu_read_lock(); per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 259eb08dfc9e..ce9502545903 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -67,15 +67,9 @@ struct ib_umem_odp { struct mutex umem_mutex; void *private; /* for the HW driver to use. */ - /* When false, use the notifier counter in the ucontext struct. */ - bool mn_counters_active; int notifiers_seq; int notifiers_count; - /* A linked list of umems that don't have private mmu notifier - * counters yet. */ - struct list_head no_private_counters; - /* Tree tracking */ struct umem_odp_node interval_tree; @@ -99,11 +93,8 @@ struct ib_ucontext_per_mm { struct rb_root_cached umem_tree; /* Protects umem_tree */ struct rw_semaphore umem_rwsem; - atomic_t notifier_count; struct mmu_notifier mn; - /* A list of umems that don't have private mmu notifier counters yet. */ - struct list_head no_private_counters; unsigned int odp_mrs_count; struct list_head ucontext_list; @@ -162,12 +153,6 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, * and the ucontext umem_mutex semaphore locked for read). */ - /* Do not allow page faults while the new ib_umem hasn't seen a state - * with zero notifiers yet, and doesn't have its own valid set of - * private counters. */ - if (!umem_odp->mn_counters_active) - return 1; - if (unlikely(umem_odp->notifiers_count)) return 1; if (umem_odp->notifiers_seq != mmu_seq) -- 2.14.4