On Mon, Dec 16, 2019 at 11:57:32AM -0800, Ralph Campbell wrote: > mmu_interval_notifier_insert() and mmu_interval_notifier_remove() can't > be called safely from inside the invalidate() callback. This is fine for > devices with explicit memory region register and unregister calls but it > is desirable from a programming model standpoint to not require explicit > memory region registration. Regions can be registered based on device > address faults but without a mechanism for updating or removing the mmu > interval notifiers in response to munmap(), the invalidation callbacks > will be for regions that are stale or apply to different mmaped regions. > > The invalidate() callback provides the necessary information (i.e., > the event type MMU_NOTIFY_UNMAP) so add insert, remove, and update > functions that are safe to call from the invalidate() callback by > extending the work done in mn_itree_inv_end() to update the interval tree > when it is not being traversed. > > Signed-off-by: Ralph Campbell <rcampbell@xxxxxxxxxx> > include/linux/mmu_notifier.h | 15 +++ > mm/mmu_notifier.c | 196 ++++++++++++++++++++++++++++++----- > 2 files changed, 186 insertions(+), 25 deletions(-) > > diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h > index 9e6caa8ecd19..55fbefcdc564 100644 > +++ b/include/linux/mmu_notifier.h > @@ -233,11 +233,18 @@ struct mmu_notifier { > * @invalidate: Upon return the caller must stop using any SPTEs within this > * range. This function can sleep. Return false only if sleeping > * was required but mmu_notifier_range_blockable(range) is false. > + * @release: This function will be called when the mmu_interval_notifier > + * is removed from the interval tree. Defining this function also > + * allows mmu_interval_notifier_remove() and > + * mmu_interval_notifier_update() to be called from the > + * invalidate() callback function (i.e., they won't block waiting > + * for invalidations to finish. > */ > struct mmu_interval_notifier_ops { > bool (*invalidate)(struct mmu_interval_notifier *mni, > const struct mmu_notifier_range *range, > unsigned long cur_seq); > + void (*release)(struct mmu_interval_notifier *mni); > }; > > struct mmu_interval_notifier { > @@ -246,6 +253,8 @@ struct mmu_interval_notifier { > struct mm_struct *mm; > struct hlist_node deferred_item; > unsigned long invalidate_seq; > + unsigned long deferred_start; > + unsigned long deferred_last; > }; > > #ifdef CONFIG_MMU_NOTIFIER > @@ -299,7 +308,13 @@ int mmu_interval_notifier_insert_locked( > struct mmu_interval_notifier *mni, struct mm_struct *mm, > unsigned long start, unsigned long length, > const struct mmu_interval_notifier_ops *ops); > +int mmu_interval_notifier_insert_safe( > + struct mmu_interval_notifier *mni, struct mm_struct *mm, > + unsigned long start, unsigned long length, > + const struct mmu_interval_notifier_ops *ops); > void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni); > +int mmu_interval_notifier_update(struct mmu_interval_notifier *mni, > + unsigned long start, unsigned long length); > > /** > * mmu_interval_set_seq - Save the invalidation sequence > diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c > index f76ea05b1cb0..c303285750b2 100644 > +++ b/mm/mmu_notifier.c > @@ -129,6 +129,7 @@ static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm) > { > struct mmu_interval_notifier *mni; > struct hlist_node *next; > + struct hlist_head removed_list; > > spin_lock(&mmn_mm->lock); > if (--mmn_mm->active_invalidate_ranges || > @@ -144,21 +145,47 @@ static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm) > * The inv_end incorporates a deferred mechanism like rtnl_unlock(). > * Adds and removes are queued until the final inv_end happens then > * they are progressed. This arrangement for tree updates is used to > - * avoid using a blocking lock during invalidate_range_start. > + * avoid using a blocking lock while walking the interval tree. > */ > + INIT_HLIST_HEAD(&removed_list); > hlist_for_each_entry_safe(mni, next, &mmn_mm->deferred_list, > deferred_item) { > + hlist_del(&mni->deferred_item); > if (RB_EMPTY_NODE(&mni->interval_tree.rb)) > interval_tree_insert(&mni->interval_tree, > &mmn_mm->itree); > - else > + else { > interval_tree_remove(&mni->interval_tree, > &mmn_mm->itree); > - hlist_del(&mni->deferred_item); > + if (mni->deferred_last) { > + mni->interval_tree.start = mni->deferred_start; > + mni->interval_tree.last = mni->deferred_last; > + mni->deferred_last = 0; Technicaly we can have an interval starting at zero. I'd write it more like if (mni->updated_start == mni->updated_end) insert else remove ie an empty interval can't get a notification so it should be removed from the tree. I also like the name 'updated' better than deferred, it is a bit clearer.. Adding release should it's own patch. > @@ -970,14 +999,52 @@ int mmu_interval_notifier_insert_locked( > EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); > > /** > - * mmu_interval_notifier_remove - Remove a interval notifier > - * @mni: Interval notifier to unregister > + * mmu_interval_notifier_insert_safe - Insert an interval notifier > + * @mni: Interval notifier to register > + * @mm : mm_struct to attach to > + * @start: Starting virtual address to monitor > + * @length: Length of the range to monitor > + * @ops: Interval notifier callback operations > * > - * This function must be paired with mmu_interval_notifier_insert(). It cannot > - * be called from any ops callback. > + * Return: -EINVAL if @mm hasn't been initialized for interval notifiers > + * by calling mmu_notifier_register(NULL, mm) or > + * __mmu_notifier_register(NULL, mm). > * > - * Once this returns ops callbacks are no longer running on other CPUs and > - * will not be called in future. > + * This function subscribes the interval notifier for notifications from the > + * mm. Upon return, the ops related to mmu_interval_notifier will be called > + * whenever an event that intersects with the given range occurs. > + * > + * This function is safe to call from the ops->invalidate() function. > + * Upon return, the mmu_interval_notifier may not be present in the interval > + * tree yet. The caller must use the normal interval notifier read flow via > + * mmu_interval_read_begin() to establish SPTEs for this range. So why do we need this? You can't call hmm_range_fault from a notifier. You just can't. So there should be no reason to create an interval from the notifier, do it from where you call hmm_range_fault, and it must be safe to obtain the mmap_sem from that thread. > +/** > + * mmu_interval_notifier_update - Update interval notifier end > + * @mni: Interval notifier to update > + * @start: New starting virtual address to monitor > + * @length: New length of the range to monitor > + * > + * This function updates the range being monitored. > + * If there is no release() function defined, the call will wait for the > + * update to finish before returning. > + */ > +int mmu_interval_notifier_update(struct mmu_interval_notifier *mni, > + unsigned long start, unsigned long length) > +{ > + struct mm_struct *mm = mni->mm; > + struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm; > + unsigned long seq = 0; > + unsigned long last; > + > + if (length == 0 || check_add_overflow(start, length - 1, &last)) > + return -EOVERFLOW; > + > + spin_lock(&mmn_mm->lock); > + if (mn_itree_is_invalidating(mmn_mm)) { > + /* > + * Update is being called after insert put this on the > + * deferred list, but before the deferred list was processed. > + */ > + if (RB_EMPTY_NODE(&mni->interval_tree.rb)) { > + mni->interval_tree.start = start; > + mni->interval_tree.last = last; > + } else { > + if (!mni->deferred_last) > + hlist_add_head(&mni->deferred_item, > + &mmn_mm->deferred_list); > + mni->deferred_start = start; > + mni->deferred_last = last; > + } > + seq = mmn_mm->invalidate_seq; > + } else { > + WARN_ON(RB_EMPTY_NODE(&mni->interval_tree.rb)); > + interval_tree_remove(&mni->interval_tree, &mmn_mm->itree); > + mni->interval_tree.start = start; > + mni->interval_tree.last = last; > + interval_tree_insert(&mni->interval_tree, &mmn_mm->itree); > + } > + spin_unlock(&mmn_mm->lock); > + > + if (!mni->ops->release && seq) { > + /* > + * The possible sleep on progress in the invalidation requires > + * the caller not hold any locks held by invalidation > + * callbacks. > + */ > + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); > + lock_map_release(&__mmu_notifier_invalidate_range_start_map); > wait_event(mmn_mm->wq, > READ_ONCE(mmn_mm->invalidate_seq) != seq); > + } > > - /* pairs with mmgrab in mmu_interval_notifier_insert() */ > - mmdrop(mm); > + return 0; > } > -EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove); > +EXPORT_SYMBOL_GPL(mmu_interval_notifier_update); A 'update' should probably be the same as insert, it doesn't necessarily take effect until mmu_interval_read_begin(), so nothing contingent on release. As before, I'm not sure what to do with this. We need an in-kernel user for new apis, and I don't see a reason to make this more complicated for a test program. The test program should match one of the existing driver flows, so use the page table like scheme from ODP or the fixed lifetime scheme from AMDGPU/ODP Jason