[PATCH RFC] mm/mmu_notifier: get rid of srcu-based synchronization

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch removes srcu-based protection from mmu-notitifier and collects all
synchronization in mmu_notifier_unregister()/mmu_notifier_release().

All mmu notifier methods are called either under mmap_sem or under one of rmap
locks: root anon vma lock or inode mmaping lock. Because they always operates in
particular vma and caller must protect it somehow. Thus we can use mmap_sem and
rmap locks for waiting for all currently running mmu-notifier methods.

This patch adds new helper function: mm_synchronize_all_locks(). This function
acquires and releases all rmap locks one by one, it much faster than sequence
mm_take_all_locks() - mm_drop_all_locks(). mm_synchronize_all_locks() needs only
mmap_sem read-lock, but caller must lock mmap_sem for write at least once to
synchronize with the rest operations which are protected with mmap_sem read-lock.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: linux-mm@xxxxxxxxx
---
 include/linux/mm.h           |    1 
 include/linux/mmu_notifier.h |    1 
 mm/mmap.c                    |   26 ++++++++++++
 mm/mmu_notifier.c            |   91 ++++++++++++++----------------------------
 4 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 66e2f7c..86cf9f3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1461,6 +1461,7 @@ extern void exit_mmap(struct mm_struct *);
 
 extern int mm_take_all_locks(struct mm_struct *mm);
 extern void mm_drop_all_locks(struct mm_struct *mm);
+extern void mm_synchronize_all_locks(struct mm_struct *mm);
 
 extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index bc823c4..fa55d89 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -4,7 +4,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/mm_types.h>
-#include <linux/srcu.h>
 
 struct mmu_notifier;
 struct mmu_notifier_ops;
diff --git a/mm/mmap.c b/mm/mmap.c
index 0fb6805..89e884e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3043,6 +3043,32 @@ void mm_drop_all_locks(struct mm_struct *mm)
 }
 
 /*
+ * This function waits for all running pte operations which are protected with
+ * rmap locks like try_to_unmap(). To wait for rest operations like page-faults
+ * which runs under mmap_sem caller must lock mmap_sem for write at least once.
+ * This function itself requires only mmap_sem locked for read.
+ */
+void mm_synchronize_all_locks(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+
+	BUG_ON(down_write_trylock(&mm->mmap_sem));
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (vma->anon_vma) {
+			anon_vma_lock_write(vma->anon_vma);
+			anon_vma_unlock_write(vma->anon_vma);
+		}
+		if (vma->vm_file && vma->vm_file->f_mapping) {
+			struct address_space *mapping = vma->vm_file->f_mapping;
+
+			mutex_lock(&mapping->i_mmap_mutex);
+			mutex_unlock(&mapping->i_mmap_mutex);
+		}
+	}
+}
+
+/*
  * initialise the VMA slab
  */
 void __init mmap_init(void)
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8a5ac8c..6b77a9f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,37 +14,31 @@
 #include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/err.h>
-#include <linux/srcu.h>
-#include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
-/* global SRCU for all MMs */
-static struct srcu_struct srcu;
-
 /*
  * This function can't run concurrently against mmu_notifier_register
  * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
  * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
  * in parallel despite there being no task using this mm any more,
  * through the vmas outside of the exit_mmap context, such as with
- * vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to SRCU and it serializes
- * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
- * can't go away from under us as exit_mmap holds an mm_count pin
- * itself.
+ * unmap_mapping_range(). This serializes against mmu_notifier_unregister()
+ * and other mmu notifiers with the mm->mmap_sem and the mmu_notifier_mm->lock.
+ * struct mmu_notifier_mm can't go away from under us as exit_mmap holds
+ * an mm_count pin itself.
  */
 void __mmu_notifier_release(struct mm_struct *mm)
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int id;
 
 	/*
-	 * SRCU here will block mmu_notifier_unregister until
-	 * ->release returns.
+	 * Block mmu_notifier_unregister() until ->release returns
+	 * and synchronize with concurrent page-faults.
 	 */
-	id = srcu_read_lock(&srcu);
+	down_write(&mm->mmap_sem);
+
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
 		/*
 		 * if ->release runs before mmu_notifier_unregister it
@@ -55,7 +49,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
 		 */
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
-	srcu_read_unlock(&srcu, id);
 
 	spin_lock(&mm->mmu_notifier_mm->lock);
 	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -73,15 +66,19 @@ void __mmu_notifier_release(struct mm_struct *mm)
 	spin_unlock(&mm->mmu_notifier_mm->lock);
 
 	/*
-	 * synchronize_srcu here prevents mmu_notifier_release to
+	 * locked mm->mmap_sem here prevents mmu_notifier_release to
 	 * return to exit_mmap (which would proceed freeing all pages
 	 * in the mm) until the ->release method returns, if it was
 	 * invoked by mmu_notifier_unregister.
 	 *
 	 * The mmu_notifier_mm can't go away from under us because one
 	 * mm_count is hold by exit_mmap.
+	 *
+	 * Explicit synchronization with mmu notifier methods isn't
+	 * required because exit_mmap() calls free_pgtables() after us,
+	 * which locks/unlocks all locks like mm_synchronize_all_locks().
 	 */
-	synchronize_srcu(&srcu);
+	up_write(&mm->mmap_sem);
 }
 
 /*
@@ -94,14 +91,12 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int young = 0, id;
+	int young = 0;
 
-	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->clear_flush_young)
 			young |= mn->ops->clear_flush_young(mn, mm, address);
 	}
-	srcu_read_unlock(&srcu, id);
 
 	return young;
 }
@@ -111,9 +106,8 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int young = 0, id;
+	int young = 0;
 
-	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->test_young) {
 			young = mn->ops->test_young(mn, mm, address);
@@ -121,7 +115,6 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 				break;
 		}
 	}
-	srcu_read_unlock(&srcu, id);
 
 	return young;
 }
@@ -131,14 +124,11 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int id;
 
-	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->change_pte)
 			mn->ops->change_pte(mn, mm, address, pte);
 	}
-	srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,14 +136,11 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int id;
 
-	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_page)
 			mn->ops->invalidate_page(mn, mm, address);
 	}
-	srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -161,14 +148,11 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int id;
 
-	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_start)
 			mn->ops->invalidate_range_start(mn, mm, start, end);
 	}
-	srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -176,14 +160,11 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int id;
 
-	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_end)
 			mn->ops->invalidate_range_end(mn, mm, start, end);
 	}
-	srcu_read_unlock(&srcu, id);
 }
 
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -195,12 +176,6 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 
 	BUG_ON(atomic_read(&mm->mm_users) <= 0);
 
-	/*
-	 * Verify that mmu_notifier_init() already run and the global srcu is
-	 * initialized.
-	 */
-	BUG_ON(!srcu.per_cpu_ref);
-
 	ret = -ENOMEM;
 	mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
 	if (unlikely(!mmu_notifier_mm))
@@ -283,8 +258,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
 /*
  * This releases the mm_count pin automatically and frees the mm
  * structure if it was the last user of it. It serializes against
- * running mmu notifiers with SRCU and against mmu_notifier_unregister
- * with the unregister lock + SRCU. All sptes must be dropped before
+ * mmu_notifier_unregister() with mmap_sem and agaings mmu notifiers
+ * with the mmap_sem + rmap locks. All sptes must be dropped before
  * calling mmu_notifier_unregister. ->release or any other notifier
  * method may be invoked concurrently with mmu_notifier_unregister,
  * and only after mmu_notifier_unregister returned we're guaranteed
@@ -294,14 +269,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
 	BUG_ON(atomic_read(&mm->mm_count) <= 0);
 
-	if (!hlist_unhashed(&mn->hlist)) {
-		/*
-		 * SRCU here will force exit_mmap to wait ->release to finish
-		 * before freeing the pages.
-		 */
-		int id;
+	/*
+	 * Synchronize with concurrent mmu_notifier_release() and page-faults.
+	 */
+	down_write(&mm->mmap_sem);
 
-		id = srcu_read_lock(&srcu);
+	if (!hlist_unhashed(&mn->hlist)) {
 		/*
 		 * exit_mmap will block in mmu_notifier_release to
 		 * guarantee ->release is called before freeing the
@@ -309,28 +282,24 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 		 */
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
-		srcu_read_unlock(&srcu, id);
 
 		spin_lock(&mm->mmu_notifier_mm->lock);
 		hlist_del_rcu(&mn->hlist);
 		spin_unlock(&mm->mmu_notifier_mm->lock);
 	}
 
+	downgrade_write(&mm->mmap_sem);
 	/*
-	 * Wait any running method to finish, of course including
-	 * ->release if it was run by mmu_notifier_relase instead of us.
+	 * Wait any running method to finish. All such operations are protected
+	 * either with mm->mmap_sem like handle_pte_fault() or with rmap locks
+	 * like try_to_unmap(). They need one of these locks to protect vma/pte
+	 * area where they operates.
 	 */
-	synchronize_srcu(&srcu);
+	mm_synchronize_all_locks(mm);
+	up_read(&mm->mmap_sem);
 
 	BUG_ON(atomic_read(&mm->mm_count) <= 0);
 
 	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
-
-static int __init mmu_notifier_init(void)
-{
-	return init_srcu_struct(&srcu);
-}
-
-module_init(mmu_notifier_init);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>


[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]