[PATCH/RFC 4/8] numa - Migrate-on-Fault - migrate misplaced pages

Lee Schermerhorn <lee.schermerhorn@xxxxxx> · Thu, 11 Nov 2010 14:45:22 -0500

Migrate-on-fault - migrate misplaced page

This patch adds a new function migrate_misplaced_page() to mm/migrate.c
[where most of the other page migration functions live] to migrate a
misplaced page to a specified destination node.  This function will be
called from the fault path.  Because we already know the destination
node for the migration, we allocate pages directly rather than rerunning
the policy node computation in alloc_page_vma().

We want to ignore the extra page ref count when replacing the
page in its mapping in the fault path.  To accomplish this I've added
a boolean [int] argument "faulting" to the migratepage op functions.
This arg gets passed down to migrate_page_move_mapping():  0 for direct
migration, !0 for migrate-on-fault.

NOTE:  at one point I had convinced myself that ignoring the ref count
in this path was safe.  Since then, a lot of changes have been made and
I've heard it said that raising the ref count disables migration.  This
might require rework--e.g., to account for the extra ref rather than
ignoring refs.

The patch adds the function check_migrate_misplaced_page() to migrate.c
to check whether a page is "misplaced"--i.e.  on a node different
from what the policy for (vma, address) dictates.  This check
involves accessing the vma policy, so we only do this if:
   * migrate-on-fault is enabled for this task [via cpuset ctrl]
   * page has zero mapcount [no pte references]
   * page is not in writeback
   * page is up to date
   * page's mapping has a migratepage a_op [no fallback!]
In these checks are satisfied, the page will be migrated to the
"correct" node, if possible.  If migration fails for any reason,
we just use the original page.

Note that when MIGRATE_ON_FAULT is not configured, the
check_migrate_misplaced_page() function becomes a static inline
function that just returns its page argument.

Subsequent patches will hook the fault handlers [anon,  and possibly
file  and/or shmem] to check_migrate_misplaced_page().

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx>

 fs/nfs/write.c            |    2 
 include/linux/fs.h        |    8 -
 include/linux/gfp.h       |    3 
 include/linux/mempolicy.h |   23 +----
 include/linux/migrate.h   |   22 ++++-
 mm/mempolicy.c            |   21 ++++
 mm/migrate.c              |  202 ++++++++++++++++++++++++++++++++++++++++------
 7 files changed, 230 insertions(+), 51 deletions(-)

Index: linux-2.6.36-mmotm-101103-1217/include/linux/mempolicy.h
===================================================================

--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/mempolicy.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/mempolicy.h
@@ -67,6 +67,7 @@ enum mpol_rebind_step {
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
 #include <linux/shared_policy.h>
+#include <linux/migrate.h>
 
 struct mm_struct;
 
@@ -223,22 +224,7 @@ extern int mpol_to_str(char *buffer, int
 			int no_context);
 #endif
 
-/* Check if a vma is migratable */
-static inline int vma_migratable(struct vm_area_struct *vma)
-{
-	if (vma->vm_flags & (VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
-		return 0;
-	/*
-	 * Migration allocates pages in the highest zone. If we cannot
-	 * do so then migration (at least from node to node) is not
-	 * possible.
-	 */
-	if (vma->vm_file &&
-		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
-								< policy_zone)
-			return 0;
-	return 1;
-}
+extern int vma_migratable(struct vm_area_struct *);
 
 struct seq_file;
 extern int show_numa_map(struct seq_file *, void *);
@@ -248,11 +234,12 @@ extern struct mpol_range *get_numa_subma
 #ifdef CONFIG_MIGRATE_ON_FAULT
 #define MPOL_MIGRATE_NONINTERLEAVED 1
 #define MPOL_MIGRATE_INTERLEAVED 2
-#define misplaced_is_interleaved(pol) (MPOL_MIGRATE_INTERLEAVED - 1)
+#define misplaced_is_interleaved(pol) (pol == MPOL_MIGRATE_INTERLEAVED)
 
 extern int mpol_misplaced(struct page *, struct vm_area_struct *,
 		unsigned long, int *);
-#endif
+
+#endif /* CONFIG_MIGRATE_ON_FAULT */
 
 #else
 
Index: linux-2.6.36-mmotm-101103-1217/include/linux/fs.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/fs.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/fs.h
@@ -608,8 +608,8 @@ struct address_space_operations {
 	int (*get_xip_mem)(struct address_space *, pgoff_t, int,
 						void **, unsigned long *);
 	/* migrate the contents of a page to the specified target */
-	int (*migratepage) (struct address_space *,
-			struct page *, struct page *);
+	int (*migratepage) (struct address_space *, struct page *,
+			struct page *, int);
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
@@ -2451,8 +2451,8 @@ extern int generic_file_fsync(struct fil
 extern int generic_check_addressable(unsigned, u64);
 
 #ifdef CONFIG_MIGRATION
-extern int buffer_migrate_page(struct address_space *,
-				struct page *, struct page *);
+extern int buffer_migrate_page(struct address_space *, struct page *,
+				struct page *, int);
 #else
 #define buffer_migrate_page NULL
 #endif
Index: linux-2.6.36-mmotm-101103-1217/include/linux/gfp.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/gfp.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/gfp.h
@@ -329,11 +329,14 @@ extern struct page *alloc_page_vma(gfp_t
 			struct vm_area_struct *vma, unsigned long addr);
 struct mempolicy;
 extern struct page *alloc_page_pol(gfp_t, struct mempolicy *, pgoff_t);
+extern struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
+					unsigned nid);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
 #define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0)
 #define alloc_page_pol(gfp_mask, pol, off)  alloc_pages(gfp_mask, 0)
+#define alloc_page_interleave(gfp_mask, order, nid) alloc_pages(gfp_mask, 0)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 
Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c
+++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
@@ -464,6 +464,25 @@ static void gather_stats(struct page *,
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags);
 
+/*
+ * Check whether a vma is migratable
+ */
+int vma_migratable(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & (VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
+		return 0;
+	/*
+	 * Migration allocates pages in the highest zone. If we cannot
+	 * do so then migration (at least from node to node) is not
+	 * possible.
+	 */
+	if (vma->vm_file &&
+		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
+								< policy_zone)
+			return 0;
+	return 1;
+}
+
 /* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end,
@@ -1901,7 +1920,7 @@ out:
 
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
+struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 					unsigned nid)
 {
 	struct zonelist *zl;
Index: linux-2.6.36-mmotm-101103-1217/mm/migrate.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/migrate.c
+++ linux-2.6.36-mmotm-101103-1217/mm/migrate.c
@@ -216,16 +216,28 @@ out:
 	pte_unmap_unlock(ptep, ptl);
 }
 
-/*
- * Replace the page in the mapping.
+/**
+ * migrate_page_move_mapping()  Replace the page in the mapping.
+ * @mapping - address_space in which to replace page
+ * @newpage - the replacement page
+ * @page    - page to be replaced -- key to slot in mapping
+ * @faulting - flag:  [lazy] migrating in the fault path
+ *
+ * For direct migration, [!faulting] the number of remaining references
+ * must be:
+ *   1 for anonymous pages without a mapping
+ *   2 for pages with a mapping
+ *   3 for pages with a mapping and PagePrivate set.
  *
- * The number of remaining references must be:
- * 1 for anonymous pages without a mapping
- * 2 for pages with a mapping
- * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
+ * However, if we're in the fault path, we found the page in a cache,
+ * up-to-date with mapcount == 0.  We hold the page locked.  After we
+ * locked the page, another task could have faulted, found the page in
+ * the cache and thus increased the ref.  We want to allow migrate on
+ * fault to proceed in this case, so we ignore the refs when @faulting.
+//TODO:  is this true?  Can we really ignore ref count in this case?
  */
 static int migrate_page_move_mapping(struct address_space *mapping,
-		struct page *newpage, struct page *page)
+		struct page *newpage, struct page *page, int faulting)
 {
 	int expected_count;
 	void **pslot;
@@ -240,9 +252,17 @@ static int migrate_page_move_mapping(str
 	spin_lock_irq(&mapping->tree_lock);
 
 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
- 					page_index(page));
+					page_index(page));
 
-	expected_count = 2 + page_has_private(page);
+	if (!faulting)
+		expected_count = 2 + !!PagePrivate(page);
+	else
+		expected_count = page_count(page); /* for page_freeze_refs() */
+
+	/*
+	 * there exists a window here, wherein a change in the reference
+	 * count on the page will block migration in the fault path. So be it.
+	 */
 	if (page_count(page) != expected_count ||
 			(struct page *)radix_tree_deref_slot(pslot) != page) {
 		spin_unlock_irq(&mapping->tree_lock);
@@ -397,8 +417,8 @@ void migrate_page_copy(struct page *newp
  ***********************************************************/
 
 /* Always fail migration. Used for mappings that are not movable */
-int fail_migrate_page(struct address_space *mapping,
-			struct page *newpage, struct page *page)
+int fail_migrate_page(struct address_space *mapping, struct page *newpage,
+			struct page *page, int faulting)
 {
 	return -EIO;
 }
@@ -410,14 +430,14 @@ EXPORT_SYMBOL(fail_migrate_page);
  *
  * Pages are locked upon entry and exit.
  */
-int migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page)
+int migrate_page(struct address_space *mapping, struct page *newpage,
+			struct page *page, int faulting)
 {
 	int rc;
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_move_mapping(mapping, newpage, page);
+	rc = migrate_page_move_mapping(mapping, newpage, page, faulting);
 
 	if (rc)
 		return rc;
@@ -433,18 +453,18 @@ EXPORT_SYMBOL(migrate_page);
  * if the underlying filesystem guarantees that no other references to "page"
  * exist.
  */
-int buffer_migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page)
+int buffer_migrate_page(struct address_space *mapping, struct page *newpage,
+			struct page *page, int faulting)
 {
 	struct buffer_head *bh, *head;
 	int rc;
 
 	if (!page_has_buffers(page))
-		return migrate_page(mapping, newpage, page);
+		return migrate_page(mapping, newpage, page, faulting);
 
 	head = page_buffers(page);
 
-	rc = migrate_page_move_mapping(mapping, newpage, page);
+	rc = migrate_page_move_mapping(mapping, newpage, page, faulting);
 
 	if (rc)
 		return rc;
@@ -545,7 +565,7 @@ static int fallback_migrate_page(struct
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
 
-	return migrate_page(mapping, newpage, page);
+	return migrate_page(mapping, newpage, page, 0);
 }
 
 /*
@@ -581,7 +601,7 @@ static int move_to_new_page(struct page
 
 	mapping = page_mapping(page);
 	if (!mapping)
-		rc = migrate_page(mapping, newpage, page);
+		rc = migrate_page(mapping, newpage, page, 0);
 	else if (mapping->a_ops->migratepage)
 		/*
 		 * Most pages have a mapping and most filesystems
@@ -591,7 +611,7 @@ static int move_to_new_page(struct page
 		 * path for page migration.
 		 */
 		rc = mapping->a_ops->migratepage(mapping,
-						newpage, page);
+						newpage, page, 0);
 	else
 		rc = fallback_migrate_page(mapping, newpage, page);
 
@@ -762,7 +782,7 @@ unlock:
  		/*
  		 * A page that has been migrated has all references
  		 * removed and will be freed. A page that has not been
- 		 * migrated will have kepts its references and be
+ 		 * migrated will have kept its references and be
  		 * restored.
  		 */
  		list_del(&page->lru);
@@ -1350,4 +1370,140 @@ int migrate_vmas(struct mm_struct *mm, c
  	}
  	return err;
 }
-#endif
+
+#ifdef CONFIG_MIGRATE_ON_FAULT
+/*
+ * Attempt to migrate a misplaced page to the specified destination
+ * node.  Page is already unmapped, up to date and locked by caller.
+ * Anon pages are in the swap cache.  Page's mapping has a migratepage aop.
+ *
+ * page refs on entry/exit:  cache + fault path [+ bufs]
+ */
+struct page *migrate_misplaced_page(struct page *page,
+				 struct mm_struct *mm,
+				 int dest, int interleaved)
+{
+	struct page *oldpage = page, *newpage;
+	struct address_space *mapping = page_mapping(page);
+	struct mem_cgroup *mcg;
+	unsigned int gfp;
+	int rc = 0;
+	int charge = -ENOMEM;	/* in case alloc_*() fails */
+
+//TODO:  explicit assertions during debug/testing.  remove later?
+	VM_BUG_ON(!PageLocked(page));
+	VM_BUG_ON(page_mapcount(page));
+	VM_BUG_ON(PageAnon(page) && !PageSwapCache(page));
+	VM_BUG_ON(!mapping || !mapping->a_ops->migratepage);
+
+	/*
+	 * remove old page from LRU so it can't be found while migrating
+	 * except thru' the cache by other faulting tasks who will
+	 * block behind my lock.
+	 */
+	if (isolate_lru_page(page))	/* incrs page count on success */
+		goto out_nolru;	/* we lost */
+
+	/*
+	 * Never wait for allocations just to migrate on fault,
+	 * but don't dip into reserves.
+	 * And, only accept pages from specified node.
+	 * No sense migrating to a different "misplaced" page!
+	 */
+	gfp = (unsigned int)mapping_gfp_mask(mapping) & ~__GFP_WAIT;
+	gfp |= __GFP_NOMEMALLOC | GFP_THISNODE ;
+
+	if (interleaved)
+		newpage = alloc_page_interleave(gfp, 0, dest);
+	else
+		newpage = alloc_pages_node(dest, gfp, 0);
+
+	if (!newpage)
+		goto out;	/* give up */
+
+	/*
+	 * can't just lock_page() -- "might sleep" in atomic context
+	 */
+	if (!trylock_page(newpage))
+		BUG();		/* new page should be unlocked!!! */
+
+// TODO:  are we in correct state to do this?  when called from do_swap_page()
+//        we have pending charge on this page.  Revisit when memcontrol settles
+//        down.
+	charge = mem_cgroup_prepare_migration(page, newpage, &mcg);
+	if (charge == -ENOMEM) {
+		rc = charge;
+		goto out;
+	}
+
+	newpage->index = page->index;
+	newpage->mapping = page->mapping;
+	if (PageSwapBacked(page))		/* like move_to_new_page() */
+		SetPageSwapBacked(newpage);
+
+	/*
+	 * migrate a_op transfers cache [+ buf] refs
+	 */
+	rc = mapping->a_ops->migratepage(mapping, newpage,
+						 page, 1);
+	if (!rc) {
+		get_page(newpage);	/* add isolate_lru_page ref */
+		put_page(page);		/* drop       "          "  */
+
+		unlock_page(page);
+		put_page(page);		/* drop fault path ref & free */
+
+		page = newpage;
+	}
+
+out:
+	if (!charge)
+		mem_cgroup_end_migration(mcg, oldpage, newpage);
+
+	if (rc) {
+		unlock_page(newpage);
+		__free_page(newpage);
+	}
+
+	putback_lru_page(page);		/* ultimately, drops a page ref */
+
+out_nolru:
+	return page;			/* locked, to complete fault */
+
+}
+
+/*
+ * Called in fault path, if migrate_on_fault_enabled(current) for a page
+ * found in the cache, page is locked, and page_mapping(page) != NULL;
+ * We check for page uptodate here because we want to be able to do any
+ * needed migration before grabbing the page table lock.  In the anon fault
+ * path, PageUptodate() isn't checked until after locking the page table.
+ *
+ * For migrate on fault, we only migrate pages whose mapping has a
+ * migratepage op.  The fallback path requires writing out the page and
+ * reading it back in.  That sort of defeats the purpose of
+ * migrate-on-fault [performance].  So, we don't even bother to check
+ * for misplacment unless the op is present.  Of course, this is an extra
+ * check in the fault path for pages we care about :-(
+ */
+struct page *check_migrate_misplaced_page(struct page *page,
+		struct vm_area_struct *vma, unsigned long address)
+{
+	int polnid, misplaced;
+
+	if (page_mapcount(page) || PageWriteback(page) ||
+			unlikely(!PageUptodate(page))  ||
+			!page_mapping(page)->a_ops->migratepage)
+		return page;
+
+	misplaced = mpol_misplaced(page, vma, address, &polnid);
+	if (!misplaced)
+		return page;
+
+	return migrate_misplaced_page(page, vma->vm_mm, polnid,
+			misplaced_is_interleaved(misplaced));
+
+}
+
+#endif /* _MIGRATE_ON_FAULT */
+#endif /* CONFIG_NUMA */
Index: linux-2.6.36-mmotm-101103-1217/include/linux/migrate.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/migrate.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/migrate.h
@@ -10,15 +10,15 @@ typedef struct page *new_page_t(struct p
 #define PAGE_MIGRATION 1
 
 extern void putback_lru_pages(struct list_head *l);
-extern int migrate_page(struct address_space *,
-			struct page *, struct page *);
+extern int migrate_page(struct address_space *, struct page *,
+				struct page *, int);
 extern int migrate_pages(struct list_head *l, new_page_t x,
 			unsigned long private, int offlining);
 extern int migrate_huge_pages(struct list_head *l, new_page_t x,
 			unsigned long private, int offlining);
 
-extern int fail_migrate_page(struct address_space *,
-			struct page *, struct page *);
+extern int fail_migrate_page(struct address_space *, struct page *,
+				struct page *, int);
 
 extern int migrate_prep(void);
 extern int migrate_prep_local(void);
@@ -28,6 +28,20 @@ extern int migrate_vmas(struct mm_struct
 extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
+
+#ifdef CONFIG_MIGRATE_ON_FAULT
+extern struct page *check_migrate_misplaced_page(struct page *,
+			struct vm_area_struct *, unsigned long);
+extern struct page *migrate_misplaced_page(struct page *, struct mm_struct *,
+			int, int);
+#else
+static inline struct page *check_migrate_misplaced_page(struct page *page,
+			struct vm_area_struct *vma, unsigned long addr)
+{
+	return page;
+}
+
+#endif
 #else
 #define PAGE_MIGRATION 0
 
Index: linux-2.6.36-mmotm-101103-1217/fs/nfs/write.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/fs/nfs/write.c
+++ linux-2.6.36-mmotm-101103-1217/fs/nfs/write.c
@@ -1562,7 +1562,7 @@ int nfs_migrate_page(struct address_spac
 	if (IS_ERR(req))
 		goto out;
 
-	ret = migrate_page(mapping, newpage, page);
+	ret = migrate_page(mapping, newpage, page, 0);
 	if (!req)
 		goto out;
 	if (ret)
--
To unsubscribe from this list: send the line "unsubscribe linux-numa" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html