[PATCH/RFC 1/5] numa - migration cache - core implementation

Lee Schermerhorn <lee.schermerhorn@xxxxxx> · Thu, 11 Nov 2010 15:22:22 -0500

Migration Cache  1/4 - Core Implementation

At the Linux Plumber's conference, Andi Kleen encouraged me again
to resubmit my automatic page migration patches because he thinks
they will be useful for virtualization.  Later, in the Virtualization
mini-conf, the subject came up during a presentation about adding
NUMA awareness to qemu/kvm.  After the presentation, I discussed
these series with Andrea Arcangeli and he also encouraged me to
post them.  My position within HP has changed such that I'm not
sure how much time I'll have to spend on this area nor whether I'll
have access to the larger NUMA platforms on which to test the
patches thoroughly.  However, here is the last of 4 series that
comprise my shared policy enhancements and lazy/auto-migration
enhancement.

I have rebased the patches against a recent mmotm tree.  This
rebase built cleanly, booted and passed a few ad hoc tests on
x86_64.  I've made a pass over the patch descriptions to update
them.  If there is sufficient interest in merging this, I'll
do what I can to assist in the completion and testing of the
series.

Based atop the previously posted:

1) Shared policy cleanup, fixes, mapped file policy
2) Migrate-on-fault a.k.a. Lazy Page Migration facility
3) Auto [as in "self"] migration facility

I'll announce this series and the automatic/lazy migration series
to follow on lkml, linux-mm, ...  However, I'll limit the actual
posting to linux-numa to avoid spamming the other lists.

 include/linux/swap.h    |   63 ++++++++--
 include/linux/swapops.h |   23 +++
 mm/Kconfig              |    9 +
 mm/swap_state.c         |  285 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 368 insertions(+), 12 deletions(-)

Index: linux-2.6.36-mmotm-101103-1217/mm/swap_state.c
===================================================================

--- linux-2.6.36-mmotm-101103-1217.orig/mm/swap_state.c
+++ linux-2.6.36-mmotm-101103-1217/mm/swap_state.c
@@ -23,6 +23,291 @@
 
 #include <asm/pgtable.h>
 
+#include <linux/idr.h>
+
+#ifdef CONFIG_MIGRATION_CACHE
+/*
+ * Migration Cache:  a pseudo-swap device and separate address space
+ * for anon pages that have been unmapped for "lazy" migration.
+ */
+
+/*
+ * struct counter:  pointer to these are maintained in the
+ * migration_idr space below.
+ * The counter maintains the number of ptes referencing the
+ * migration cache entry ["offset"] specified by the id.
+ */
+struct counter {
+	atomic_t i;	/* references to this migration cache entry */
+};
+
+/*
+ * Migration Cache id space.  Protected by radix_tree lock.
+ * The ids from this space represent the migration cache tags
+ * for radix tree lookups.  Analogous to swap cache offsets.
+ */
+struct idr migration_idr;
+
+static struct address_space_operations migration_aops = {
+        .writepage      = NULL,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+	.migratepage	= migrate_page,
+};
+
+static struct backing_dev_info migration_backing_dev_info = {
+	.capabilities   = BDI_CAP_NO_ACCT_DIRTY|BDI_CAP_NO_WRITEBACK,
+};
+
+struct address_space migration_space = {
+        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC),
+	.tree_lock	= __SPIN_LOCK_UNLOCKED(migration_space.tree_lock),
+        .a_ops          = &migration_aops,
+        .flags          = GFP_HIGHUSER,
+        .i_mmap_nonlinear = LIST_HEAD_INIT(migration_space.i_mmap_nonlinear),
+        .backing_dev_info = &migration_backing_dev_info,
+};
+
+int __init init_migration_cache(void)
+{
+	idr_init(&migration_idr);
+	return 0;
+}
+
+__initcall(init_migration_cache);
+
+/*
+ * is page in migration cache?
+ */
+int page_in_migration_cache(struct page *page)
+{
+        swp_entry_t entry;
+
+	if (!PageSwapCache(page))
+		return 0;
+
+	entry.val = page_private(page);
+	if (swp_type(entry) != SWP_MIGRATION_CACHE)
+		return 0;
+
+	return 1;
+}
+
+struct page *lookup_migration_cache(swp_entry_t entry)
+{
+	/*
+	 * Don't worry about "bogus" migration cache entries here.
+	 * Just return NULL.  swapin_readahead() will ignore migration
+	 * cache entries and read_swap_cache_async() will return NULL
+	 * when it sees one.  If do_swap_page() finds that the pte
+	 * has changed, we probably have a race between 2 tasks sharing
+	 * the mm_struct & page table and the 2nd task will just bail out.
+	 * If not [pte's same], task will receive OOM.
+	 */
+	return find_get_page(&migration_space, entry.val);
+}
+
+/*
+ * migration_duplicate(swp_entry_t entry)
+ * @entry -- a migrate cache entry
+ *
+ * Increment reference count on migration swap @entry
+ * when pte reference added:  try_to_unmap() or copy pte()
+ * [on address space duplication].   Like swap_duplicate().
+ *
+ * return 0 on success; -ENOENT if entry not found in cache
+ */
+int migration_duplicate(swp_entry_t entry)
+{
+	struct counter *cnt;
+	int err = 0;
+
+	spin_lock_irq(&migration_space.tree_lock);
+	cnt = idr_find(&migration_idr, (int)swp_offset(entry));
+	if (cnt)
+		atomic_inc(&cnt->i);
+	else
+		err = -ENOENT;
+	spin_unlock_irq(&migration_space.tree_lock);
+	return err;
+}
+
+/*
+ * migration_add_reference_page(page)
+ * like {swap|migration}_duplicate(), but takes page.
+ * No-op for non-migration cache entries.
+ * Used to add a extra ref to migration cache entries over unmap for lazy
+ * migration so that do_swap_page(), for example, can't rip the entry
+ * out from under us should the reference count go zero.  Required because
+ * migration cache does not hold an extra ref on the entry so that we
+ * recycle the entry as soon as the last pte ref is removed.
+ */
+int migration_add_reference_page(struct page *page)
+{
+	swp_entry_t entry;
+	int ret = 0;
+	entry.val = page_private(page);
+
+	if (PageSwapCache(page) && is_migration_cache(entry))
+		ret = migration_duplicate(entry);
+	return ret;
+}
+
+/*
+ * Number of ptes referencing a migration cache entry/page.
+ * Unlike swap cache, migration cache does not include
+ * a count for the cache itself.  The existence of the
+ * counter serves as the cache ref.
+ *
+ */
+int migration_ref_count(swp_entry_t entry)
+{
+	struct counter *cnt;
+	int ref = -1;
+
+	spin_lock_irq(&migration_space.tree_lock);
+	cnt = idr_find(&migration_idr, (int)swp_offset(entry));
+	if (cnt)
+		ref = atomic_read(&cnt->i);
+	spin_unlock_irq(&migration_space.tree_lock);
+	if (ref < 0) {
+		WARN_ONCE(true,
+			KERN_ERR "%s: entry 0x%lx ref count < 0:  %d\n",
+			__func__, entry.val, ref);
+		/*
+		 * FIXME:  this shouldn't happen, but until I can
+		 * find/fix the race, return > 1 to prevent the
+		 * apparently bogus entry from being reused/freed.
+		 * Will likely result in leaked pages/migcache-entries.
+		 */
+		ref = 2;
+	}
+	return ref;
+}
+
+/*
+ * Unconditionally remove page from migration cache at 'id'
+ * Migration cache radix_tree must be write locked.
+ */
+static void __remove_from_migration_cache(struct page *page, unsigned long id)
+{
+	if (page) {
+		ClearPageSwapCache(page);
+		set_page_private(page, 0);
+	}
+
+	radix_tree_delete(&migration_space.page_tree, id);
+	idr_remove(&migration_idr, (int)id);
+}
+
+/*
+ * decrement reference on migration cache entry by @dec;
+ * free if goes to zero.
+ * If @page !NULL, release cache ref.
+ *
+ * we could just read lock the tree to protect the id and counter
+ * and modify/query counter atomically.  However, we'd need to
+ * upgrade to write lock to remove the entry from the cache, and
+ * most reference removal will result in removal from cache, so
+ * just use a write lock.
+ */
+void __migration_remove_reference(struct page *page, swp_entry_t entry)
+{
+	struct counter *cnt;
+	int offset = (int)swp_offset(entry);
+
+	spin_lock_irq(&migration_space.tree_lock);
+	cnt = idr_find(&migration_idr, offset);
+	if (!cnt) {
+		spin_unlock_irq(&migration_space.tree_lock);
+		WARN_ONCE(1, "No migration cache 'id' for entry %d\n", offset);
+		return;
+	}
+
+	VM_BUG_ON(!atomic_read(&cnt->i));
+
+	if (atomic_sub_and_test(1, &cnt->i)) {
+		__remove_from_migration_cache(page, entry.val);
+		spin_unlock_irq(&migration_space.tree_lock);
+		kfree(cnt);
+		if (page)
+			page_cache_release(page); /* cache's ref */
+	} else
+		spin_unlock_irq(&migration_space.tree_lock);
+}
+
+/*
+ * Remove reference on migration cache entry, given a page
+ * in the migration cache.  NO_OP for other swap cache entries.
+ * Removes extra ref held over unmap for lazy migration.
+ */
+void migration_remove_reference_page(struct page *page)
+{
+	swp_entry_t entry;
+	entry.val = page_private(page);
+
+	if (PageSwapCache(page) && is_migration_cache(entry))
+		__migration_remove_reference(page, entry);
+}
+
+/*
+ * Remove entry's reference on page in migration cache
+ */
+void migration_remove_reference_entry(swp_entry_t entry)
+{
+	struct page *page;
+
+	page = find_get_page(&migration_space, entry.val);
+	if (!page)
+		BUG();
+
+	__migration_remove_reference(page, entry);
+	page_cache_release(page);	/* find_get ref */
+}
+
+int add_to_migration_cache(struct page *page, int gfp_mask)
+{
+	int error, offset;
+	struct counter *counter;
+	swp_entry_t entry;
+
+	VM_BUG_ON(!PageLocked(page));
+	VM_BUG_ON(PageSwapCache(page));
+	VM_BUG_ON(PagePrivate(page));
+
+        if (idr_pre_get(&migration_idr, GFP_ATOMIC) == 0)
+                return -ENOMEM;
+
+	counter = kmalloc(sizeof(struct counter), GFP_KERNEL);
+	if (!counter)
+		return -ENOMEM;
+	atomic_set(&counter->i, 1);		/* initial/temp cache ref */
+
+	error = radix_tree_preload(gfp_mask);
+	if (!error) {
+		spin_lock_irq(&migration_space.tree_lock);
+	        error = idr_get_new_above(&migration_idr, counter, 1, &offset);
+		if (error < 0) {
+			spin_unlock_irq(&migration_space.tree_lock);
+			radix_tree_preload_end();
+			BUG();
+		}
+		entry = swp_entry(SWP_MIGRATION_CACHE, offset);
+		error = radix_tree_insert(&migration_space.page_tree, entry.val,
+							page);
+		if (!error) {
+			page_cache_get(page);
+			set_page_private(page, entry.val);
+			SetPageSwapCache(page);
+			SetPageUptodate(page);		/* like add_to_swap() */
+		}
+		spin_unlock_irq(&migration_space.tree_lock);
+		radix_tree_preload_end();
+	}
+	return error;
+}
+#endif /* CONFIG_MIGRATION_CACHE */
+
+
 /*
  * swapper_space is a fiction, retained to simplify the path through
  * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
Index: linux-2.6.36-mmotm-101103-1217/include/linux/swapops.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/swapops.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/swapops.h
@@ -113,6 +113,23 @@ static inline void make_migration_entry_
 
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
+#ifdef CONFIG_MIGRATION_CACHE
+static inline int is_migration_cache(swp_entry_t entry)
+{
+	return unlikely(swp_type(entry) == SWP_MIGRATION_CACHE);
+}
+
+static inline int is_migcache_pte(pte_t pte)
+{
+	unsigned long swp_type;
+	swp_entry_t arch_entry;
+
+	arch_entry = __pte_to_swp_entry(pte);
+	swp_type = __swp_type(arch_entry);
+
+	return swp_type == SWP_MIGRATION_CACHE;
+}
+#endif /* CONFIG_MIGRATION_CACHE */
 #else
 
 #define make_migration_entry(page, write) swp_entry(0, 0)
@@ -131,6 +148,12 @@ static inline int is_write_migration_ent
 
 #endif
 
+#ifndef CONFIG_MIGRATION_CACHE
+
+#define is_migration_cache(type) (0)
+#define is_migcache_pte(pte)  (0)
+
+#endif /* !CONFIG_MIGRATION_CACHE */
 #ifdef CONFIG_MEMORY_FAILURE
 /*
  * Support for hardware poisoned pages
Index: linux-2.6.36-mmotm-101103-1217/include/linux/swap.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/swap.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/swap.h
@@ -31,8 +31,8 @@ static inline int current_is_kswapd(void
  * be swapped to.  The swap type and the offset into that swap type are
  * encoded into pte's and into pgoff_t's in the swapcache.  Using five bits
  * for the type means that the maximum number of swapcache pages is 27 bits
- * on 32-bit-pgoff_t architectures.  And that assumes that the architecture packs
- * the type/offset into the pte as 5/27 as well.
+ * on 32-bit-pgoff_t architectures.  And that assumes that the architecture
+ * packs the type/offset into the pte as 5/27 as well.
  */
 #define MAX_SWAPFILES_SHIFT	5
 
@@ -43,25 +43,35 @@ static inline int current_is_kswapd(void
  */
 
 /*
+ * Handling of hardware poisoned pages with memory corruption.
+ */
+#ifdef CONFIG_MEMORY_FAILURE
+#define SWP_HWPOISON_NUM 1
+#define SWP_HWPOISON		MAX_SWAPFILES
+#else
+#define SWP_HWPOISON_NUM 0
+#endif
+
+/*
  * NUMA node memory migration support
  */
 #ifdef CONFIG_MIGRATION
-#define SWP_MIGRATION_NUM 2
 #define SWP_MIGRATION_READ	(MAX_SWAPFILES + SWP_HWPOISON_NUM)
 #define SWP_MIGRATION_WRITE	(MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
-#else
-#define SWP_MIGRATION_NUM 0
-#endif
 
+#ifndef CONFIG_MIGRATION_CACHE
 /*
- * Handling of hardware poisoned pages with memory corruption.
+ * Use last two entries for page migration swap entries
  */
-#ifdef CONFIG_MEMORY_FAILURE
-#define SWP_HWPOISON_NUM 1
-#define SWP_HWPOISON		MAX_SWAPFILES
+#define SWP_MIGRATION_NUM 2
 #else
-#define SWP_HWPOISON_NUM 0
-#endif
+#define SWP_MIGRATION_CACHE	(MAX_SWAPFILES + SWP_HWPOISON_NUM + 2)
+#define SWP_MIGRATION_NUM 3
+#endif	/* CONFIG_MIGRATION_CACHE */
+
+#else	/* !CONFIG_MIGRATION */
+#define SWP_MIGRATION_NUM 0
+#endif	/* !CONFIG_MIGRATION */
 
 #define MAX_SWAPFILES \
 	((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
@@ -382,6 +392,20 @@ static inline void mem_cgroup_uncharge_s
 }
 #endif
 
+#ifdef CONFIG_MIGRATION_CACHE
+/*
+ * The Migration Cache:  a pseudo-swap cache for anon pages
+ */
+extern int page_in_migration_cache(struct page *);
+extern void migration_remove_reference_page(struct page *);
+extern void migration_remove_reference_entry(swp_entry_t);
+extern int  migration_duplicate(swp_entry_t);
+extern int  migration_add_reference_page(struct page *);
+extern int migration_ref_count(swp_entry_t);
+extern void __migration_remove_reference(struct page *, swp_entry_t);
+extern struct page *lookup_migration_cache(swp_entry_t);
+#endif	/* CONFIG_MIGRATION_CACHE */
+
 #else /* CONFIG_SWAP */
 
 #define nr_swap_pages				0L
@@ -507,5 +531,20 @@ mem_cgroup_count_swap_user(swp_entry_t e
 #endif
 
 #endif /* CONFIG_SWAP */
+
+#ifndef CONFIG_MIGRATION_CACHE
+static inline void migration_remove_reference_page(struct page *p) { }
+static inline void migration_remove_reference_entry(swp_entry_t e) { }
+static inline int  migration_duplicate(swp_entry_t e) { return 0; }
+static inline int  migration_add_reference_page(struct page *p) { return 0; }
+static inline int  migration_ref_count(swp_entry_t e) { return 0; }
+static inline void __migration_remove_reference(struct page *page,
+	 swp_entry_t entry) { }
+
+static inline struct page *lookup_migration_cache(swp_entry_t entry)
+{
+	return (struct page *)0;
+}
+#endif	/* !CONFIG_MIGRATION_CACHE */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
Index: linux-2.6.36-mmotm-101103-1217/mm/Kconfig
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/Kconfig
+++ linux-2.6.36-mmotm-101103-1217/mm/Kconfig
@@ -219,6 +219,15 @@ config AUTO_MIGRATION
 	  Allows tasks' private memory to follow that task itself across
 	  inter-node migrations.
 
+config MIGRATION_CACHE
+	bool "Migration cache"
+	depends on MIGRATE_ON_FAULT
+	def_bool y
+	help
+	  A pseudo-swap device for migrating anon pages without the use of
+	  configured swap devices.  The kernel swap infrastructure is still
+	  required.
+
 config PHYS_ADDR_T_64BIT
 	def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
 
--
To unsubscribe from this list: send the line "unsubscribe linux-numa" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html