On Tue, Feb 05, 2013 at 05:21:52PM +0800, Lin Feng wrote: >get_user_pages() always tries to allocate pages from movable zone, which is not > reliable to memory hotremove framework in some case. > >This patch introduces a new library function called get_user_pages_non_movable() > to pin pages only from zone non-movable in memory. >It's a wrapper of get_user_pages() but it makes sure that all pages come from >non-movable zone via additional page migration. > >Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> >Cc: Mel Gorman <mgorman@xxxxxxx> >Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> >Cc: Yasuaki Ishimatsu <isimatu.yasuaki@xxxxxxxxxxxxxx> >Cc: Jeff Moyer <jmoyer@xxxxxxxxxx> >Cc: Minchan Kim <minchan@xxxxxxxxxx> >Cc: Zach Brown <zab@xxxxxxxxxx> >Reviewed-by: Tang Chen <tangchen@xxxxxxxxxxxxxx> >Reviewed-by: Gu Zheng <guz.fnst@xxxxxxxxxxxxxx> >Signed-off-by: Lin Feng <linfeng@xxxxxxxxxxxxxx> >--- > include/linux/mm.h | 3 ++ > include/linux/mmzone.h | 4 ++ > mm/memory.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++ > mm/page_isolation.c | 5 +++ > 4 files changed, 95 insertions(+), 0 deletions(-) > >diff --git a/include/linux/mm.h b/include/linux/mm.h >index 12f5a09..3ff9eba 100644 >--- a/include/linux/mm.h >+++ b/include/linux/mm.h >@@ -1049,6 +1049,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, > struct page **pages, struct vm_area_struct **vmas); > int get_user_pages_fast(unsigned long start, int nr_pages, int write, > struct page **pages); >+int get_user_pages_non_movable(struct task_struct *tsk, struct mm_struct *mm, >+ unsigned long start, int nr_pages, int write, int force, >+ struct page **pages, struct vm_area_struct **vmas); > struct kvec; > int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, > struct page **pages); >diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h >index e25ab6f..1506351 100644 >--- a/include/linux/mmzone.h >+++ b/include/linux/mmzone.h >@@ -841,6 +841,10 @@ static inline int is_normal_idx(enum zone_type idx) > return (idx == ZONE_NORMAL); > } > >+static inline int zone_is_movable(struct zone *zone) >+{ >+ return zone_idx(zone) == ZONE_MOVABLE; >+} > /** > * is_highmem - helper function to quickly check if a struct zone is a > * highmem zone or not. This is an attempt to keep references >diff --git a/mm/memory.c b/mm/memory.c >index bb1369f..ede53cc 100644 >--- a/mm/memory.c >+++ b/mm/memory.c >@@ -58,6 +58,8 @@ > #include <linux/elf.h> > #include <linux/gfp.h> > #include <linux/migrate.h> >+#include <linux/page-isolation.h> >+#include <linux/mm_inline.h> > #include <linux/string.h> > > #include <asm/io.h> >@@ -1995,6 +1997,87 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, > } > EXPORT_SYMBOL(get_user_pages); > >+#ifdef CONFIG_MEMORY_HOTREMOVE >+/** >+ * It's a wrapper of get_user_pages() but it makes sure that all pages come from >+ * non-movable zone via additional page migration. It's designed for memory >+ * hotremove framework. >+ * >+ * Currently get_user_pages() always tries to allocate pages from movable zone, >+ * in some case users of get_user_pages() is easy to pin user pages for a long >+ * time(for now we found that pages pinned as aio ring pages is such case), >+ * which is fatal for memory hotremove framework. >+ * >+ * This function first calls get_user_pages() to get the candidate pages, and >+ * then check to ensure all pages are from non movable zone. Otherwise migrate How about "Otherwise migrate candidate pages which have already been isolated to non movable zone."? >+ * them to non movable zone, then retry. It will at most retry once. >+ */ >+int get_user_pages_non_movable(struct task_struct *tsk, struct mm_struct *mm, >+ unsigned long start, int nr_pages, int write, int force, >+ struct page **pages, struct vm_area_struct **vmas) >+{ >+ int ret, i, isolate_err, migrate_pre_flag; >+ LIST_HEAD(pagelist); >+ >+retry: >+ ret = get_user_pages(tsk, mm, start, nr_pages, write, force, pages, >+ vmas); >+ if (ret <= 0) >+ return ret; >+ >+ isolate_err = 0; >+ migrate_pre_flag = 0; >+ >+ for (i = 0; i < ret; i++) { >+ if (zone_is_movable(page_zone(pages[i]))) { >+ if (!migrate_pre_flag) { >+ if (migrate_prep()) >+ goto release_page; >+ migrate_pre_flag = 1; >+ } >+ >+ if (!isolate_lru_page(pages[i])) { >+ inc_zone_page_state(pages[i], NR_ISOLATED_ANON + >+ page_is_file_cache(pages[i])); >+ list_add_tail(&pages[i]->lru, &pagelist); >+ } else { >+ isolate_err = 1; >+ goto release_page; >+ } >+ } >+ } >+ >+ /* All pages are non movable, we are done :) */ >+ if (i == ret && list_empty(&pagelist)) >+ return ret; >+ >+release_page: >+ /* Undo the effects of former get_user_pages(), we won't pin anything */ >+ release_pages(pages, ret, 1); >+ >+ if (migrate_pre_flag && !isolate_err) { >+ ret = migrate_pages(&pagelist, alloc_migrate_target, 1, >+ false, MIGRATE_SYNC, MR_SYSCALL); >+ /* Steal pages from non-movable zone successfully? */ >+ if (!ret) >+ goto retry; >+ } >+ >+ putback_lru_pages(&pagelist); >+ /* Migration failed, we pin 0 page, tell caller the truth */ >+ return 0; >+} >+#else >+inline int get_user_pages_non_movable(struct task_struct *tsk, struct mm_struct *mm, >+ unsigned long start, int nr_pages, int write, int force, >+ struct page **pages, struct vm_area_struct **vmas) >+{ >+ return get_user_pages(tsk, mm, start, nr_pages, write, force, pages, >+ vmas); >+} >+#endif >+EXPORT_SYMBOL(get_user_pages_non_movable); >+ > /** > * get_dump_page() - pin user page in memory while writing it to core dump > * @addr: user address >diff --git a/mm/page_isolation.c b/mm/page_isolation.c >index 383bdbb..1b7bd17 100644 >--- a/mm/page_isolation.c >+++ b/mm/page_isolation.c >@@ -247,6 +247,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, > return ret ? 0 : -EBUSY; > } > >+/** >+ * @private: 0 means page can be alloced from movable zone, otherwise forbidden >+ */ > struct page *alloc_migrate_target(struct page *page, unsigned long private, > int **resultp) > { >@@ -254,6 +257,8 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, > > if (PageHighMem(page)) > gfp_mask |= __GFP_HIGHMEM; >+ if (unlikely(private != 0)) >+ gfp_mask &= ~__GFP_MOVABLE; > > return alloc_page(gfp_mask); > } >-- >1.7.1 > >-- >To unsubscribe, send a message with 'unsubscribe linux-mm' in >the body to majordomo@xxxxxxxxx. For more info on Linux MM, >see: http://www.linux-mm.org/ . >Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>