Hi Andrew, Mel and other guys, How about this V3 patch, any comments? thanks, linfeng On 02/21/2013 07:01 PM, Lin Feng wrote: > get_user_pages() always tries to allocate pages from movable zone, which is not > reliable to memory hotremove framework in some case. > > This patch introduces a new library function called get_user_pages_non_movable() > to pin pages only from zone non-movable in memory. > It's a wrapper of get_user_pages() but it makes sure that all pages come from > non-movable zone via additional page migration. But if migration fails it > will at least keep the base functionality of get_user_pages(). > > Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > Cc: Mel Gorman <mgorman@xxxxxxx> > Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> > Cc: Yasuaki Ishimatsu <isimatu.yasuaki@xxxxxxxxxxxxxx> > Cc: Jeff Moyer <jmoyer@xxxxxxxxxx> > Cc: Minchan Kim <minchan@xxxxxxxxxx> > Cc: Zach Brown <zab@xxxxxxxxxx> > Reviewed-by: Tang Chen <tangchen@xxxxxxxxxxxxxx> > Reviewed-by: Gu Zheng <guz.fnst@xxxxxxxxxxxxxx> > Signed-off-by: Lin Feng <linfeng@xxxxxxxxxxxxxx> > --- > include/linux/mm.h | 14 ++++++ > include/linux/mmzone.h | 4 ++ > mm/memory.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++ > mm/page_isolation.c | 8 ++++ > 4 files changed, 129 insertions(+), 0 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 5625c1c..737dc39 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1025,6 +1025,20 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, > struct vm_area_struct **vmas); > int get_user_pages_fast(unsigned long start, int nr_pages, int write, > struct page **pages); > +#ifdef CONFIG_MEMORY_HOTREMOVE > +int get_user_pages_non_movable(struct task_struct *tsk, struct mm_struct *mm, > + unsigned long start, int nr_pages, int write, int force, > + struct page **pages, struct vm_area_struct **vmas); > +#else > +static inline > +int get_user_pages_non_movable(struct task_struct *tsk, struct mm_struct *mm, > + unsigned long start, int nr_pages, int write, int force, > + struct page **pages, struct vm_area_struct **vmas) > +{ > + return get_user_pages(tsk, mm, start, nr_pages, write, force, pages, > + vmas); > +} > +#endif > struct kvec; > int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, > struct page **pages); > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index ab20a60..c31007e 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -851,6 +851,10 @@ static inline int is_normal_idx(enum zone_type idx) > return (idx == ZONE_NORMAL); > } > > +static inline int zone_is_movable(struct zone *zone) > +{ > + return zone_idx(zone) == ZONE_MOVABLE; > +} > /** > * is_highmem - helper function to quickly check if a struct zone is a > * highmem zone or not. This is an attempt to keep references > diff --git a/mm/memory.c b/mm/memory.c > index 16ca5d0..83db7dd 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -58,6 +58,8 @@ > #include <linux/elf.h> > #include <linux/gfp.h> > #include <linux/migrate.h> > +#include <linux/page-isolation.h> > +#include <linux/mm_inline.h> > #include <linux/string.h> > > #include <asm/io.h> > @@ -2014,6 +2016,107 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, > } > EXPORT_SYMBOL(get_user_pages); > > +#ifdef CONFIG_MEMORY_HOTREMOVE > +/** > + * It's a wrapper of get_user_pages() but it makes sure that all pages come from > + * non-movable zone via additional page migration. It's designed for memory > + * hotremove framework. > + * > + * Currently get_user_pages() always tries to allocate pages from movable zone, > + * in some case users of get_user_pages() is easy to pin user pages for a long > + * time(for now we found that pages pinned as aio ring pages is such case), > + * which is fatal for memory hotremove framework. > + * > + * This function first calls get_user_pages() to get the candidate pages, and > + * then check to ensure all pages are from non movable zone. Otherwise migrate > + * them to non movable zone, then retry. It will at most retry once. If > + * migration fails, it will keep the base functionality of get_user_pages() > + * and issue WARN message for memory hot-remove people. > + * > + * Fixme: now we don't support non movable version of GUP for hugepage. > + */ > +int get_user_pages_non_movable(struct task_struct *tsk, struct mm_struct *mm, > + unsigned long start, int nr_pages, int write, int force, > + struct page **pages, struct vm_area_struct **vmas) > +{ > + int ret, i, tried = 0; > + bool isolate_err, migrate_prepped; > + LIST_HEAD(pagelist); > + > +retry: > + BUG_ON(tried == 2); > + ret = get_user_pages(tsk, mm, start, nr_pages, write, force, pages, > + vmas); > + /* No ZONE_MOVABLE populated, all pages are from non movable zone */ > + if (movable_zone == ZONE_MOVABLE || ret <= 0) > + return ret; > + > + isolate_err = false; > + migrate_prepped = false; > + > + for (i = 0; i < ret; i++) { > + if (zone_is_movable(page_zone(pages[i]))) { > + /* Fixme: improve for hugepage non movable support */ > + if (PageHuge(pages[i])) { > + WARN_ONCE(1, "Non movable GUP for hugepages " > + "haven't been implemented yet, it may " > + "lead to memory hot-remove failure.\n"); > + continue; > + } > + > + /* Hugepage or THP's head page has covered tail pages */ > + if (PageTail(pages[i]) && (page_count(pages[i]) == 1)) > + continue; > + > + if (!migrate_prepped) { > + BUG_ON(migrate_prep()); > + migrate_prepped = true; > + } > + > + /* Fixme: isolate_lru_page() takes the LRU lock every > + * time, batching the lock could avoid potential lock > + * contention problems. -Mel Gorman > + */ > + if (!isolate_lru_page(pages[i])) { > + inc_zone_page_state(pages[i], NR_ISOLATED_ANON + > + page_is_file_cache(pages[i])); > + list_add(&pages[i]->lru, &pagelist); > + } else { > + isolate_err = true; > + break; > + } > + } > + } > + > + /* All pages are non movable, we are done :) */ > + if (i == ret && list_empty(&pagelist)) > + return ret; > + > + /* Undo the effects of former get_user_pages(), ready for another try */ > + release_pages(pages, ret, 1); > + > + if (!isolate_err) { > + ret = migrate_pages(&pagelist, alloc_migrate_target, 1, > + MIGRATE_SYNC, MR_SYSCALL); > + /* Steal pages from non-movable zone successfully? */ > + if (!ret) { > + tried++; > + goto retry; > + } > + } > + > + putback_lru_pages(&pagelist); > + /* Migration failed, in order to keep at least the base functionality of > + * get_user_pages(), we pin pages again but give WARN info to remind > + * memory hot-remove people, which is a trade-off. > + */ > + WARN_ONCE(1, "Non movable zone migration failed, " > + "it may lead to memroy hot-remove failure.\n"); > + return get_user_pages(tsk, mm, start, nr_pages, write, force, pages, > + vmas); > +} > +EXPORT_SYMBOL(get_user_pages_non_movable); > +#endif > /** > * get_dump_page() - pin user page in memory while writing it to core dump > * @addr: user address > diff --git a/mm/page_isolation.c b/mm/page_isolation.c > index 383bdbb..7823ea5 100644 > --- a/mm/page_isolation.c > +++ b/mm/page_isolation.c > @@ -247,6 +247,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, > return ret ? 0 : -EBUSY; > } > > +/** > + * @private: 0 means page can be alloced from movable zone, otherwise forbidden > + */ > struct page *alloc_migrate_target(struct page *page, unsigned long private, > int **resultp) > { > @@ -254,6 +257,11 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, > > if (PageHighMem(page)) > gfp_mask |= __GFP_HIGHMEM; > +#if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_HIGHMEM) > + BUILD_BUG_ON(1); > +#endif > + if (unlikely(private != 0)) > + gfp_mask &= ~__GFP_HIGHMEM; > > return alloc_page(gfp_mask); > } > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html