Re: [PATCH 04/10] hugetlb: hugepage migration core

Mel Gorman <mel@xxxxxxxxx> · Mon, 20 Sep 2010 12:10:55 +0100

On Wed, Sep 08, 2010 at 10:19:35AM +0900, Naoya Horiguchi wrote:
> This patch extends page migration code to support hugepage migration.
> One of the potential users of this feature is soft offlining which
> is triggered by memory corrected errors (added by the next patch.)
> 
> Todo:
> - there are other users of page migration such as memory policy,
>   memory hotplug and memocy compaction.
>   They are not ready for hugepage support for now.
> 
> ChangeLog since v4:
> - define migrate_huge_pages()
> - remove changes on isolation/putback_lru_page()
> 
> ChangeLog since v2:
> - refactor isolate/putback_lru_page() to handle hugepage
> - add comment about race on unmap_and_move_huge_page()
> 
> ChangeLog since v1:
> - divide migration code path for hugepage
> - define routine checking migration swap entry for hugetlb
> - replace "goto" with "if/else" in remove_migration_pte()
> 
> Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
> Signed-off-by: Jun'ichi Nomura <j-nomura@xxxxxxxxxxxxx>
> ---
>  fs/hugetlbfs/inode.c    |   15 +++
>  include/linux/migrate.h |   16 +++
>  mm/hugetlb.c            |   18 ++++-
>  mm/migrate.c            |  232 +++++++++++++++++++++++++++++++++++++++++++----
>  4 files changed, 262 insertions(+), 19 deletions(-)
> 
> diff --git v2.6.36-rc2/fs/hugetlbfs/inode.c v2.6.36-rc2/fs/hugetlbfs/inode.c
> index 6e5bd42..1f7ca50 100644
> --- v2.6.36-rc2/fs/hugetlbfs/inode.c
> +++ v2.6.36-rc2/fs/hugetlbfs/inode.c
> @@ -31,6 +31,7 @@
>  #include <linux/statfs.h>
>  #include <linux/security.h>
>  #include <linux/magic.h>
> +#include <linux/migrate.h>
>  
>  #include <asm/uaccess.h>
>  
> @@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
>  	return 0;
>  }
>  
> +static int hugetlbfs_migrate_page(struct address_space *mapping,
> +				struct page *newpage, struct page *page)
> +{
> +	int rc;
> +
> +	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
> +	if (rc)
> +		return rc;
> +	migrate_page_copy(newpage, page);
> +
> +	return 0;
> +}
> +
>  static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
>  {
>  	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
> @@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = {
>  	.write_begin	= hugetlbfs_write_begin,
>  	.write_end	= hugetlbfs_write_end,
>  	.set_page_dirty	= hugetlbfs_set_page_dirty,
> +	.migratepage    = hugetlbfs_migrate_page,
>  };
>  
>  
> diff --git v2.6.36-rc2/include/linux/migrate.h v2.6.36-rc2/include/linux/migrate.h
> index 7238231..3c1941e 100644
> --- v2.6.36-rc2/include/linux/migrate.h
> +++ v2.6.36-rc2/include/linux/migrate.h
> @@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *,
>  			struct page *, struct page *);
>  extern int migrate_pages(struct list_head *l, new_page_t x,
>  			unsigned long private, int offlining);
> +extern int migrate_huge_pages(struct list_head *l, new_page_t x,
> +			unsigned long private, int offlining);
>  
>  extern int fail_migrate_page(struct address_space *,
>  			struct page *, struct page *);
> @@ -23,12 +25,17 @@ extern int migrate_prep_local(void);
>  extern int migrate_vmas(struct mm_struct *mm,
>  		const nodemask_t *from, const nodemask_t *to,
>  		unsigned long flags);
> +extern void migrate_page_copy(struct page *newpage, struct page *page);
> +extern int migrate_huge_page_move_mapping(struct address_space *mapping,
> +				  struct page *newpage, struct page *page);
>  #else
>  #define PAGE_MIGRATION 0
>  
>  static inline void putback_lru_pages(struct list_head *l) {}
>  static inline int migrate_pages(struct list_head *l, new_page_t x,
>  		unsigned long private, int offlining) { return -ENOSYS; }
> +static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
> +		unsigned long private, int offlining) { return -ENOSYS; }
>  
>  static inline int migrate_prep(void) { return -ENOSYS; }
>  static inline int migrate_prep_local(void) { return -ENOSYS; }
> @@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
>  	return -ENOSYS;
>  }
>  
> +static inline void migrate_page_copy(struct page *newpage,
> +				     struct page *page) {}
> +
> +extern int migrate_huge_page_move_mapping(struct address_space *mapping,
> +				  struct page *newpage, struct page *page)
> +{
> +	return -ENOSYS;
> +}
> +
>  /* Possible settings for the migrate_page() method in address_operations */
>  #define migrate_page NULL
>  #define fail_migrate_page NULL
> diff --git v2.6.36-rc2/mm/hugetlb.c v2.6.36-rc2/mm/hugetlb.c
> index 351f8d1..55f3e2d 100644
> --- v2.6.36-rc2/mm/hugetlb.c
> +++ v2.6.36-rc2/mm/hugetlb.c
> @@ -2217,6 +2217,19 @@ nomem:
>  	return -ENOMEM;
>  }
>  
> +static int is_hugetlb_entry_migration(pte_t pte)
> +{
> +	swp_entry_t swp;
> +
> +	if (huge_pte_none(pte) || pte_present(pte))
> +		return 0;
> +	swp = pte_to_swp_entry(pte);
> +	if (non_swap_entry(swp) && is_migration_entry(swp)) {
> +		return 1;
> +	} else
> +		return 0;
> +}
> +
>  static int is_hugetlb_entry_hwpoisoned(pte_t pte)
>  {
>  	swp_entry_t swp;
> @@ -2651,7 +2664,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  	ptep = huge_pte_offset(mm, address);
>  	if (ptep) {
>  		entry = huge_ptep_get(ptep);
> -		if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
> +		if (unlikely(is_hugetlb_entry_migration(entry))) {
> +			migration_entry_wait(mm, (pmd_t *)ptep, address);
> +			return 0;
> +		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
>  			return VM_FAULT_HWPOISON;
>  	}
>  
> diff --git v2.6.36-rc2/mm/migrate.c v2.6.36-rc2/mm/migrate.c
> index 38e7cad..55dbc45 100644
> --- v2.6.36-rc2/mm/migrate.c
> +++ v2.6.36-rc2/mm/migrate.c
> @@ -32,6 +32,7 @@
>  #include <linux/security.h>
>  #include <linux/memcontrol.h>
>  #include <linux/syscalls.h>
> +#include <linux/hugetlb.h>
>  #include <linux/gfp.h>
>  
>  #include "internal.h"
> @@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
>  	pte_t *ptep, pte;
>   	spinlock_t *ptl;
>  
> - 	pgd = pgd_offset(mm, addr);
> -	if (!pgd_present(*pgd))
> -		goto out;
> +	if (unlikely(PageHuge(new))) {
> +		ptep = huge_pte_offset(mm, addr);
> +		if (!ptep)
> +			goto out;
> +		ptl = &mm->page_table_lock;
> +	} else {
> +		pgd = pgd_offset(mm, addr);
> +		if (!pgd_present(*pgd))
> +			goto out;
>  
> -	pud = pud_offset(pgd, addr);
> -	if (!pud_present(*pud))
> -		goto out;
> +		pud = pud_offset(pgd, addr);
> +		if (!pud_present(*pud))
> +			goto out;
>  

Why are the changes to teh rest of the walkers necessary? Instead, why
did you not identify which PTL lock you needed and then goto the point
where spin_lock(ptl) is called? Similar to what page_check_address()
does for example.

Otherwise, I did not spot anything that was obviously wrong.

> -	pmd = pmd_offset(pud, addr);
> -	if (!pmd_present(*pmd))
> -		goto out;
> +		pmd = pmd_offset(pud, addr);
> +		if (!pmd_present(*pmd))
> +			goto out;
>  
> -	ptep = pte_offset_map(pmd, addr);
> +		ptep = pte_offset_map(pmd, addr);
>  
> -	if (!is_swap_pte(*ptep)) {
> -		pte_unmap(ptep);
> -		goto out;
> - 	}
> +		if (!is_swap_pte(*ptep)) {
> +			pte_unmap(ptep);
> +			goto out;
> +		}
> +
> +		ptl = pte_lockptr(mm, pmd);
> +	}
>  
> - 	ptl = pte_lockptr(mm, pmd);
>   	spin_lock(ptl);
>  	pte = *ptep;
>  	if (!is_swap_pte(pte))
> @@ -130,10 +139,17 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
>  	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
>  	if (is_write_migration_entry(entry))
>  		pte = pte_mkwrite(pte);
> +	if (PageHuge(new))
> +		pte = pte_mkhuge(pte);
>  	flush_cache_page(vma, addr, pte_pfn(pte));
>  	set_pte_at(mm, addr, ptep, pte);
>  
> -	if (PageAnon(new))
> +	if (PageHuge(new)) {
> +		if (PageAnon(new))
> +			hugepage_add_anon_rmap(new, vma, addr);
> +		else
> +			page_dup_rmap(new);
> +	} else if (PageAnon(new))
>  		page_add_anon_rmap(new, vma, addr);
>  	else
>  		page_add_file_rmap(new);
> @@ -276,11 +292,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
>  }
>  
>  /*
> + * The expected number of remaining references is the same as that
> + * of migrate_page_move_mapping().
> + */
> +int migrate_huge_page_move_mapping(struct address_space *mapping,
> +				   struct page *newpage, struct page *page)
> +{
> +	int expected_count;
> +	void **pslot;
> +
> +	if (!mapping) {
> +		if (page_count(page) != 1)
> +			return -EAGAIN;
> +		return 0;
> +	}
> +
> +	spin_lock_irq(&mapping->tree_lock);
> +
> +	pslot = radix_tree_lookup_slot(&mapping->page_tree,
> +					page_index(page));
> +
> +	expected_count = 2 + page_has_private(page);
> +	if (page_count(page) != expected_count ||
> +	    (struct page *)radix_tree_deref_slot(pslot) != page) {
> +		spin_unlock_irq(&mapping->tree_lock);
> +		return -EAGAIN;
> +	}
> +
> +	if (!page_freeze_refs(page, expected_count)) {
> +		spin_unlock_irq(&mapping->tree_lock);
> +		return -EAGAIN;
> +	}
> +
> +	get_page(newpage);
> +
> +	radix_tree_replace_slot(pslot, newpage);
> +
> +	page_unfreeze_refs(page, expected_count);
> +
> +	__put_page(page);
> +
> +	spin_unlock_irq(&mapping->tree_lock);
> +	return 0;
> +}
> +
> +/*
>   * Copy the page to its new location
>   */
> -static void migrate_page_copy(struct page *newpage, struct page *page)
> +void migrate_page_copy(struct page *newpage, struct page *page)
>  {
> -	copy_highpage(newpage, page);
> +	if (PageHuge(page))
> +		copy_huge_page(newpage, page);
> +	else
> +		copy_highpage(newpage, page);
>  
>  	if (PageError(page))
>  		SetPageError(newpage);
> @@ -724,6 +788,92 @@ move_newpage:
>  }
>  
>  /*
> + * Counterpart of unmap_and_move_page() for hugepage migration.
> + *
> + * This function doesn't wait the completion of hugepage I/O
> + * because there is no race between I/O and migration for hugepage.
> + * Note that currently hugepage I/O occurs only in direct I/O
> + * where no lock is held and PG_writeback is irrelevant,
> + * and writeback status of all subpages are counted in the reference
> + * count of the head page (i.e. if all subpages of a 2MB hugepage are
> + * under direct I/O, the reference of the head page is 512 and a bit more.)
> + * This means that when we try to migrate hugepage whose subpages are
> + * doing direct I/O, some references remain after try_to_unmap() and
> + * hugepage migration fails without data corruption.
> + *
> + * There is also no race when direct I/O is issued on the page under migration,
> + * because then pte is replaced with migration swap entry and direct I/O code
> + * will wait in the page fault for migration to complete.
> + */
> +static int unmap_and_move_huge_page(new_page_t get_new_page,
> +				unsigned long private, struct page *hpage,
> +				int force, int offlining)
> +{
> +	int rc = 0;
> +	int *result = NULL;
> +	struct page *new_hpage = get_new_page(hpage, private, &result);
> +	int rcu_locked = 0;
> +	struct anon_vma *anon_vma = NULL;
> +
> +	if (!new_hpage)
> +		return -ENOMEM;
> +
> +	rc = -EAGAIN;
> +
> +	if (!trylock_page(hpage)) {
> +		if (!force)
> +			goto out;
> +		lock_page(hpage);
> +	}
> +
> +	if (PageAnon(hpage)) {
> +		rcu_read_lock();
> +		rcu_locked = 1;
> +
> +		if (page_mapped(hpage)) {
> +			anon_vma = page_anon_vma(hpage);
> +			atomic_inc(&anon_vma->external_refcount);
> +		}
> +	}
> +
> +	try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
> +
> +	if (!page_mapped(hpage))
> +		rc = move_to_new_page(new_hpage, hpage, 1);
> +
> +	if (rc)
> +		remove_migration_ptes(hpage, hpage);
> +
> +	if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
> +					    &anon_vma->lock)) {
> +		int empty = list_empty(&anon_vma->head);
> +		spin_unlock(&anon_vma->lock);
> +		if (empty)
> +			anon_vma_free(anon_vma);
> +	}
> +
> +	if (rcu_locked)
> +		rcu_read_unlock();
> +out:
> +	unlock_page(hpage);
> +
> +	if (rc != -EAGAIN) {
> +		list_del(&hpage->lru);
> +		put_page(hpage);
> +	}
> +
> +	put_page(new_hpage);
> +
> +	if (result) {
> +		if (rc)
> +			*result = rc;
> +		else
> +			*result = page_to_nid(new_hpage);
> +	}
> +	return rc;
> +}
> +
> +/*
>   * migrate_pages
>   *
>   * The function takes one list of pages to migrate and a function
> @@ -788,6 +938,52 @@ out:
>  	return nr_failed + retry;
>  }
>  
> +int migrate_huge_pages(struct list_head *from,
> +		new_page_t get_new_page, unsigned long private, int offlining)
> +{
> +	int retry = 1;
> +	int nr_failed = 0;
> +	int pass = 0;
> +	struct page *page;
> +	struct page *page2;
> +	int rc;
> +
> +	for (pass = 0; pass < 10 && retry; pass++) {
> +		retry = 0;
> +
> +		list_for_each_entry_safe(page, page2, from, lru) {
> +			cond_resched();
> +
> +			rc = unmap_and_move_huge_page(get_new_page,
> +					private, page, pass > 2, offlining);
> +
> +			switch(rc) {
> +			case -ENOMEM:
> +				goto out;
> +			case -EAGAIN:
> +				retry++;
> +				break;
> +			case 0:
> +				break;
> +			default:
> +				/* Permanent failure */
> +				nr_failed++;
> +				break;
> +			}
> +		}
> +	}
> +	rc = 0;
> +out:
> +
> +	list_for_each_entry_safe(page, page2, from, lru)
> +		put_page(page);
> +
> +	if (rc)
> +		return rc;
> +
> +	return nr_failed + retry;
> +}
> +
>  #ifdef CONFIG_NUMA
>  /*
>   * Move a list of individual pages
> -- 
> 1.7.2.2
> 

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>