Re: [PATCHv3 04/24] rmap: add argument to charge compound page

Vlastimil Babka <vbabka@xxxxxxx> · Mon, 23 Feb 2015 17:21:31 +0100

On 02/12/2015 05:18 PM, Kirill A. Shutemov wrote:
> We're going to allow mapping of individual 4k pages of THP compound
> page. It means we cannot rely on PageTransHuge() check to decide if map
> small page or THP.
> 
> The patch adds new argument to rmap function to indicate whethe we want
> to map whole compound page or only the small page.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
> ---
>  include/linux/rmap.h    | 14 +++++++++++---
>  kernel/events/uprobes.c |  4 ++--
>  mm/huge_memory.c        | 16 ++++++++--------
>  mm/hugetlb.c            |  4 ++--
>  mm/ksm.c                |  4 ++--
>  mm/memory.c             | 14 +++++++-------
>  mm/migrate.c            |  8 ++++----
>  mm/rmap.c               | 43 +++++++++++++++++++++++++++----------------
>  mm/swapfile.c           |  4 ++--
>  9 files changed, 65 insertions(+), 46 deletions(-)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index c4088feac1fc..3bf73620b672 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -168,16 +168,24 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
>  
>  struct anon_vma *page_get_anon_vma(struct page *page);
>  
> +/* flags for do_page_add_anon_rmap() */
> +enum {
> +	RMAP_EXCLUSIVE = 1,
> +	RMAP_COMPOUND = 2,
> +};
> +
>  /*
>   * rmap interfaces called when adding or removing pte of page
>   */
>  void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
> -void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
> +void page_add_anon_rmap(struct page *, struct vm_area_struct *,
> +		unsigned long, bool);
>  void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
>  			   unsigned long, int);
> -void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
> +void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
> +		unsigned long, bool);
>  void page_add_file_rmap(struct page *);
> -void page_remove_rmap(struct page *);
> +void page_remove_rmap(struct page *, bool);
>  
>  void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
>  			    unsigned long);
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index cb346f26a22d..5523daf59953 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -183,7 +183,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
>  		goto unlock;
>  
>  	get_page(kpage);
> -	page_add_new_anon_rmap(kpage, vma, addr);
> +	page_add_new_anon_rmap(kpage, vma, addr, false);
>  	mem_cgroup_commit_charge(kpage, memcg, false);
>  	lru_cache_add_active_or_unevictable(kpage, vma);
>  
> @@ -196,7 +196,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
>  	ptep_clear_flush_notify(vma, addr, ptep);
>  	set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
>  
> -	page_remove_rmap(page);
> +	page_remove_rmap(page, false);
>  	if (!page_mapped(page))
>  		try_to_free_swap(page);
>  	pte_unmap_unlock(ptep, ptl);
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 5f4c97e1a6da..36637a80669e 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -743,7 +743,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
>  		pmd_t entry;
>  		entry = mk_huge_pmd(page, vma->vm_page_prot);
>  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> -		page_add_new_anon_rmap(page, vma, haddr);
> +		page_add_new_anon_rmap(page, vma, haddr, true);
>  		mem_cgroup_commit_charge(page, memcg, false);
>  		lru_cache_add_active_or_unevictable(page, vma);
>  		pgtable_trans_huge_deposit(mm, pmd, pgtable);
> @@ -1034,7 +1034,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>  		memcg = (void *)page_private(pages[i]);
>  		set_page_private(pages[i], 0);
> -		page_add_new_anon_rmap(pages[i], vma, haddr);
> +		page_add_new_anon_rmap(pages[i], vma, haddr, false);
>  		mem_cgroup_commit_charge(pages[i], memcg, false);
>  		lru_cache_add_active_or_unevictable(pages[i], vma);
>  		pte = pte_offset_map(&_pmd, haddr);
> @@ -1046,7 +1046,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
>  
>  	smp_wmb(); /* make pte visible before pmd */
>  	pmd_populate(mm, pmd, pgtable);
> -	page_remove_rmap(page);
> +	page_remove_rmap(page, true);
>  	spin_unlock(ptl);
>  
>  	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> @@ -1168,7 +1168,7 @@ alloc:
>  		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
>  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>  		pmdp_clear_flush_notify(vma, haddr, pmd);
> -		page_add_new_anon_rmap(new_page, vma, haddr);
> +		page_add_new_anon_rmap(new_page, vma, haddr, true);
>  		mem_cgroup_commit_charge(new_page, memcg, false);
>  		lru_cache_add_active_or_unevictable(new_page, vma);
>  		set_pmd_at(mm, haddr, pmd, entry);
> @@ -1178,7 +1178,7 @@ alloc:
>  			put_huge_zero_page();
>  		} else {
>  			VM_BUG_ON_PAGE(!PageHead(page), page);
> -			page_remove_rmap(page);
> +			page_remove_rmap(page, true);
>  			put_page(page);
>  		}
>  		ret |= VM_FAULT_WRITE;
> @@ -1431,7 +1431,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  			put_huge_zero_page();
>  		} else {
>  			page = pmd_page(orig_pmd);
> -			page_remove_rmap(page);
> +			page_remove_rmap(page, true);
>  			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
>  			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
>  			VM_BUG_ON_PAGE(!PageHead(page), page);
> @@ -2368,7 +2368,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
>  			 * superfluous.
>  			 */
>  			pte_clear(vma->vm_mm, address, _pte);
> -			page_remove_rmap(src_page);
> +			page_remove_rmap(src_page, false);
>  			spin_unlock(ptl);
>  			free_page_and_swap_cache(src_page);
>  		}
> @@ -2658,7 +2658,7 @@ static void collapse_huge_page(struct mm_struct *mm,
>  
>  	spin_lock(pmd_ptl);
>  	BUG_ON(!pmd_none(*pmd));
> -	page_add_new_anon_rmap(new_page, vma, address);
> +	page_add_new_anon_rmap(new_page, vma, address, true);
>  	mem_cgroup_commit_charge(new_page, memcg, false);
>  	lru_cache_add_active_or_unevictable(new_page, vma);
>  	pgtable_trans_huge_deposit(mm, pmd, pgtable);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 0a9ac6c26832..ebb7329301c4 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -2688,7 +2688,7 @@ again:
>  		if (huge_pte_dirty(pte))
>  			set_page_dirty(page);
>  
> -		page_remove_rmap(page);
> +		page_remove_rmap(page, true);
>  		force_flush = !__tlb_remove_page(tlb, page);
>  		if (force_flush) {
>  			address += sz;
> @@ -2908,7 +2908,7 @@ retry_avoidcopy:
>  		mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
>  		set_huge_pte_at(mm, address, ptep,
>  				make_huge_pte(vma, new_page, 1));
> -		page_remove_rmap(old_page);
> +		page_remove_rmap(old_page, true);
>  		hugepage_add_new_anon_rmap(new_page, vma, address);
>  		/* Make the old page be freed below */
>  		new_page = old_page;
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 4162dce2eb44..92182eeba87d 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -957,13 +957,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
>  	}
>  
>  	get_page(kpage);
> -	page_add_anon_rmap(kpage, vma, addr);
> +	page_add_anon_rmap(kpage, vma, addr, false);
>  
>  	flush_cache_page(vma, addr, pte_pfn(*ptep));
>  	ptep_clear_flush_notify(vma, addr, ptep);
>  	set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
>  
> -	page_remove_rmap(page);
> +	page_remove_rmap(page, false);
>  	if (!page_mapped(page))
>  		try_to_free_swap(page);
>  	put_page(page);
> diff --git a/mm/memory.c b/mm/memory.c
> index 8ae52c918415..5529627d2cd6 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1125,7 +1125,7 @@ again:
>  					mark_page_accessed(page);
>  				rss[MM_FILEPAGES]--;
>  			}
> -			page_remove_rmap(page);
> +			page_remove_rmap(page, false);
>  			if (unlikely(page_mapcount(page) < 0))
>  				print_bad_pte(vma, addr, ptent, page);
>  			if (unlikely(!__tlb_remove_page(tlb, page))) {
> @@ -2189,7 +2189,7 @@ gotten:
>  		 * thread doing COW.
>  		 */
>  		ptep_clear_flush_notify(vma, address, page_table);
> -		page_add_new_anon_rmap(new_page, vma, address);
> +		page_add_new_anon_rmap(new_page, vma, address, false);
>  		mem_cgroup_commit_charge(new_page, memcg, false);
>  		lru_cache_add_active_or_unevictable(new_page, vma);
>  		/*
> @@ -2222,7 +2222,7 @@ gotten:
>  			 * mapcount is visible. So transitively, TLBs to
>  			 * old page will be flushed before it can be reused.
>  			 */
> -			page_remove_rmap(old_page);
> +			page_remove_rmap(old_page, false);
>  		}
>  
>  		/* Free the old page.. */
> @@ -2465,7 +2465,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
>  		flags &= ~FAULT_FLAG_WRITE;
>  		ret |= VM_FAULT_WRITE;
> -		exclusive = 1;
> +		exclusive = RMAP_EXCLUSIVE;
>  	}
>  	flush_icache_page(vma, page);
>  	if (pte_swp_soft_dirty(orig_pte))
> @@ -2475,7 +2475,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  		do_page_add_anon_rmap(page, vma, address, exclusive);
>  		mem_cgroup_commit_charge(page, memcg, true);
>  	} else { /* ksm created a completely new copy */
> -		page_add_new_anon_rmap(page, vma, address);
> +		page_add_new_anon_rmap(page, vma, address, false);
>  		mem_cgroup_commit_charge(page, memcg, false);
>  		lru_cache_add_active_or_unevictable(page, vma);
>  	}
> @@ -2613,7 +2613,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  		goto release;
>  
>  	inc_mm_counter_fast(mm, MM_ANONPAGES);
> -	page_add_new_anon_rmap(page, vma, address);
> +	page_add_new_anon_rmap(page, vma, address, false);
>  	mem_cgroup_commit_charge(page, memcg, false);
>  	lru_cache_add_active_or_unevictable(page, vma);
>  setpte:
> @@ -2701,7 +2701,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>  	if (anon) {
>  		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
> -		page_add_new_anon_rmap(page, vma, address);
> +		page_add_new_anon_rmap(page, vma, address, false);
>  	} else {
>  		inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
>  		page_add_file_rmap(page);
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 85e042686031..0d2b3110277a 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -166,7 +166,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
>  		else
>  			page_dup_rmap(new);
>  	} else if (PageAnon(new))
> -		page_add_anon_rmap(new, vma, addr);
> +		page_add_anon_rmap(new, vma, addr, false);
>  	else
>  		page_add_file_rmap(new);
>  
> @@ -1803,7 +1803,7 @@ fail_putback:
>  	 * guarantee the copy is visible before the pagetable update.
>  	 */
>  	flush_cache_range(vma, mmun_start, mmun_end);
> -	page_add_anon_rmap(new_page, vma, mmun_start);
> +	page_add_anon_rmap(new_page, vma, mmun_start, true);
>  	pmdp_clear_flush_notify(vma, mmun_start, pmd);
>  	set_pmd_at(mm, mmun_start, pmd, entry);
>  	flush_tlb_range(vma, mmun_start, mmun_end);
> @@ -1814,13 +1814,13 @@ fail_putback:
>  		flush_tlb_range(vma, mmun_start, mmun_end);
>  		mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
>  		update_mmu_cache_pmd(vma, address, &entry);
> -		page_remove_rmap(new_page);
> +		page_remove_rmap(new_page, true);
>  		goto fail_putback;
>  	}
>  
>  	mem_cgroup_migrate(page, new_page, false);
>  
> -	page_remove_rmap(page);
> +	page_remove_rmap(page, true);
>  
>  	spin_unlock(ptl);
>  	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 47b3ba87c2dd..f67e83be75e4 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1041,9 +1041,9 @@ static void __page_check_anon_rmap(struct page *page,
>   * (but PageKsm is never downgraded to PageAnon).
>   */
>  void page_add_anon_rmap(struct page *page,
> -	struct vm_area_struct *vma, unsigned long address)
> +	struct vm_area_struct *vma, unsigned long address, bool compound)
>  {
> -	do_page_add_anon_rmap(page, vma, address, 0);
> +	do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
>  }
>  
>  /*
> @@ -1052,21 +1052,24 @@ void page_add_anon_rmap(struct page *page,
>   * Everybody else should continue to use page_add_anon_rmap above.
>   */
>  void do_page_add_anon_rmap(struct page *page,
> -	struct vm_area_struct *vma, unsigned long address, int exclusive)
> +	struct vm_area_struct *vma, unsigned long address, int flags)
>  {
>  	int first = atomic_inc_and_test(&page->_mapcount);
>  	if (first) {
> +		bool compound = flags & RMAP_COMPOUND;
> +		int nr = compound ? hpage_nr_pages(page) : 1;

hpage_nr_pages(page) is:

static inline int hpage_nr_pages(struct page *page)
{
        if (unlikely(PageTransHuge(page)))
                return HPAGE_PMD_NR;
        return 1;
}

and later...

>  		/*
>  		 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
>  		 * these counters are not modified in interrupt context, and
>  		 * pte lock(a spinlock) is held, which implies preemption
>  		 * disabled.
>  		 */
> -		if (PageTransHuge(page))
> +		if (compound) {
> +			VM_BUG_ON_PAGE(!PageTransHuge(page), page);

this means that we could assume that
(compound == true) => (PageTransHuge(page) == true)

and simplify above to:

int nr = compound ? HPAGE_PMD_NR : 1;

Right?
Same thing seems to hold for the two other variants below.

>  			__inc_zone_page_state(page,
>  					      NR_ANON_TRANSPARENT_HUGEPAGES);
> -		__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
> -				hpage_nr_pages(page));
> +		}
> +		__mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
>  	}
>  	if (unlikely(PageKsm(page)))
>  		return;
> @@ -1074,7 +1077,8 @@ void do_page_add_anon_rmap(struct page *page,
>  	VM_BUG_ON_PAGE(!PageLocked(page), page);
>  	/* address might be in next vma when migration races vma_adjust */
>  	if (first)
> -		__page_set_anon_rmap(page, vma, address, exclusive);
> +		__page_set_anon_rmap(page, vma, address,
> +				flags & RMAP_EXCLUSIVE);
>  	else
>  		__page_check_anon_rmap(page, vma, address);
>  }
> @@ -1090,15 +1094,18 @@ void do_page_add_anon_rmap(struct page *page,
>   * Page does not have to be locked.
>   */
>  void page_add_new_anon_rmap(struct page *page,
> -	struct vm_area_struct *vma, unsigned long address)
> +	struct vm_area_struct *vma, unsigned long address, bool compound)
>  {
> +	int nr = compound ? hpage_nr_pages(page) : 1;
> +
>  	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
>  	SetPageSwapBacked(page);
>  	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
> -	if (PageTransHuge(page))
> +	if (compound) {
> +		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
>  		__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> -	__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
> -			hpage_nr_pages(page));
> +	}
> +	__mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
>  	__page_set_anon_rmap(page, vma, address, 1);
>  }
>  
> @@ -1154,9 +1161,12 @@ out:
>   *
>   * The caller needs to hold the pte lock.
>   */
> -void page_remove_rmap(struct page *page)
> +void page_remove_rmap(struct page *page, bool compound)
>  {
> +	int nr = compound ? hpage_nr_pages(page) : 1;
> +
>  	if (!PageAnon(page)) {
> +		VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
>  		page_remove_file_rmap(page);
>  		return;
>  	}
> @@ -1174,11 +1184,12 @@ void page_remove_rmap(struct page *page)
>  	 * these counters are not modified in interrupt context, and
>  	 * pte lock(a spinlock) is held, which implies preemption disabled.
>  	 */
> -	if (PageTransHuge(page))
> +	if (compound) {
> +		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
>  		__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> +	}
>  
> -	__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
> -			      -hpage_nr_pages(page));
> +	__mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
>  
>  	if (unlikely(PageMlocked(page)))
>  		clear_page_mlock(page);
> @@ -1320,7 +1331,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
>  		dec_mm_counter(mm, MM_FILEPAGES);
>  
>  discard:
> -	page_remove_rmap(page);
> +	page_remove_rmap(page, false);
>  	page_cache_release(page);
>  
>  out_unmap:
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 63f55ccb9b26..200298895cee 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1121,10 +1121,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>  	set_pte_at(vma->vm_mm, addr, pte,
>  		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
>  	if (page == swapcache) {
> -		page_add_anon_rmap(page, vma, addr);
> +		page_add_anon_rmap(page, vma, addr, false);
>  		mem_cgroup_commit_charge(page, memcg, true);
>  	} else { /* ksm created a completely new copy */
> -		page_add_new_anon_rmap(page, vma, addr);
> +		page_add_new_anon_rmap(page, vma, addr, false);
>  		mem_cgroup_commit_charge(page, memcg, false);
>  		lru_cache_add_active_or_unevictable(page, vma);
>  	}
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>