(12/21/11 5:23 PM), Naoya Horiguchi wrote: > Currently when we check if we can handle thp as it is or we need to > split it into regular sized pages, we hold page table lock prior to > check whether a given pmd is mapping thp or not. Because of this, > when it's not "huge pmd" we suffer from unnecessary lock/unlock overhead. > To remove it, this patch introduces a optimized check function and > replace several similar logics with it. > > Signed-off-by: Naoya Horiguchi<n-horiguchi@xxxxxxxxxxxxx> > Cc: David Rientjes<rientjes@xxxxxxxxxx> ok, this looks a valuable patch. > --- > fs/proc/task_mmu.c | 74 ++++++++++------------------ > include/linux/huge_mm.h | 7 +++ > mm/huge_memory.c | 124 ++++++++++++++++++++++------------------------ > mm/mremap.c | 3 +- > 4 files changed, 93 insertions(+), 115 deletions(-) > > diff --git 3.2-rc5.orig/fs/proc/task_mmu.c 3.2-rc5/fs/proc/task_mmu.c > index 0df61ab..3b79dd4 100644 > --- 3.2-rc5.orig/fs/proc/task_mmu.c > +++ 3.2-rc5/fs/proc/task_mmu.c > @@ -394,20 +394,12 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, > pte_t *pte; > spinlock_t *ptl; > > - spin_lock(&walk->mm->page_table_lock); > - if (pmd_trans_huge(*pmd)) { > - if (pmd_trans_splitting(*pmd)) { > - spin_unlock(&walk->mm->page_table_lock); > - wait_split_huge_page(vma->anon_vma, pmd); > - } else { > - smaps_pte_entry(*(pte_t *)pmd, addr, > - HPAGE_PMD_SIZE, walk); > - spin_unlock(&walk->mm->page_table_lock); > - mss->anonymous_thp += HPAGE_PMD_SIZE; > - return 0; > - } > - } else { > + if (check_and_wait_split_huge_pmd(pmd, vma)) { > + smaps_pte_entry(*(pte_t *)pmd, addr, > + HPAGE_PMD_SIZE, walk); > spin_unlock(&walk->mm->page_table_lock); > + mss->anonymous_thp += HPAGE_PMD_SIZE; > + return 0; > } > /* > * The mmap_sem held all the way back in m_start() is what > @@ -689,26 +681,19 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, > /* find the first VMA at or above 'addr' */ > vma = find_vma(walk->mm, addr); > > - spin_lock(&walk->mm->page_table_lock); > - if (pmd_trans_huge(*pmd)) { > - if (pmd_trans_splitting(*pmd)) { > - spin_unlock(&walk->mm->page_table_lock); > - wait_split_huge_page(vma->anon_vma, pmd); > - } else { > - for (; addr != end; addr += PAGE_SIZE) { > - int offset = (addr& ~PAGEMAP_WALK_MASK) > - >> PAGE_SHIFT; > - pfn = thp_pte_to_pagemap_entry(*(pte_t *)pmd, > - offset); > - err = add_to_pagemap(addr, pfn, pm); > - if (err) > - break; > - } > - spin_unlock(&walk->mm->page_table_lock); > - return err; > + /* David comment */ This commnet doesn't explain anything. > + if (check_and_wait_split_huge_pmd(pmd, vma)) { > + for (; addr != end; addr += PAGE_SIZE) { > + int offset = (addr& ~PAGEMAP_WALK_MASK) > + >> PAGE_SHIFT; > + pfn = thp_pte_to_pagemap_entry(*(pte_t *)pmd, > + offset); > + err = add_to_pagemap(addr, pfn, pm); > + if (err) > + break; > } > - } else { > spin_unlock(&walk->mm->page_table_lock); > + return err; > } > > for (; addr != end; addr += PAGE_SIZE) { > @@ -975,24 +960,17 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, > pte_t *pte; > > md = walk->private; > - spin_lock(&walk->mm->page_table_lock); > - if (pmd_trans_huge(*pmd)) { > - if (pmd_trans_splitting(*pmd)) { > - spin_unlock(&walk->mm->page_table_lock); > - wait_split_huge_page(md->vma->anon_vma, pmd); > - } else { > - pte_t huge_pte = *(pte_t *)pmd; > - struct page *page; > - > - page = can_gather_numa_stats(huge_pte, md->vma, addr); > - if (page) > - gather_stats(page, md, pte_dirty(huge_pte), > - HPAGE_PMD_SIZE/PAGE_SIZE); > - spin_unlock(&walk->mm->page_table_lock); > - return 0; > - } > - } else { > + > + if (check_and_wait_split_huge_pmd(pmd, md->vma)) { > + pte_t huge_pte = *(pte_t *)pmd; > + struct page *page; > + > + page = can_gather_numa_stats(huge_pte, md->vma, addr); > + if (page) > + gather_stats(page, md, pte_dirty(huge_pte), > + HPAGE_PMD_SIZE/PAGE_SIZE); > spin_unlock(&walk->mm->page_table_lock); > + return 0; > } > > orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr,&ptl); > diff --git 3.2-rc5.orig/include/linux/huge_mm.h 3.2-rc5/include/linux/huge_mm.h > index a9ace9c..477c8e3 100644 > --- 3.2-rc5.orig/include/linux/huge_mm.h > +++ 3.2-rc5/include/linux/huge_mm.h > @@ -113,6 +113,8 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, > unsigned long start, > unsigned long end, > long adjust_next); > +extern int check_and_wait_split_huge_pmd(pmd_t *pmd, > + struct vm_area_struct *vma); > static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, > unsigned long start, > unsigned long end, > @@ -176,6 +178,11 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, > long adjust_next) > { > } > +static inline int check_and_wait_split_huge_pmd(pmd_t *pmd, > + struct vm_area_struct *vma) > +{ > + return 0; > +} > #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > > #endif /* _LINUX_HUGE_MM_H */ > diff --git 3.2-rc5.orig/mm/huge_memory.c 3.2-rc5/mm/huge_memory.c > index 36b3d98..b73c744 100644 > --- 3.2-rc5.orig/mm/huge_memory.c > +++ 3.2-rc5/mm/huge_memory.c > @@ -1001,29 +1001,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, > { > int ret = 0; > > - spin_lock(&tlb->mm->page_table_lock); > - if (likely(pmd_trans_huge(*pmd))) { > - if (unlikely(pmd_trans_splitting(*pmd))) { > - spin_unlock(&tlb->mm->page_table_lock); > - wait_split_huge_page(vma->anon_vma, > - pmd); > - } else { > - struct page *page; > - pgtable_t pgtable; > - pgtable = get_pmd_huge_pte(tlb->mm); > - page = pmd_page(*pmd); > - pmd_clear(pmd); > - page_remove_rmap(page); > - VM_BUG_ON(page_mapcount(page)< 0); > - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); > - VM_BUG_ON(!PageHead(page)); > - spin_unlock(&tlb->mm->page_table_lock); > - tlb_remove_page(tlb, page); > - pte_free(tlb->mm, pgtable); > - ret = 1; > - } > - } else > + if (likely(check_and_wait_split_huge_pmd(pmd, vma))) { > + struct page *page; > + pgtable_t pgtable; > + pgtable = get_pmd_huge_pte(tlb->mm); > + page = pmd_page(*pmd); > + pmd_clear(pmd); > + page_remove_rmap(page); > + VM_BUG_ON(page_mapcount(page)< 0); > + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); > + VM_BUG_ON(!PageHead(page)); > spin_unlock(&tlb->mm->page_table_lock); > + tlb_remove_page(tlb, page); > + pte_free(tlb->mm, pgtable); > + ret = 1; > + } > > return ret; > } > @@ -1034,21 +1026,14 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > { > int ret = 0; > > - spin_lock(&vma->vm_mm->page_table_lock); > - if (likely(pmd_trans_huge(*pmd))) { > - ret = !pmd_trans_splitting(*pmd); > - spin_unlock(&vma->vm_mm->page_table_lock); > - if (unlikely(!ret)) > - wait_split_huge_page(vma->anon_vma, pmd); > - else { > - /* > - * All logical pages in the range are present > - * if backed by a huge page. > - */ > - memset(vec, 1, (end - addr)>> PAGE_SHIFT); > - } > - } else > + if (likely(check_and_wait_split_huge_pmd(pmd, vma))) { > + /* > + * All logical pages in the range are present > + * if backed by a huge page. > + */ > spin_unlock(&vma->vm_mm->page_table_lock); > + memset(vec, 1, (end - addr)>> PAGE_SHIFT); > + } > > return ret; > } > @@ -1078,21 +1063,12 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, > goto out; > } > > - spin_lock(&mm->page_table_lock); > - if (likely(pmd_trans_huge(*old_pmd))) { > - if (pmd_trans_splitting(*old_pmd)) { > - spin_unlock(&mm->page_table_lock); > - wait_split_huge_page(vma->anon_vma, old_pmd); > - ret = -1; > - } else { > - pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); > - VM_BUG_ON(!pmd_none(*new_pmd)); > - set_pmd_at(mm, new_addr, new_pmd, pmd); > - spin_unlock(&mm->page_table_lock); > - ret = 1; > - } > - } else { > + if (likely(check_and_wait_split_huge_pmd(old_pmd, vma))) { > + pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); > + VM_BUG_ON(!pmd_none(*new_pmd)); > + set_pmd_at(mm, new_addr, new_pmd, pmd); > spin_unlock(&mm->page_table_lock); > + ret = 1; > } > out: > return ret; > @@ -1104,27 +1080,45 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, > struct mm_struct *mm = vma->vm_mm; > int ret = 0; > > - spin_lock(&mm->page_table_lock); > - if (likely(pmd_trans_huge(*pmd))) { > - if (unlikely(pmd_trans_splitting(*pmd))) { > - spin_unlock(&mm->page_table_lock); > - wait_split_huge_page(vma->anon_vma, pmd); > - } else { > - pmd_t entry; > + if (likely(check_and_wait_split_huge_pmd(pmd, vma))) { > + pmd_t entry; > > - entry = pmdp_get_and_clear(mm, addr, pmd); > - entry = pmd_modify(entry, newprot); > - set_pmd_at(mm, addr, pmd, entry); > - spin_unlock(&vma->vm_mm->page_table_lock); > - flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); > - ret = 1; > - } > - } else > + entry = pmdp_get_and_clear(mm, addr, pmd); > + entry = pmd_modify(entry, newprot); > + set_pmd_at(mm, addr, pmd, entry); > spin_unlock(&vma->vm_mm->page_table_lock); > + flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); > + ret = 1; > + } > > return ret; > } > > +/* > + * Returns 1 if a given pmd is mapping a thp and stable (not under splitting.) > + * Returns 0 otherwise. Note that if it returns 1, this routine returns without > + * unlocking page table locks. So callers must unlock them. > + */ > +int check_and_wait_split_huge_pmd(pmd_t *pmd, struct vm_area_struct *vma) We always should avoid a name of "check". It doesn't explain what the function does. > +{ VM_BUG_ON(!rwsem_is_locked(vma->mm)) here? > + if (!pmd_trans_huge(*pmd)) > + return 0; > + > + spin_lock(&vma->vm_mm->page_table_lock); > + if (likely(pmd_trans_huge(*pmd))) { > + if (pmd_trans_splitting(*pmd)) { > + spin_unlock(&vma->vm_mm->page_table_lock); > + wait_split_huge_page(vma->anon_vma, pmd); > + } else { > + /* Thp mapped by 'pmd' is stable, so we can > + * handle it as it is. */ > + return 1; > + } > + } > + spin_unlock(&vma->vm_mm->page_table_lock); > + return 0; > +} > + > pmd_t *page_check_address_pmd(struct page *page, > struct mm_struct *mm, > unsigned long address, > diff --git 3.2-rc5.orig/mm/mremap.c 3.2-rc5/mm/mremap.c > index d6959cb..d534668 100644 > --- 3.2-rc5.orig/mm/mremap.c > +++ 3.2-rc5/mm/mremap.c > @@ -155,9 +155,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, > if (err> 0) { > need_flush = true; > continue; > - } else if (!err) { > - split_huge_page_pmd(vma->vm_mm, old_pmd); > } > + split_huge_page_pmd(vma->vm_mm, old_pmd); unrelated hunk? > VM_BUG_ON(pmd_trans_huge(*old_pmd)); > } > if (pmd_none(*new_pmd)&& __pte_alloc(new_vma->vm_mm, new_vma, -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>