On Tue, Mar 15, 2011 at 10:27:50AM +0100, Johannes Weiner wrote: > On Fri, Mar 11, 2011 at 03:04:10AM +0100, Andrea Arcangeli wrote: > > @@ -42,7 +42,7 @@ static pmd_t *get_old_pmd(struct mm_stru > > > > pmd = pmd_offset(pud, addr); > > split_huge_page_pmd(mm, pmd); > > Wasn't getting rid of this line the sole purpose of the patch? :) Leftover that should have been deleted right... > > + if (pmd_trans_huge(*old_pmd)) { > > + int err = move_huge_pmd(vma, old_addr, new_addr, > > + old_end, old_pmd, new_pmd); > > + if (err > 0) { > > + old_addr += HPAGE_PMD_SIZE; > > + new_addr += HPAGE_PMD_SIZE; > > + continue; > > + } > > + } > > + /* > > + * split_huge_page_pmd() must run outside the > > + * pmd_trans_huge() block above because that check > > + * racy. split_huge_page_pmd() will recheck > > + * pmd_trans_huge() but in a not racy way under the > > + * page_table_lock. > > + */ > > + split_huge_page_pmd(vma->vm_mm, old_pmd); > > I don't understand what we are racing here against. If we see a huge > pmd, it may split. But we hold mmap_sem in write-mode, I don't see > how a regular pmd could become huge all of a sudden at this point. Agreed, in fact it runs it without the lock too... Does this look any better? This also optimizes away the tlb flush for totally uninitialized areas. === Subject: thp: mremap support and TLB optimization From: Andrea Arcangeli <aarcange@xxxxxxxxxx> This adds THP support to mremap (decreases the number of split_huge_page called). This also replaces ptep_clear_flush with ptep_get_and_clear and replaces it with a final flush_tlb_range to send a single tlb flush IPI instead of one IPI for each page. Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> --- include/linux/huge_mm.h | 3 +++ mm/huge_memory.c | 38 ++++++++++++++++++++++++++++++++++++++ mm/mremap.c | 29 +++++++++++++++++++++-------- 3 files changed, 62 insertions(+), 8 deletions(-) --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -22,6 +22,9 @@ extern int zap_huge_pmd(struct mmu_gathe extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); +extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, unsigned long old_end, + pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot); --- a/mm/mremap.c +++ b/mm/mremap.c @@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru return NULL; pmd = pmd_offset(pud, addr); - split_huge_page_pmd(mm, pmd); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none(*pmd)) return NULL; return pmd; @@ -80,11 +79,7 @@ static void move_ptes(struct vm_area_str struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; - unsigned long old_start; - old_start = old_addr; - mmu_notifier_invalidate_range_start(vma->vm_mm, - old_start, old_end); if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before @@ -112,7 +107,7 @@ static void move_ptes(struct vm_area_str new_pte++, new_addr += PAGE_SIZE) { if (pte_none(*old_pte)) continue; - pte = ptep_clear_flush(vma, old_addr, old_pte); + pte = ptep_get_and_clear(mm, old_addr, old_pte); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); set_pte_at(mm, new_addr, new_pte, pte); } @@ -124,7 +119,6 @@ static void move_ptes(struct vm_area_str pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); - mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } #define LATENCY_LIMIT (64 * PAGE_SIZE) @@ -135,10 +129,13 @@ unsigned long move_page_tables(struct vm { unsigned long extent, next, old_end; pmd_t *old_pmd, *new_pmd; + bool need_flush = false; old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); + mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); + for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); next = (old_addr + PMD_SIZE) & PMD_MASK; @@ -151,6 +148,18 @@ unsigned long move_page_tables(struct vm new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; + need_flush = true; + if (pmd_trans_huge(*old_pmd)) { + int err = move_huge_pmd(vma, old_addr, new_addr, + old_end, old_pmd, new_pmd); + if (err > 0) { + old_addr += HPAGE_PMD_SIZE; + new_addr += HPAGE_PMD_SIZE; + continue; + } else if (!err) + __split_huge_page_pmd(vma->vm_mm, old_pmd); + VM_BUG_ON(pmd_trans_huge(*old_pmd)); + } next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) extent = next - new_addr; @@ -159,6 +168,10 @@ unsigned long move_page_tables(struct vm move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, new_pmd, new_addr); } + if (likely(need_flush)) + flush_tlb_range(vma, old_end-len, old_addr); + + mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); return len + old_addr - old_end; /* how much done */ } --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1048,6 +1048,44 @@ int mincore_huge_pmd(struct vm_area_stru return ret; } +int move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, unsigned long old_end, + pmd_t *old_pmd, pmd_t *new_pmd) +{ + int ret = 0; + pmd_t pmd; + + struct mm_struct *mm = vma->vm_mm; + + if ((old_addr & ~HPAGE_PMD_MASK) || + (new_addr & ~HPAGE_PMD_MASK) || + (old_addr + HPAGE_PMD_SIZE) > old_end) + goto out; + + /* if the new area is all for our destination it must be unmapped */ + VM_BUG_ON(!pmd_none(*new_pmd)); + /* mostly to remember this locking isn't enough with filebacked vma */ + VM_BUG_ON(vma->vm_file); + + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*old_pmd))) { + if (pmd_trans_splitting(*old_pmd)) { + spin_unlock(&vma->vm_mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, old_pmd); + ret = -1; + } else { + pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); + set_pmd_at(mm, new_addr, new_pmd, pmd); + spin_unlock(&mm->page_table_lock); + ret = 1; + } + } else + spin_unlock(&mm->page_table_lock); + +out: + return ret; +} + int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot) { -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>