Hello everyone, I've been wondering why mremap is sending one IPI for each page that it moves. I tried to remove that so we send an IPI for each vma/syscall (not for each pte/page). I also added native THP support without calling split_huge_page unconditionally if both the source and destination alignment allows a pmd_trans_huge to be preserved (the mremap extension and truncation already preserved existing hugepages but the move into new place didn't yet). If the destination alignment isn't ok, split_huge_page is unavoidable but that is an userland/hardware limitation, not really something we can optimize further in the kernel. I've no real numbers yet (volanomark results are mostly unchanged, it's a tinybit faster but it may be measurement error, and it doesn't seem to call mremap enough, but the thp_split number in /proc/vmstat seem to go down close to zero, maybe other JIT workloads will benefit?). In the meantime I'm posting this for review. I'm not entirely sure this is safe at this point (I mean the tlb part especially). Also note if any arch needs the tlb flush after ptep_get_and_clear, move_pte can provide it. The huge_memory.c part has no move_pmd equivalent because the only arch that needs move_pte (sparc64) doesn't supports THP yet (I've no idea if sparc64 is one of the candidates of future THP capable archs, arm/ppcembedded should make it eventually). I applied this to my aa.git tree and I'm running this on all my systems with no adverse effects for more than a day, so if you want to test the usual procedure works. first: git clone git://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git or first: git clone --reference linux-2.6 git://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git later: git fetch; git checkout -f origin/master === Subject: thp: mremap support and TLB optimization From: Andrea Arcangeli <aarcange@xxxxxxxxxx> This adds THP support to mremap (decreases the number of split_huge_page called). This also replaces ptep_clear_flush with ptep_get_and_clear and replaces it with a final flush_tlb_range to send a single tlb flush IPI instead of one IPI for each page. Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> --- include/linux/huge_mm.h | 3 +++ mm/huge_memory.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ mm/mremap.c | 31 ++++++++++++++++++++++++------- 3 files changed, 71 insertions(+), 7 deletions(-) --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -22,6 +22,9 @@ extern int zap_huge_pmd(struct mmu_gathe extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); +extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, unsigned long old_end, + pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot); --- a/mm/mremap.c +++ b/mm/mremap.c @@ -42,7 +42,7 @@ static pmd_t *get_old_pmd(struct mm_stru pmd = pmd_offset(pud, addr); split_huge_page_pmd(mm, pmd); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none(*pmd)) return NULL; return pmd; @@ -80,11 +80,7 @@ static void move_ptes(struct vm_area_str struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; - unsigned long old_start; - old_start = old_addr; - mmu_notifier_invalidate_range_start(vma->vm_mm, - old_start, old_end); if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before @@ -112,7 +108,7 @@ static void move_ptes(struct vm_area_str new_pte++, new_addr += PAGE_SIZE) { if (pte_none(*old_pte)) continue; - pte = ptep_clear_flush(vma, old_addr, old_pte); + pte = ptep_get_and_clear(mm, old_addr, old_pte); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); set_pte_at(mm, new_addr, new_pte, pte); } @@ -124,7 +120,6 @@ static void move_ptes(struct vm_area_str pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); - mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } #define LATENCY_LIMIT (64 * PAGE_SIZE) @@ -139,6 +134,8 @@ unsigned long move_page_tables(struct vm old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); + mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); + for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); next = (old_addr + PMD_SIZE) & PMD_MASK; @@ -151,6 +148,23 @@ unsigned long move_page_tables(struct vm new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; + if (pmd_trans_huge(*old_pmd)) { + int err = move_huge_pmd(vma, old_addr, new_addr, + old_end, old_pmd, new_pmd); + if (err > 0) { + old_addr += HPAGE_PMD_SIZE; + new_addr += HPAGE_PMD_SIZE; + continue; + } + } + /* + * split_huge_page_pmd() must run outside the + * pmd_trans_huge() block above because that check + * racy. split_huge_page_pmd() will recheck + * pmd_trans_huge() but in a not racy way under the + * page_table_lock. + */ + split_huge_page_pmd(vma->vm_mm, old_pmd); next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) extent = next - new_addr; @@ -159,6 +173,9 @@ unsigned long move_page_tables(struct vm move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, new_pmd, new_addr); } + flush_tlb_range(vma, old_end-len, old_addr); + + mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); return len + old_addr - old_end; /* how much done */ } --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1048,6 +1048,50 @@ int mincore_huge_pmd(struct vm_area_stru return ret; } +int move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, unsigned long old_end, + pmd_t *old_pmd, pmd_t *new_pmd) +{ + int ret = 0; + pmd_t pmd; + + struct mm_struct *mm = vma->vm_mm; + + if ((old_addr & ~HPAGE_PMD_MASK) || + (new_addr & ~HPAGE_PMD_MASK) || + (old_addr + HPAGE_PMD_SIZE) > old_end) + goto out; + + /* if the new area is all for our destination it must be unmapped */ + VM_BUG_ON(!pmd_none(*new_pmd)); + /* mostly to remember this locking isn't enough with filebacked vma */ + VM_BUG_ON(vma->vm_file); + + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*old_pmd))) { + if (pmd_trans_splitting(*old_pmd)) { + spin_unlock(&vma->vm_mm->page_table_lock); + /* + * It's not mandatory to wait here as the + * caller will run split_huge_page_pmd(), but + * this is faster and it will avoid the caller + * to invoke __split_huge_page_pmd() (and to + * take the page_table_lock again). + */ + wait_split_huge_page(vma->anon_vma, old_pmd); + } else { + pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); + set_pmd_at(mm, new_addr, new_pmd, pmd); + spin_unlock(&mm->page_table_lock); + ret = 1; + } + } else + spin_unlock(&mm->page_table_lock); + +out: + return ret; +} + int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot) { -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>