[PATCH] thp: mremap support and TLB optimization

Andrea Arcangeli <aarcange@xxxxxxxxxx> · Fri, 11 Mar 2011 03:04:10 +0100

Hello everyone,

I've been wondering why mremap is sending one IPI for each page that
it moves. I tried to remove that so we send an IPI for each
vma/syscall (not for each pte/page). I also added native THP support
without calling split_huge_page unconditionally if both the source and
destination alignment allows a pmd_trans_huge to be preserved (the
mremap extension and truncation already preserved existing hugepages
but the move into new place didn't yet). If the destination alignment
isn't ok, split_huge_page is unavoidable but that is an
userland/hardware limitation, not really something we can optimize
further in the kernel.

I've no real numbers yet (volanomark results are mostly unchanged,
it's a tinybit faster but it may be measurement error, and it doesn't
seem to call mremap enough, but the thp_split number in /proc/vmstat
seem to go down close to zero, maybe other JIT workloads will
benefit?).

In the meantime I'm posting this for review. I'm not entirely sure
this is safe at this point (I mean the tlb part especially). Also note
if any arch needs the tlb flush after ptep_get_and_clear, move_pte can
provide it. The huge_memory.c part has no move_pmd equivalent because
the only arch that needs move_pte (sparc64) doesn't supports THP yet
(I've no idea if sparc64 is one of the candidates of future THP
capable archs, arm/ppcembedded should make it eventually).

I applied this to my aa.git tree and I'm running this on all my
systems with no adverse effects for more than a day, so if you want to
test the usual procedure works.

first: git clone git://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git
or first: git clone --reference linux-2.6 git://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git
later: git fetch; git checkout -f origin/master

===
Subject: thp: mremap support and TLB optimization

From: Andrea Arcangeli <aarcange@xxxxxxxxxx>

This adds THP support to mremap (decreases the number of split_huge_page
called).

This also replaces ptep_clear_flush with ptep_get_and_clear and replaces it
with a final flush_tlb_range to send a single tlb flush IPI instead of one IPI
for each page.

Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
---
 include/linux/huge_mm.h |    3 +++
 mm/huge_memory.c        |   44 ++++++++++++++++++++++++++++++++++++++++++++
 mm/mremap.c             |   31 ++++++++++++++++++++++++-------
 3 files changed, 71 insertions(+), 7 deletions(-)

--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -22,6 +22,9 @@ extern int zap_huge_pmd(struct mmu_gathe
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, unsigned long end,
 			unsigned char *vec);
+extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
+			 unsigned long new_addr, unsigned long old_end,
+			 pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, pgprot_t newprot);
 
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -42,7 +42,7 @@ static pmd_t *get_old_pmd(struct mm_stru
 
 	pmd = pmd_offset(pud, addr);
 	split_huge_page_pmd(mm, pmd);
-	if (pmd_none_or_clear_bad(pmd))
+	if (pmd_none(*pmd))
 		return NULL;
 
 	return pmd;
@@ -80,11 +80,7 @@ static void move_ptes(struct vm_area_str
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
-	unsigned long old_start;
 
-	old_start = old_addr;
-	mmu_notifier_invalidate_range_start(vma->vm_mm,
-					    old_start, old_end);
 	if (vma->vm_file) {
 		/*
 		 * Subtle point from Rajesh Venkatasubramanian: before
@@ -112,7 +108,7 @@ static void move_ptes(struct vm_area_str
 				   new_pte++, new_addr += PAGE_SIZE) {
 		if (pte_none(*old_pte))
 			continue;
-		pte = ptep_clear_flush(vma, old_addr, old_pte);
+		pte = ptep_get_and_clear(mm, old_addr, old_pte);
 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
@@ -124,7 +120,6 @@ static void move_ptes(struct vm_area_str
 	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
-	mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 }
 
 #define LATENCY_LIMIT	(64 * PAGE_SIZE)
@@ -139,6 +134,8 @@ unsigned long move_page_tables(struct vm
 	old_end = old_addr + len;
 	flush_cache_range(vma, old_addr, old_end);
 
+	mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
+
 	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
 		cond_resched();
 		next = (old_addr + PMD_SIZE) & PMD_MASK;
@@ -151,6 +148,23 @@ unsigned long move_page_tables(struct vm
 		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
 		if (!new_pmd)
 			break;
+		if (pmd_trans_huge(*old_pmd)) {
+			int err = move_huge_pmd(vma, old_addr, new_addr,
+						old_end, old_pmd, new_pmd);
+			if (err > 0) {
+				old_addr += HPAGE_PMD_SIZE;
+				new_addr += HPAGE_PMD_SIZE;
+				continue;
+			}
+		}
+		/*
+		 * split_huge_page_pmd() must run outside the
+		 * pmd_trans_huge() block above because that check
+		 * racy. split_huge_page_pmd() will recheck
+		 * pmd_trans_huge() but in a not racy way under the
+		 * page_table_lock.
+		 */
+		split_huge_page_pmd(vma->vm_mm, old_pmd);
 		next = (new_addr + PMD_SIZE) & PMD_MASK;
 		if (extent > next - new_addr)
 			extent = next - new_addr;
@@ -159,6 +173,9 @@ unsigned long move_page_tables(struct vm
 		move_ptes(vma, old_pmd, old_addr, old_addr + extent,
 				new_vma, new_pmd, new_addr);
 	}
+	flush_tlb_range(vma, old_end-len, old_addr);
+
+	mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
 
 	return len + old_addr - old_end;	/* how much done */
 }
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1048,6 +1048,50 @@ int mincore_huge_pmd(struct vm_area_stru
 	return ret;
 }
 
+int move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
+		  unsigned long new_addr, unsigned long old_end,
+		  pmd_t *old_pmd, pmd_t *new_pmd)
+{
+	int ret = 0;
+	pmd_t pmd;
+
+	struct mm_struct *mm = vma->vm_mm;
+
+	if ((old_addr & ~HPAGE_PMD_MASK) ||
+	    (new_addr & ~HPAGE_PMD_MASK) ||
+	    (old_addr + HPAGE_PMD_SIZE) > old_end)
+		goto out;
+
+	/* if the new area is all for our destination it must be unmapped */
+	VM_BUG_ON(!pmd_none(*new_pmd));
+	/* mostly to remember this locking isn't enough with filebacked vma */
+	VM_BUG_ON(vma->vm_file);
+
+	spin_lock(&mm->page_table_lock);
+	if (likely(pmd_trans_huge(*old_pmd))) {
+		if (pmd_trans_splitting(*old_pmd)) {
+			spin_unlock(&vma->vm_mm->page_table_lock);
+			/*
+			 * It's not mandatory to wait here as the
+			 * caller will run split_huge_page_pmd(), but
+			 * this is faster and it will avoid the caller
+			 * to invoke __split_huge_page_pmd() (and to
+			 * take the page_table_lock again).
+			 */
+			wait_split_huge_page(vma->anon_vma, old_pmd);
+		} else {
+			pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+			set_pmd_at(mm, new_addr, new_pmd, pmd);
+			spin_unlock(&mm->page_table_lock);
+			ret = 1;
+		}
+	} else
+		spin_unlock(&mm->page_table_lock);
+
+out:
+	return ret;
+}
+
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, pgprot_t newprot)
 {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>