- shared-page-table-for-hugetlb-page-v2.patch removed from -mm tree

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The patch titled

     shared page table for hugetlb page - v2

has been removed from the -mm tree.  Its filename is

     shared-page-table-for-hugetlb-page-v2.patch

This patch was dropped because it was nacked by the maintainer

------------------------------------------------------
Subject: shared page table for hugetlb page - v2
From: "Chen, Kenneth W" <kenneth.w.chen@xxxxxxxxx>

Following up with the work on shared page table, here is a re-post of
shared page table for hugetlb memory.  Dave's latest patch restricts the
page table sharing at pmd level in order to simplify some of the complexity
for normal page, but that simplification cuts out all the performance
benefit for hugetlb on x86-64 and ia32.

The following patch attempt to kick that optimization back in for hugetlb
memory and allow pt sharing at second level.  It is nicely self-contained
within hugetlb subsystem.  With no impact to generic VM at all, I think
this patch is ready for mainline consideration.

Imprecise RSS accounting is an irritating ill effect with pt sharing. 
After consulted with several VM experts, I have tried various methods to
solve that problem: (1) iterate through all mm_structs that share the PT
and increment count; (2) keep RSS count in page table structure and then
sum them up at reporting time.  None of the above methods yield any
satisfactory implementation.

Since process RSS accounting is pure information only, I propose we don't
count them at all for hugetlb page.  rlimit has such field, though there is
absolutely no enforcement on limiting that resource.  One other method is
to account all RSS at hugetlb mmap time regardless they are faulted or not.
 I opt for the simplicity of no accounting at all.

Signed-off-by: Ken Chen <kenneth.w.chen@xxxxxxxxx>
Cc: Dave McCracken <dmccr@xxxxxxxxxx>
Cc: Hugh Dickins <hugh@xxxxxxxxxxx>
Cc: Nick Piggin <nickpiggin@xxxxxxxxxxxx>
Cc: Adam Litke <agl@xxxxxxxxxx>
Cc: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx>
Cc: William Lee Irwin III <wli@xxxxxxxxxxxxxx>
Cc: Andi Kleen <ak@xxxxxx>
DESC
shared-page-table-for-hugetlb-page-v2-tidy
EDESC

Cc: Ken Chen <kenneth.w.chen@xxxxxxxxx>
Cc: Dave McCracken <dmccr@xxxxxxxxxx>
Cc: Hugh Dickins <hugh@xxxxxxxxxxx>
Cc: Nick Piggin <nickpiggin@xxxxxxxxxxxx>
Cc: Adam Litke <agl@xxxxxxxxxx>
Cc: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx>
Cc: William Lee Irwin III <wli@xxxxxxxxxxxxxx>
Cc: Andi Kleen <ak@xxxxxx>
DESC
shared page table for hugetlb page - v2 (comments)
EDESC

Signed-off-by: Ken Chen <kenneth.w.chen@xxxxxxxxx>
Cc: Dave McCracken <dmccr@xxxxxxxxxx>
Cc: Hugh Dickins <hugh@xxxxxxxxxxx>
Cc: Nick Piggin <nickpiggin@xxxxxxxxxxxx>
Cc: Adam Litke <agl@xxxxxxxxxx>
Cc: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx>
Cc: William Lee Irwin III <wli@xxxxxxxxxxxxxx>
Cc: Andi Kleen <ak@xxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxx>
---

 arch/i386/mm/hugetlbpage.c |   95 ++++++++++++++++++++++++++++++++++-
 mm/hugetlb.c               |   14 ++++-
 2 files changed, 105 insertions(+), 4 deletions(-)

diff -puN arch/i386/mm/hugetlbpage.c~shared-page-table-for-hugetlb-page-v2 arch/i386/mm/hugetlbpage.c
--- a/arch/i386/mm/hugetlbpage.c~shared-page-table-for-hugetlb-page-v2
+++ a/arch/i386/mm/hugetlbpage.c
@@ -17,16 +17,109 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
+static int page_table_shareable(struct vm_area_struct *svma,
+			 struct vm_area_struct *vma,
+			 unsigned long addr, unsigned long size)
+{
+	unsigned long base = addr & ~(size - 1);
+	unsigned long end = base + size;
+
+	if (base < vma->vm_start || vma->vm_end < end)
+		return 0;
+
+	if (svma->vm_flags != vma->vm_flags ||
+	    svma->vm_start != vma->vm_start ||
+	    svma->vm_end   != vma->vm_end)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * search for a shareable pmd page for hugetlb.
+ */
+static void pmd_share(struct vm_area_struct *vma, pud_t *pud,
+			unsigned long addr)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct prio_tree_iter iter;
+	struct vm_area_struct *svma;
+	pte_t *spte = NULL;
+
+	if (!vma->vm_flags & VM_SHARED)
+		return;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap,
+			      vma->vm_pgoff, vma->vm_pgoff) {
+		if (svma == vma ||
+		    !page_table_shareable(svma, vma, addr, PUD_SIZE))
+			continue;
+
+		spin_lock(&svma->vm_mm->page_table_lock);
+		spte = huge_pte_offset(svma->vm_mm, addr);
+		/*
+		 * if a valid hugetlb pte is found, take a reference count
+		 * on the pte page.  We can then safely populate it into
+		 * pud at a later point.
+		 */
+		if (spte)
+			get_page(virt_to_page(spte));
+		spin_unlock(&svma->vm_mm->page_table_lock);
+		if (spte)
+			break;
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+
+	if (!spte)
+		return;
+
+	spin_lock(&vma->vm_mm->page_table_lock);
+	if (pud_none(*pud))
+		pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
+	else
+		put_page(virt_to_page(spte));
+	spin_unlock(&vma->vm_mm->page_table_lock);
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ *	    0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pte_put(struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep)
+{
+	pgd_t *pgd = pgd_offset(vma->vm_mm, *addr);
+	pud_t *pud = pud_offset(pgd, *addr);
+
+	if (page_count(virt_to_page(ptep)) <= 1)
+		return 0;
+
+	pud_clear(pud);
+	put_page(virt_to_page(ptep));
+	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+	return 1;
+}
+
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
+	struct vm_area_struct *vma = find_vma(mm, addr);
 	pgd_t *pgd;
 	pud_t *pud;
 	pte_t *pte = NULL;
 
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
-	if (pud)
+	if (pud) {
+		if (pud_none(*pud))
+			pmd_share(vma, pud, addr);
 		pte = (pte_t *) pmd_alloc(mm, pud, addr);
+	}
 	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
 
 	return pte;
diff -puN mm/hugetlb.c~shared-page-table-for-hugetlb-page-v2 mm/hugetlb.c
--- a/mm/hugetlb.c~shared-page-table-for-hugetlb-page-v2
+++ a/mm/hugetlb.c
@@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_st
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
-			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
@@ -356,6 +355,12 @@ nomem:
 	return -ENOMEM;
 }
 
+__attribute__((weak))
+int huge_pte_put(struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep)
+{
+	return 0;
+}
+
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 			  unsigned long end)
 {
@@ -379,13 +384,15 @@ void unmap_hugepage_range(struct vm_area
 		if (!ptep)
 			continue;
 
+		if (huge_pte_put(vma, &address, ptep))
+			continue;
+
 		pte = huge_ptep_get_and_clear(mm, address, ptep);
 		if (pte_none(pte))
 			continue;
 
 		page = pte_page(pte);
 		put_page(page);
-		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 	}
 
 	spin_unlock(&mm->page_table_lock);
@@ -488,7 +495,6 @@ retry:
 	if (!pte_none(*ptep))
 		goto backout;
 
-	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
 				&& (vma->vm_flags & VM_SHARED)));
 	set_huge_pte_at(mm, address, ptep, new_pte);
@@ -631,6 +637,8 @@ void hugetlb_change_protection(struct vm
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
+		if (huge_pte_put(vma, &address, ptep))
+			continue;
 		if (!pte_none(*ptep)) {
 			pte = huge_ptep_get_and_clear(mm, address, ptep);
 			pte = pte_mkhuge(pte_modify(pte, newprot));
_

Patches currently in -mm which might be from kenneth.w.chen@xxxxxxxxx are

shared-page-table-for-hugetlb-page-v2.patch
clean-up-unused-kiocb-variables.patch

-
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Kernel Newbies FAQ]     [Kernel Archive]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [Bugtraq]     [Photo]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]

  Powered by Linux