[PATCH -mm] mincore: apply page table walker on do_mincore() (Re: [PATCH 00/10] mm: pagewalk: huge page cleanups and VMA passing)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Jun 02, 2014 at 02:53:34PM -0700, Dave Hansen wrote:
> On 06/02/2014 02:52 PM, Naoya Horiguchi wrote:
> > What version is this patchset based on?
> > Recently I comprehensively rewrote page table walker (from the same motivation
> > as yours) and the patchset is now in linux-mm. I guess most of your patchset
> > (I've not read them yet) conflict with this patchset.
> > So could you take a look on it?
> 
> It's on top of a version of Linus's from the last week.  I'll take a
> look at how it sits on top of -mm.

I've looked over your series, but unfortunately most of works (patch 1, 2, 3,
5, 6, 7) were already done or we don't have to do it in current linux-mm code.
Problems you try to handle in these patches come from current code's poor
design of page table walker, worst thing is that we do vma-loop inside pgd loop.
I know you tried harder to make things better, but I don't think it make sense
to maintain current bad base code for long.

As for patch 4, yes, we can apply page table walker do_mincore() and I have
a patch applicable onto linux-mm code (attached). And for other potential
page walk users, I'm investigating whether we can really apply page table
walker (some caller doesn't hold mmap_sem, so can't simply apply it.)

And for patch 8, 9, and 10, I don't think it's good idea to add a new callback
which can handle both pmd and pte (because they are essentially differnt thing).
But the underneath idea of doing pmd_trans_huge_lock() in the common code in
walk_single_entry_locked() looks nice to me. So it would be great if we can do
the same thing in walk_pmd_range() (of linux-mm) to reduce code in callbacks.

Thanks,
Naoya Horiguchi
---
>From d73b28adddd43bf29222eda86a851662bee8ef12 Mon Sep 17 00:00:00 2001
Date: Tue, 3 Jun 2014 00:06:26 -0400
Subject: [PATCH -mm] mincore: apply page table walker on do_mincore()

This patch makes do_mincore() use walk_page_vma(), which reduces the lines
of code by using common page table walk code.

Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
---
 mm/mincore.c | 196 +++++++++++++++++++++--------------------------------------
 1 file changed, 70 insertions(+), 126 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index 725c80961048..f292f5bea8e2 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -19,38 +19,27 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
-				unsigned long addr, unsigned long end,
-				unsigned char *vec)
+static int mincore_hugetlb(pte_t *pte, unsigned long addr,
+			unsigned long end, struct mm_walk *walk)
 {
+	int err = 0;
 #ifdef CONFIG_HUGETLB_PAGE
-	struct hstate *h;
+	unsigned char present;
+	unsigned char *vec = walk->private;
 
-	h = hstate_vma(vma);
-	while (1) {
-		unsigned char present;
-		pte_t *ptep;
-		/*
-		 * Huge pages are always in RAM for now, but
-		 * theoretically it needs to be checked.
-		 */
-		ptep = huge_pte_offset(current->mm,
-				       addr & huge_page_mask(h));
-		present = ptep && !huge_pte_none(huge_ptep_get(ptep));
-		while (1) {
-			*vec = present;
-			vec++;
-			addr += PAGE_SIZE;
-			if (addr == end)
-				return;
-			/* check hugepage border */
-			if (!(addr & ~huge_page_mask(h)))
-				break;
-		}
-	}
+	/*
+	 * Huge pages are always in RAM for now, but
+	 * theoretically it needs to be checked.
+	 */
+	present = pte && !huge_pte_none(huge_ptep_get(pte));
+	for (; addr != end; vec++, addr += PAGE_SIZE)
+		*vec = present;
+	cond_resched();
+	walk->private += (end - addr) >> PAGE_SHIFT;
 #else
 	BUG();
 #endif
+	return err;
 }
 
 /*
@@ -94,10 +83,11 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 	return present;
 }
 
-static void mincore_unmapped_range(struct vm_area_struct *vma,
-				unsigned long addr, unsigned long end,
-				unsigned char *vec)
+static int mincore_hole(unsigned long addr, unsigned long end,
+				struct mm_walk *walk)
 {
+	struct vm_area_struct *vma = walk->vma;
+	unsigned char *vec = walk->private;
 	unsigned long nr = (end - addr) >> PAGE_SHIFT;
 	int i;
 
@@ -111,110 +101,55 @@ static void mincore_unmapped_range(struct vm_area_struct *vma,
 		for (i = 0; i < nr; i++)
 			vec[i] = 0;
 	}
+	walk->private += nr;
+	return 0;
 }
 
-static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-			unsigned long addr, unsigned long end,
-			unsigned char *vec)
+static int mincore_pte(pte_t *pte, unsigned long addr, unsigned long end,
+			struct mm_walk *walk)
 {
-	unsigned long next;
-	spinlock_t *ptl;
-	pte_t *ptep;
-
-	ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	do {
-		pte_t pte = *ptep;
-		pgoff_t pgoff;
-
-		next = addr + PAGE_SIZE;
-		if (pte_none(pte))
-			mincore_unmapped_range(vma, addr, next, vec);
-		else if (pte_present(pte))
+	struct vm_area_struct *vma = walk->vma;
+	unsigned char *vec = walk->private;
+	pgoff_t pgoff;
+
+	if (pte_present(*pte))
+		*vec = 1;
+	else if (pte_file(*pte)) {
+		pgoff = pte_to_pgoff(*pte);
+		*vec = mincore_page(vma->vm_file->f_mapping, pgoff);
+	} else { /* pte is a swap entry */
+		swp_entry_t entry = pte_to_swp_entry(*pte);
+
+		if (is_migration_entry(entry)) {
+			/* migration entries are always uptodate */
 			*vec = 1;
-		else if (pte_file(pte)) {
-			pgoff = pte_to_pgoff(pte);
-			*vec = mincore_page(vma->vm_file->f_mapping, pgoff);
-		} else { /* pte is a swap entry */
-			swp_entry_t entry = pte_to_swp_entry(pte);
-
-			if (is_migration_entry(entry)) {
-				/* migration entries are always uptodate */
-				*vec = 1;
-			} else {
+		} else {
 #ifdef CONFIG_SWAP
-				pgoff = entry.val;
-				*vec = mincore_page(swap_address_space(entry),
-					pgoff);
+			pgoff = entry.val;
+			*vec = mincore_page(swap_address_space(entry),
+					    pgoff);
 #else
-				WARN_ON(1);
-				*vec = 1;
+			WARN_ON(1);
+			*vec = 1;
 #endif
-			}
 		}
-		vec++;
-	} while (ptep++, addr = next, addr != end);
-	pte_unmap_unlock(ptep - 1, ptl);
-}
-
-static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-			unsigned long addr, unsigned long end,
-			unsigned char *vec)
-{
-	unsigned long next;
-	pmd_t *pmd;
-
-	pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		if (pmd_trans_huge(*pmd)) {
-			if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
-				vec += (next - addr) >> PAGE_SHIFT;
-				continue;
-			}
-			/* fall through */
-		}
-		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
-			mincore_unmapped_range(vma, addr, next, vec);
-		else
-			mincore_pte_range(vma, pmd, addr, next, vec);
-		vec += (next - addr) >> PAGE_SHIFT;
-	} while (pmd++, addr = next, addr != end);
-}
-
-static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-			unsigned long addr, unsigned long end,
-			unsigned char *vec)
-{
-	unsigned long next;
-	pud_t *pud;
-
-	pud = pud_offset(pgd, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
-			mincore_unmapped_range(vma, addr, next, vec);
-		else
-			mincore_pmd_range(vma, pud, addr, next, vec);
-		vec += (next - addr) >> PAGE_SHIFT;
-	} while (pud++, addr = next, addr != end);
+	}
+	walk->private++;
+	return 0;
 }
 
-static void mincore_page_range(struct vm_area_struct *vma,
-			unsigned long addr, unsigned long end,
-			unsigned char *vec)
+static int mincore_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+			struct mm_walk *walk)
 {
-	unsigned long next;
-	pgd_t *pgd;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned char *vec = walk->private;
 
-	pgd = pgd_offset(vma->vm_mm, addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			mincore_unmapped_range(vma, addr, next, vec);
-		else
-			mincore_pud_range(vma, pgd, addr, next, vec);
-		vec += (next - addr) >> PAGE_SHIFT;
-	} while (pgd++, addr = next, addr != end);
+	if (mincore_huge_pmd(vma, pmd, addr, end, vec)) {
+		walk->private += (end - addr) >> PAGE_SHIFT;
+		/* don't call micore_pte() */
+		walk->skip = 1;
+	}
+	return 0;
 }
 
 /*
@@ -226,6 +161,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 {
 	struct vm_area_struct *vma;
 	unsigned long end;
+	int err;
 
 	vma = find_vma(current->mm, addr);
 	if (!vma || addr < vma->vm_start)
@@ -233,12 +169,20 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 
 	end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
 
-	if (is_vm_hugetlb_page(vma))
-		mincore_hugetlb_page_range(vma, addr, end, vec);
+	struct mm_walk mincore_walk = {
+		.pmd_entry = mincore_pmd,
+		.pte_entry = mincore_pte,
+		.pte_hole = mincore_hole,
+		.hugetlb_entry = mincore_hugetlb,
+		.mm = vma->vm_mm,
+		.vma = vma,
+		.private = vec,
+	};
+	err = walk_page_vma(vma, &mincore_walk);
+	if (err < 0)
+		return err;
 	else
-		mincore_page_range(vma, addr, end, vec);
-
-	return (end - addr) >> PAGE_SHIFT;
+		return (end - addr) >> PAGE_SHIFT;
 }
 
 /*
-- 
1.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]