[PATCH v3 2/6] mm/hugetlb: take page table lock in follow_huge_(addr|pmd|pud)()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



We have a race condition between move_pages() and freeing hugepages,
where move_pages() calls follow_page(FOLL_GET) for hugepages internally
and tries to get its refcount without preventing concurrent freeing.
This race crashes the kernel, so this patch fixes it by moving FOLL_GET
code for hugepages into follow_huge_pmd() with taking the page table lock.

This patch also adds the similar locking to follow_huge_(addr|pud)
for consistency.

Here is the reproducer:

  $ cat movepages.c
  #include <stdio.h>
  #include <stdlib.h>
  #include <numaif.h>

  #define ADDR_INPUT      0x700000000000UL
  #define HPS             0x200000
  #define PS              0x1000

  int main(int argc, char *argv[]) {
          int i;
          int nr_hp = strtol(argv[1], NULL, 0);
          int nr_p  = nr_hp * HPS / PS;
          int ret;
          void **addrs;
          int *status;
          int *nodes;
          pid_t pid;

          pid = strtol(argv[2], NULL, 0);
          addrs  = malloc(sizeof(char *) * nr_p + 1);
          status = malloc(sizeof(char *) * nr_p + 1);
          nodes  = malloc(sizeof(char *) * nr_p + 1);

          while (1) {
                  for (i = 0; i < nr_p; i++) {
                          addrs[i] = (void *)ADDR_INPUT + i * PS;
                          nodes[i] = 1;
                          status[i] = 0;
                  }
                  ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
                                        MPOL_MF_MOVE_ALL);
                  if (ret == -1)
                          err("move_pages");

                  for (i = 0; i < nr_p; i++) {
                          addrs[i] = (void *)ADDR_INPUT + i * PS;
                          nodes[i] = 0;
                          status[i] = 0;
                  }
                  ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
                                        MPOL_MF_MOVE_ALL);
                  if (ret == -1)
                          err("move_pages");
          }
          return 0;
  }

  $ cat hugepage.c
  #include <stdio.h>
  #include <sys/mman.h>
  #include <string.h>

  #define ADDR_INPUT      0x700000000000UL
  #define HPS             0x200000

  int main(int argc, char *argv[]) {
          int nr_hp = strtol(argv[1], NULL, 0);
          char *p;

          while (1) {
                  p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
                           MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
                  if (p != (void *)ADDR_INPUT) {
                          perror("mmap");
                          break;
                  }
                  memset(p, 0, nr_hp * HPS);
                  munmap(p, nr_hp * HPS);
          }
  }

  $ sysctl vm.nr_hugepages=40
  $ ./hugepage 10 &
  $ ./movepages 10 $(pgrep -f hugepage)

Note for stable inclusion:
  This patch fixes e632a938d914 ("mm: migrate: add hugepage migration code
  to move_pages()"), so is applicable to -stable kernels which includes it.

ChangeLog v3:
- remove unnecessary if (page) check
- check (pmd|pud)_huge again after holding ptl
- do the same change also on follow_huge_pud()
- take page table lock also in follow_huge_addr()

ChangeLog v2:
- introduce follow_huge_pmd_lock() to do locking in arch-independent code.

Reported-by: Hugh Dickins <hughd@xxxxxxxxxx>
Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
Cc: <stable@xxxxxxxxxxxxxxx>  # [3.12+]
---
 arch/ia64/mm/hugetlbpage.c    |  9 +++++++--
 arch/metag/mm/hugetlbpage.c   |  4 ++--
 arch/powerpc/mm/hugetlbpage.c | 22 +++++++++++-----------
 include/linux/hugetlb.h       | 12 ++++++------
 mm/gup.c                      | 25 ++++---------------------
 mm/hugetlb.c                  | 43 +++++++++++++++++++++++++++++++------------
 6 files changed, 61 insertions(+), 54 deletions(-)

diff --git mmotm-2014-08-25-16-52.orig/arch/ia64/mm/hugetlbpage.c mmotm-2014-08-25-16-52/arch/ia64/mm/hugetlbpage.c
index 52b7604b5215..6170381bf074 100644
--- mmotm-2014-08-25-16-52.orig/arch/ia64/mm/hugetlbpage.c
+++ mmotm-2014-08-25-16-52/arch/ia64/mm/hugetlbpage.c
@@ -91,17 +91,22 @@ int prepare_hugepage_range(struct file *file,
 
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
 {
-	struct page *page;
+	struct page *page = NULL;
 	pte_t *ptep;
+	spinlock_t *ptl;
 
 	if (REGION_NUMBER(addr) != RGN_HPAGE)
 		return ERR_PTR(-EINVAL);
 
 	ptep = huge_pte_offset(mm, addr);
+	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
 	if (!ptep || pte_none(*ptep))
-		return NULL;
+		goto out;
+
 	page = pte_page(*ptep);
 	page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
+out:
+	spin_unlock(ptl);
 	return page;
 }
 int pmd_huge(pmd_t pmd)
diff --git mmotm-2014-08-25-16-52.orig/arch/metag/mm/hugetlbpage.c mmotm-2014-08-25-16-52/arch/metag/mm/hugetlbpage.c
index 745081427659..5e96ef096df9 100644
--- mmotm-2014-08-25-16-52.orig/arch/metag/mm/hugetlbpage.c
+++ mmotm-2014-08-25-16-52/arch/metag/mm/hugetlbpage.c
@@ -104,8 +104,8 @@ int pud_huge(pud_t pud)
 	return 0;
 }
 
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
+struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long address,
+			     pmd_t *pmd, int flags)
 {
 	return NULL;
 }
diff --git mmotm-2014-08-25-16-52.orig/arch/powerpc/mm/hugetlbpage.c mmotm-2014-08-25-16-52/arch/powerpc/mm/hugetlbpage.c
index 9517a93a315c..1d8854a56309 100644
--- mmotm-2014-08-25-16-52.orig/arch/powerpc/mm/hugetlbpage.c
+++ mmotm-2014-08-25-16-52/arch/powerpc/mm/hugetlbpage.c
@@ -677,38 +677,38 @@ struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
 	pte_t *ptep;
-	struct page *page;
+	struct page *page = ERR_PTR(-EINVAL);
 	unsigned shift;
 	unsigned long mask;
+	spinlock_t *ptl;
 	/*
 	 * Transparent hugepages are handled by generic code. We can skip them
 	 * here.
 	 */
 	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
-
+	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
 	/* Verify it is a huge page else bail. */
 	if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
-		return ERR_PTR(-EINVAL);
+		goto out;
 
 	mask = (1UL << shift) - 1;
-	page = pte_page(*ptep);
-	if (page)
-		page += (address & mask) / PAGE_SIZE;
-
+	page = pte_page(*ptep) + ((address & mask) >> PAGE_SHIFT);
+out:
+	spin_unlock(ptl);
 	return page;
 }
 
 struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+follow_huge_pmd(struct vm_area_struct *vma, unsigned long address,
+		pmd_t *pmd, int flags)
 {
 	BUG();
 	return NULL;
 }
 
 struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+follow_huge_pud(struct vm_area_struct *vma, unsigned long address,
+		pud_t *pud, int flags)
 {
 	BUG();
 	return NULL;
diff --git mmotm-2014-08-25-16-52.orig/include/linux/hugetlb.h mmotm-2014-08-25-16-52/include/linux/hugetlb.h
index 6e6d338641fe..b3200fce07aa 100644
--- mmotm-2014-08-25-16-52.orig/include/linux/hugetlb.h
+++ mmotm-2014-08-25-16-52/include/linux/hugetlb.h
@@ -98,10 +98,10 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int write);
-struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
-				pud_t *pud, int write);
+struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long address,
+				pmd_t *pmd, int flags);
+struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long address,
+				pud_t *pud, int flags);
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pmd);
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -133,8 +133,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 static inline void hugetlb_show_meminfo(void)
 {
 }
-#define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define follow_huge_pud(mm, addr, pud, write)	NULL
+#define follow_huge_pmd(vma, addr, pmd, flags)	NULL
+#define follow_huge_pud(vma, addr, pud, flags)	NULL
 #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define pud_huge(x)	0
diff --git mmotm-2014-08-25-16-52.orig/mm/gup.c mmotm-2014-08-25-16-52/mm/gup.c
index 91d044b1600d..597a5e92e265 100644
--- mmotm-2014-08-25-16-52.orig/mm/gup.c
+++ mmotm-2014-08-25-16-52/mm/gup.c
@@ -162,33 +162,16 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	pud = pud_offset(pgd, address);
 	if (pud_none(*pud))
 		return no_page_table(vma, flags);
-	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
-		if (flags & FOLL_GET)
-			return NULL;
-		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
-		return page;
-	}
+	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB)
+		return follow_huge_pud(vma, address, pud, flags);
 	if (unlikely(pud_bad(*pud)))
 		return no_page_table(vma, flags);
 
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd))
 		return no_page_table(vma, flags);
-	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
-		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
-		if (flags & FOLL_GET) {
-			/*
-			 * Refcount on tail pages are not well-defined and
-			 * shouldn't be taken. The caller should handle a NULL
-			 * return when trying to follow tail pages.
-			 */
-			if (PageHead(page))
-				get_page(page);
-			else
-				page = NULL;
-		}
-		return page;
-	}
+	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB)
+		return follow_huge_pmd(vma, address, pmd, flags);
 	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
 		return no_page_table(vma, flags);
 	if (pmd_trans_huge(*pmd)) {
diff --git mmotm-2014-08-25-16-52.orig/mm/hugetlb.c mmotm-2014-08-25-16-52/mm/hugetlb.c
index 022767506c7b..c5345c5edb50 100644
--- mmotm-2014-08-25-16-52.orig/mm/hugetlb.c
+++ mmotm-2014-08-25-16-52/mm/hugetlb.c
@@ -3667,26 +3667,45 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address,
 }
 
 struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+follow_huge_pmd(struct vm_area_struct *vma, unsigned long address,
+		pmd_t *pmd, int flags)
 {
-	struct page *page;
+	struct page *page = NULL;
+	spinlock_t *ptl;
 
-	page = pte_page(*(pte_t *)pmd);
-	if (page)
-		page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
+
+	if (!pmd_huge(*pmd))
+		goto out;
+
+	page = pte_page(*(pte_t *)pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+
+	if (flags & FOLL_GET)
+		if (!get_page_unless_zero(page))
+			page = NULL;
+out:
+	spin_unlock(ptl);
 	return page;
 }
 
 struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pud_t *pud, int write)
+follow_huge_pud(struct vm_area_struct *vma, unsigned long address,
+		pud_t *pud, int flags)
 {
-	struct page *page;
+	struct page *page = NULL;
+	spinlock_t *ptl;
 
-	page = pte_page(*(pte_t *)pud);
-	if (page)
-		page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
+	if (flags & FOLL_GET)
+		return NULL;
+
+	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pud);
+
+	if (!pud_huge(*pud))
+		goto out;
+
+	page = pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+out:
+	spin_unlock(ptl);
 	return page;
 }
 
-- 
1.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]