Re: [PATCH 1/2] hugetlbfs: support split page table lock

Wanpeng Li <liwanp@xxxxxxxxxxxxxxxxxx> · Wed, 29 May 2013 09:09:59 +0800



On Tue, May 28, 2013 at 03:52:50PM -0400, Naoya Horiguchi wrote:
>Currently all of page table handling by hugetlbfs code are done under
>mm->page_table_lock. This is not optimal because there can be lock
>contentions between unrelated components using this lock.
>
>This patch makes hugepage support split page table lock so that
>we use page->ptl of the leaf node of page table tree which is pte for
>normal pages but can be pmd and/or pud for hugepages of some architectures.
>

Reviewed-by: Wanpeng Li <liwanp@xxxxxxxxxxxxxxxxxx>

>Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
>---
> arch/x86/mm/hugetlbpage.c |  6 ++--
> include/linux/hugetlb.h   | 18 ++++++++++
> mm/hugetlb.c              | 84 ++++++++++++++++++++++++++++-------------------
> 3 files changed, 73 insertions(+), 35 deletions(-)
>
>diff --git v3.10-rc3.orig/arch/x86/mm/hugetlbpage.c v3.10-rc3/arch/x86/mm/hugetlbpage.c
>index ae1aa71..0e4a396 100644
>--- v3.10-rc3.orig/arch/x86/mm/hugetlbpage.c
>+++ v3.10-rc3/arch/x86/mm/hugetlbpage.c
>@@ -75,6 +75,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
> 	unsigned long saddr;
> 	pte_t *spte = NULL;
> 	pte_t *pte;
>+	spinlock_t *ptl;
>
> 	if (!vma_shareable(vma, addr))
> 		return (pte_t *)pmd_alloc(mm, pud, addr);
>@@ -89,6 +90,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
> 			spte = huge_pte_offset(svma->vm_mm, saddr);
> 			if (spte) {
> 				get_page(virt_to_page(spte));
>+				ptl = huge_pte_lockptr(mm, spte);
> 				break;
> 			}
> 		}
>@@ -97,12 +99,12 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
> 	if (!spte)
> 		goto out;
>
>-	spin_lock(&mm->page_table_lock);
>+	spin_lock(ptl);
> 	if (pud_none(*pud))
> 		pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
> 	else
> 		put_page(virt_to_page(spte));
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> out:
> 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
> 	mutex_unlock(&mapping->i_mmap_mutex);
>diff --git v3.10-rc3.orig/include/linux/hugetlb.h v3.10-rc3/include/linux/hugetlb.h
>index a639c87..40f3215 100644
>--- v3.10-rc3.orig/include/linux/hugetlb.h
>+++ v3.10-rc3/include/linux/hugetlb.h
>@@ -32,6 +32,24 @@ void hugepage_put_subpool(struct hugepage_subpool *spool);
>
> int PageHuge(struct page *page);
>
>+#if USE_SPLIT_PTLOCKS
>+#define huge_pte_lockptr(mm, ptep) ({__pte_lockptr(virt_to_page(ptep)); })
>+#else	/* !USE_SPLIT_PTLOCKS */
>+#define huge_pte_lockptr(mm, ptep) ({&(mm)->page_table_lock; })
>+#endif	/* USE_SPLIT_PTLOCKS */
>+
>+#define huge_pte_offset_lock(mm, address, ptlp)		\
>+({							\
>+	pte_t *__pte = huge_pte_offset(mm, address);	\
>+	spinlock_t *__ptl = NULL;			\
>+	if (__pte) {					\
>+		__ptl = huge_pte_lockptr(mm, __pte);	\
>+		*(ptlp) = __ptl;			\
>+		spin_lock(__ptl);			\
>+	}						\
>+	__pte;						\
>+})
>+
> void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
> int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
> int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
>diff --git v3.10-rc3.orig/mm/hugetlb.c v3.10-rc3/mm/hugetlb.c
>index 463fb5e..8e1af32 100644
>--- v3.10-rc3.orig/mm/hugetlb.c
>+++ v3.10-rc3/mm/hugetlb.c
>@@ -2325,6 +2325,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
>
> 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
>+		spinlock_t *srcptl, *dstptl;
> 		src_pte = huge_pte_offset(src, addr);
> 		if (!src_pte)
> 			continue;
>@@ -2336,8 +2337,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> 		if (dst_pte == src_pte)
> 			continue;
>
>-		spin_lock(&dst->page_table_lock);
>-		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
>+		dstptl = huge_pte_lockptr(dst, dst_pte);
>+		srcptl = huge_pte_lockptr(src, src_pte);
>+		spin_lock(dstptl);
>+		spin_lock_nested(srcptl, SINGLE_DEPTH_NESTING);
> 		if (!huge_pte_none(huge_ptep_get(src_pte))) {
> 			if (cow)
> 				huge_ptep_set_wrprotect(src, addr, src_pte);
>@@ -2347,8 +2350,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> 			page_dup_rmap(ptepage);
> 			set_huge_pte_at(dst, addr, dst_pte, entry);
> 		}
>-		spin_unlock(&src->page_table_lock);
>-		spin_unlock(&dst->page_table_lock);
>+		spin_unlock(srcptl);
>+		spin_unlock(dstptl);
> 	}
> 	return 0;
>
>@@ -2391,6 +2394,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> 	unsigned long address;
> 	pte_t *ptep;
> 	pte_t pte;
>+	spinlock_t *ptl;
> 	struct page *page;
> 	struct hstate *h = hstate_vma(vma);
> 	unsigned long sz = huge_page_size(h);
>@@ -2404,25 +2408,24 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> 	tlb_start_vma(tlb, vma);
> 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
> again:
>-	spin_lock(&mm->page_table_lock);
> 	for (address = start; address < end; address += sz) {
>-		ptep = huge_pte_offset(mm, address);
>+		ptep = huge_pte_offset_lock(mm, address, &ptl);
> 		if (!ptep)
> 			continue;
>
> 		if (huge_pmd_unshare(mm, &address, ptep))
>-			continue;
>+			goto unlock;
>
> 		pte = huge_ptep_get(ptep);
> 		if (huge_pte_none(pte))
>-			continue;
>+			goto unlock;
>
> 		/*
> 		 * HWPoisoned hugepage is already unmapped and dropped reference
> 		 */
> 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
> 			huge_pte_clear(mm, address, ptep);
>-			continue;
>+			goto unlock;
> 		}
>
> 		page = pte_page(pte);
>@@ -2433,7 +2436,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
> 		 */
> 		if (ref_page) {
> 			if (page != ref_page)
>-				continue;
>+				goto unlock;
>
> 			/*
> 			 * Mark the VMA as having unmapped its page so that
>@@ -2450,13 +2453,18 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
>
> 		page_remove_rmap(page);
> 		force_flush = !__tlb_remove_page(tlb, page);
>-		if (force_flush)
>+		if (force_flush) {
>+			spin_unlock(ptl);
> 			break;
>+		}
> 		/* Bail out after unmapping reference page if supplied */
>-		if (ref_page)
>+		if (ref_page) {
>+			spin_unlock(ptl);
> 			break;
>+		}
>+unlock:
>+		spin_unlock(ptl);
> 	}
>-	spin_unlock(&mm->page_table_lock);
> 	/*
> 	 * mmu_gather ran out of room to batch pages, we break out of
> 	 * the PTE lock to avoid doing the potential expensive TLB invalidate
>@@ -2570,6 +2578,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 	int outside_reserve = 0;
> 	unsigned long mmun_start;	/* For mmu_notifiers */
> 	unsigned long mmun_end;		/* For mmu_notifiers */
>+	spinlock_t *ptl = huge_pte_lockptr(mm, ptep);
>
> 	old_page = pte_page(pte);
>
>@@ -2601,7 +2610,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 	page_cache_get(old_page);
>
> 	/* Drop page_table_lock as buddy allocator may be called */
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> 	new_page = alloc_huge_page(vma, address, outside_reserve);
>
> 	if (IS_ERR(new_page)) {
>@@ -2619,7 +2628,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 			BUG_ON(huge_pte_none(pte));
> 			if (unmap_ref_private(mm, vma, old_page, address)) {
> 				BUG_ON(huge_pte_none(pte));
>-				spin_lock(&mm->page_table_lock);
>+				spin_lock(ptl);
> 				ptep = huge_pte_offset(mm, address & huge_page_mask(h));
> 				if (likely(pte_same(huge_ptep_get(ptep), pte)))
> 					goto retry_avoidcopy;
>@@ -2633,7 +2642,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 		}
>
> 		/* Caller expects lock to be held */
>-		spin_lock(&mm->page_table_lock);
>+		spin_lock(ptl);
> 		if (err == -ENOMEM)
> 			return VM_FAULT_OOM;
> 		else
>@@ -2648,7 +2657,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 		page_cache_release(new_page);
> 		page_cache_release(old_page);
> 		/* Caller expects lock to be held */
>-		spin_lock(&mm->page_table_lock);
>+		spin_lock(ptl);
> 		return VM_FAULT_OOM;
> 	}
>
>@@ -2663,7 +2672,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 	 * Retake the page_table_lock to check for racing updates
> 	 * before the page tables are altered
> 	 */
>-	spin_lock(&mm->page_table_lock);
>+	spin_lock(ptl);
> 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
> 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
> 		/* Break COW */
>@@ -2675,10 +2684,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> 		/* Make the old page be freed below */
> 		new_page = old_page;
> 	}
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> 	/* Caller expects lock to be held */
>-	spin_lock(&mm->page_table_lock);
>+	spin_lock(ptl);
> 	page_cache_release(new_page);
> 	page_cache_release(old_page);
> 	return 0;
>@@ -2728,6 +2737,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 	struct page *page;
> 	struct address_space *mapping;
> 	pte_t new_pte;
>+	spinlock_t *ptl;
>
> 	/*
> 	 * Currently, we are forced to kill the process in the event the
>@@ -2813,7 +2823,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 			goto backout_unlocked;
> 		}
>
>-	spin_lock(&mm->page_table_lock);
>+	ptl = huge_pte_lockptr(mm, ptep);
>+	spin_lock(ptl);
> 	size = i_size_read(mapping->host) >> huge_page_shift(h);
> 	if (idx >= size)
> 		goto backout;
>@@ -2835,13 +2846,13 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
> 	}
>
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> 	unlock_page(page);
> out:
> 	return ret;
>
> backout:
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
> backout_unlocked:
> 	unlock_page(page);
> 	put_page(page);
>@@ -2853,6 +2864,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> {
> 	pte_t *ptep;
> 	pte_t entry;
>+	spinlock_t *ptl;
> 	int ret;
> 	struct page *page = NULL;
> 	struct page *pagecache_page = NULL;
>@@ -2921,7 +2933,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> 	if (page != pagecache_page)
> 		lock_page(page);
>
>-	spin_lock(&mm->page_table_lock);
>+	ptl = huge_pte_lockptr(mm, ptep);
>+	spin_lock(ptl);
> 	/* Check for a racing update before calling hugetlb_cow */
> 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
> 		goto out_page_table_lock;
>@@ -2941,7 +2954,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> 		update_mmu_cache(vma, address, ptep);
>
> out_page_table_lock:
>-	spin_unlock(&mm->page_table_lock);
>+	spin_unlock(ptl);
>
> 	if (pagecache_page) {
> 		unlock_page(pagecache_page);
>@@ -2976,9 +2989,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 	unsigned long remainder = *nr_pages;
> 	struct hstate *h = hstate_vma(vma);
>
>-	spin_lock(&mm->page_table_lock);
> 	while (vaddr < vma->vm_end && remainder) {
> 		pte_t *pte;
>+		spinlock_t *ptl = NULL;
> 		int absent;
> 		struct page *page;
>
>@@ -2986,8 +2999,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		 * Some archs (sparc64, sh*) have multiple pte_ts to
> 		 * each hugepage.  We have to make sure we get the
> 		 * first, for the page indexing below to work.
>+		 *
>+		 * Note that page table lock is not held when pte is null.
> 		 */
>-		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
>+		pte = huge_pte_offset_lock(mm, vaddr & huge_page_mask(h), &ptl);
> 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
>
> 		/*
>@@ -2999,6 +3014,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		 */
> 		if (absent && (flags & FOLL_DUMP) &&
> 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
>+			if (pte)
>+				spin_unlock(ptl);
> 			remainder = 0;
> 			break;
> 		}
>@@ -3018,10 +3035,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 		      !huge_pte_write(huge_ptep_get(pte)))) {
> 			int ret;
>
>-			spin_unlock(&mm->page_table_lock);
>+			if (pte)
>+				spin_unlock(ptl);
> 			ret = hugetlb_fault(mm, vma, vaddr,
> 				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
>-			spin_lock(&mm->page_table_lock);
> 			if (!(ret & VM_FAULT_ERROR))
> 				continue;
>
>@@ -3052,8 +3069,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> 			 */
> 			goto same_page;
> 		}
>+		spin_unlock(ptl);
> 	}
>-	spin_unlock(&mm->page_table_lock);
> 	*nr_pages = remainder;
> 	*position = vaddr;
>
>@@ -3074,13 +3091,14 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> 	flush_cache_range(vma, address, end);
>
> 	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
>-	spin_lock(&mm->page_table_lock);
> 	for (; address < end; address += huge_page_size(h)) {
>-		ptep = huge_pte_offset(mm, address);
>+		spinlock_t *ptl;
>+		ptep = huge_pte_offset_lock(mm, address, &ptl);
> 		if (!ptep)
> 			continue;
> 		if (huge_pmd_unshare(mm, &address, ptep)) {
> 			pages++;
>+			spin_unlock(ptl);
> 			continue;
> 		}
> 		if (!huge_pte_none(huge_ptep_get(ptep))) {
>@@ -3090,8 +3108,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> 			set_huge_pte_at(mm, address, ptep, pte);
> 			pages++;
> 		}
>+		spin_unlock(ptl);
> 	}
>-	spin_unlock(&mm->page_table_lock);
> 	/*
> 	 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
> 	 * may have cleared our pud entry and done put_page on the page table:
>-- 
>1.7.11.7
>
>--
>To unsubscribe, send a message with 'unsubscribe linux-mm' in
>the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
>see: http://www.linux-mm.org/ .
>Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>