Currently all of page table handling by hugetlbfs code are done under mm->page_table_lock. This is not optimal because there can be lock contentions between unrelated components using this lock. This patch makes hugepage support split page table lock so that we use page->ptl of the leaf node of page table tree which is pte for normal pages but can be pmd and/or pud for hugepages of some architectures. Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> --- arch/x86/mm/hugetlbpage.c | 6 ++-- include/linux/hugetlb.h | 18 ++++++++++ mm/hugetlb.c | 84 ++++++++++++++++++++++++++++------------------- 3 files changed, 73 insertions(+), 35 deletions(-) diff --git v3.10-rc3.orig/arch/x86/mm/hugetlbpage.c v3.10-rc3/arch/x86/mm/hugetlbpage.c index ae1aa71..0e4a396 100644 --- v3.10-rc3.orig/arch/x86/mm/hugetlbpage.c +++ v3.10-rc3/arch/x86/mm/hugetlbpage.c @@ -75,6 +75,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) unsigned long saddr; pte_t *spte = NULL; pte_t *pte; + spinlock_t *ptl; if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); @@ -89,6 +90,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spte = huge_pte_offset(svma->vm_mm, saddr); if (spte) { get_page(virt_to_page(spte)); + ptl = huge_pte_lockptr(mm, spte); break; } } @@ -97,12 +99,12 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!spte) goto out; - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (pud_none(*pud)) pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); else put_page(virt_to_page(spte)); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); mutex_unlock(&mapping->i_mmap_mutex); diff --git v3.10-rc3.orig/include/linux/hugetlb.h v3.10-rc3/include/linux/hugetlb.h index a639c87..40f3215 100644 --- v3.10-rc3.orig/include/linux/hugetlb.h +++ v3.10-rc3/include/linux/hugetlb.h @@ -32,6 +32,24 @@ void hugepage_put_subpool(struct hugepage_subpool *spool); int PageHuge(struct page *page); +#if USE_SPLIT_PTLOCKS +#define huge_pte_lockptr(mm, ptep) ({__pte_lockptr(virt_to_page(ptep)); }) +#else /* !USE_SPLIT_PTLOCKS */ +#define huge_pte_lockptr(mm, ptep) ({&(mm)->page_table_lock; }) +#endif /* USE_SPLIT_PTLOCKS */ + +#define huge_pte_offset_lock(mm, address, ptlp) \ +({ \ + pte_t *__pte = huge_pte_offset(mm, address); \ + spinlock_t *__ptl = NULL; \ + if (__pte) { \ + __ptl = huge_pte_lockptr(mm, __pte); \ + *(ptlp) = __ptl; \ + spin_lock(__ptl); \ + } \ + __pte; \ +}) + void reset_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); diff --git v3.10-rc3.orig/mm/hugetlb.c v3.10-rc3/mm/hugetlb.c index 463fb5e..8e1af32 100644 --- v3.10-rc3.orig/mm/hugetlb.c +++ v3.10-rc3/mm/hugetlb.c @@ -2325,6 +2325,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { + spinlock_t *srcptl, *dstptl; src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; @@ -2336,8 +2337,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, if (dst_pte == src_pte) continue; - spin_lock(&dst->page_table_lock); - spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); + dstptl = huge_pte_lockptr(dst, dst_pte); + srcptl = huge_pte_lockptr(src, src_pte); + spin_lock(dstptl); + spin_lock_nested(srcptl, SINGLE_DEPTH_NESTING); if (!huge_pte_none(huge_ptep_get(src_pte))) { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); @@ -2347,8 +2350,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); + spin_unlock(srcptl); + spin_unlock(dstptl); } return 0; @@ -2391,6 +2394,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address; pte_t *ptep; pte_t pte; + spinlock_t *ptl; struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -2404,25 +2408,24 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); again: - spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset_lock(mm, address, &ptl); if (!ptep) continue; if (huge_pmd_unshare(mm, &address, ptep)) - continue; + goto unlock; pte = huge_ptep_get(ptep); if (huge_pte_none(pte)) - continue; + goto unlock; /* * HWPoisoned hugepage is already unmapped and dropped reference */ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { huge_pte_clear(mm, address, ptep); - continue; + goto unlock; } page = pte_page(pte); @@ -2433,7 +2436,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, */ if (ref_page) { if (page != ref_page) - continue; + goto unlock; /* * Mark the VMA as having unmapped its page so that @@ -2450,13 +2453,18 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, page_remove_rmap(page); force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) + if (force_flush) { + spin_unlock(ptl); break; + } /* Bail out after unmapping reference page if supplied */ - if (ref_page) + if (ref_page) { + spin_unlock(ptl); break; + } +unlock: + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); /* * mmu_gather ran out of room to batch pages, we break out of * the PTE lock to avoid doing the potential expensive TLB invalidate @@ -2570,6 +2578,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, int outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + spinlock_t *ptl = huge_pte_lockptr(mm, ptep); old_page = pte_page(pte); @@ -2601,7 +2610,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(old_page); /* Drop page_table_lock as buddy allocator may be called */ - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { @@ -2619,7 +2628,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, BUG_ON(huge_pte_none(pte)); if (unmap_ref_private(mm, vma, old_page, address)) { BUG_ON(huge_pte_none(pte)); - spin_lock(&mm->page_table_lock); + spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; @@ -2633,7 +2642,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (err == -ENOMEM) return VM_FAULT_OOM; else @@ -2648,7 +2657,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_release(new_page); page_cache_release(old_page); /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); return VM_FAULT_OOM; } @@ -2663,7 +2672,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, * Retake the page_table_lock to check for racing updates * before the page tables are altered */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ @@ -2675,10 +2684,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, /* Make the old page be freed below */ new_page = old_page; } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); page_cache_release(new_page); page_cache_release(old_page); return 0; @@ -2728,6 +2737,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; struct address_space *mapping; pte_t new_pte; + spinlock_t *ptl; /* * Currently, we are forced to kill the process in the event the @@ -2813,7 +2823,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, goto backout_unlocked; } - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(mm, ptep); + spin_lock(ptl); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -2835,13 +2846,13 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); unlock_page(page); out: return ret; backout: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); backout_unlocked: unlock_page(page); put_page(page); @@ -2853,6 +2864,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, { pte_t *ptep; pte_t entry; + spinlock_t *ptl; int ret; struct page *page = NULL; struct page *pagecache_page = NULL; @@ -2921,7 +2933,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (page != pagecache_page) lock_page(page); - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(mm, ptep); + spin_lock(ptl); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) goto out_page_table_lock; @@ -2941,7 +2954,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); out_page_table_lock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (pagecache_page) { unlock_page(pagecache_page); @@ -2976,9 +2989,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); - spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; + spinlock_t *ptl = NULL; int absent; struct page *page; @@ -2986,8 +2999,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the * first, for the page indexing below to work. + * + * Note that page table lock is not held when pte is null. */ - pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + pte = huge_pte_offset_lock(mm, vaddr & huge_page_mask(h), &ptl); absent = !pte || huge_pte_none(huge_ptep_get(pte)); /* @@ -2999,6 +3014,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if (absent && (flags & FOLL_DUMP) && !hugetlbfs_pagecache_present(h, vma, vaddr)) { + if (pte) + spin_unlock(ptl); remainder = 0; break; } @@ -3018,10 +3035,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, !huge_pte_write(huge_ptep_get(pte)))) { int ret; - spin_unlock(&mm->page_table_lock); + if (pte) + spin_unlock(ptl); ret = hugetlb_fault(mm, vma, vaddr, (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); - spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; @@ -3052,8 +3069,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, */ goto same_page; } + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); *nr_pages = remainder; *position = vaddr; @@ -3074,13 +3091,14 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, address, end); mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); - spin_lock(&mm->page_table_lock); for (; address < end; address += huge_page_size(h)) { - ptep = huge_pte_offset(mm, address); + spinlock_t *ptl; + ptep = huge_pte_offset_lock(mm, address, &ptl); if (!ptep) continue; if (huge_pmd_unshare(mm, &address, ptep)) { pages++; + spin_unlock(ptl); continue; } if (!huge_pte_none(huge_ptep_get(ptep))) { @@ -3090,8 +3108,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, set_huge_pte_at(mm, address, ptep, pte); pages++; } + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); /* * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: -- 1.7.11.7 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>