On Fri, Aug 09, 2013 at 06:26:37PM +0900, Joonsoo Kim wrote: > If parallel fault occur, we can fail to allocate a hugepage, > because many threads dequeue a hugepage to handle a fault of same address. > This makes reserved pool shortage just for a little while and this cause > faulting thread who can get hugepages to get a SIGBUS signal. > > To solve this problem, we already have a nice solution, that is, > a hugetlb_instantiation_mutex. This blocks other threads to dive into > a fault handler. This solve the problem clearly, but it introduce > performance degradation, because it serialize all fault handling. > > Now, I try to remove a hugetlb_instantiation_mutex to get rid of > performance degradation. For achieving it, at first, we should ensure that > no one get a SIGBUS if there are enough hugepages. > > For this purpose, if we fail to allocate a new hugepage when there is > concurrent user, we return just 0, instead of VM_FAULT_SIGBUS. With this, > these threads defer to get a SIGBUS signal until there is no > concurrent user, and so, we can ensure that no one get a SIGBUS if there > are enough hugepages. > > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx> > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index e29e28f..981c539 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -242,6 +242,7 @@ struct hstate { > int next_nid_to_free; > unsigned int order; > unsigned long mask; > + unsigned long nr_dequeue_users; > unsigned long max_huge_pages; > unsigned long nr_huge_pages; > unsigned long free_huge_pages; > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 8743e5c..0501fe5 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -561,6 +561,7 @@ retry_cpuset: > if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { > page = dequeue_huge_page_node(h, zone_to_nid(zone)); > if (page) { > + h->nr_dequeue_users++; So, nr_dequeue_users doesn't seem to be incremented in the alloc_huge_page_node() path. I'm not sure exactly where that's used, so I'm not sure if it's a problem. > if (!use_reserve) > break; > > @@ -577,6 +578,16 @@ retry_cpuset: > return page; > } > > +static void commit_dequeued_huge_page(struct hstate *h, bool do_dequeue) > +{ > + if (!do_dequeue) > + return; Seems like it would be easier to do this test in the callers, but I doubt it matters much. > + spin_lock(&hugetlb_lock); > + h->nr_dequeue_users--; > + spin_unlock(&hugetlb_lock); > +} > + > static void update_and_free_page(struct hstate *h, struct page *page) > { > int i; > @@ -1110,7 +1121,9 @@ static void vma_commit_reservation(struct hstate *h, > } > > static struct page *alloc_huge_page(struct vm_area_struct *vma, > - unsigned long addr, int use_reserve) > + unsigned long addr, int use_reserve, > + unsigned long *nr_dequeue_users, > + bool *do_dequeue) > { > struct hugepage_subpool *spool = subpool_vma(vma); > struct hstate *h = hstate_vma(vma); > @@ -1138,8 +1151,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, > return ERR_PTR(-ENOSPC); > } > spin_lock(&hugetlb_lock); > + *do_dequeue = true; > page = dequeue_huge_page_vma(h, vma, addr, use_reserve); > if (!page) { > + *nr_dequeue_users = h->nr_dequeue_users; So, the nr_dequeue_users parameter is only initialized if !page here. It's not obvious to me that the callers only use it in hat case. > + *do_dequeue = false; > spin_unlock(&hugetlb_lock); > page = alloc_buddy_huge_page(h, NUMA_NO_NODE); > if (!page) { I think the counter also needs to be incremented in the case where we call alloc_buddy_huge_page() from alloc_huge_page(). Even though it's new, it gets added to the hugepage pool at this point and could still be a contended page for the last allocation, unless I'm missing something. > @@ -1894,6 +1910,7 @@ void __init hugetlb_add_hstate(unsigned order) > h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); > h->nr_huge_pages = 0; > h->free_huge_pages = 0; > + h->nr_dequeue_users = 0; > for (i = 0; i < MAX_NUMNODES; ++i) > INIT_LIST_HEAD(&h->hugepage_freelists[i]); > INIT_LIST_HEAD(&h->hugepage_activelist); > @@ -2500,6 +2517,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, > int outside_reserve = 0; > long chg; > bool use_reserve = false; > + unsigned long nr_dequeue_users = 0; > + bool do_dequeue = false; > int ret = 0; > unsigned long mmun_start; /* For mmu_notifiers */ > unsigned long mmun_end; /* For mmu_notifiers */ > @@ -2551,11 +2570,17 @@ retry_avoidcopy: > use_reserve = !chg; > } > > - new_page = alloc_huge_page(vma, address, use_reserve); > + new_page = alloc_huge_page(vma, address, use_reserve, > + &nr_dequeue_users, &do_dequeue); > > if (IS_ERR(new_page)) { > page_cache_release(old_page); > > + if (nr_dequeue_users) { > + ret = 0; > + goto out_lock; > + } > + > /* > * If a process owning a MAP_PRIVATE mapping fails to COW, > * it is due to references held by a child and an insufficient > @@ -2580,6 +2605,9 @@ retry_avoidcopy: > WARN_ON_ONCE(1); > } > > + if (use_reserve) > + WARN_ON_ONCE(1); > + > ret = VM_FAULT_SIGBUS; > goto out_lock; > } > @@ -2614,6 +2642,7 @@ retry_avoidcopy: > page_cache_release(new_page); > out_old_page: > page_cache_release(old_page); > + commit_dequeued_huge_page(h, do_dequeue); > out_lock: > /* Caller expects lock to be held */ > spin_lock(&mm->page_table_lock); > @@ -2666,6 +2695,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, > pte_t new_pte; > long chg; > bool use_reserve; > + unsigned long nr_dequeue_users = 0; > + bool do_dequeue = false; > > /* > * Currently, we are forced to kill the process in the event the > @@ -2699,9 +2730,17 @@ retry: > } > use_reserve = !chg; > > - page = alloc_huge_page(vma, address, use_reserve); > + page = alloc_huge_page(vma, address, use_reserve, > + &nr_dequeue_users, &do_dequeue); > if (IS_ERR(page)) { > - ret = VM_FAULT_SIGBUS; > + if (nr_dequeue_users) > + ret = 0; > + else { > + if (use_reserve) > + WARN_ON_ONCE(1); > + > + ret = VM_FAULT_SIGBUS; > + } > goto out; > } > clear_huge_page(page, address, pages_per_huge_page(h)); > @@ -2714,22 +2753,24 @@ retry: > err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); > if (err) { > put_page(page); > + commit_dequeued_huge_page(h, do_dequeue); > if (err == -EEXIST) > goto retry; > goto out; > } > ClearPagePrivate(page); > + commit_dequeued_huge_page(h, do_dequeue); > > spin_lock(&inode->i_lock); > inode->i_blocks += blocks_per_huge_page(h); > spin_unlock(&inode->i_lock); > } else { > lock_page(page); > + anon_rmap = 1; > if (unlikely(anon_vma_prepare(vma))) { > ret = VM_FAULT_OOM; > goto backout_unlocked; > } > - anon_rmap = 1; > } > } else { > /* > @@ -2783,6 +2824,8 @@ retry: > spin_unlock(&mm->page_table_lock); > unlock_page(page); > out: > + if (anon_rmap) > + commit_dequeued_huge_page(h, do_dequeue); > return ret; > > backout: Otherwise I think it looks good. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
pgpUCtQwGkM3D.pgp
Description: PGP signature