From: Cliff Wickman <cpw@xxxxxxx> v2: diff'd against linux-next I am seeing list corruption occurring from within gather_surplus_pages() (mm/hugetlb.c). The problem occurs in a RHEL6 kernel under a heavy load, and seems to be because this function drops the hugetlb_lock. The list_add() in gather_surplus_pages() seems to need to be protected by the lock. (I don't have a similar test for a linux-next kernel) I have CONFIG_DEBUG_LIST=y, and am running an MPI application with 64 threads and a library that creates a large heap of hugetlbfs pages for it. The below patch fixes the problem. The gist of this patch is that gather_surplus_pages() does not have to drop the lock if alloc_buddy_huge_page() is told whether the lock is already held. Signed-off-by: Cliff Wickman <cpw@xxxxxxx> --- mm/hugetlb.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) Index: linux/mm/hugetlb.c =================================================================== --- linux.orig/mm/hugetlb.c +++ linux/mm/hugetlb.c @@ -838,7 +838,9 @@ static int free_pool_huge_page(struct hs return ret; } -static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) +/* already_locked means the caller has already locked hugetlb_lock */ +static struct page *alloc_buddy_huge_page(struct hstate *h, int nid, + int already_locked) { struct page *page; unsigned int r_nid; @@ -869,15 +871,19 @@ static struct page *alloc_buddy_huge_pag * the node values until we've gotten the hugepage and only the * per-node value is checked there. */ - spin_lock(&hugetlb_lock); + if (!already_locked) + spin_lock(&hugetlb_lock); + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { - spin_unlock(&hugetlb_lock); + if (!already_locked) + spin_unlock(&hugetlb_lock); return NULL; } else { h->nr_huge_pages++; h->surplus_huge_pages++; } spin_unlock(&hugetlb_lock); + /* page allocation may sleep, so the lock must be unlocked */ if (nid == NUMA_NO_NODE) page = alloc_pages(htlb_alloc_mask|__GFP_COMP| @@ -910,7 +916,8 @@ static struct page *alloc_buddy_huge_pag h->surplus_huge_pages--; __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); } - spin_unlock(&hugetlb_lock); + if (!already_locked) + spin_unlock(&hugetlb_lock); return page; } @@ -929,7 +936,7 @@ struct page *alloc_huge_page_node(struct spin_unlock(&hugetlb_lock); if (!page) - page = alloc_buddy_huge_page(h, nid); + page = alloc_buddy_huge_page(h, nid, 0); return page; } @@ -937,6 +944,7 @@ struct page *alloc_huge_page_node(struct /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. + * This is entered and exited with hugetlb_lock locked. */ static int gather_surplus_pages(struct hstate *h, int delta) { @@ -957,9 +965,8 @@ static int gather_surplus_pages(struct h ret = -ENOMEM; retry: - spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + page = alloc_buddy_huge_page(h, NUMA_NO_NODE, 1); if (!page) { alloc_ok = false; break; @@ -969,10 +976,9 @@ retry: allocated += i; /* - * After retaking hugetlb_lock, we need to recalculate 'needed' + * With hugetlb_lock still locked, we need to recalculate 'needed' * because either resv_huge_pages or free_huge_pages may have changed. */ - spin_lock(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); if (needed > 0) { @@ -1010,15 +1016,12 @@ retry: enqueue_huge_page(h, page); } free: - spin_unlock(&hugetlb_lock); - /* Free unnecessary surplus pages to the buddy allocator */ if (!list_empty(&surplus_list)) { list_for_each_entry_safe(page, tmp, &surplus_list, lru) { put_page(page); } } - spin_lock(&hugetlb_lock); return ret; } @@ -1151,7 +1154,7 @@ static struct page *alloc_huge_page(stru spin_unlock(&hugetlb_lock); } else { spin_unlock(&hugetlb_lock); - page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + page = alloc_buddy_huge_page(h, NUMA_NO_NODE, 0); if (!page) { hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>