When testing the gigantic page whose order is too large for the buddy allocator, the libhugetlbfs test case "counter.sh" will fail. The counter.sh is just a wrapper for counter.c, you can find them in: https://github.com/libhugetlbfs/libhugetlbfs/blob/master/tests/counters.c https://github.com/libhugetlbfs/libhugetlbfs/blob/master/tests/counters.sh Please see the error log below: ............................................ ........ quota.sh (32M: 64): PASS counters.sh (32M: 64): FAIL mmap failed: Invalid argument ********** TEST SUMMARY * 32M * 32-bit 64-bit * Total testcases: 0 87 * Skipped: 0 0 * PASS: 0 86 * FAIL: 0 1 * Killed by signal: 0 0 * Bad configuration: 0 0 * Expected FAIL: 0 0 * Unexpected PASS: 0 0 * Strange test result: 0 0 ********** ............................................ The failure is caused by: 1) kernel fails to allocate a gigantic page for the surplus case. And the gather_surplus_pages() will return NULL in the end. 2) The condition checks for "over-commit" is wrong. This patch does following things: 1) This patch changes the condition checks for: return_unused_surplus_pages() nr_overcommit_hugepages_store() hugetlb_overcommit_handler() 2) This patch introduces two helper functions: huge_nodemask() and __hugetlb_alloc_gigantic_page(). Please see the descritions in the two functions. 3) This patch uses __hugetlb_alloc_gigantic_page() to allocate the gigantic page in the __alloc_huge_page(). After this patch, gather_surplus_pages() can return a gigantic page for the surplus case. After this patch, the counter.sh can pass for the gigantic page. Signed-off-by: Huang Shijie <shijie.huang@xxxxxxx> --- include/linux/mempolicy.h | 8 +++++ mm/hugetlb.c | 77 +++++++++++++++++++++++++++++++++++++++++++---- mm/mempolicy.c | 44 +++++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 6 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5f4d828..6539fbb 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -146,6 +146,8 @@ extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, enum mpol_rebind_step step); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); +extern bool huge_nodemask(struct vm_area_struct *vma, + unsigned long addr, nodemask_t *mask); extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); @@ -269,6 +271,12 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } +static inline bool huge_nodemask(struct vm_area_struct *vma, + unsigned long addr, nodemask_t *mask) +{ + return false; +} + static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1395bef..04440b8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1506,6 +1506,69 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) /* * There are 3 ways this can get called: + * + * 1. When the NUMA is not enabled, use alloc_gigantic_page() to get + * the gigantic page. + * + * 2. The NUMA is enabled, but the vma is NULL. + * Initialize the @mask, and use alloc_fresh_gigantic_page() to get + * the gigantic page. + * + * 3. The NUMA is enabled, and the vma is valid. + * Use the @vma's memory policy. + * Get @mask by huge_nodemask(), and use alloc_fresh_gigantic_page() + * to get the gigantic page. + */ +static struct page *__hugetlb_alloc_gigantic_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, int nid) +{ + NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL | __GFP_NORETRY); + struct page *page = NULL; + + /* Not NUMA */ + if (!IS_ENABLED(CONFIG_NUMA)) { + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + + page = alloc_gigantic_page(nid, huge_page_order(h)); + if (page) + prep_compound_gigantic_page(page, huge_page_order(h)); + goto got_page; + } + + /* NUMA && !vma */ + if (!vma) { + /* First, check the mask */ + if (!mask) { + mask = &node_states[N_MEMORY]; + } else { + if (nid == NUMA_NO_NODE) { + if (!init_nodemask_of_mempolicy(mask)) { + NODEMASK_FREE(mask); + mask = &node_states[N_MEMORY]; + } + } else { + init_nodemask_of_node(mask, nid); + } + } + + page = alloc_fresh_gigantic_page(h, mask, false); + goto got_page; + } + + /* NUMA && vma */ + if (mask && huge_nodemask(vma, addr, mask)) + page = alloc_fresh_gigantic_page(h, mask, false); + +got_page: + if (mask != &node_states[N_MEMORY]) + NODEMASK_FREE(mask); + + return page; +} + +/* + * There are 3 ways this can get called: * 1. With vma+addr: we use the VMA's memory policy * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge * page from any node, and let the buddy allocator itself figure @@ -1584,7 +1647,7 @@ static struct page *__alloc_huge_page(struct hstate *h, struct page *page; unsigned int r_nid; - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic(h) && !gigantic_page_supported()) return NULL; /* @@ -1629,7 +1692,10 @@ static struct page *__alloc_huge_page(struct hstate *h, } spin_unlock(&hugetlb_lock); - page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); + if (hstate_is_gigantic(h)) + page = __hugetlb_alloc_gigantic_page(h, vma, addr, nid); + else + page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); spin_lock(&hugetlb_lock); if (page) { @@ -1796,8 +1862,7 @@ static void return_unused_surplus_pages(struct hstate *h, /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; - /* Cannot return gigantic pages currently */ - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic(h) && !gigantic_page_supported()) return; nr_pages = min(unused_resv_pages, h->surplus_huge_pages); @@ -2514,7 +2579,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic(h) && !gigantic_page_supported()) return -EINVAL; err = kstrtoul(buf, 10, &input); @@ -2966,7 +3031,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, tmp = h->nr_overcommit_huge_pages; - if (write && hstate_is_gigantic(h)) + if (write && hstate_is_gigantic(h) && !gigantic_page_supported()) return -EINVAL; table->data = &tmp; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6d3639e..3550a29 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1800,6 +1800,50 @@ static inline unsigned interleave_nid(struct mempolicy *pol, #ifdef CONFIG_HUGETLBFS /* + * huge_nodemask(@vma, @addr, @mask) + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma + * @mask: should be a valid nodemask pointer, not NULL + * + * Return true if we can succeed in extracting the policy nodemask + * for 'bind' or 'interleave' policy into the argument @mask, or + * initializing the argument @mask to contain the single node for + * 'preferred' or 'local' policy. + */ +bool huge_nodemask(struct vm_area_struct *vma, unsigned long addr, + nodemask_t *mask) +{ + struct mempolicy *mpol; + bool ret = true; + int nid; + + mpol = get_vma_policy(vma, addr); + + switch (mpol->mode) { + case MPOL_PREFERRED: + if (mpol->flags & MPOL_F_LOCAL) + nid = numa_node_id(); + else + nid = mpol->v.preferred_node; + init_nodemask_of_node(mask, nid); + break; + + case MPOL_BIND: + /* Fall through */ + case MPOL_INTERLEAVE: + *mask = mpol->v.nodes; + break; + + default: + ret = false; + break; + } + mpol_cond_put(mpol); + + return ret; +} + +/* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy -- 2.5.5 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>