The patch titled Subject: hugetlb: force allocating surplus hugepages on mempolicy allowed nodes has been added to the -mm mm-unstable branch. Its filename is hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Aristeu Rozanski <aris@xxxxxxxxxx> Subject: hugetlb: force allocating surplus hugepages on mempolicy allowed nodes Date: Fri, 21 Jun 2024 15:00:50 -0400 allowed_mems_nr() only accounts for the number of free hugepages in the nodes the current process belongs to. In case one or more of the requested surplus hugepages are allocated in a different node, the whole allocation will fail due allowed_mems_nr() returning a lower value. So allocate surplus hugepages in one of the nodes the current process belongs to. Easy way to reproduce this issue is to use a 2+ NUMA nodes system: # echo 0 >/proc/sys/vm/nr_hugepages # echo 1 >/proc/sys/vm/nr_overcommit_hugepages # numactl -m0 ./tools/testing/selftests/mm/map_hugetlb 2 will eventually fail when the hugepage ends up allocated in a different node. Link: https://lkml.kernel.org/r/20240621190050.mhxwb65zn37doegp@xxxxxxxxxx Signed-off-by: Aristeu Rozanski <aris@xxxxxxxxxx> Cc: Muchun Song <muchun.song@xxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- mm/hugetlb.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) --- a/mm/hugetlb.c~hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes +++ a/mm/hugetlb.c @@ -2620,6 +2620,23 @@ struct folio *alloc_hugetlb_folio_nodema return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } +static nodemask_t *policy_mbind_nodemask(gfp_t gfp) +{ +#ifdef CONFIG_NUMA + struct mempolicy *mpol = get_task_policy(current); + + /* + * Only enforce MPOL_BIND policy which overlaps with cpuset policy + * (from policy_nodemask) specifically for hugetlb case + */ + if (mpol->mode == MPOL_BIND && + (apply_policy_zone(mpol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) + return &mpol->nodes; +#endif + return NULL; +} + /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -2633,6 +2650,8 @@ static int gather_surplus_pages(struct h long i; long needed, allocated; bool alloc_ok = true; + int node; + nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; @@ -2647,8 +2666,14 @@ static int gather_surplus_pages(struct h retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { - folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), - NUMA_NO_NODE, NULL); + for_each_node_mask(node, cpuset_current_mems_allowed) { + if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) { + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), + node, NULL); + if (folio) + break; + } + } if (!folio) { alloc_ok = false; break; @@ -4878,23 +4903,6 @@ static int __init default_hugepagesz_set } __setup("default_hugepagesz=", default_hugepagesz_setup); -static nodemask_t *policy_mbind_nodemask(gfp_t gfp) -{ -#ifdef CONFIG_NUMA - struct mempolicy *mpol = get_task_policy(current); - - /* - * Only enforce MPOL_BIND policy which overlaps with cpuset policy - * (from policy_nodemask) specifically for hugetlb case - */ - if (mpol->mode == MPOL_BIND && - (apply_policy_zone(mpol, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) - return &mpol->nodes; -#endif - return NULL; -} - static unsigned int allowed_mems_nr(struct hstate *h) { int node; _ Patches currently in -mm which might be from aris@xxxxxxxxxx are hugetlb-force-allocating-surplus-hugepages-on-mempolicy-allowed-nodes.patch