The function alloc_pages_exact_node() was introduced in 6484eb3e2a81 ("page allocator: do not check NUMA node ID when the caller knows the node is valid") as an optimized variant of alloc_pages_node(), that doesn't allow the node id to be -1. Unfortunately the name of the function can easily suggest that the allocation is restricted to the given node and fails otherwise. In truth, the node is only preferred, unless __GFP_THISNODE is passed among the gfp flags. The misleading name has lead to mistakes in the past, see 5265047ac301 ("mm, thp: really limit transparent hugepage allocation to local node") and b360edb43f8e ("mm, mempolicy: migrate_to_node should only migrate to node"). To prevent further mistakes and provide a convenience function for allocations truly restricted to a node, this patch makes alloc_pages_exact_node() pass __GFP_THISNODE to that effect. The previous implementation of alloc_pages_exact_node() is copied as __alloc_pages_node() which implies it's an optimized variant of __alloc_pages_node() not intended for general usage. All three functions are described in the comment. Existing callers of alloc_pages_exact_node() are adjusted as follows: - those that explicitly pass __GFP_THISNODE keep calling alloc_pages_exact_node(), but the flag is removed from the call - others are converted to call __alloc_pages_node(). Some may still pass __GFP_THISNODE if they serve as wrappers that get gfp_flags from higher layers. There's exception of sba_alloc_coherent() which open-codes the check for nid == -1, so it is converted to use alloc_pages_node() instead. This means it no longer performs some VM_BUG_ON checks, but otherwise the whole patch makes no functional changes. Signed-off-by: Vlastimil Babka <vbabka@xxxxxxx> --- I've dropped non-mm guys from CC for now and marked as RFC until we agree on the API. arch/ia64/hp/common/sba_iommu.c | 6 +----- arch/ia64/kernel/uncached.c | 2 +- arch/ia64/sn/pci/pci_dma.c | 2 +- arch/powerpc/platforms/cell/ras.c | 2 +- arch/x86/kvm/vmx.c | 2 +- drivers/misc/sgi-xp/xpc_uv.c | 2 +- include/linux/gfp.h | 23 +++++++++++++++++++++++ kernel/profile.c | 8 ++++---- mm/filemap.c | 2 +- mm/hugetlb.c | 7 +++---- mm/memory-failure.c | 2 +- mm/mempolicy.c | 6 ++---- mm/migrate.c | 6 ++---- mm/slab.c | 2 +- mm/slob.c | 2 +- mm/slub.c | 2 +- 16 files changed, 45 insertions(+), 31 deletions(-) diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 344387a..a6d6190 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1140,13 +1140,9 @@ sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, #ifdef CONFIG_NUMA { - int node = ioc->node; struct page *page; - if (node == NUMA_NO_NODE) - node = numa_node_id(); - - page = alloc_pages_exact_node(node, flags, get_order(size)); + page = alloc_pages_node(ioc->node, flags, get_order(size)); if (unlikely(!page)) return NULL; diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index 20e8a9b..b187c87 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -98,7 +98,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid) /* attempt to allocate a granule's worth of cached memory pages */ page = alloc_pages_exact_node(nid, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO, IA64_GRANULE_SHIFT-PAGE_SHIFT); if (!page) { mutex_unlock(&uc_pool->add_chunk_mutex); diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c index d0853e8..8f59907 100644 --- a/arch/ia64/sn/pci/pci_dma.c +++ b/arch/ia64/sn/pci/pci_dma.c @@ -92,7 +92,7 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size, */ node = pcibus_to_node(pdev->bus); if (likely(node >=0)) { - struct page *p = alloc_pages_exact_node(node, + struct page *p = __alloc_pages_node(node, flags, get_order(size)); if (likely(p)) diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c index e865d74..ff5ae13 100644 --- a/arch/powerpc/platforms/cell/ras.c +++ b/arch/powerpc/platforms/cell/ras.c @@ -124,7 +124,7 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order) area->nid = nid; area->order = order; area->pages = alloc_pages_exact_node(area->nid, - GFP_KERNEL|__GFP_THISNODE, + GFP_KERNEL, area->order); if (!area->pages) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2d73807..8c7f3b0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3158,7 +3158,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) struct page *pages; struct vmcs *vmcs; - pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); + pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index 95c8944..a4758cd 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -240,7 +240,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name, nid = cpu_to_node(cpu); page = alloc_pages_exact_node(nid, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO, pg_order); if (page == NULL) { dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d " diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 15928f0..c50848e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -300,6 +300,22 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL); } +/* + * An optimized version of alloc_pages_node(), to be only used in places where + * the overhead of the check for nid == -1 could matter. + */ +static inline struct page * +__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid)); + + return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); +} + +/* + * Allocate pages, preferring the node given as nid. When nid equals -1, + * prefer the current CPU's node. + */ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { @@ -310,11 +326,18 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } +/* + * Allocate pages, restricting the allocation to the node given as nid. The + * node must be valid and online. This is achieved by adding __GFP_THISNODE + * to gfp_mask. + */ static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask, unsigned int order) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid)); + gfp_mask |= __GFP_THISNODE; + return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } diff --git a/kernel/profile.c b/kernel/profile.c index a7bcd28..30a9404 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info, node = cpu_to_mem(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info, per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -544,14 +544,14 @@ static int create_hash_tables(void) struct page *page; page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO, 0); if (!page) goto out_cleanup; diff --git a/mm/filemap.c b/mm/filemap.c index 6bf5e42..5a7d4e2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -648,7 +648,7 @@ struct page *__page_cache_alloc(gfp_t gfp) do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); - page = alloc_pages_exact_node(n, gfp, 0); + page = __alloc_pages_node(n, gfp, 0); } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); return page; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 271e443..156d8d7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1089,8 +1089,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) struct page *page; page = alloc_pages_exact_node(nid, - htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_REPEAT|__GFP_NOWARN, + htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { @@ -1251,8 +1250,8 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) huge_page_order(h)); else page = alloc_pages_exact_node(nid, - htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT| + __GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 501820c..b783bc5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1503,7 +1503,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x) return alloc_huge_page_node(page_hstate(compound_head(p)), nid); else - return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); + return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0); } /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7477432..4547960 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -945,8 +945,7 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x return alloc_huge_page_node(page_hstate(compound_head(page)), node); else - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE | - __GFP_THISNODE, 0); + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1986,8 +1985,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); if (!nmask || node_isset(node, *nmask)) { mpol_cond_put(pol); - page = alloc_pages_exact_node(node, - gfp | __GFP_THISNODE, order); + page = alloc_pages_exact_node(node, gfp, order); goto out; } } diff --git a/mm/migrate.c b/mm/migrate.c index f53838f..d139222 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1554,10 +1554,8 @@ static struct page *alloc_misplaced_dst_page(struct page *page, struct page *newpage; newpage = alloc_pages_exact_node(nid, - (GFP_HIGHUSER_MOVABLE | - __GFP_THISNODE | __GFP_NOMEMALLOC | - __GFP_NORETRY | __GFP_NOWARN) & - ~GFP_IOFS, 0); + (GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | + __GFP_NORETRY | __GFP_NOWARN) & ~GFP_IOFS, 0); return newpage; } diff --git a/mm/slab.c b/mm/slab.c index 7eb38dd..5f49e63 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1594,7 +1594,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (memcg_charge_slab(cachep, flags, cachep->gfporder)) return NULL; - page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); + page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { memcg_uncharge_slab(cachep, cachep->gfporder); slab_out_of_memory(cachep, flags, nodeid); diff --git a/mm/slob.c b/mm/slob.c index 4765f65..10d8e02 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -193,7 +193,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) #ifdef CONFIG_NUMA if (node != NUMA_NO_NODE) - page = alloc_pages_exact_node(node, gfp, order); + page = __alloc_pages_node(node, gfp, order); else #endif page = alloc_pages(gfp, order); diff --git a/mm/slub.c b/mm/slub.c index 54c0876..0486343 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1323,7 +1323,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); else - page = alloc_pages_exact_node(node, flags, order); + page = __alloc_pages_node(node, flags, order); if (!page) memcg_uncharge_slab(s, order); -- 2.4.6 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>