The patch titled Subject: x86, numa: always initialize all possible nodes has been added to the -mm tree. Its filename is x86-numa-always-initialize-all-possible-nodes.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/x86-numa-always-initialize-all-possible-nodes.patch and later at http://ozlabs.org/~akpm/mmotm/broken-out/x86-numa-always-initialize-all-possible-nodes.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Michal Hocko <mhocko@xxxxxxxx> Subject: x86, numa: always initialize all possible nodes Pingfan Liu has reported the following splat [ 5.772742] BUG: unable to handle kernel paging request at 0000000000002088 [ 5.773618] PGD 0 P4D 0 [ 5.773618] Oops: 0000 [#1] SMP NOPTI [ 5.773618] CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.20.0-rc1+ #3 [ 5.773618] Hardware name: Dell Inc. PowerEdge R7425/02MJ3T, BIOS 1.4.3 06/29/2018 [ 5.773618] RIP: 0010:__alloc_pages_nodemask+0xe2/0x2a0 [ 5.773618] Code: 00 00 44 89 ea 80 ca 80 41 83 f8 01 44 0f 44 ea 89 da c1 ea 08 83 e2 01 88 54 24 20 48 8b 54 24 08 48 85 d2 0f 85 46 01 00 00 <3b> 77 08 0f 82 3d 01 00 00 48 89 f8 44 89 ea 48 89 e1 44 89 e6 89 [ 5.773618] RSP: 0018:ffffaa600005fb20 EFLAGS: 00010246 [ 5.773618] RAX: 0000000000000000 RBX: 00000000006012c0 RCX: 0000000000000000 [ 5.773618] RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000002080 [ 5.773618] RBP: 00000000006012c0 R08: 0000000000000000 R09: 0000000000000002 [ 5.773618] R10: 00000000006080c0 R11: 0000000000000002 R12: 0000000000000000 [ 5.773618] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000002 [ 5.773618] FS: 0000000000000000(0000) GS:ffff8c69afe00000(0000) knlGS:0000000000000000 [ 5.773618] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 5.773618] CR2: 0000000000002088 CR3: 000000087e00a000 CR4: 00000000003406e0 [ 5.773618] Call Trace: [ 5.773618] new_slab+0xa9/0x570 [ 5.773618] ___slab_alloc+0x375/0x540 [ 5.773618] ? pinctrl_bind_pins+0x2b/0x2a0 [ 5.773618] __slab_alloc+0x1c/0x38 [ 5.773618] __kmalloc_node_track_caller+0xc8/0x270 [ 5.773618] ? pinctrl_bind_pins+0x2b/0x2a0 [ 5.773618] devm_kmalloc+0x28/0x60 [ 5.773618] pinctrl_bind_pins+0x2b/0x2a0 [ 5.773618] really_probe+0x73/0x420 [ 5.773618] driver_probe_device+0x115/0x130 [ 5.773618] __driver_attach+0x103/0x110 [ 5.773618] ? driver_probe_device+0x130/0x130 [ 5.773618] bus_for_each_dev+0x67/0xc0 [ 5.773618] ? klist_add_tail+0x3b/0x70 [ 5.773618] bus_add_driver+0x41/0x260 [ 5.773618] ? pcie_port_setup+0x4d/0x4d [ 5.773618] driver_register+0x5b/0xe0 [ 5.773618] ? pcie_port_setup+0x4d/0x4d [ 5.773618] do_one_initcall+0x4e/0x1d4 [ 5.773618] ? init_setup+0x25/0x28 [ 5.773618] kernel_init_freeable+0x1c1/0x26e [ 5.773618] ? loglevel+0x5b/0x5b [ 5.773618] ? rest_init+0xb0/0xb0 [ 5.773618] kernel_init+0xa/0x110 [ 5.773618] ret_from_fork+0x22/0x40 [ 5.773618] Modules linked in: [ 5.773618] CR2: 0000000000002088 [ 5.773618] ---[ end trace 1030c9120a03d081 ]--- with his AMD machine with the following topology NUMA node0 CPU(s): 0,8,16,24 NUMA node1 CPU(s): 2,10,18,26 NUMA node2 CPU(s): 4,12,20,28 NUMA node3 CPU(s): 6,14,22,30 NUMA node4 CPU(s): 1,9,17,25 NUMA node5 CPU(s): 3,11,19,27 NUMA node6 CPU(s): 5,13,21,29 NUMA node7 CPU(s): 7,15,23,31 [ 0.007418] Early memory node ranges [ 0.007419] node 1: [mem 0x0000000000001000-0x000000000008efff] [ 0.007420] node 1: [mem 0x0000000000090000-0x000000000009ffff] [ 0.007422] node 1: [mem 0x0000000000100000-0x000000005c3d6fff] [ 0.007422] node 1: [mem 0x00000000643df000-0x0000000068ff7fff] [ 0.007423] node 1: [mem 0x000000006c528000-0x000000006fffffff] [ 0.007424] node 1: [mem 0x0000000100000000-0x000000047fffffff] [ 0.007425] node 5: [mem 0x0000000480000000-0x000000087effffff] and nr_cpus set to 4. The underlying reason is tha the device is bound to node 2 which doesn't have any memory and init_cpu_to_node only initializes memory-less nodes for possible cpus which nr_cpus restrics. This in turn means that proper zonelists are not allocated and the page allocator blows up. Fix the issue by reworking how x86 initializes the memory less nodes. The current implementation is hacked into the workflow and it doesn't allow any flexibility. There is init_memory_less_node called for each offline node that has a CPU as already mentioned above. This will make sure that we will have a new online node without any memory. Much later on we build a zone list for this node and things seem to work, except they do not (e.g. due to nr_cpus). Not to mention that it doesn't really make much sense to consider an empty node as online because we just consider this node whenever we want to iterate nodes to use and empty node is obviously not the best candidate. This is all just too fragile. The new code relies on the arch specific initialization to allocate all possible NUMA nodes (including memory less) - numa_register_memblks in this case. Generic code then initializes both zonelists (__build_all_zonelists) and allocator internals (free_area_init_nodes) for all non-null pgdats rather than online ones. For the x86 specific part also do not make new node online in alloc_node_data because this is too early to know that. numa_register_memblks knows that a node has some memory so it can make the node online appropriately. init_memory_less_node hack can be safely removed altogether now. Link: http://lkml.kernel.org/r/20190212095343.23315-2-mhocko@xxxxxxxxxx Signed-off-by: Michal Hocko <mhocko@xxxxxxxx> Reported-by: Pingfan Liu <kernelfans@xxxxxxxxx> Tested-by: Pingfan Liu <kernelfans@xxxxxxxxx> Reviewed-by: Mike Rapoport <rppt@xxxxxxxxxxxxx> Reviewed-by: Dave Hansen <dave.hansen@xxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx> Cc: Tony Luck <tony.luck@xxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- arch/x86/mm/numa.c | 27 +++------------------------ mm/page_alloc.c | 15 +++++++++------ 2 files changed, 12 insertions(+), 30 deletions(-) --- a/arch/x86/mm/numa.c~x86-numa-always-initialize-all-possible-nodes +++ a/arch/x86/mm/numa.c @@ -212,8 +212,6 @@ static void __init alloc_node_data(int n node_data[nid] = nd; memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); - - node_set_online(nid); } /** @@ -566,7 +564,7 @@ static int __init numa_register_memblks( return -EINVAL; /* Finally register nodes. */ - for_each_node_mask(nid, node_possible_map) { + for_each_node_mask(nid, numa_nodes_parsed) { u64 start = PFN_PHYS(max_pfn); u64 end = 0; @@ -577,9 +575,6 @@ static int __init numa_register_memblks( end = max(mi->blk[i].end, end); } - if (start >= end) - continue; - /* * Don't confuse VM with a node that doesn't have the * minimum amount of memory: @@ -588,6 +583,8 @@ static int __init numa_register_memblks( continue; alloc_node_data(nid); + if (end) + node_set_online(nid); } /* Dump memblock with node info and return. */ @@ -717,21 +714,6 @@ void __init x86_numa_init(void) numa_init(dummy_numa_init); } -static void __init init_memory_less_node(int nid) -{ - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; - - /* Allocate and initialize node data. Memory-less node is now online.*/ - alloc_node_data(nid); - free_area_init_node(nid, zones_size, 0, zholes_size); - - /* - * All zonelists will be built later in start_kernel() after per cpu - * areas are initialized. - */ -} - /* * Setup early cpu_to_node. * @@ -759,9 +741,6 @@ void __init init_cpu_to_node(void) if (node == NUMA_NO_NODE) continue; - if (!node_online(node)) - init_memory_less_node(node); - numa_set_node(cpu, node); } } --- a/mm/page_alloc.c~x86-numa-always-initialize-all-possible-nodes +++ a/mm/page_alloc.c @@ -5657,10 +5657,11 @@ static void __build_all_zonelists(void * if (self && !node_online(self->node_id)) { build_zonelists(self); } else { - for_each_online_node(nid) { + for_each_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - build_zonelists(pgdat); + if (pgdat) + build_zonelists(pgdat); } #ifdef CONFIG_HAVE_MEMORYLESS_NODES @@ -6943,10 +6944,8 @@ static unsigned long __init find_min_pfn for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) min_pfn = min(min_pfn, start_pfn); - if (min_pfn == ULONG_MAX) { - pr_warn("Could not find start_pfn for node %d\n", nid); + if (min_pfn == ULONG_MAX) return 0; - } return min_pfn; } @@ -7290,8 +7289,12 @@ void __init free_area_init_nodes(unsigne mminit_verify_pageflags_layout(); setup_nr_node_ids(); zero_resv_unavail(); - for_each_online_node(nid) { + for_each_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); + + if (!pgdat) + continue; + free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), NULL); _ Patches currently in -mm which might be from mhocko@xxxxxxxx are mm-memory_hotplug-cleanup-memory-offline-path.patch mm-memory_hotplug-provide-a-more-generic-restrictions-for-memory-hotplug.patch x86-numa-always-initialize-all-possible-nodes.patch mm-be-more-verbose-about-zonelist-initialization.patch