On Tue, Feb 26, 2019 at 7:58 PM Mike Rapoport <rppt@xxxxxxxxxxxxx> wrote: > > On Sun, Feb 24, 2019 at 08:34:05PM +0800, Pingfan Liu wrote: > > There are numa machines with memory-less node. When allocating memory for > > the memory-less node, memblock allocator falls back to 'Node 0' without fully > > utilizing the nearest node. This hurts the performance, especially for per > > cpu section. Suppressing this defect by building the full node fall back > > info for memblock allocator, like what we have done for page allocator. > > Is it really necessary to build full node fallback info for memblock and > then rebuild it again for the page allocator? > Do you mean building the full node fallback info once, and share it by both memblock and page allocator? If it is, then node online/offline is the corner case to block this design. > I think it should be possible to split parts of build_all_zonelists_init() > that do not touch per-cpu areas into a separate function and call that > function after topology detection. Then it would be possible to use > local_memory_node() when calling memblock. > Yes, this is one way but may be with higher pay of changing the code. I will try it. Thank your for your suggestion. Best regards, Pingfan > > Signed-off-by: Pingfan Liu <kernelfans@xxxxxxxxx> > > CC: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > > CC: Ingo Molnar <mingo@xxxxxxxxxx> > > CC: Borislav Petkov <bp@xxxxxxxxx> > > CC: "H. Peter Anvin" <hpa@xxxxxxxxx> > > CC: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> > > CC: Vlastimil Babka <vbabka@xxxxxxx> > > CC: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx> > > CC: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > > CC: Mel Gorman <mgorman@xxxxxxx> > > CC: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx> > > CC: Andy Lutomirski <luto@xxxxxxxxxx> > > CC: Andi Kleen <ak@xxxxxxxxxxxxxxx> > > CC: Petr Tesarik <ptesarik@xxxxxxx> > > CC: Michal Hocko <mhocko@xxxxxxxx> > > CC: Stephen Rothwell <sfr@xxxxxxxxxxxxxxxx> > > CC: Jonathan Corbet <corbet@xxxxxxx> > > CC: Nicholas Piggin <npiggin@xxxxxxxxx> > > CC: Daniel Vacek <neelx@xxxxxxxxxx> > > CC: linux-kernel@xxxxxxxxxxxxxxx > > --- > > include/linux/memblock.h | 3 +++ > > mm/memblock.c | 68 ++++++++++++++++++++++++++++++++++++++++++++---- > > 2 files changed, 66 insertions(+), 5 deletions(-) > > > > diff --git a/include/linux/memblock.h b/include/linux/memblock.h > > index 64c41cf..ee999c5 100644 > > --- a/include/linux/memblock.h > > +++ b/include/linux/memblock.h > > @@ -342,6 +342,9 @@ void *memblock_alloc_try_nid_nopanic(phys_addr_t size, phys_addr_t align, > > void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, > > phys_addr_t min_addr, phys_addr_t max_addr, > > int nid); > > +extern int build_node_order(int *node_oder_array, int sz, > > + int local_node, nodemask_t *used_mask); > > +void memblock_build_node_order(void); > > > > static inline void * __init memblock_alloc(phys_addr_t size, phys_addr_t align) > > { > > diff --git a/mm/memblock.c b/mm/memblock.c > > index 022d4cb..cf78850 100644 > > --- a/mm/memblock.c > > +++ b/mm/memblock.c > > @@ -1338,6 +1338,47 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali > > return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); > > } > > > > +static int **node_fallback __initdata; > > + > > +/* > > + * build_node_order() relies on cpumask_of_node(), hence arch should set up > > + * cpumask before calling this func. > > + */ > > +void __init memblock_build_node_order(void) > > +{ > > + int nid, i; > > + nodemask_t used_mask; > > + > > + node_fallback = memblock_alloc(MAX_NUMNODES * sizeof(int *), > > + sizeof(int *)); > > + for_each_online_node(nid) { > > + node_fallback[nid] = memblock_alloc( > > + num_online_nodes() * sizeof(int), sizeof(int)); > > + for (i = 0; i < num_online_nodes(); i++) > > + node_fallback[nid][i] = NUMA_NO_NODE; > > + } > > + > > + for_each_online_node(nid) { > > + nodes_clear(used_mask); > > + node_set(nid, used_mask); > > + build_node_order(node_fallback[nid], num_online_nodes(), > > + nid, &used_mask); > > + } > > +} > > + > > +static void __init memblock_free_node_order(void) > > +{ > > + int nid; > > + > > + if (!node_fallback) > > + return; > > + for_each_online_node(nid) > > + memblock_free(__pa(node_fallback[nid]), > > + num_online_nodes() * sizeof(int)); > > + memblock_free(__pa(node_fallback), MAX_NUMNODES * sizeof(int *)); > > + node_fallback = NULL; > > +} > > + > > /** > > * memblock_alloc_internal - allocate boot memory block > > * @size: size of memory block to be allocated in bytes > > @@ -1370,6 +1411,7 @@ static void * __init memblock_alloc_internal( > > { > > phys_addr_t alloc; > > void *ptr; > > + int node; > > enum memblock_flags flags = choose_memblock_flags(); > > > > if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) > > @@ -1397,11 +1439,26 @@ static void * __init memblock_alloc_internal( > > goto done; > > > > if (nid != NUMA_NO_NODE) { > > - alloc = memblock_find_in_range_node(size, align, min_addr, > > - max_addr, NUMA_NO_NODE, > > - flags); > > - if (alloc && !memblock_reserve(alloc, size)) > > - goto done; > > + if (!node_fallback) { > > + alloc = memblock_find_in_range_node(size, align, > > + min_addr, max_addr, > > + NUMA_NO_NODE, flags); > > + if (alloc && !memblock_reserve(alloc, size)) > > + goto done; > > + } else { > > + int i; > > + for (i = 0; i < num_online_nodes(); i++) { > > + node = node_fallback[nid][i]; > > + /* fallback list has all memory nodes */ > > + if (node == NUMA_NO_NODE) > > + break; > > + alloc = memblock_find_in_range_node(size, > > + align, min_addr, max_addr, > > + node, flags); > > + if (alloc && !memblock_reserve(alloc, size)) > > + goto done; > > + } > > + } > > } > > > > if (min_addr) { > > @@ -1969,6 +2026,7 @@ unsigned long __init memblock_free_all(void) > > > > reset_all_zones_managed_pages(); > > > > + memblock_free_node_order(); > > pages = free_low_memory_core_early(); > > totalram_pages_add(pages); > > > > -- > > 2.7.4 > > > > -- > Sincerely yours, > Mike. >