Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sun, Mar 10, 2013 at 01:55:10PM +0800, Hillf Danton wrote:
> On Thu, Mar 7, 2013 at 5:50 AM, Cliff Wickman <cpw@xxxxxxx> wrote:
> > From: Cliff Wickman <cpw@xxxxxxx>
> >
> > Allocating a large number of 1GB hugetlbfs pages at boot takes a
> > very long time.
> >
> > Large system sites would at times like to allocate a very large amount of
> > memory as 1GB pages.  They would put this on the kernel boot line:
> >    default_hugepagesz=1G hugepagesz=1G hugepages=4096
> > [Dynamic allocation of 1G pages is not an option, as zone pages only go
> >  up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
> >
> > Each page is zeroed as it is allocated, and all allocation is done by
> > cpu 0, as this path is early in boot:
> >       start_kernel
> >         kernel_init
> >           do_pre_smp_initcalls
> >             hugetlb_init
> >               hugetlb_init_hstates
> >                 hugetlb_hstate_alloc_pages
> >
> > Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
> > on large numa systems).
> > This estimate is approximate (it depends on core frequency & number of hops
> > to remote memory) but should be within a factor of 2 on most systems.
> > A benchmark attempting to reserve a TB for 1GB pages would thus require
> > ~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.
> >
> > I propose passing a flag to the early allocator to indicate that no zeroing
> > of a page should be done.  The 'no zeroing' flag would have to be passed
> > down this code path:
> >
> 
> FYI: huge pages are cleared just after allocated, for instance,
> clear_huge_page() in hugetlb_no_page()
> 
> Hillf

Yes, I should have added that comment to the changelog.  And because
this is true there is no need to clear a huge page at boot time.

-Cliff
> >   hugetlb_hstate_alloc_pages
> >     alloc_bootmem_huge_page
> >       __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
> >         __alloc_memory_core_early  NO_ZERO
> >           if (!(flags & NO_ZERO))
> >             memset(ptr, 0, size);
> >
> > Or this path if CONFIG_NO_BOOTMEM is not set:
> >
> >   hugetlb_hstate_alloc_pages
> >     alloc_bootmem_huge_page
> >       __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
> >         alloc_bootmem_core          NO_ZERO
> >           if (!(flags & NO_ZERO))
> >             memset(region, 0, size);
> >         __alloc_bootmem_nopanic     NO_ZERO
> >           ___alloc_bootmem_nopanic  NO_ZERO
> >             alloc_bootmem_core      NO_ZERO
> >               if (!(flags & NO_ZERO))
> >                 memset(region, 0, size);
> >
> > Signed-off-by: Cliff Wickman <cpw@xxxxxxx>
> >
> > ---
> >  arch/x86/kernel/setup_percpu.c |    4 ++--
> >  include/linux/bootmem.h        |   23 ++++++++++++++++-------
> >  mm/bootmem.c                   |   12 +++++++-----
> >  mm/hugetlb.c                   |    3 ++-
> >  mm/nobootmem.c                 |   41 +++++++++++++++++++++++------------------
> >  mm/page_cgroup.c               |    2 +-
> >  mm/sparse.c                    |    2 +-
> >  7 files changed, 52 insertions(+), 35 deletions(-)
> >
> > Index: linux/include/linux/bootmem.h
> > ===================================================================
> > --- linux.orig/include/linux/bootmem.h
> > +++ linux/include/linux/bootmem.h
> > @@ -8,6 +8,11 @@
> >  #include <asm/dma.h>
> >
> >  /*
> > + * allocation flags
> > + */
> > +#define NO_ZERO                0x00000001
> > +
> > +/*
> >   *  simple boot-time physical memory area allocator.
> >   */
> >
> > @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
> >                              unsigned long goal);
> >  extern void *__alloc_bootmem_nopanic(unsigned long size,
> >                                      unsigned long align,
> > -                                    unsigned long goal);
> > +                                    unsigned long goal,
> > +                                    u32 flags);
> >  extern void *__alloc_bootmem_node(pg_data_t *pgdat,
> >                                   unsigned long size,
> >                                   unsigned long align,
> > @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
> >  extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >                                   unsigned long size,
> >                                   unsigned long align,
> > -                                 unsigned long goal);
> > +                                 unsigned long goal,
> > +                                 u32 flags);
> >  void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >                                   unsigned long size,
> >                                   unsigned long align,
> >                                   unsigned long goal,
> > -                                 unsigned long limit);
> > +                                 unsigned long limit,
> > +                                 u32 flags);
> >  extern void *__alloc_bootmem_low(unsigned long size,
> >                                  unsigned long align,
> >                                  unsigned long goal);
> > @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
> >  #define alloc_bootmem_align(x, align) \
> >         __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
> >  #define alloc_bootmem_nopanic(x) \
> > -       __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> > +       __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
> >  #define alloc_bootmem_pages(x) \
> >         __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> >  #define alloc_bootmem_pages_nopanic(x) \
> > -       __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> > +       __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
> >  #define alloc_bootmem_node(pgdat, x) \
> >         __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> >  #define alloc_bootmem_node_nopanic(pgdat, x) \
> > -       __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> > +       __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
> > +                                    BOOTMEM_LOW_LIMIT, 0)
> >  #define alloc_bootmem_pages_node(pgdat, x) \
> >         __alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> >  #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
> > -       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> > +       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
> >
> >  #define alloc_bootmem_low(x) \
> >         __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
> > Index: linux/arch/x86/kernel/setup_percpu.c
> > ===================================================================
> > --- linux.orig/arch/x86/kernel/setup_percpu.c
> > +++ linux/arch/x86/kernel/setup_percpu.c
> > @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
> >         void *ptr;
> >
> >         if (!node_online(node) || !NODE_DATA(node)) {
> > -               ptr = __alloc_bootmem_nopanic(size, align, goal);
> > +               ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
> >                 pr_info("cpu %d has no node %d or node-local memory\n",
> >                         cpu, node);
> >                 pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
> >                          cpu, size, __pa(ptr));
> >         } else {
> >                 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> > -                                                  size, align, goal);
> > +                                                  size, align, goal, 0);
> >                 pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
> >                          cpu, size, node, __pa(ptr));
> >         }
> > Index: linux/mm/nobootmem.c
> > ===================================================================
> > --- linux.orig/mm/nobootmem.c
> > +++ linux/mm/nobootmem.c
> > @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
> >  unsigned long max_pfn;
> >
> >  static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
> > -                                       u64 goal, u64 limit)
> > +                                       u64 goal, u64 limit, u32 flags)
> >  {
> >         void *ptr;
> >         u64 addr;
> > @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
> >                 return NULL;
> >
> >         ptr = phys_to_virt(addr);
> > -       memset(ptr, 0, size);
> > +       if (!(flags & NO_ZERO))
> > +               memset(ptr, 0, size);
> >         memblock_reserve(addr, size);
> >         /*
> >          * The min_count is set to 0 so that bootmem allocated blocks
> > @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
> >  static void * __init ___alloc_bootmem_nopanic(unsigned long size,
> >                                         unsigned long align,
> >                                         unsigned long goal,
> > -                                       unsigned long limit)
> > +                                       unsigned long limit,
> > +                                       u32 flags)
> >  {
> >         void *ptr;
> >
> > @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
> >
> >  restart:
> >
> > -       ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
> > +       ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
> > +                                       limit, 0);
> >
> >         if (ptr)
> >                 return ptr;
> > @@ -244,17 +247,17 @@ restart:
> >   * Returns NULL on failure.
> >   */
> >  void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> > -                                       unsigned long goal)
> > +                                       unsigned long goal, u32 flags)
> >  {
> >         unsigned long limit = -1UL;
> >
> > -       return ___alloc_bootmem_nopanic(size, align, goal, limit);
> > +       return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
> >  }
> >
> >  static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
> > -                                       unsigned long goal, unsigned long limit)
> > +                       unsigned long goal, unsigned long limit, u32 flags)
> >  {
> > -       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
> > +       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
> >
> >         if (mem)
> >                 return mem;
> > @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
> >  {
> >         unsigned long limit = -1UL;
> >
> > -       return ___alloc_bootmem(size, align, goal, limit);
> > +       return ___alloc_bootmem(size, align, goal, limit, 0);
> >  }
> >
> >  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >                                                    unsigned long size,
> >                                                    unsigned long align,
> >                                                    unsigned long goal,
> > -                                                  unsigned long limit)
> > +                                                  unsigned long limit,
> > +                                                  u32 flags)
> >  {
> >         void *ptr;
> >
> >  again:
> >         ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
> > -                                       goal, limit);
> > +                                       goal, limit, flags);
> >         if (ptr)
> >                 return ptr;
> >
> >         ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
> > -                                       goal, limit);
> > +                                       goal, limit, flags);
> >         if (ptr)
> >                 return ptr;
> >
> > @@ -315,12 +319,13 @@ again:
> >  }
> >
> >  void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> > -                                  unsigned long align, unsigned long goal)
> > +                       unsigned long align, unsigned long goal, u32 flags)
> >  {
> >         if (WARN_ON_ONCE(slab_is_available()))
> >                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> >
> > -       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> > +       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> > +                       0, flags);
> >  }
> >
> >  void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> > @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
> >  {
> >         void *ptr;
> >
> > -       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
> > +       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
> >         if (ptr)
> >                 return ptr;
> >
> > @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
> >   * The function panics if the request can not be satisfied.
> >   */
> >  void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> > -                                  unsigned long align, unsigned long goal)
> > +                       unsigned long align, unsigned long goal)
> >  {
> >         if (WARN_ON_ONCE(slab_is_available()))
> >                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> > @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
> >  void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
> >                                   unsigned long goal)
> >  {
> > -       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
> > +       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
> >  }
> >
> >  void * __init __alloc_bootmem_low_nopanic(unsigned long size,
> > @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
> >                                           unsigned long goal)
> >  {
> >         return ___alloc_bootmem_nopanic(size, align, goal,
> > -                                       ARCH_LOW_ADDRESS_LIMIT);
> > +                                       ARCH_LOW_ADDRESS_LIMIT, 0);
> >  }
> >
> >  /**
> > Index: linux/mm/sparse.c
> > ===================================================================
> > --- linux.orig/mm/sparse.c
> > +++ linux/mm/sparse.c
> > @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
> >         nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> >  again:
> >         p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
> > -                                         SMP_CACHE_BYTES, goal, limit);
> > +                                         SMP_CACHE_BYTES, goal, limit, 0);
> >         if (!p && limit) {
> >                 limit = 0;
> >                 goto again;
> > Index: linux/mm/hugetlb.c
> > ===================================================================
> > --- linux.orig/mm/hugetlb.c
> > +++ linux/mm/hugetlb.c
> > @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
> >                 addr = __alloc_bootmem_node_nopanic(
> >                                 NODE_DATA(hstate_next_node_to_alloc(h,
> >                                                 &node_states[N_MEMORY])),
> > -                               huge_page_size(h), huge_page_size(h), 0);
> > +                               huge_page_size(h), huge_page_size(h),
> > +                               0, NO_ZERO);
> >
> >                 if (addr) {
> >                         /*
> > Index: linux/mm/bootmem.c
> > ===================================================================
> > --- linux.orig/mm/bootmem.c
> > +++ linux/mm/bootmem.c
> > @@ -660,7 +660,7 @@ restart:
> >   * Returns NULL on failure.
> >   */
> >  void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> > -                                       unsigned long goal)
> > +                                       unsigned long goal, u32 flags)
> >  {
> >         unsigned long limit = 0;
> >
> > @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
> >
> >  void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> >                                 unsigned long size, unsigned long align,
> > -                               unsigned long goal, unsigned long limit)
> > +                               unsigned long goal, unsigned long limit,
> > +                               u32 flags)
> >  {
> >         void *ptr;
> >
> > @@ -734,12 +735,13 @@ again:
> >  }
> >
> >  void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> > -                                  unsigned long align, unsigned long goal)
> > +                       unsigned long align, unsigned long goal, u32 flags)
> >  {
> >         if (WARN_ON_ONCE(slab_is_available()))
> >                 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> >
> > -       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> > +       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> > +                                            0, flags);
> >  }
> >
> >  void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> > @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
> >  {
> >         void *ptr;
> >
> > -       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> > +       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
> >         if (ptr)
> >                 return ptr;
> >
> > Index: linux/mm/page_cgroup.c
> > ===================================================================
> > --- linux.orig/mm/page_cgroup.c
> > +++ linux/mm/page_cgroup.c
> > @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
> >         table_size = sizeof(struct page_cgroup) * nr_pages;
> >
> >         base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> > -                       table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> > +                       table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
> >         if (!base)
> >                 return -ENOMEM;
> >         NODE_DATA(nid)->node_page_cgroup = base;
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@xxxxxxxxxxxxxxx
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> >
> >

-- 
Cliff Wickman
SGI
cpw@xxxxxxx
(651) 683-3824

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>


[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]