[PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v3

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Reserving a large number of 1GB hugetlbfs pages at boot takes a very
long time due to the pages being memset to 0 during the reservation.
This is unneeded as the pages will be zeroed by clear_huge_page() when
being allocated by the user.

Large system sites would at times like to allocate a very large amount
of memory as 1GB pages.  They would put this on the kernel boot line:
   default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
 up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:
      start_kernel
        kernel_init
          do_pre_smp_initcalls
            hugetlb_init
              hugetlb_init_hstates
                hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems).  This estimate is approximate (it depends on
core frequency & number of hops to remote memory) but should be within
a factor of 2 on most systems.  A benchmark attempting to reserve 1TB
for 1GB pages would thus require ~1000 seconds of boot time just for
this allocating.  32TB would take 8 hours.  With this change, the 1TB
benchmark took just 4.5 seconds.

Signed-off-by: Robin Holt <holt@xxxxxxx>
Cc: Cliff Whickman <cpw@xxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: Yinghai Lu <yinghai@xxxxxxxxxx>
Cc: lkml <linux-kernel@xxxxxxxxxxxxxxx>
Cc: Linux mm <linux-mm@xxxxxxxxx>
Cc: x86 Maintainers <x86@xxxxxxxxxx>

---

Changes since -v2
 - Fixed the CONFIG_NO_BOOTMEM=n case so the pages were actually not zeroed.
   These changes have been compiled for ia64, but not tested.

 include/linux/bootmem.h |  8 +++++++-
 mm/bootmem.c            | 51 ++++++++++++++++++++++++++++++++++---------------
 mm/hugetlb.c            |  2 +-
 mm/nobootmem.c          | 37 +++++++++++++++++++++++++----------
 mm/sparse.c             |  2 +-
 5 files changed, 72 insertions(+), 28 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index cdc3bab..04563fc 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -92,11 +92,17 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal);
+extern void *__alloc_bootmem_node_nopanic_notzeroed(
+				  pg_data_t *pgdat,
+				  unsigned long size,
+				  unsigned long align,
+				  unsigned long goal);
 void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal,
-				  unsigned long limit);
+				  unsigned long limit,
+				  int zeroed);
 extern void *__alloc_bootmem_low(unsigned long size,
 				 unsigned long align,
 				 unsigned long goal);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb0..350e0ab 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -497,7 +497,8 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
 
 static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
 					unsigned long size, unsigned long align,
-					unsigned long goal, unsigned long limit)
+					unsigned long goal, unsigned long limit,
+					int zeroed)
 {
 	unsigned long fallback = 0;
 	unsigned long min, max, start, sidx, midx, step;
@@ -584,7 +585,8 @@ find_block:
 
 		region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
 				start_off);
-		memset(region, 0, size);
+		if (zeroed)
+			memset(region, 0, size);
 		/*
 		 * The min_count is set to 0 so that bootmem allocated blocks
 		 * are never reported as leaks.
@@ -605,13 +607,18 @@ find_block:
 static void * __init alloc_bootmem_core(unsigned long size,
 					unsigned long align,
 					unsigned long goal,
-					unsigned long limit)
+					unsigned long limit,
+					int zeroed)
 {
 	bootmem_data_t *bdata;
 	void *region;
 
-	if (WARN_ON_ONCE(slab_is_available()))
-		return kzalloc(size, GFP_NOWAIT);
+	if (WARN_ON_ONCE(slab_is_available())) {
+		if (zeroed)
+			return kzalloc(size, GFP_NOWAIT);
+		else
+			return kmalloc(size, GFP_NOWAIT);
+	}
 
 	list_for_each_entry(bdata, &bdata_list, list) {
 		if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
@@ -619,7 +626,7 @@ static void * __init alloc_bootmem_core(unsigned long size,
 		if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
 			break;
 
-		region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
+		region = alloc_bootmem_bdata(bdata, size, align, goal, limit, zeroed);
 		if (region)
 			return region;
 	}
@@ -635,7 +642,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
 	void *ptr;
 
 restart:
-	ptr = alloc_bootmem_core(size, align, goal, limit);
+	ptr = alloc_bootmem_core(size, align, goal, limit, 1);
 	if (ptr)
 		return ptr;
 	if (goal) {
@@ -705,23 +712,28 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 
 void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				unsigned long size, unsigned long align,
-				unsigned long goal, unsigned long limit)
+				unsigned long goal, unsigned long limit,
+				int zeroed)
 {
 	void *ptr;
 
-	if (WARN_ON_ONCE(slab_is_available()))
-		return kzalloc(size, GFP_NOWAIT);
+	if (WARN_ON_ONCE(slab_is_available())) {
+		if (zeroed)
+			return kzalloc(size, GFP_NOWAIT);
+		else
+			return kmalloc(size, GFP_NOWAIT);
+	}
 again:
 
 	/* do not panic in alloc_bootmem_bdata() */
 	if (limit && goal + size > limit)
 		limit = 0;
 
-	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
+	ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
-	ptr = alloc_bootmem_core(size, align, goal, limit);
+	ptr = alloc_bootmem_core(size, align, goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
@@ -733,13 +745,22 @@ again:
 	return NULL;
 }
 
+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
 void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
 }
 
 void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -748,7 +769,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
 	if (ptr)
 		return ptr;
 
@@ -800,7 +821,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 
 		new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
 		ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
-						 new_goal, 0);
+						 new_goal, 0, 1);
 		if (ptr)
 			return ptr;
 	}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ca9a7c6..7683f6a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
 	while (nr_nodes) {
 		void *addr;
 
-		addr = __alloc_bootmem_node_nopanic(
+		addr = __alloc_bootmem_node_nopanic_notzeroed(
 				NODE_DATA(hstate_next_node_to_alloc(h,
 						&node_states[N_MEMORY])),
 				huge_page_size(h), huge_page_size(h), 0);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36..342511b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -32,8 +32,8 @@ unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
 
-static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
-					u64 goal, u64 limit)
+static void * __init ___alloc_memory_core_early(int nid, u64 size, u64 align,
+					u64 goal, u64 limit, int zeroed)
 {
 	void *ptr;
 	u64 addr;
@@ -46,7 +46,8 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 		return NULL;
 
 	ptr = phys_to_virt(addr);
-	memset(ptr, 0, size);
+	if (zeroed)
+		memset(ptr, 0, size);
 	memblock_reserve(addr, size);
 	/*
 	 * The min_count is set to 0 so that bootmem allocated blocks
@@ -56,6 +57,12 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 	return ptr;
 }
 
+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+					u64 goal, u64 limit)
+{
+	return ___alloc_memory_core_early(nid, size, align, goal, limit, 1);
+}
+
 /*
  * free_bootmem_late - free bootmem pages directly to page allocator
  * @addr: starting address of the range
@@ -291,18 +298,19 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 						   unsigned long size,
 						   unsigned long align,
 						   unsigned long goal,
-						   unsigned long limit)
+						   unsigned long limit,
+						   int zeroed)
 {
 	void *ptr;
 
 again:
-	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-					goal, limit);
+	ptr = ___alloc_memory_core_early(pgdat->node_id, size, align,
+					goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
-	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-					goal, limit);
+	ptr = ___alloc_memory_core_early(MAX_NUMNODES, size, align,
+					goal, limit, zeroed);
 	if (ptr)
 		return ptr;
 
@@ -314,13 +322,22 @@ again:
 	return NULL;
 }
 
+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
+{
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
 void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
-	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+	return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
 }
 
 void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -329,7 +346,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
-	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+	ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 1);
 	if (ptr)
 		return ptr;
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 7ca6dc8..8a1c5ad 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
 	p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
-					  SMP_CACHE_BYTES, goal, limit);
+					  SMP_CACHE_BYTES, goal, limit, 1);
 	if (!p && limit) {
 		limit = 0;
 		goto again;
-- 
1.8.1.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]