[RFC PATCH] mm: support CONFIG_ZONE_DEVICE + CONFIG_ZONE_DMA

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



It appears devices requiring ZONE_DMA are still prevalent (see link
below).  For this reason the proposal to require turning off ZONE_DMA to
enable ZONE_DEVICE is untenable in the short term.  We want a single
kernel image to be able to support legacy devices as well as next
generation persistent memory platforms.

Towards this end, alias ZONE_DMA and ZONE_DEVICE to work around needing
to maintain a unique zone number for ZONE_DEVICE.  Record the geometry
of ZONE_DMA at init (->init_spanned_pages) and use that information in
is_zone_device_page() to differentiate pages allocated via
devm_memremap_pages() vs true ZONE_DMA pages.  Otherwise, use the
simpler definition of is_zone_device_page() when ZONE_DMA is turned off.

Note that this also teaches the memory hot remove path that the zone may
not have sections for all pfn spans (->zone_dyn_start_pfn).

A user visible implication of this change is potentially an unexpectedly
high "spanned" value in /proc/zoneinfo for the DMA zone.

Cc: H. Peter Anvin <hpa@xxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Jerome Glisse <j.glisse@xxxxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931
Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"")
Reported-by: Sudip Mukherjee <sudipm.mukherjee@xxxxxxxxx>
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
 include/linux/mm.h     |   46 ++++++++++++++++++++++++++++++++--------------
 include/linux/mmzone.h |   24 ++++++++++++++++++++----
 mm/Kconfig             |    1 -
 mm/memory_hotplug.c    |   15 +++++++++++----
 mm/page_alloc.c        |    9 ++++++---
 5 files changed, 69 insertions(+), 26 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f1cd22f2df1a..b4bccd3d3c41 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -664,12 +664,44 @@ static inline enum zone_type page_zonenum(const struct page *page)
 	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
 
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+extern int page_to_nid(const struct page *page);
+#else
+static inline int page_to_nid(const struct page *page)
+{
+	return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
+}
+#endif
+
+static inline struct zone *page_zone(const struct page *page)
+{
+	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 void get_zone_device_page(struct page *page);
 void put_zone_device_page(struct page *page);
 static inline bool is_zone_device_page(const struct page *page)
 {
+#ifndef CONFIG_ZONE_DMA
 	return page_zonenum(page) == ZONE_DEVICE;
+#else /* ZONE_DEVICE == ZONE_DMA */
+	struct zone *zone;
+
+	if (page_zonenum(page) != ZONE_DEVICE)
+		return false;
+
+	/*
+	 * If ZONE_DEVICE is aliased with ZONE_DMA we need to check
+	 * whether this was a dynamically allocated page from
+	 * devm_memremap_pages() by checking against the size of
+	 * ZONE_DMA at boot.
+	 */
+	zone = page_zone(page);
+	if (page_to_pfn(page) <= zone_end_pfn_boot(zone))
+		return false;
+	return true;
+#endif
 }
 #else
 static inline void get_zone_device_page(struct page *page)
@@ -735,15 +767,6 @@ static inline int zone_to_nid(struct zone *zone)
 #endif
 }
 
-#ifdef NODE_NOT_IN_PAGE_FLAGS
-extern int page_to_nid(const struct page *page);
-#else
-static inline int page_to_nid(const struct page *page)
-{
-	return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
-}
-#endif
-
 #ifdef CONFIG_NUMA_BALANCING
 static inline int cpu_pid_to_cpupid(int cpu, int pid)
 {
@@ -857,11 +880,6 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-static inline struct zone *page_zone(const struct page *page)
-{
-	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
-}
-
 #ifdef SECTION_IN_PAGE_FLAGS
 static inline void set_page_section(struct page *page, unsigned long section)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 33bb1b19273e..a0ef09b7f893 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -288,6 +288,13 @@ enum zone_type {
 	 */
 	ZONE_DMA,
 #endif
+#ifdef CONFIG_ZONE_DEVICE
+#ifndef CONFIG_ZONE_DMA
+	ZONE_DEVICE,
+#else
+	ZONE_DEVICE = ZONE_DMA,
+#endif
+#endif
 #ifdef CONFIG_ZONE_DMA32
 	/*
 	 * x86_64 needs two ZONE_DMAs because it supports devices that are
@@ -314,11 +321,7 @@ enum zone_type {
 	ZONE_HIGHMEM,
 #endif
 	ZONE_MOVABLE,
-#ifdef CONFIG_ZONE_DEVICE
-	ZONE_DEVICE,
-#endif
 	__MAX_NR_ZONES
-
 };
 
 #ifndef __GENERATING_BOUNDS_H
@@ -379,12 +382,19 @@ struct zone {
 
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
+	/* first dynamically added pfn of the zone */
+	unsigned long		zone_dyn_start_pfn;
 
 	/*
 	 * spanned_pages is the total pages spanned by the zone, including
 	 * holes, which is calculated as:
 	 * 	spanned_pages = zone_end_pfn - zone_start_pfn;
 	 *
+	 * init_spanned_pages is the boot/init time total pages spanned
+	 * by the zone for differentiating statically assigned vs
+	 * dynamically hot added memory to a zone.
+	 * 	init_spanned_pages = init_zone_end_pfn - zone_start_pfn;
+	 *
 	 * present_pages is physical pages existing within the zone, which
 	 * is calculated as:
 	 *	present_pages = spanned_pages - absent_pages(pages in holes);
@@ -423,6 +433,7 @@ struct zone {
 	 */
 	unsigned long		managed_pages;
 	unsigned long		spanned_pages;
+	unsigned long		init_spanned_pages;
 	unsigned long		present_pages;
 
 	const char		*name;
@@ -546,6 +557,11 @@ static inline unsigned long zone_end_pfn(const struct zone *zone)
 	return zone->zone_start_pfn + zone->spanned_pages;
 }
 
+static inline unsigned long zone_end_pfn_boot(const struct zone *zone)
+{
+	return zone->zone_start_pfn + zone->init_spanned_pages;
+}
+
 static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
 {
 	return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
diff --git a/mm/Kconfig b/mm/Kconfig
index 97a4e06b15c0..08a92a9c8fbd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -652,7 +652,6 @@ config IDLE_PAGE_TRACKING
 config ZONE_DEVICE
 	bool "Device memory (pmem, etc...) hotplug support" if EXPERT
 	default !ZONE_DMA
-	depends on !ZONE_DMA
 	depends on MEMORY_HOTPLUG
 	depends on MEMORY_HOTREMOVE
 	depends on X86_64 #arch_add_memory() comprehends device memory
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4af58a3a8ffa..c3f0ff45bd47 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -300,6 +300,8 @@ static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
 
 	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
 				zone->zone_start_pfn;
+	if (!zone->zone_dyn_start_pfn || start_pfn < zone->zone_dyn_start_pfn)
+		zone->zone_dyn_start_pfn = start_pfn;
 
 	zone_span_writeunlock(zone);
 }
@@ -601,8 +603,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone,
 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 			     unsigned long end_pfn)
 {
-	unsigned long zone_start_pfn = zone->zone_start_pfn;
+	unsigned long zone_start_pfn = zone->zone_dyn_start_pfn;
 	unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
+	bool dyn_zone = zone->zone_start_pfn == zone_start_pfn;
 	unsigned long zone_end_pfn = z;
 	unsigned long pfn;
 	struct mem_section *ms;
@@ -619,7 +622,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
 						zone_end_pfn);
 		if (pfn) {
-			zone->zone_start_pfn = pfn;
+			if (dyn_zone)
+				zone->zone_start_pfn = pfn;
+			zone->zone_dyn_start_pfn = pfn;
 			zone->spanned_pages = zone_end_pfn - pfn;
 		}
 	} else if (zone_end_pfn == end_pfn) {
@@ -661,8 +666,10 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 	}
 
 	/* The zone has no valid section */
-	zone->zone_start_pfn = 0;
-	zone->spanned_pages = 0;
+	if (dyn_zone)
+		zone->zone_start_pfn = 0;
+	zone->zone_dyn_start_pfn = 0;
+	zone->spanned_pages = zone->init_spanned_pages;
 	zone_span_writeunlock(zone);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63358d9f9aa9..2d8b1d602ff3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -209,6 +209,10 @@ EXPORT_SYMBOL(totalram_pages);
 static char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
 	 "DMA",
+#else
+#ifdef CONFIG_ZONE_DEVICE
+	 "Device",
+#endif
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
@@ -218,9 +222,6 @@ static char * const zone_names[MAX_NR_ZONES] = {
 	 "HighMem",
 #endif
 	 "Movable",
-#ifdef CONFIG_ZONE_DEVICE
-	 "Device",
-#endif
 };
 
 compound_page_dtor * const compound_page_dtors[] = {
@@ -5082,6 +5083,8 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 						  node_start_pfn, node_end_pfn,
 						  zholes_size);
 		zone->spanned_pages = size;
+		zone->init_spanned_pages = size;
+		zone->zone_dyn_start_pfn = 0;
 		zone->present_pages = real_size;
 
 		totalpages += size;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>



[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]