Re: [RFC PATCH] mm: support CONFIG_ZONE_DEVICE + CONFIG_ZONE_DMA

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 26.1.2016 1:06, Dan Williams wrote:
> It appears devices requiring ZONE_DMA are still prevalent (see link
> below).  For this reason the proposal to require turning off ZONE_DMA to
> enable ZONE_DEVICE is untenable in the short term.  We want a single
> kernel image to be able to support legacy devices as well as next
> generation persistent memory platforms.
> 
> Towards this end, alias ZONE_DMA and ZONE_DEVICE to work around needing
> to maintain a unique zone number for ZONE_DEVICE.  Record the geometry
> of ZONE_DMA at init (->init_spanned_pages) and use that information in
> is_zone_device_page() to differentiate pages allocated via
> devm_memremap_pages() vs true ZONE_DMA pages.  Otherwise, use the
> simpler definition of is_zone_device_page() when ZONE_DMA is turned off.
> 
> Note that this also teaches the memory hot remove path that the zone may
> not have sections for all pfn spans (->zone_dyn_start_pfn).
> 
> A user visible implication of this change is potentially an unexpectedly
> high "spanned" value in /proc/zoneinfo for the DMA zone.

[+CC Joonsoo, Laura]

Sounds like quite a hack :( Would it be possible to extend the bits encoding
zone? Potentially, ZONE_CMA could be added one day...

> Cc: H. Peter Anvin <hpa@xxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: Rik van Riel <riel@xxxxxxxxxx>
> Cc: Mel Gorman <mgorman@xxxxxxx>
> Cc: Jerome Glisse <j.glisse@xxxxxxxxx>
> Cc: Christoph Hellwig <hch@xxxxxx>
> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931
> Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"")
> Reported-by: Sudip Mukherjee <sudipm.mukherjee@xxxxxxxxx>
> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
> ---
>  include/linux/mm.h     |   46 ++++++++++++++++++++++++++++++++--------------
>  include/linux/mmzone.h |   24 ++++++++++++++++++++----
>  mm/Kconfig             |    1 -
>  mm/memory_hotplug.c    |   15 +++++++++++----
>  mm/page_alloc.c        |    9 ++++++---
>  5 files changed, 69 insertions(+), 26 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f1cd22f2df1a..b4bccd3d3c41 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -664,12 +664,44 @@ static inline enum zone_type page_zonenum(const struct page *page)
>  	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
>  }
>  
> +#ifdef NODE_NOT_IN_PAGE_FLAGS
> +extern int page_to_nid(const struct page *page);
> +#else
> +static inline int page_to_nid(const struct page *page)
> +{
> +	return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
> +}
> +#endif
> +
> +static inline struct zone *page_zone(const struct page *page)
> +{
> +	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
> +}
> +
>  #ifdef CONFIG_ZONE_DEVICE
>  void get_zone_device_page(struct page *page);
>  void put_zone_device_page(struct page *page);
>  static inline bool is_zone_device_page(const struct page *page)
>  {
> +#ifndef CONFIG_ZONE_DMA
>  	return page_zonenum(page) == ZONE_DEVICE;
> +#else /* ZONE_DEVICE == ZONE_DMA */
> +	struct zone *zone;
> +
> +	if (page_zonenum(page) != ZONE_DEVICE)
> +		return false;
> +
> +	/*
> +	 * If ZONE_DEVICE is aliased with ZONE_DMA we need to check
> +	 * whether this was a dynamically allocated page from
> +	 * devm_memremap_pages() by checking against the size of
> +	 * ZONE_DMA at boot.
> +	 */
> +	zone = page_zone(page);
> +	if (page_to_pfn(page) <= zone_end_pfn_boot(zone))
> +		return false;
> +	return true;
> +#endif
>  }
>  #else
>  static inline void get_zone_device_page(struct page *page)
> @@ -735,15 +767,6 @@ static inline int zone_to_nid(struct zone *zone)
>  #endif
>  }
>  
> -#ifdef NODE_NOT_IN_PAGE_FLAGS
> -extern int page_to_nid(const struct page *page);
> -#else
> -static inline int page_to_nid(const struct page *page)
> -{
> -	return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
> -}
> -#endif
> -
>  #ifdef CONFIG_NUMA_BALANCING
>  static inline int cpu_pid_to_cpupid(int cpu, int pid)
>  {
> @@ -857,11 +880,6 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
>  }
>  #endif /* CONFIG_NUMA_BALANCING */
>  
> -static inline struct zone *page_zone(const struct page *page)
> -{
> -	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
> -}
> -
>  #ifdef SECTION_IN_PAGE_FLAGS
>  static inline void set_page_section(struct page *page, unsigned long section)
>  {
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 33bb1b19273e..a0ef09b7f893 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -288,6 +288,13 @@ enum zone_type {
>  	 */
>  	ZONE_DMA,
>  #endif
> +#ifdef CONFIG_ZONE_DEVICE
> +#ifndef CONFIG_ZONE_DMA
> +	ZONE_DEVICE,
> +#else
> +	ZONE_DEVICE = ZONE_DMA,
> +#endif
> +#endif
>  #ifdef CONFIG_ZONE_DMA32
>  	/*
>  	 * x86_64 needs two ZONE_DMAs because it supports devices that are
> @@ -314,11 +321,7 @@ enum zone_type {
>  	ZONE_HIGHMEM,
>  #endif
>  	ZONE_MOVABLE,
> -#ifdef CONFIG_ZONE_DEVICE
> -	ZONE_DEVICE,
> -#endif
>  	__MAX_NR_ZONES
> -
>  };
>  
>  #ifndef __GENERATING_BOUNDS_H
> @@ -379,12 +382,19 @@ struct zone {
>  
>  	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
>  	unsigned long		zone_start_pfn;
> +	/* first dynamically added pfn of the zone */
> +	unsigned long		zone_dyn_start_pfn;
>  
>  	/*
>  	 * spanned_pages is the total pages spanned by the zone, including
>  	 * holes, which is calculated as:
>  	 * 	spanned_pages = zone_end_pfn - zone_start_pfn;
>  	 *
> +	 * init_spanned_pages is the boot/init time total pages spanned
> +	 * by the zone for differentiating statically assigned vs
> +	 * dynamically hot added memory to a zone.
> +	 * 	init_spanned_pages = init_zone_end_pfn - zone_start_pfn;
> +	 *
>  	 * present_pages is physical pages existing within the zone, which
>  	 * is calculated as:
>  	 *	present_pages = spanned_pages - absent_pages(pages in holes);
> @@ -423,6 +433,7 @@ struct zone {
>  	 */
>  	unsigned long		managed_pages;
>  	unsigned long		spanned_pages;
> +	unsigned long		init_spanned_pages;
>  	unsigned long		present_pages;
>  
>  	const char		*name;
> @@ -546,6 +557,11 @@ static inline unsigned long zone_end_pfn(const struct zone *zone)
>  	return zone->zone_start_pfn + zone->spanned_pages;
>  }
>  
> +static inline unsigned long zone_end_pfn_boot(const struct zone *zone)
> +{
> +	return zone->zone_start_pfn + zone->init_spanned_pages;
> +}
> +
>  static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
>  {
>  	return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 97a4e06b15c0..08a92a9c8fbd 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -652,7 +652,6 @@ config IDLE_PAGE_TRACKING
>  config ZONE_DEVICE
>  	bool "Device memory (pmem, etc...) hotplug support" if EXPERT
>  	default !ZONE_DMA
> -	depends on !ZONE_DMA
>  	depends on MEMORY_HOTPLUG
>  	depends on MEMORY_HOTREMOVE
>  	depends on X86_64 #arch_add_memory() comprehends device memory
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 4af58a3a8ffa..c3f0ff45bd47 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -300,6 +300,8 @@ static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
>  
>  	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
>  				zone->zone_start_pfn;
> +	if (!zone->zone_dyn_start_pfn || start_pfn < zone->zone_dyn_start_pfn)
> +		zone->zone_dyn_start_pfn = start_pfn;
>  
>  	zone_span_writeunlock(zone);
>  }
> @@ -601,8 +603,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone,
>  static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
>  			     unsigned long end_pfn)
>  {
> -	unsigned long zone_start_pfn = zone->zone_start_pfn;
> +	unsigned long zone_start_pfn = zone->zone_dyn_start_pfn;
>  	unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
> +	bool dyn_zone = zone->zone_start_pfn == zone_start_pfn;
>  	unsigned long zone_end_pfn = z;
>  	unsigned long pfn;
>  	struct mem_section *ms;
> @@ -619,7 +622,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
>  		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
>  						zone_end_pfn);
>  		if (pfn) {
> -			zone->zone_start_pfn = pfn;
> +			if (dyn_zone)
> +				zone->zone_start_pfn = pfn;
> +			zone->zone_dyn_start_pfn = pfn;
>  			zone->spanned_pages = zone_end_pfn - pfn;
>  		}
>  	} else if (zone_end_pfn == end_pfn) {
> @@ -661,8 +666,10 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
>  	}
>  
>  	/* The zone has no valid section */
> -	zone->zone_start_pfn = 0;
> -	zone->spanned_pages = 0;
> +	if (dyn_zone)
> +		zone->zone_start_pfn = 0;
> +	zone->zone_dyn_start_pfn = 0;
> +	zone->spanned_pages = zone->init_spanned_pages;
>  	zone_span_writeunlock(zone);
>  }
>  
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 63358d9f9aa9..2d8b1d602ff3 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -209,6 +209,10 @@ EXPORT_SYMBOL(totalram_pages);
>  static char * const zone_names[MAX_NR_ZONES] = {
>  #ifdef CONFIG_ZONE_DMA
>  	 "DMA",
> +#else
> +#ifdef CONFIG_ZONE_DEVICE
> +	 "Device",
> +#endif
>  #endif
>  #ifdef CONFIG_ZONE_DMA32
>  	 "DMA32",
> @@ -218,9 +222,6 @@ static char * const zone_names[MAX_NR_ZONES] = {
>  	 "HighMem",
>  #endif
>  	 "Movable",
> -#ifdef CONFIG_ZONE_DEVICE
> -	 "Device",
> -#endif
>  };
>  
>  compound_page_dtor * const compound_page_dtors[] = {
> @@ -5082,6 +5083,8 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
>  						  node_start_pfn, node_end_pfn,
>  						  zholes_size);
>  		zone->spanned_pages = size;
> +		zone->init_spanned_pages = size;
> +		zone->zone_dyn_start_pfn = 0;
>  		zone->present_pages = real_size;
>  
>  		totalpages += size;
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>



[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]