On 26.1.2016 1:06, Dan Williams wrote: > It appears devices requiring ZONE_DMA are still prevalent (see link > below). For this reason the proposal to require turning off ZONE_DMA to > enable ZONE_DEVICE is untenable in the short term. We want a single > kernel image to be able to support legacy devices as well as next > generation persistent memory platforms. > > Towards this end, alias ZONE_DMA and ZONE_DEVICE to work around needing > to maintain a unique zone number for ZONE_DEVICE. Record the geometry > of ZONE_DMA at init (->init_spanned_pages) and use that information in > is_zone_device_page() to differentiate pages allocated via > devm_memremap_pages() vs true ZONE_DMA pages. Otherwise, use the > simpler definition of is_zone_device_page() when ZONE_DMA is turned off. > > Note that this also teaches the memory hot remove path that the zone may > not have sections for all pfn spans (->zone_dyn_start_pfn). > > A user visible implication of this change is potentially an unexpectedly > high "spanned" value in /proc/zoneinfo for the DMA zone. [+CC Joonsoo, Laura] Sounds like quite a hack :( Would it be possible to extend the bits encoding zone? Potentially, ZONE_CMA could be added one day... > Cc: H. Peter Anvin <hpa@xxxxxxxxx> > Cc: Ingo Molnar <mingo@xxxxxxxxxx> > Cc: Rik van Riel <riel@xxxxxxxxxx> > Cc: Mel Gorman <mgorman@xxxxxxx> > Cc: Jerome Glisse <j.glisse@xxxxxxxxx> > Cc: Christoph Hellwig <hch@xxxxxx> > Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> > Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931 > Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"") > Reported-by: Sudip Mukherjee <sudipm.mukherjee@xxxxxxxxx> > Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> > --- > include/linux/mm.h | 46 ++++++++++++++++++++++++++++++++-------------- > include/linux/mmzone.h | 24 ++++++++++++++++++++---- > mm/Kconfig | 1 - > mm/memory_hotplug.c | 15 +++++++++++---- > mm/page_alloc.c | 9 ++++++--- > 5 files changed, 69 insertions(+), 26 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index f1cd22f2df1a..b4bccd3d3c41 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -664,12 +664,44 @@ static inline enum zone_type page_zonenum(const struct page *page) > return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; > } > > +#ifdef NODE_NOT_IN_PAGE_FLAGS > +extern int page_to_nid(const struct page *page); > +#else > +static inline int page_to_nid(const struct page *page) > +{ > + return (page->flags >> NODES_PGSHIFT) & NODES_MASK; > +} > +#endif > + > +static inline struct zone *page_zone(const struct page *page) > +{ > + return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; > +} > + > #ifdef CONFIG_ZONE_DEVICE > void get_zone_device_page(struct page *page); > void put_zone_device_page(struct page *page); > static inline bool is_zone_device_page(const struct page *page) > { > +#ifndef CONFIG_ZONE_DMA > return page_zonenum(page) == ZONE_DEVICE; > +#else /* ZONE_DEVICE == ZONE_DMA */ > + struct zone *zone; > + > + if (page_zonenum(page) != ZONE_DEVICE) > + return false; > + > + /* > + * If ZONE_DEVICE is aliased with ZONE_DMA we need to check > + * whether this was a dynamically allocated page from > + * devm_memremap_pages() by checking against the size of > + * ZONE_DMA at boot. > + */ > + zone = page_zone(page); > + if (page_to_pfn(page) <= zone_end_pfn_boot(zone)) > + return false; > + return true; > +#endif > } > #else > static inline void get_zone_device_page(struct page *page) > @@ -735,15 +767,6 @@ static inline int zone_to_nid(struct zone *zone) > #endif > } > > -#ifdef NODE_NOT_IN_PAGE_FLAGS > -extern int page_to_nid(const struct page *page); > -#else > -static inline int page_to_nid(const struct page *page) > -{ > - return (page->flags >> NODES_PGSHIFT) & NODES_MASK; > -} > -#endif > - > #ifdef CONFIG_NUMA_BALANCING > static inline int cpu_pid_to_cpupid(int cpu, int pid) > { > @@ -857,11 +880,6 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) > } > #endif /* CONFIG_NUMA_BALANCING */ > > -static inline struct zone *page_zone(const struct page *page) > -{ > - return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; > -} > - > #ifdef SECTION_IN_PAGE_FLAGS > static inline void set_page_section(struct page *page, unsigned long section) > { > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 33bb1b19273e..a0ef09b7f893 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -288,6 +288,13 @@ enum zone_type { > */ > ZONE_DMA, > #endif > +#ifdef CONFIG_ZONE_DEVICE > +#ifndef CONFIG_ZONE_DMA > + ZONE_DEVICE, > +#else > + ZONE_DEVICE = ZONE_DMA, > +#endif > +#endif > #ifdef CONFIG_ZONE_DMA32 > /* > * x86_64 needs two ZONE_DMAs because it supports devices that are > @@ -314,11 +321,7 @@ enum zone_type { > ZONE_HIGHMEM, > #endif > ZONE_MOVABLE, > -#ifdef CONFIG_ZONE_DEVICE > - ZONE_DEVICE, > -#endif > __MAX_NR_ZONES > - > }; > > #ifndef __GENERATING_BOUNDS_H > @@ -379,12 +382,19 @@ struct zone { > > /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ > unsigned long zone_start_pfn; > + /* first dynamically added pfn of the zone */ > + unsigned long zone_dyn_start_pfn; > > /* > * spanned_pages is the total pages spanned by the zone, including > * holes, which is calculated as: > * spanned_pages = zone_end_pfn - zone_start_pfn; > * > + * init_spanned_pages is the boot/init time total pages spanned > + * by the zone for differentiating statically assigned vs > + * dynamically hot added memory to a zone. > + * init_spanned_pages = init_zone_end_pfn - zone_start_pfn; > + * > * present_pages is physical pages existing within the zone, which > * is calculated as: > * present_pages = spanned_pages - absent_pages(pages in holes); > @@ -423,6 +433,7 @@ struct zone { > */ > unsigned long managed_pages; > unsigned long spanned_pages; > + unsigned long init_spanned_pages; > unsigned long present_pages; > > const char *name; > @@ -546,6 +557,11 @@ static inline unsigned long zone_end_pfn(const struct zone *zone) > return zone->zone_start_pfn + zone->spanned_pages; > } > > +static inline unsigned long zone_end_pfn_boot(const struct zone *zone) > +{ > + return zone->zone_start_pfn + zone->init_spanned_pages; > +} > + > static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) > { > return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); > diff --git a/mm/Kconfig b/mm/Kconfig > index 97a4e06b15c0..08a92a9c8fbd 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -652,7 +652,6 @@ config IDLE_PAGE_TRACKING > config ZONE_DEVICE > bool "Device memory (pmem, etc...) hotplug support" if EXPERT > default !ZONE_DMA > - depends on !ZONE_DMA > depends on MEMORY_HOTPLUG > depends on MEMORY_HOTREMOVE > depends on X86_64 #arch_add_memory() comprehends device memory > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index 4af58a3a8ffa..c3f0ff45bd47 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -300,6 +300,8 @@ static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, > > zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - > zone->zone_start_pfn; > + if (!zone->zone_dyn_start_pfn || start_pfn < zone->zone_dyn_start_pfn) > + zone->zone_dyn_start_pfn = start_pfn; > > zone_span_writeunlock(zone); > } > @@ -601,8 +603,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone, > static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, > unsigned long end_pfn) > { > - unsigned long zone_start_pfn = zone->zone_start_pfn; > + unsigned long zone_start_pfn = zone->zone_dyn_start_pfn; > unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ > + bool dyn_zone = zone->zone_start_pfn == zone_start_pfn; > unsigned long zone_end_pfn = z; > unsigned long pfn; > struct mem_section *ms; > @@ -619,7 +622,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, > pfn = find_smallest_section_pfn(nid, zone, end_pfn, > zone_end_pfn); > if (pfn) { > - zone->zone_start_pfn = pfn; > + if (dyn_zone) > + zone->zone_start_pfn = pfn; > + zone->zone_dyn_start_pfn = pfn; > zone->spanned_pages = zone_end_pfn - pfn; > } > } else if (zone_end_pfn == end_pfn) { > @@ -661,8 +666,10 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, > } > > /* The zone has no valid section */ > - zone->zone_start_pfn = 0; > - zone->spanned_pages = 0; > + if (dyn_zone) > + zone->zone_start_pfn = 0; > + zone->zone_dyn_start_pfn = 0; > + zone->spanned_pages = zone->init_spanned_pages; > zone_span_writeunlock(zone); > } > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 63358d9f9aa9..2d8b1d602ff3 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -209,6 +209,10 @@ EXPORT_SYMBOL(totalram_pages); > static char * const zone_names[MAX_NR_ZONES] = { > #ifdef CONFIG_ZONE_DMA > "DMA", > +#else > +#ifdef CONFIG_ZONE_DEVICE > + "Device", > +#endif > #endif > #ifdef CONFIG_ZONE_DMA32 > "DMA32", > @@ -218,9 +222,6 @@ static char * const zone_names[MAX_NR_ZONES] = { > "HighMem", > #endif > "Movable", > -#ifdef CONFIG_ZONE_DEVICE > - "Device", > -#endif > }; > > compound_page_dtor * const compound_page_dtors[] = { > @@ -5082,6 +5083,8 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, > node_start_pfn, node_end_pfn, > zholes_size); > zone->spanned_pages = size; > + zone->init_spanned_pages = size; > + zone->zone_dyn_start_pfn = 0; > zone->present_pages = real_size; > > totalpages += size; > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxx. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>