The patch titled Subject: mm, memory_hotplug: move zone/pages handling to offline stage has been removed from the -mm tree. Its filename was mm-memory_hotplug-move-zone-pages-handling-to-offline-stage.patch This patch was dropped because an updated version will be merged ------------------------------------------------------ From: Oscar Salvador <osalvador@xxxxxxxx> Subject: mm, memory_hotplug: move zone/pages handling to offline stage The current implementation accesses pages during hot-remove stage in order to get the zone linked to this memory-range. We use that zone for a) check if the zone is ZONE_DEVICE and b) to shrink the zone's spanned pages. Accessing pages during this stage is problematic, as we might be accessing pages that were not initialized if we did not get to online the memory before removing it. The only reason to check for ZONE_DEVICE in __remove_pages is to bypass the call to release_mem_region_adjustable(), since these regions are removed with devm_release_mem_region. With patch#2, this is no longer a problem so we can safely call release_mem_region_adjustable(). release_mem_region_adjustable() will spot that the region we are trying to remove was acquired by means of devm_request_mem_region, and will back off safely. This allows us to remove all zone-related operations from hot-remove stage. Because of this, zone's spanned pages are shrinked during the offlining stage in shrink_zone_pgdat(). It would have been great to decrease also the spanned page for the node there, but we need them in try_offline_node(). So we still decrease spanned pages for the node in the hot-remove stage. The only particularity is that now find_smallest_section_pfn/find_biggest_section_pfn, when called from shrink_zone_span, will now check for online sections and not valid sections instead. To make this work with devm/HMM code, we need to call offline_mem_sections and online_mem_sections in that code path when we are adding memory. Link: http://lkml.kernel.org/r/20181127162005.15833-4-osalvador@xxxxxxx Signed-off-by: Oscar Salvador <osalvador@xxxxxxx> Cc: Dan Williams <dan.j.williams@xxxxxxxxx> Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: Jerome Glisse <jglisse@xxxxxxxxxx> Cc: Jonathan Cameron <Jonathan.Cameron@xxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Pavel Tatashin <pasha.tatashin@xxxxxxxxxx> Cc: "Rafael J. Wysocki" <rafael@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- arch/powerpc/mm/mem.c | 11 --- arch/sh/mm/init.c | 4 - arch/x86/mm/init_32.c | 3 arch/x86/mm/init_64.c | 8 -- include/linux/memory_hotplug.h | 8 +- kernel/memremap.c | 14 +++- mm/memory_hotplug.c | 95 +++++++++++++++++-------------- mm/sparse.c | 4 - 8 files changed, 76 insertions(+), 71 deletions(-) --- a/arch/powerpc/mm/mem.c~mm-memory_hotplug-move-zone-pages-handling-to-offline-stage +++ a/arch/powerpc/mm/mem.c @@ -144,18 +144,9 @@ int __meminit arch_remove_memory(int nid { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page; int ret; - /* - * If we have an altmap then we need to skip over any reserved PFNs - * when querying the zone. - */ - page = pfn_to_page(start_pfn); - if (altmap) - page += vmem_altmap_offset(altmap); - - ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); + ret = remove_sections(nid, start_pfn, nr_pages, altmap); if (ret) return ret; --- a/arch/sh/mm/init.c~mm-memory_hotplug-move-zone-pages-handling-to-offline-stage +++ a/arch/sh/mm/init.c @@ -447,11 +447,9 @@ int arch_remove_memory(int nid, u64 star { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; int ret; - zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); + ret = remove_sections(nid, start_pfn, nr_pages, altmap); if (unlikely(ret)) pr_warn("%s: Failed, __remove_pages() == %d\n", __func__, ret); --- a/arch/x86/mm/init_32.c~mm-memory_hotplug-move-zone-pages-handling-to-offline-stage +++ a/arch/x86/mm/init_32.c @@ -866,8 +866,7 @@ int arch_remove_memory(int nid, u64 star unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; - zone = page_zone(pfn_to_page(start_pfn)); - return __remove_pages(zone, start_pfn, nr_pages, altmap); + return remove_sections(nid, start_pfn, nr_pages, altmap); } #endif #endif --- a/arch/x86/mm/init_64.c~mm-memory_hotplug-move-zone-pages-handling-to-offline-stage +++ a/arch/x86/mm/init_64.c @@ -1152,15 +1152,9 @@ int __ref arch_remove_memory(int nid, u6 { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page = pfn_to_page(start_pfn); - struct zone *zone; int ret; - /* With altmap the first mapped page is offset from @start */ - if (altmap) - page += vmem_altmap_offset(altmap); - zone = page_zone(page); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); + ret = remove_sections(nid, start_pfn, nr_pages, altmap); WARN_ON_ONCE(ret); kernel_physical_mapping_remove(start, start + size); --- a/include/linux/memory_hotplug.h~mm-memory_hotplug-move-zone-pages-handling-to-offline-stage +++ a/include/linux/memory_hotplug.h @@ -109,8 +109,10 @@ static inline bool movable_node_is_enabl #ifdef CONFIG_MEMORY_HOTREMOVE extern int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap); -extern int __remove_pages(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); +extern int remove_sections(int nid, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap); +extern void shrink_zone(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn, unsigned long offlined_pages); #endif /* CONFIG_MEMORY_HOTREMOVE */ /* reasonably generic interface to expand the physical pages */ @@ -335,7 +337,7 @@ extern int offline_pages(unsigned long s extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn, struct vmem_altmap *altmap); -extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, +extern void sparse_remove_one_section(int nid, struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); --- a/kernel/memremap.c~mm-memory_hotplug-move-zone-pages-handling-to-offline-stage +++ a/kernel/memremap.c @@ -87,6 +87,7 @@ static void devm_memremap_pages_release( struct resource *res = &pgmap->res; resource_size_t align_start, align_size; unsigned long pfn; + unsigned long nr_pages; int nid; pgmap->kill(pgmap->ref); @@ -101,10 +102,14 @@ static void devm_memremap_pages_release( nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT)); mem_hotplug_begin(); + + pfn = align_start >> PAGE_SHIFT; + nr_pages = align_size >> PAGE_SHIFT; + offline_mem_sections(pfn, pfn + nr_pages); + shrink_zone(page_zone(pfn_to_page(pfn)), pfn, pfn + nr_pages, nr_pages); + if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - pfn = align_start >> PAGE_SHIFT; - __remove_pages(page_zone(pfn_to_page(pfn)), pfn, - align_size >> PAGE_SHIFT, NULL); + remove_sections(nid, pfn, nr_pages, NULL); } else { arch_remove_memory(nid, align_start, align_size, pgmap->altmap_valid ? &pgmap->altmap : NULL); @@ -224,7 +229,10 @@ void *devm_memremap_pages(struct device if (!error) { struct zone *zone; + unsigned long pfn = align_start >> PAGE_SHIFT; + unsigned long nr_pages = align_size >> PAGE_SHIFT; + online_mem_sections(pfn, pfn + nr_pages); zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT, align_size >> PAGE_SHIFT, altmap); --- a/mm/memory_hotplug.c~mm-memory_hotplug-move-zone-pages-handling-to-offline-stage +++ a/mm/memory_hotplug.c @@ -314,6 +314,17 @@ out: } #ifdef CONFIG_MEMORY_HOTREMOVE +static bool is_section_ok(struct mem_section *ms, bool zone) +{ + /* + * We cannot shrink pgdat's spanned because we use them + * in try_offline_node to check if all sections were removed. + */ + if (zone) + return online_section(ms); + else + return valid_section(ms); +} /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, @@ -324,7 +335,7 @@ static unsigned long find_smallest_secti for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { ms = __pfn_to_section(start_pfn); - if (unlikely(!valid_section(ms))) + if (!is_section_ok(ms, !!zone)) continue; if (unlikely(pfn_to_nid(start_pfn) != nid)) @@ -352,7 +363,7 @@ static unsigned long find_biggest_sectio for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { ms = __pfn_to_section(pfn); - if (unlikely(!valid_section(ms))) + if (!is_section_ok(ms, !!zone)) continue; if (unlikely(pfn_to_nid(pfn) != nid)) @@ -414,7 +425,7 @@ static void shrink_zone_span(struct zone for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { ms = __pfn_to_section(pfn); - if (unlikely(!valid_section(ms))) + if (unlikely(!online_section(ms))) continue; if (page_zone(pfn_to_page(pfn)) != zone) @@ -501,23 +512,33 @@ static void shrink_pgdat_span(struct pgl pgdat->node_spanned_pages = 0; } -static void __remove_zone(struct zone *zone, unsigned long start_pfn) +void shrink_zone(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn, unsigned long offlined_pages) { - struct pglist_data *pgdat = zone->zone_pgdat; int nr_pages = PAGES_PER_SECTION; + unsigned long pfn; + + clear_zone_contiguous(zone); + for (pfn = start_pfn; pfn < end_pfn; pfn += nr_pages) + shrink_zone_span(zone, pfn, pfn + nr_pages); + set_zone_contiguous(zone); +} + +static void shrink_pgdat(int nid, unsigned long sect_nr) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + int nr_pages = PAGES_PER_SECTION; + unsigned long pfn = section_nr_to_pfn((unsigned long)sect_nr); unsigned long flags; - pgdat_resize_lock(zone->zone_pgdat, &flags); - shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); - shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); - pgdat_resize_unlock(zone->zone_pgdat, &flags); + pgdat_resize_lock(pgdat, &flags); + shrink_pgdat_span(pgdat, pfn, pfn + nr_pages); + pgdat_resize_unlock(pgdat, &flags); } -static int __remove_section(struct zone *zone, struct mem_section *ms, +static int __remove_section(int nid, struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap) { - unsigned long start_pfn; - int scn_nr; int ret = -EINVAL; if (!valid_section(ms)) @@ -527,17 +548,15 @@ static int __remove_section(struct zone if (ret) return ret; - scn_nr = __section_nr(ms); - start_pfn = section_nr_to_pfn((unsigned long)scn_nr); - __remove_zone(zone, start_pfn); + shrink_pgdat(nid, __section_nr(ms)); - sparse_remove_one_section(zone, ms, map_offset, altmap); + sparse_remove_one_section(nid, ms, map_offset, altmap); return 0; } /** - * __remove_pages() - remove sections of pages from a zone - * @zone: zone from which pages need to be removed + * __remove_pages() - remove sections of pages from a nid + * @nid: nid from which pages belong to * @phys_start_pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used @@ -547,35 +566,28 @@ static int __remove_section(struct zone * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, +int remove_sections(int nid, unsigned long phys_start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long i; unsigned long map_offset = 0; int sections_to_remove, ret = 0; + resource_size_t start, size; - /* In the ZONE_DEVICE case device driver owns the memory region */ - if (is_dev_zone(zone)) { - if (altmap) - map_offset = vmem_altmap_offset(altmap); - } else { - resource_size_t start, size; - - start = phys_start_pfn << PAGE_SHIFT; - size = nr_pages * PAGE_SIZE; + start = phys_start_pfn << PAGE_SHIFT; + size = nr_pages * PAGE_SIZE; - ret = release_mem_region_adjustable(&iomem_resource, start, - size); - if (ret) { - resource_size_t endres = start + size - 1; + if (altmap) + map_offset = vmem_altmap_offset(altmap); - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); - } + ret = release_mem_region_adjustable(&iomem_resource, start, + size); + if (ret) { + resource_size_t endres = start + size - 1; + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); } - clear_zone_contiguous(zone); - /* * We can only remove entire sections */ @@ -587,15 +599,13 @@ int __remove_pages(struct zone *zone, un unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; cond_resched(); - ret = __remove_section(zone, __pfn_to_section(pfn), map_offset, - altmap); + ret = __remove_section(nid, __pfn_to_section(pfn), map_offset, + altmap); map_offset = 0; if (ret) break; } - set_zone_contiguous(zone); - return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ @@ -1635,11 +1645,14 @@ repeat: /* reset pagetype flags and makes migrate type to be MOVABLE */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); /* removal success */ + + /* Shrink zone's managed,spanned and zone/pgdat's present pages */ adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); zone->present_pages -= offlined_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages -= offlined_pages; + shrink_zone(zone, valid_start, valid_end, offlined_pages); pgdat_resize_unlock(zone->zone_pgdat, &flags); init_per_zone_wmark_min(); --- a/mm/sparse.c~mm-memory_hotplug-move-zone-pages-handling-to-offline-stage +++ a/mm/sparse.c @@ -790,12 +790,12 @@ static void free_section_usemap(struct p free_map_bootmem(memmap); } -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, +void sparse_remove_one_section(int nid, struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap) { struct page *memmap = NULL; unsigned long *usemap = NULL, flags; - struct pglist_data *pgdat = zone->zone_pgdat; + struct pglist_data *pgdat = NODE_DATA(nid); pgdat_resize_lock(pgdat, &flags); if (ms->section_mem_map) { _ Patches currently in -mm which might be from osalvador@xxxxxxxx are mm-memory_hotplug-add-nid-parameter-to-arch_remove_memory.patch mm-memory-hotplug-rework-unregister_mem_sect_under_nodes.patch mm-memory_hotplug-refactor-shrink_zone-pgdat_span.patch