From: Oscar Salvador <osalvador@xxxxxxx> Currently, we decrement zone/node spanned_pages when we __remove__ the memory. This is not really great. Incrementing of spanned pages is done in online_pages() path, decrementing spanned pages should be moved to offline_pages(). This, besides making the core more consistent, helps in fixing the bug explained in the cover letter, since from now on, we will not touch any pages in remove_memory path. This does all the work to accomplish this. It removes __remove_zone() and creates __shrink_pages(), which does pretty much the same. It also changes find_smallest/biggest_section_pfn to check for: !online_section(ms) instead of !valid_section(ms) as we offline the section in: __offline_pages offline_isolated_pages offline_isolated_pages_cb __offline_isolated_pages offline_mem_sections right before shrinking the pages. Signed-off-by: Oscar Salvador <osalvador@xxxxxxx> --- arch/ia64/mm/init.c | 4 +-- arch/powerpc/mm/mem.c | 11 +------ arch/sh/mm/init.c | 4 +-- arch/x86/mm/init_32.c | 4 +-- arch/x86/mm/init_64.c | 8 +----- include/linux/memory_hotplug.h | 6 ++-- kernel/memremap.c | 5 ++++ mm/hmm.c | 2 +- mm/memory_hotplug.c | 65 +++++++++++++++++++++--------------------- mm/sparse.c | 4 +-- 10 files changed, 50 insertions(+), 63 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index bc5e15045cee..0029c4d3f574 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -664,11 +664,9 @@ int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; int ret; - zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); + ret = __remove_pages(nid, start_pfn, nr_pages, altmap); if (ret) pr_warn("%s: Problem encountered in __remove_pages() as" " ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 9e17d57a9948..12879191bc6b 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -143,18 +143,9 @@ int __meminit arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altma { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page; int ret; - /* - * If we have an altmap then we need to skip over any reserved PFNs - * when querying the zone. - */ - page = pfn_to_page(start_pfn); - if (altmap) - page += vmem_altmap_offset(altmap); - - ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); + ret = __remove_pages(nid, start_pfn, nr_pages, altmap); if (ret) return ret; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 55c740ab861b..4f183857b522 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -458,11 +458,9 @@ int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; int ret; - zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); + ret = __remove_pages(nid, start_pfn, nr_pages, altmap); if (unlikely(ret)) pr_warn("%s: Failed, __remove_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9fa503f2f56c..88909e88c873 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -865,10 +865,8 @@ int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct zone *zone; - zone = page_zone(pfn_to_page(start_pfn)); - return __remove_pages(zone, start_pfn, nr_pages, altmap); + return __remove_pages(nid, start_pfn, nr_pages, altmap); } #endif #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 26728df07072..1aad5ea2e274 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1152,15 +1152,9 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *a { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page = pfn_to_page(start_pfn); - struct zone *zone; int ret; - /* With altmap the first mapped page is offset from @start */ - if (altmap) - page += vmem_altmap_offset(altmap); - zone = page_zone(page); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); + ret = __remove_pages(nid, start_pfn, nr_pages, altmap); WARN_ON_ONCE(ret); kernel_physical_mapping_remove(start, start + size); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c68b03fd87bd..eff460ba3ab1 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -109,8 +109,10 @@ static inline bool movable_node_is_enabled(void) #ifdef CONFIG_MEMORY_HOTREMOVE extern int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap); -extern int __remove_pages(struct zone *zone, unsigned long start_pfn, +extern int __remove_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); +extern void shrink_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); #endif /* CONFIG_MEMORY_HOTREMOVE */ /* reasonably generic interface to expand the physical pages */ @@ -333,7 +335,7 @@ extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn, struct vmem_altmap *altmap); -extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, +extern void sparse_remove_one_section(int nid, struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); diff --git a/kernel/memremap.c b/kernel/memremap.c index 7a832b844f24..2057af5c1c4f 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -121,6 +121,7 @@ static void devm_memremap_pages_release(void *data) struct resource *res = &pgmap->res; resource_size_t align_start, align_size; unsigned long pfn; + struct page *page; int nid; for_each_device_pfn(pfn, pgmap) @@ -138,6 +139,10 @@ static void devm_memremap_pages_release(void *data) nid = dev_to_node(dev); mem_hotplug_begin(); + page = pfn_to_page(pfn); + if (pgmap->altmap_valid) + page += vmem_altmap_offset(&pgmap->altmap); + shrink_pages(page_zone(page), pfn, align_size >> PAGE_SHIFT); arch_remove_memory(nid, align_start, align_size, pgmap->altmap_valid ? &pgmap->altmap : NULL); kasan_remove_zero_shadow(__va(align_start), align_size); diff --git a/mm/hmm.c b/mm/hmm.c index 21787e480b4a..b2f2dcadb5fb 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1012,7 +1012,7 @@ static void hmm_devmem_release(struct device *dev, void *data) mem_hotplug_begin(); if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) - __remove_pages(zone, start_pfn, npages, NULL); + __remove_pages(nid, start_pfn, npages, NULL); else arch_remove_memory(nid, start_pfn << PAGE_SHIFT, npages << PAGE_SHIFT, NULL); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9bd629944c91..e33555651e46 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -320,12 +320,10 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - struct mem_section *ms; - for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { - ms = __pfn_to_section(start_pfn); + struct mem_section *ms = __pfn_to_section(start_pfn); - if (unlikely(!valid_section(ms))) + if (unlikely(!online_section(ms))) continue; if (unlikely(pfn_to_nid(start_pfn) != nid)) @@ -345,15 +343,14 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - struct mem_section *ms; unsigned long pfn; /* pfn is the end pfn of a memory section. */ pfn = end_pfn - 1; for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { - ms = __pfn_to_section(pfn); + struct mem_section *ms = __pfn_to_section(pfn); - if (unlikely(!valid_section(ms))) + if (unlikely(!online_section(ms))) continue; if (unlikely(pfn_to_nid(pfn) != nid)) @@ -415,7 +412,7 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { ms = __pfn_to_section(pfn); - if (unlikely(!valid_section(ms))) + if (unlikely(!online_section(ms))) continue; if (page_zone(pfn_to_page(pfn)) != zone) @@ -483,7 +480,7 @@ static void shrink_pgdat_span(struct pglist_data *pgdat, for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { ms = __pfn_to_section(pfn); - if (unlikely(!valid_section(ms))) + if (unlikely(!online_section(ms))) continue; if (pfn_to_nid(pfn) != nid) @@ -502,19 +499,32 @@ static void shrink_pgdat_span(struct pglist_data *pgdat, pgdat->node_spanned_pages = 0; } -static void __remove_zone(struct zone *zone, unsigned long start_pfn) +static void __shrink_pages(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn, + unsigned long offlined_pages) { struct pglist_data *pgdat = zone->zone_pgdat; int nr_pages = PAGES_PER_SECTION; unsigned long flags; + unsigned long pfn; - pgdat_resize_lock(zone->zone_pgdat, &flags); - shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); - shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); - pgdat_resize_unlock(zone->zone_pgdat, &flags); + clear_zone_contiguous(zone); + pgdat_resize_lock(pgdat, &flags); + for(pfn = start_pfn; pfn < end_pfn; pfn += nr_pages) { + shrink_zone_span(zone, pfn, pfn + nr_pages); + shrink_pgdat_span(pgdat, pfn, pfn + nr_pages); + } + pgdat->node_present_pages -= offlined_pages; + pgdat_resize_unlock(pgdat, &flags); + set_zone_contiguous(zone); } -static int __remove_section(struct zone *zone, struct mem_section *ms, +void shrink_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) +{ + __shrink_pages(zone, start_pfn, start_pfn + nr_pages, nr_pages); +} + +static int __remove_section(int nid, struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap) { unsigned long start_pfn; @@ -530,15 +540,14 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, scn_nr = __section_nr(ms); start_pfn = section_nr_to_pfn((unsigned long)scn_nr); - __remove_zone(zone, start_pfn); - sparse_remove_one_section(zone, ms, map_offset, altmap); + sparse_remove_one_section(nid, ms, map_offset, altmap); return 0; } /** * __remove_pages() - remove sections of pages from a zone - * @zone: zone from which pages need to be removed + * @nid: node which pages belong to * @phys_start_pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used @@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, +int __remove_pages(int nid, unsigned long phys_start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long i; @@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, int sections_to_remove, ret = 0; /* In the ZONE_DEVICE case device driver owns the memory region */ - if (is_dev_zone(zone)) { - if (altmap) - map_offset = vmem_altmap_offset(altmap); - } else { + if (altmap) + map_offset = vmem_altmap_offset(altmap); + else { resource_size_t start, size; start = phys_start_pfn << PAGE_SHIFT; @@ -575,8 +583,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, } } - clear_zone_contiguous(zone); - /* * We can only remove entire sections */ @@ -587,15 +593,13 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; - ret = __remove_section(zone, __pfn_to_section(pfn), map_offset, + ret = __remove_section(nid, __pfn_to_section(pfn), map_offset, altmap); map_offset = 0; if (ret) break; } - set_zone_contiguous(zone); - return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ @@ -1595,7 +1599,6 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long pfn, nr_pages; long offlined_pages; int ret, node; - unsigned long flags; unsigned long valid_start, valid_end; struct zone *zone; struct memory_notify arg; @@ -1667,9 +1670,7 @@ static int __ref __offline_pages(unsigned long start_pfn, adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); zone->present_pages -= offlined_pages; - pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages -= offlined_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); + __shrink_pages(zone, valid_start, valid_end, offlined_pages); init_per_zone_wmark_min(); diff --git a/mm/sparse.c b/mm/sparse.c index 10b07eea9a6e..016020bd20b5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -766,12 +766,12 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap, free_map_bootmem(memmap); } -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, +void sparse_remove_one_section(int nid, struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap) { struct page *memmap = NULL; unsigned long *usemap = NULL, flags; - struct pglist_data *pgdat = zone->zone_pgdat; + struct pglist_data *pgdat = NODE_DATA(nid); pgdat_resize_lock(pgdat, &flags); if (ms->section_mem_map) { -- 2.13.6