On Wed 24-03-21 13:03:29, Michal Hocko wrote: > On Wed 24-03-21 11:12:59, Oscar Salvador wrote: [...] > > I kind of understand to be reluctant to use vmemmap_pages terminology here, but > > unfortunately we need to know about it. > > We could rename nr_vmemmap_pages to offset_buddy_pages or something like that. > > I am not convinced. It seems you are justr trying to graft the new > functionality in. But I still believe that {on,off}lining shouldn't care > about where their vmemmaps come from at all. It should be a > responsibility of the code which reserves that space to compansate for > accounting. Otherwise we will end up with a hard to maintain code > because expectations would be spread at way too many places. Not to > mention different pfns that the code should care about. The below is a quick hack on top of this patch to illustrate my thinking. I have dug out all the vmemmap pieces out of the {on,off}lining and hooked all the accounting when the space is reserved. This just compiles without any deeper look so there are likely some minor problems but I haven't really encountered any major problems or hacks to introduce into the code. The separation seems to be possible. The diffstat also looks promising. Am I missing something fundamental in this? --- drivers/base/memory.c | 8 +-- include/linux/memory_hotplug.h | 6 +- mm/memory_hotplug.c | 151 ++++++++++++++++++++--------------------- 3 files changed, 80 insertions(+), 85 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 5ea2b3fbce02..9697acfe96eb 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -181,15 +181,15 @@ memory_block_action(unsigned long start_section_nr, unsigned long action, unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; int ret; - start_pfn = section_nr_to_pfn(start_section_nr); + start_pfn = section_nr_to_pfn(start_section_nr) + nr_vmemmap_pages; + nr_pages -= nr_vmemmap_pages; switch (action) { case MEM_ONLINE: - ret = online_pages(start_pfn, nr_pages, nr_vmemmap_pages, - online_type, nid); + ret = online_pages(start_pfn, nr_pages, online_type, nid); break; case MEM_OFFLINE: - ret = offline_pages(start_pfn, nr_pages, nr_vmemmap_pages); + ret = offline_pages(start_pfn, nr_pages); break; default: WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index a85d4b7d15c2..673d2d4a8443 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -109,8 +109,7 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); /* VM interface that may be used by firmware interface */ extern int online_pages(unsigned long pfn, unsigned long nr_pages, - unsigned long nr_vmemmap_pages, int online_type, - int nid); + int online_type, int nid); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, @@ -317,8 +316,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} #ifdef CONFIG_MEMORY_HOTREMOVE extern void try_offline_node(int nid); -extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages, - unsigned long nr_vmemmap_pages); +extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern int remove_memory(int nid, u64 start, u64 size); extern void __remove_memory(int nid, u64 start, u64 size); extern int offline_and_remove_memory(int nid, u64 start, u64 size); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0c3a98cb8cde..754026a9164d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -844,30 +844,19 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, } int __ref online_pages(unsigned long pfn, unsigned long nr_pages, - unsigned long nr_vmemmap_pages, int online_type, int nid) + int online_type, int nid) { - unsigned long flags, buddy_start_pfn, buddy_nr_pages; + unsigned long flags; struct zone *zone; int need_zonelists_rebuild = 0; int ret; struct memory_notify arg; - /* We can only online full sections (e.g., SECTION_IS_ONLINE) */ - if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION))) - return -EINVAL; - - buddy_start_pfn = pfn + nr_vmemmap_pages; - buddy_nr_pages = nr_pages - nr_vmemmap_pages; - mem_hotplug_begin(); /* associate pfn range with the zone */ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); - if (nr_vmemmap_pages) - move_pfn_range_to_zone(zone, pfn, nr_vmemmap_pages, NULL, - MIGRATE_UNMOVABLE); - move_pfn_range_to_zone(zone, buddy_start_pfn, buddy_nr_pages, NULL, + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); arg.start_pfn = pfn; @@ -884,7 +873,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, * onlining, such that undo_isolate_page_range() works correctly. */ spin_lock_irqsave(&zone->lock, flags); - zone->nr_isolate_pageblock += buddy_nr_pages / pageblock_nr_pages; + zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages; spin_unlock_irqrestore(&zone->lock, flags); /* @@ -897,7 +886,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, setup_zone_pageset(zone); } - online_pages_range(pfn, nr_pages, buddy_start_pfn); + online_pages_range(pfn, nr_pages, pfn); zone->present_pages += nr_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); @@ -910,9 +899,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, zone_pcp_update(zone); /* Basic onlining is complete, allow allocation of onlined pages. */ - undo_isolate_page_range(buddy_start_pfn, - buddy_start_pfn + buddy_nr_pages, - MIGRATE_MOVABLE); + undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); /* * Freshly onlined pages aren't shuffled (e.g., all pages are placed to @@ -1126,6 +1113,59 @@ bool mhp_supports_memmap_on_memory(unsigned long size) IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)); } +static void reserve_vmmemmap_space(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) +{ + struct zone *zone = &NODE_DATA(nid)->node_zones[ZONE_NORMAL]; + + altmap->free = nr_pages; + altmap->base_pfn = pfn; + + /* initialize struct pages and account for this space */ + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); +} + +static void unaccount_vmemmap_space(int nid, unsigned long start_pfn, unsigned long nr_pages) +{ + struct zone *zone = &NODE_DATA(nid)->node_zones[ZONE_NORMAL]; + unsigned long flags; + + adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); + zone->present_pages -= nr_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); + zone->zone_pgdat->node_present_pages -= nr_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); + + remove_pfn_range_from_zone(zone, start_pfn, nr_pages); +} + +static int remove_memory_block_cb(struct memory_block *mem, void *arg) +{ + unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + struct vmem_altmap mhp_altmap = {}; + struct vmem_altmap *altmap = NULL; + u64 start = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); + u64 size = memory_block_size_bytes(); + + if (!mem->nr_vmemmap_pages) { + arch_remove_memory(mem->nid, start, size, NULL); + return 0; + } + + /* + * Let remove_pmd_table->free_hugepage_table + * do the right thing if we used vmem_altmap + * when hot-adding the range. + */ + mhp_altmap.alloc = nr_vmemmap_pages; + altmap = &mhp_altmap; + + unaccount_vmemmap_space(mem->nid, PHYS_PFN(start), nr_vmemmap_pages); + arch_remove_memory(mem->nid, start, size, altmap); + + return 0; +} + /* * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations (triggered e.g. by sysfs). @@ -1170,8 +1210,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) ret = -EINVAL; goto error; } - mhp_altmap.free = PHYS_PFN(size); - mhp_altmap.base_pfn = PHYS_PFN(start); + reserve_vmmemmap_space(nid, PHYS_PFN(start), PHYS_PFN(size), &mhp_altmap); params.altmap = &mhp_altmap; } @@ -1639,25 +1678,16 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, return 0; } -int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, - unsigned long nr_vmemmap_pages) +int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) { const unsigned long end_pfn = start_pfn + nr_pages; - unsigned long pfn, buddy_start_pfn, buddy_nr_pages, system_ram_pages = 0; + unsigned long pfn, system_ram_pages = 0; unsigned long flags; struct zone *zone; struct memory_notify arg; int ret, node; char *reason; - /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */ - if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION))) - return -EINVAL; - - buddy_start_pfn = start_pfn + nr_vmemmap_pages; - buddy_nr_pages = nr_pages - nr_vmemmap_pages; - mem_hotplug_begin(); /* @@ -1693,7 +1723,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, zone_pcp_disable(zone); /* set above range as isolated */ - ret = start_isolate_page_range(buddy_start_pfn, end_pfn, + ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, MEMORY_OFFLINE | REPORT_FAILURE); if (ret) { @@ -1713,7 +1743,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, } do { - pfn = buddy_start_pfn; + pfn = start_pfn; do { if (signal_pending(current)) { ret = -EINTR; @@ -1744,18 +1774,18 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, * offlining actually in order to make hugetlbfs's object * counting consistent. */ - ret = dissolve_free_huge_pages(buddy_start_pfn, end_pfn); + ret = dissolve_free_huge_pages(start_pfn, end_pfn); if (ret) { reason = "failure to dissolve huge pages"; goto failed_removal_isolated; } - ret = test_pages_isolated(buddy_start_pfn, end_pfn, MEMORY_OFFLINE); + ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); } while (ret); /* Mark all sections offline and remove free pages from the buddy. */ - __offline_isolated_pages(start_pfn, end_pfn, buddy_start_pfn); + __offline_isolated_pages(start_pfn, end_pfn, start_pfn); pr_debug("Offlined Pages %ld\n", nr_pages); /* @@ -1764,13 +1794,13 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, * of isolated pageblocks, memory onlining will properly revert this. */ spin_lock_irqsave(&zone->lock, flags); - zone->nr_isolate_pageblock -= buddy_nr_pages / pageblock_nr_pages; + zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; spin_unlock_irqrestore(&zone->lock, flags); zone_pcp_enable(zone); /* removal success */ - adjust_managed_page_count(pfn_to_page(start_pfn), -buddy_nr_pages); + adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); zone->present_pages -= nr_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); @@ -1799,7 +1829,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, return 0; failed_removal_isolated: - undo_isolate_page_range(buddy_start_pfn, end_pfn, MIGRATE_MOVABLE); + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); memory_notify(MEM_CANCEL_OFFLINE, &arg); failed_removal_pcplists_disabled: zone_pcp_enable(zone); @@ -1830,14 +1860,6 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) return 0; } -static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) -{ - /* - * If not set, continue with the next block. - */ - return mem->nr_vmemmap_pages; -} - static int check_cpu_on_node(pg_data_t *pgdat) { int cpu; @@ -1912,9 +1934,6 @@ EXPORT_SYMBOL(try_offline_node); static int __ref try_remove_memory(int nid, u64 start, u64 size) { int rc = 0; - struct vmem_altmap mhp_altmap = {}; - struct vmem_altmap *altmap = NULL; - unsigned long nr_vmemmap_pages = 0; BUG_ON(check_hotplug_memory_range(start, size)); @@ -1927,31 +1946,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) if (rc) return rc; - /* - * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in - * the same granularity it was added - a single memory block. - */ - if (memmap_on_memory) { - nr_vmemmap_pages = walk_memory_blocks(start, size, NULL, - get_nr_vmemmap_pages_cb); - if (nr_vmemmap_pages) { - if (size != memory_block_size_bytes()) { - pr_warn("Refuse to remove %#llx - %#llx," - "wrong granularity\n", - start, start + size); - return -EINVAL; - } - - /* - * Let remove_pmd_table->free_hugepage_table - * do the right thing if we used vmem_altmap - * when hot-adding the range. - */ - mhp_altmap.alloc = nr_vmemmap_pages; - altmap = &mhp_altmap; - } - } - /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); @@ -1963,7 +1957,10 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); - arch_remove_memory(nid, start, size, altmap); + if (!memmap_on_memory) + arch_remove_memory(nid, start, size, NULL); + else + walk_memory_blocks(start, size, NULL, remove_memory_block_cb); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { memblock_free(start, size); -- Michal Hocko SUSE Labs