On Thu 30-03-17 13:54:53, Michal Hocko wrote: [...] > -static int __meminit __add_section(int nid, struct zone *zone, > - unsigned long phys_start_pfn) > +static int __meminit __add_section(int nid, unsigned long phys_start_pfn) > { > int ret; > + int i; > > if (pfn_valid(phys_start_pfn)) > return -EEXIST; > > - ret = sparse_add_one_section(zone, phys_start_pfn); > - > + ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); > if (ret < 0) > return ret; > > - ret = __add_zone(zone, phys_start_pfn); > + /* > + * Make all the pages reserved so that nobody will stumble over half > + * initialized state. > + */ > + for (i = 0; i < PAGES_PER_SECTION; i++) { > + unsigned long pfn = phys_start_pfn + i; > + if (!pfn_valid(pfn)) > + continue; > > - if (ret < 0) > - return ret; > + SetPageReserved(pfn_to_page(phys_start_pfn + i)); > + } > > return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); I have just realized one more dependency on the zone initialization. register_new_memory relies on is_zone_device_section to rule out memblock specific operations including sysfs infrastructure. I have come up with the following to handle this. --- >From 377eb5691706de179fbf570915c1cf365d3253c0 Mon Sep 17 00:00:00 2001 From: Michal Hocko <mhocko@xxxxxxxx> Date: Thu, 6 Apr 2017 09:50:35 +0200 Subject: [PATCH] mm, memory_hotplug: get rid of is_zone_device_section device memory hotplug hooks into regular memory hotplug only half way. It needs memory sections to track struct pages but there is no need/desire to associate those sections with memory blocks and export them to the userspace via sysfs because they cannot be onlined anyway. This is currently expressed by for_device argument to arch_add_memory which then makes sure to associate the given memory range with ZONE_DEVICE. register_new_memory then relies on is_zone_device_section to distinguish special memory hotplug from the regular one. While this works now, later patches in this series want to move __add_zone outside of arch_add_memory path so we have to come up with something else. Add want_memblock down the __add_pages path and use it to control whether the section->memblock association should be done. arch_add_memory the just trivially want memblock for everything but for_device hotplug. remove_memory_section doesn't need is_zone_device_section either. We can simply skip all the memblock specific cleanup if there is no memblock for the given section. Cc: Dan Williams <dan.j.williams@xxxxxxxxx> Signed-off-by: Michal Hocko <mhocko@xxxxxxxx> --- arch/ia64/mm/init.c | 2 +- arch/powerpc/mm/mem.c | 2 +- arch/s390/mm/init.c | 2 +- arch/sh/mm/init.c | 2 +- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- drivers/base/memory.c | 22 ++++++++-------------- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 11 +++++++---- 9 files changed, 22 insertions(+), 25 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 06cdaef54b2e..62085fd902e6 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -657,7 +657,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) zone = pgdat->node_zones + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); - ret = __add_pages(nid, zone, start_pfn, nr_pages); + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 5f844337de21..ea3e09a62f38 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -149,7 +149,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) zone = pgdata->node_zones + zone_for_memory(nid, start, size, 0, for_device); - return __add_pages(nid, zone, start_pfn, nr_pages); + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index bf5b8a0c4ff7..5c84346e5211 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -182,7 +182,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) continue; nr_pages = (start_pfn + size_pages > zone_end_pfn) ? zone_end_pfn - start_pfn : size_pages; - rc = __add_pages(nid, zone, start_pfn, nr_pages); + rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); if (rc) break; start_pfn += nr_pages; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 75491862d900..a9d57f75ae8c 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -498,7 +498,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) ret = __add_pages(nid, pgdat->node_zones + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device), - start_pfn, nr_pages); + start_pfn, nr_pages, !for_device); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 928cfde76232..bc5530cc0746 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -824,7 +824,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, zone, start_pfn, nr_pages); + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7eef17239378..39cfaee93975 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -652,7 +652,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) init_memory_mapping(start, start + size); - ret = __add_pages(nid, zone, start_pfn, nr_pages); + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/drivers/base/memory.c b/drivers/base/memory.c index cc4f1d0cbffe..89c15e942852 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -685,14 +685,6 @@ static int add_memory_block(int base_section_nr) return 0; } -static bool is_zone_device_section(struct mem_section *ms) -{ - struct page *page; - - page = sparse_decode_mem_map(ms->section_mem_map, __section_nr(ms)); - return is_zone_device_page(page); -} - /* * need an interface for the VM to add new memory regions, * but without onlining it. @@ -702,9 +694,6 @@ int register_new_memory(int nid, struct mem_section *section) int ret = 0; struct memory_block *mem; - if (is_zone_device_section(section)) - return 0; - mutex_lock(&mem_sysfs_mutex); mem = find_memory_block(section); @@ -741,11 +730,16 @@ static int remove_memory_section(unsigned long node_id, { struct memory_block *mem; - if (is_zone_device_section(section)) - return 0; - mutex_lock(&mem_sysfs_mutex); + + /* + * Some users of the memory hotplug do not want/need memblock to + * track all sections. Skip over those. + */ mem = find_memory_block(section); + if (!mem) + return 0; + unregister_mem_sect_under_nodes(mem, __section_nr(section)); mem->section_count--; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 134a2f69c21a..3c8cf86201c3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -111,7 +111,7 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, /* reasonably generic interface to expand the physical pages in a zone */ extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages); + unsigned long nr_pages, bool want_memblock); #ifdef CONFIG_NUMA extern int memory_add_physaddr_to_nid(u64 start); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 28b16a831d40..5a8924a41a3b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -489,7 +489,7 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) } static int __meminit __add_section(int nid, struct zone *zone, - unsigned long phys_start_pfn) + unsigned long phys_start_pfn, bool want_memblock) { int ret; @@ -506,7 +506,10 @@ static int __meminit __add_section(int nid, struct zone *zone, if (ret < 0) return ret; - return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); + if (want_memblock) + ret = register_new_memory(nid, __pfn_to_section(phys_start_pfn)); + + return ret; } /* @@ -516,7 +519,7 @@ static int __meminit __add_section(int nid, struct zone *zone, * add the new pages. */ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages) + unsigned long nr_pages, bool want_memblock) { unsigned long i; int err = 0; @@ -544,7 +547,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, section_nr_to_pfn(i)); + err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock); /* * EEXIST is finally dealt with by ioresource collision -- 2.11.0 -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>