From: Zi Yan <ziy@xxxxxxxxxx> To enable subsection memory online/offline, we need to remove the assumption of memory_block size being greater or equal to section size. The following changes are made: 1. use (start_pfn, nr_pages) pair to specify memory_block size instead of start_section_nr. 2. calculate memory_block id using phys / memory_block_size_bytes() instead of section number. The memory_block minimum size is set to the smaller of 128MB (the old x86_64 section size) and section size instead. Signed-off-by: Zi Yan <ziy@xxxxxxxxxx> --- drivers/base/memory.c | 176 ++++++++++++++++++++--------------------- drivers/base/node.c | 2 +- include/linux/memory.h | 8 +- mm/memory_hotplug.c | 6 +- 4 files changed, 98 insertions(+), 94 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index b31b3af5c490..141431eb64a4 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -50,19 +50,15 @@ int mhp_online_type_from_str(const char *str) static int sections_per_block; -static inline unsigned long memory_block_id(unsigned long section_nr) +static inline unsigned long phys_to_block_id(unsigned long phys) { - return section_nr / sections_per_block; + return phys / memory_block_size_bytes(); } static inline unsigned long pfn_to_block_id(unsigned long pfn) { - return memory_block_id(pfn_to_section_nr(pfn)); -} - -static inline unsigned long phys_to_block_id(unsigned long phys) -{ - return pfn_to_block_id(PFN_DOWN(phys)); + /* calculate using memory_block_size_bytes() */ + return phys_to_block_id(PFN_PHYS(pfn)); } static int memory_subsys_online(struct device *dev); @@ -118,7 +114,7 @@ static ssize_t phys_index_show(struct device *dev, struct memory_block *mem = to_memory_block(dev); unsigned long phys_index; - phys_index = mem->start_section_nr / sections_per_block; + phys_index = pfn_to_section_nr(mem->start_pfn); return sysfs_emit(buf, "%08lx\n", phys_index); } @@ -171,8 +167,8 @@ int memory_notify(unsigned long val, void *v) static int memory_block_online(struct memory_block *mem) { - unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); - unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + unsigned long start_pfn = mem->start_pfn; + unsigned long nr_pages = mem->nr_pages; unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; struct zone *zone; int ret; @@ -212,8 +208,8 @@ static int memory_block_online(struct memory_block *mem) static int memory_block_offline(struct memory_block *mem) { - unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); - unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + unsigned long start_pfn = mem->start_pfn; + unsigned long nr_pages = mem->nr_pages; unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; struct zone *zone; int ret; @@ -260,7 +256,7 @@ memory_block_action(struct memory_block *mem, unsigned long action) break; default: WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " - "%ld\n", __func__, mem->start_section_nr, action, action); + "%ld\n", __func__, mem->start_pfn, mem->nr_pages, action); ret = -EINVAL; } @@ -366,7 +362,7 @@ static ssize_t phys_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); - unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); + unsigned long start_pfn = mem->start_pfn; return sysfs_emit(buf, "%d\n", arch_get_memory_phys_device(start_pfn)); @@ -390,8 +386,8 @@ static ssize_t valid_zones_show(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); - unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); - unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + unsigned long start_pfn = mem->start_pfn; + unsigned long nr_pages = mem->nr_pages; struct zone *default_zone; int len = 0; int nid; @@ -575,16 +571,6 @@ static struct memory_block *find_memory_block_by_id(unsigned long block_id) return mem; } -/* - * Called under device_hotplug_lock. - */ -struct memory_block *find_memory_block(struct mem_section *section) -{ - unsigned long block_id = memory_block_id(__section_nr(section)); - - return find_memory_block_by_id(block_id); -} - static struct attribute *memory_memblk_attrs[] = { &dev_attr_phys_index.attr, &dev_attr_state.attr, @@ -614,7 +600,7 @@ int register_memory(struct memory_block *memory) int ret; memory->dev.bus = &memory_subsys; - memory->dev.id = memory->start_section_nr / sections_per_block; + memory->dev.id = memory->start_pfn / (memory_block_size_bytes() >> PAGE_SHIFT); memory->dev.release = memory_block_release; memory->dev.groups = memory_memblk_attr_groups; memory->dev.offline = memory->state == MEM_OFFLINE; @@ -633,57 +619,89 @@ int register_memory(struct memory_block *memory) return ret; } -static int init_memory_block(unsigned long block_id, unsigned long state, +static void unregister_memory(struct memory_block *memory) +{ + if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) + return; + + WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); + + /* drop the ref. we got via find_memory_block() */ + put_device(&memory->dev); + device_unregister(&memory->dev); +} + +static int init_memory_blocks(unsigned long start_pfn, unsigned long num_pages, unsigned long state, unsigned long nr_vmemmap_pages) { struct memory_block *mem; int ret = 0; + unsigned long block_nr_pages = memory_block_size_bytes() / PAGE_SIZE; + unsigned long block_start_pfn; - mem = find_memory_block_by_id(block_id); - if (mem) { - put_device(&mem->dev); - return -EEXIST; - } - mem = kzalloc(sizeof(*mem), GFP_KERNEL); - if (!mem) - return -ENOMEM; - - mem->start_section_nr = block_id * sections_per_block; - mem->state = state; - mem->nid = NUMA_NO_NODE; - mem->nr_vmemmap_pages = nr_vmemmap_pages; + for (block_start_pfn = start_pfn; num_pages != 0; block_start_pfn += block_nr_pages) { + unsigned long block_id = pfn_to_block_id(block_start_pfn); - ret = register_memory(mem); - - return ret; + mem = find_memory_block_by_id(block_id); + if (mem) { + put_device(&mem->dev); + return -EEXIST; + } + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) + return -ENOMEM; + + mem->start_pfn = block_start_pfn; + mem->nr_pages = min(num_pages, block_nr_pages); + mem->state = state; + mem->nid = NUMA_NO_NODE; + mem->nr_vmemmap_pages = nr_vmemmap_pages; + + ret = register_memory(mem); + + if (ret) { + unsigned long unregister_block_pfn; + + for (unregister_block_pfn = start_pfn; + unregister_block_pfn < block_start_pfn; + unregister_block_pfn -= block_nr_pages) { + block_id = pfn_to_block_id(unregister_block_pfn); + mem = find_memory_block_by_id(block_id); + if (WARN_ON_ONCE(!mem)) + continue; + unregister_memory(mem); + } + return -EINVAL; + } + if (num_pages > block_nr_pages) + num_pages -= block_nr_pages; + else + num_pages = 0; + } + return 0; } -static int add_memory_block(unsigned long base_section_nr) +static void add_whole_section_memory_block(unsigned long base_section_nr) { - int section_count = 0; - unsigned long nr; + int ret; + unsigned long start_pfn = section_nr_to_pfn(base_section_nr); + unsigned long nr_pages = 0; + struct mem_section *ms = __nr_to_section(base_section_nr); - for (nr = base_section_nr; nr < base_section_nr + sections_per_block; - nr++) - if (present_section_nr(nr)) - section_count++; + if (bitmap_full(ms->usage->subsection_map, SUBSECTIONS_PER_SECTION)) + nr_pages = PAGES_PER_SECTION; + else + nr_pages = PAGES_PER_SUBSECTION * + bitmap_weight(ms->usage->subsection_map, SUBSECTIONS_PER_SECTION); - if (section_count == 0) - return 0; - return init_memory_block(memory_block_id(base_section_nr), - MEM_ONLINE, 0); -} -static void unregister_memory(struct memory_block *memory) -{ - if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) + if (!nr_pages) return; - WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); - - /* drop the ref. we got via find_memory_block() */ - put_device(&memory->dev); - device_unregister(&memory->dev); + ret = init_memory_blocks(start_pfn, nr_pages, MEM_ONLINE, 0); + if (ret) + panic("%s() failed to add memory block: %d\n", __func__, + ret); } /* @@ -696,31 +714,16 @@ static void unregister_memory(struct memory_block *memory) int create_memory_block_devices(unsigned long start, unsigned long size, unsigned long vmemmap_pages) { - const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); - unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); - struct memory_block *mem; - unsigned long block_id; + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_DOWN(start + size); int ret = 0; if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || !IS_ALIGNED(size, memory_block_size_bytes()))) return -EINVAL; - for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages); - if (ret) - break; - } - if (ret) { - end_block_id = block_id; - for (block_id = start_block_id; block_id != end_block_id; - block_id++) { - mem = find_memory_block_by_id(block_id); - if (WARN_ON_ONCE(!mem)) - continue; - unregister_memory(mem); - } - } + ret = init_memory_blocks(start_pfn, end_pfn - start_pfn, MEM_OFFLINE, vmemmap_pages); + return ret; } @@ -807,10 +810,7 @@ void __init memory_dev_init(void) */ for (nr = 0; nr <= __highest_present_section_nr; nr += sections_per_block) { - ret = add_memory_block(nr); - if (ret) - panic("%s() failed to add memory block: %d\n", __func__, - ret); + add_whole_section_memory_block(nr); } } diff --git a/drivers/base/node.c b/drivers/base/node.c index 2c36f61d30bc..76d67b8ddf1b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -809,7 +809,7 @@ static int register_mem_block_under_node_early(struct memory_block *mem_blk, void *arg) { unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE; - unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); + unsigned long start_pfn = mem_blk->start_pfn; unsigned long end_pfn = start_pfn + memory_block_pfns - 1; int nid = *(int *)arg; unsigned long pfn; diff --git a/include/linux/memory.h b/include/linux/memory.h index 97e92e8b556a..e9590c7c6a9e 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -21,10 +21,15 @@ #include <linux/mutex.h> #include <linux/notifier.h> +#if SECTION_SIZE_BITS > 27 /* 128MB */ +#define MIN_MEMORY_BLOCK_SIZE (1UL << 27) +#else #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) +#endif struct memory_block { - unsigned long start_section_nr; + unsigned long start_pfn; + unsigned long nr_pages; unsigned long state; /* serialized by the dev->lock */ int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ @@ -90,7 +95,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size, void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); -extern struct memory_block *find_memory_block(struct mem_section *); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 70620d0dd923..6e93b0ecc5cb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1872,8 +1872,8 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) if (unlikely(ret)) { phys_addr_t beginpa, endpa; - beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); - endpa = beginpa + memory_block_size_bytes() - 1; + beginpa = PFN_PHYS(mem->start_pfn); + endpa = beginpa + mem->nr_pages * PAGE_SIZE - 1; pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); @@ -2079,7 +2079,7 @@ static int try_offline_memory_block(struct memory_block *mem, void *arg) * with multiple zones within one memory block will be rejected * by offlining code ... so we don't care about that. */ - page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); + page = pfn_to_online_page(mem->start_pfn); if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) online_type = MMOP_ONLINE_MOVABLE; -- 2.30.2