On Fri, Jul 23, 2021 at 02:52:04PM +0200, David Hildenbrand wrote: > In our "auto-movable" memory onlining policy, we want to make decisions > across memory blocks of a single memory device. Examples of memory devices > include ACPI memory devices (in the simplest case a single DIMM) and > virtio-mem. For now, we don't have a connection between a single memory > block device and the real memory device. Each memory device consists of > 1..X memory block devices. > > Let's logically group memory blocks belonging to the same memory device > in "memory groups". Memory groups can span multiple physical ranges and a > memory group itself does not contain any information regarding physical > ranges, only properties (e.g., "max_pages") necessary for improved memory > onlining. > > Introduce two memory group types: > > 1) Static memory group: E.g., a single ACPI memory device, consisting of > 1..X memory resources. A memory group consists of 1..Y memory blocks. > The whole group is added/removed in one go. If any part cannot get > offlined, the whole group cannot be removed. > > 2) Dynamic memory group: E.g., a single virtio-mem device. Memory is > dynamically added/removed in a fixed granularity, called a "unit", > consisting of 1..X memory blocks. A unit is added/removed in one go. > If any part of a unit cannot get offlined, the whole unit cannot be > removed. > > In case of 1) we usually want either all memory managed by ZONE_MOVABLE > or none. In case of 2) we usually want to have as many units as possible > managed by ZONE_MOVABLE. We want a single unit to be of the same type. > > For now, memory groups are an internal concept that is not exposed to > user space; we might want to change that in the future, though. > > add_memory() users can specify a mgid instead of a nid when passing > the MHP_NID_IS_MGID flag. > > Signed-off-by: David Hildenbrand <david@xxxxxxxxxx> > --- > drivers/base/memory.c | 102 +++++++++++++++++++++++++++++++-- > include/linux/memory.h | 46 ++++++++++++++- > include/linux/memory_hotplug.h | 6 +- > mm/memory_hotplug.c | 11 +++- > 4 files changed, 158 insertions(+), 7 deletions(-) > > diff --git a/drivers/base/memory.c b/drivers/base/memory.c > index 86ec2dc82fc2..42109e7fb0b5 100644 > --- a/drivers/base/memory.c > +++ b/drivers/base/memory.c > @@ -82,6 +82,11 @@ static struct bus_type memory_subsys = { > */ > static DEFINE_XARRAY(memory_blocks); > > +/* > + * Memory groups, indexed by memory group identification (mgid). > + */ > +static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); > + > static BLOCKING_NOTIFIER_HEAD(memory_chain); > > int register_memory_notifier(struct notifier_block *nb) > @@ -634,7 +639,8 @@ int register_memory(struct memory_block *memory) > } > > static int init_memory_block(unsigned long block_id, unsigned long state, > - unsigned long nr_vmemmap_pages) > + unsigned long nr_vmemmap_pages, > + struct memory_group *group) > { > struct memory_block *mem; > int ret = 0; > @@ -653,6 +659,11 @@ static int init_memory_block(unsigned long block_id, unsigned long state, > mem->nid = NUMA_NO_NODE; > mem->nr_vmemmap_pages = nr_vmemmap_pages; > > + if (group) { > + mem->group = group; > + refcount_inc(&group->refcount); > + } > + > ret = register_memory(mem); > > return ret; > @@ -671,7 +682,7 @@ static int add_memory_block(unsigned long base_section_nr) > if (section_count == 0) > return 0; > return init_memory_block(memory_block_id(base_section_nr), > - MEM_ONLINE, 0); > + MEM_ONLINE, 0, NULL); > } > > static void unregister_memory(struct memory_block *memory) > @@ -681,6 +692,11 @@ static void unregister_memory(struct memory_block *memory) > > WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); > > + if (memory->group) { > + refcount_dec(&memory->group->refcount); > + memory->group = NULL; Who freed the memory for the group? > + } > + > /* drop the ref. we got via find_memory_block() */ > put_device(&memory->dev); > device_unregister(&memory->dev); > @@ -694,7 +710,8 @@ static void unregister_memory(struct memory_block *memory) > * Called under device_hotplug_lock. > */ > int create_memory_block_devices(unsigned long start, unsigned long size, > - unsigned long vmemmap_pages) > + unsigned long vmemmap_pages, > + struct memory_group *group) > { > const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); > unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); > @@ -707,7 +724,8 @@ int create_memory_block_devices(unsigned long start, unsigned long size, > return -EINVAL; > > for (block_id = start_block_id; block_id != end_block_id; block_id++) { > - ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages); > + ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages, > + group); > if (ret) > break; > } > @@ -891,3 +909,79 @@ int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) > return bus_for_each_dev(&memory_subsys, NULL, &cb_data, > for_each_memory_block_cb); > } > + > +static int register_memory_group(struct memory_group group) > +{ > + struct memory_group *new_group; > + uint32_t mgid; > + int ret; > + > + if (!node_possible(group.nid)) > + return -EINVAL; > + > + new_group = kzalloc(sizeof(group), GFP_KERNEL); > + if (!new_group) > + return -ENOMEM; > + *new_group = group; You burried a memcpy here, why? Please be explicit as this is now a dynamic structure. > + refcount_set(&new_group->refcount, 1); Why not just use a kref? You seem to be treating it as a kref would work, right? > + > + ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, > + GFP_KERNEL); > + if (ret) > + kfree(new_group); > + return ret ? ret : mgid; I hate ?: please spell this out: if (ret) return ret; return mgid; There, more obvious and you can read it in 10 years when you have to go fix it up... > +} > + > +int register_static_memory_group(int nid, unsigned long max_pages) > +{ > + struct memory_group group = { > + .nid = nid, > + .s = { > + .max_pages = max_pages, > + }, > + }; > + > + if (!max_pages) > + return -EINVAL; > + return register_memory_group(group); > +} > +EXPORT_SYMBOL_GPL(register_static_memory_group); Let's make our global namespace a bit nicer: memory_group_register_static() memory_group_register_dynamic() and so on. Use prefixes please, not suffixes. > + > +int register_dynamic_memory_group(int nid, unsigned long unit_pages) > +{ > + struct memory_group group = { > + .nid = nid, > + .is_dynamic = true, > + .d = { > + .unit_pages = unit_pages, > + }, > + }; > + > + if (!unit_pages || !is_power_of_2(unit_pages) || > + unit_pages < PHYS_PFN(memory_block_size_bytes())) > + return -EINVAL; > + return register_memory_group(group); > +} > +EXPORT_SYMBOL_GPL(register_dynamic_memory_group); > + > +int unregister_memory_group(int mgid) > +{ > + struct memory_group *group; > + > + if (mgid < 0) > + return -EINVAL; > + > + group = xa_load(&memory_groups, mgid); > + if (!group || refcount_read(&group->refcount) > 1) > + return -EINVAL; > + > + xa_erase(&memory_groups, mgid); > + kfree(group); > + return 0; > +} > +EXPORT_SYMBOL_GPL(unregister_memory_group); memory_group_unregister() > + > +struct memory_group *get_memory_group(int mgid) > +{ > + return xa_load(&memory_groups, mgid); > +} Global function? > diff --git a/include/linux/memory.h b/include/linux/memory.h > index 97e92e8b556a..6e20a6174fe5 100644 > --- a/include/linux/memory.h > +++ b/include/linux/memory.h > @@ -23,6 +23,42 @@ > > #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) > > +struct memory_group { > + /* Nid the whole group belongs to. */ > + int nid; What is a "nid"? > + /* References from memory blocks + 1. */ Blank line above this? And put the structure comments in proper kernel doc so that others can read them and we can verify it is correct over time. > + refcount_t refcount; > + /* > + * Memory group type: static vs. dynamic. > + * > + * Static: All memory in the group belongs to a single unit, such as, > + * a DIMM. All memory belonging to the group will be added in > + * one go and removed in one go -- it's static. > + * > + * Dynamic: Memory within the group is added/removed dynamically in > + * units of the specified granularity of at least one memory block. > + */ > + bool is_dynamic; > + > + union { > + struct { > + /* > + * Maximum number of pages we'll have in this static > + * memory group. > + */ > + unsigned long max_pages; > + } s; > + struct { > + /* > + * Unit in pages in which memory is added/removed in > + * this dynamic memory group. This granularity defines > + * the alignment of a unit in physical address space. > + */ > + unsigned long unit_pages; > + } d; so is_dynamic determines which to use here? Please be explicit. thanks, greg k-h