Re: [PATCH v2 3/9] drivers/base/memory: introduce "memory groups" to logically group memory blocks

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Jul 23, 2021 at 02:52:04PM +0200, David Hildenbrand wrote:
> In our "auto-movable" memory onlining policy, we want to make decisions
> across memory blocks of a single memory device. Examples of memory devices
> include ACPI memory devices (in the simplest case a single DIMM) and
> virtio-mem. For now, we don't have a connection between a single memory
> block device and the real memory device. Each memory device consists of
> 1..X memory block devices.
> 
> Let's logically group memory blocks belonging to the same memory device
> in "memory groups". Memory groups can span multiple physical ranges and a
> memory group itself does not contain any information regarding physical
> ranges, only properties (e.g., "max_pages") necessary for improved memory
> onlining.
> 
> Introduce two memory group types:
> 
> 1) Static memory group: E.g., a single ACPI memory device, consisting of
>    1..X memory resources. A memory group consists of 1..Y memory blocks.
>    The whole group is added/removed in one go. If any part cannot get
>    offlined, the whole group cannot be removed.
> 
> 2) Dynamic memory group: E.g., a single virtio-mem device. Memory is
>    dynamically added/removed in a fixed granularity, called a "unit",
>    consisting of 1..X memory blocks. A unit is added/removed in one go.
>    If any part of a unit cannot get offlined, the whole unit cannot be
>    removed.
> 
> In case of 1) we usually want either all memory managed by ZONE_MOVABLE
> or none. In case of 2) we usually want to have as many units as possible
> managed by ZONE_MOVABLE. We want a single unit to be of the same type.
> 
> For now, memory groups are an internal concept that is not exposed to
> user space; we might want to change that in the future, though.
> 
> add_memory() users can specify a mgid instead of a nid when passing
> the MHP_NID_IS_MGID flag.
> 
> Signed-off-by: David Hildenbrand <david@xxxxxxxxxx>
> ---
>  drivers/base/memory.c          | 102 +++++++++++++++++++++++++++++++--
>  include/linux/memory.h         |  46 ++++++++++++++-
>  include/linux/memory_hotplug.h |   6 +-
>  mm/memory_hotplug.c            |  11 +++-
>  4 files changed, 158 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 86ec2dc82fc2..42109e7fb0b5 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -82,6 +82,11 @@ static struct bus_type memory_subsys = {
>   */
>  static DEFINE_XARRAY(memory_blocks);
>  
> +/*
> + * Memory groups, indexed by memory group identification (mgid).
> + */
> +static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
> +
>  static BLOCKING_NOTIFIER_HEAD(memory_chain);
>  
>  int register_memory_notifier(struct notifier_block *nb)
> @@ -634,7 +639,8 @@ int register_memory(struct memory_block *memory)
>  }
>  
>  static int init_memory_block(unsigned long block_id, unsigned long state,
> -			     unsigned long nr_vmemmap_pages)
> +			     unsigned long nr_vmemmap_pages,
> +			     struct memory_group *group)
>  {
>  	struct memory_block *mem;
>  	int ret = 0;
> @@ -653,6 +659,11 @@ static int init_memory_block(unsigned long block_id, unsigned long state,
>  	mem->nid = NUMA_NO_NODE;
>  	mem->nr_vmemmap_pages = nr_vmemmap_pages;
>  
> +	if (group) {
> +		mem->group = group;
> +		refcount_inc(&group->refcount);
> +	}
> +
>  	ret = register_memory(mem);
>  
>  	return ret;
> @@ -671,7 +682,7 @@ static int add_memory_block(unsigned long base_section_nr)
>  	if (section_count == 0)
>  		return 0;
>  	return init_memory_block(memory_block_id(base_section_nr),
> -				 MEM_ONLINE, 0);
> +				 MEM_ONLINE, 0,  NULL);
>  }
>  
>  static void unregister_memory(struct memory_block *memory)
> @@ -681,6 +692,11 @@ static void unregister_memory(struct memory_block *memory)
>  
>  	WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
>  
> +	if (memory->group) {
> +		refcount_dec(&memory->group->refcount);
> +		memory->group = NULL;

Who freed the memory for the group?

> +	}
> +
>  	/* drop the ref. we got via find_memory_block() */
>  	put_device(&memory->dev);
>  	device_unregister(&memory->dev);
> @@ -694,7 +710,8 @@ static void unregister_memory(struct memory_block *memory)
>   * Called under device_hotplug_lock.
>   */
>  int create_memory_block_devices(unsigned long start, unsigned long size,
> -				unsigned long vmemmap_pages)
> +				unsigned long vmemmap_pages,
> +				struct memory_group *group)
>  {
>  	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
>  	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
> @@ -707,7 +724,8 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
>  		return -EINVAL;
>  
>  	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
> -		ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages);
> +		ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages,
> +					group);
>  		if (ret)
>  			break;
>  	}
> @@ -891,3 +909,79 @@ int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
>  	return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
>  				for_each_memory_block_cb);
>  }
> +
> +static int register_memory_group(struct memory_group group)
> +{
> +	struct memory_group *new_group;
> +	uint32_t mgid;
> +	int ret;
> +
> +	if (!node_possible(group.nid))
> +		return -EINVAL;
> +
> +	new_group = kzalloc(sizeof(group), GFP_KERNEL);
> +	if (!new_group)
> +		return -ENOMEM;
> +	*new_group = group;

You burried a memcpy here, why?  Please be explicit as this is now a
dynamic structure.

> +	refcount_set(&new_group->refcount, 1);

Why not just use a kref?  You seem to be treating it as a kref would
work, right?

> +
> +	ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
> +		       GFP_KERNEL);
> +	if (ret)
> +		kfree(new_group);
> +	return ret ? ret : mgid;

I hate ?: please spell this out:
	if (ret)
		return ret;
	return mgid;

There, more obvious and you can read it in 10 years when you have to go
fix it up...



> +}
> +
> +int register_static_memory_group(int nid, unsigned long max_pages)
> +{
> +	struct memory_group group = {
> +		.nid = nid,
> +		.s = {
> +			.max_pages = max_pages,
> +		},
> +	};
> +
> +	if (!max_pages)
> +		return -EINVAL;
> +	return register_memory_group(group);
> +}
> +EXPORT_SYMBOL_GPL(register_static_memory_group);

Let's make our global namespace a bit nicer:
	memory_group_register_static()
	memory_group_register_dynamic()

and so on.  Use prefixes please, not suffixes.


> +
> +int register_dynamic_memory_group(int nid, unsigned long unit_pages)
> +{
> +	struct memory_group group = {
> +		.nid = nid,
> +		.is_dynamic = true,
> +		.d = {
> +			.unit_pages = unit_pages,
> +		},
> +	};
> +
> +	if (!unit_pages || !is_power_of_2(unit_pages) ||
> +	    unit_pages < PHYS_PFN(memory_block_size_bytes()))
> +		return -EINVAL;
> +	return register_memory_group(group);
> +}
> +EXPORT_SYMBOL_GPL(register_dynamic_memory_group);
> +
> +int unregister_memory_group(int mgid)
> +{
> +	struct memory_group *group;
> +
> +	if (mgid < 0)
> +		return -EINVAL;
> +
> +	group = xa_load(&memory_groups, mgid);
> +	if (!group || refcount_read(&group->refcount) > 1)
> +		return -EINVAL;
> +
> +	xa_erase(&memory_groups, mgid);
> +	kfree(group);
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(unregister_memory_group);

memory_group_unregister()


> +
> +struct memory_group *get_memory_group(int mgid)
> +{
> +	return xa_load(&memory_groups, mgid);
> +}

Global function?


> diff --git a/include/linux/memory.h b/include/linux/memory.h
> index 97e92e8b556a..6e20a6174fe5 100644
> --- a/include/linux/memory.h
> +++ b/include/linux/memory.h
> @@ -23,6 +23,42 @@
>  
>  #define MIN_MEMORY_BLOCK_SIZE     (1UL << SECTION_SIZE_BITS)
>  
> +struct memory_group {
> +	/* Nid the whole group belongs to. */
> +	int nid;

What is a "nid"?

> +	/* References from memory blocks + 1. */

Blank line above this?

And put the structure comments in proper kernel doc so that others can
read them and we can verify it is correct over time.

> +	refcount_t refcount;
> +	/*
> +	 * Memory group type: static vs. dynamic.
> +	 *
> +	 * Static: All memory in the group belongs to a single unit, such as,
> +	 * a DIMM. All memory belonging to the group will be added in
> +	 * one go and removed in one go -- it's static.
> +	 *
> +	 * Dynamic: Memory within the group is added/removed dynamically in
> +	 * units of the specified granularity of at least one memory block.
> +	 */
> +	bool is_dynamic;
> +
> +	union {
> +		struct {
> +			/*
> +			 * Maximum number of pages we'll have in this static
> +			 * memory group.
> +			 */
> +			unsigned long max_pages;
> +		} s;
> +		struct {
> +			/*
> +			 * Unit in pages in which memory is added/removed in
> +			 * this dynamic memory group. This granularity defines
> +			 * the alignment of a unit in physical address space.
> +			 */
> +			unsigned long unit_pages;
> +		} d;

so is_dynamic determines which to use here?  Please be explicit.


thanks,

greg k-h




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux