On Fri, 27 May 2022 17:55:22 +0530 "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> wrote: > From: Jagdish Gediya <jvgediya@xxxxxxxxxxxxx> > > In the current kernel, memory tiers are defined implicitly via a > demotion path relationship between NUMA nodes, which is created > during the kernel initialization and updated when a NUMA node is > hot-added or hot-removed. The current implementation puts all > nodes with CPU into the top tier, and builds the tier hierarchy > tier-by-tier by establishing the per-node demotion targets based > on the distances between nodes. > > This current memory tier kernel interface needs to be improved for > several important use cases, > > The current tier initialization code always initializes > each memory-only NUMA node into a lower tier. But a memory-only > NUMA node may have a high performance memory device (e.g. a DRAM > device attached via CXL.mem or a DRAM-backed memory-only node on > a virtual machine) and should be put into a higher tier. > > The current tier hierarchy always puts CPU nodes into the top > tier. But on a system with HBM or GPU devices, the > memory-only NUMA nodes mapping these devices should be in the > top tier, and DRAM nodes with CPUs are better to be placed into the > next lower tier. > > With current kernel higher tier node can only be demoted to selected nodes on the > next lower tier as defined by the demotion path, not any other > node from any lower tier. This strict, hard-coded demotion order > does not work in all use cases (e.g. some use cases may want to > allow cross-socket demotion to another node in the same demotion > tier as a fallback when the preferred demotion node is out of > space), This demotion order is also inconsistent with the page > allocation fallback order when all the nodes in a higher tier are > out of space: The page allocation can fall back to any node from > any lower tier, whereas the demotion order doesn't allow that. > > The current kernel also don't provide any interfaces for the > userspace to learn about the memory tier hierarchy in order to > optimize its memory allocations. > > This patch series address the above by defining memory tiers explicitly. > > This patch adds below sysfs interface which is read-only and > can be used to read nodes available in specific tier. > > /sys/devices/system/memtier/memtierN/nodelist > > Tier 0 is the highest tier, while tier MAX_MEMORY_TIERS - 1 is the > lowest tier. The absolute value of a tier id number has no specific > meaning. what matters is the relative order of the tier id numbers. > > All the tiered memory code is guarded by CONFIG_TIERED_MEMORY. > Default number of memory tiers are MAX_MEMORY_TIERS(3). All the > nodes are by default assigned to DEFAULT_MEMORY_TIER(1). > > Default memory tier can be read from, > /sys/devices/system/memtier/default_tier > > Max memory tier can be read from, > /sys/devices/system/memtier/max_tiers > > This patch implements the RFC spec sent by Wei Xu <weixugc@xxxxxxxxxx> at [1]. > > [1] https://lore.kernel.org/linux-mm/CAAPL-u-DGLcKRVDnChN9ZhxPkfxQvz9Sb93kVoX_4J2oiJSkUw@xxxxxxxxxxxxxx/ > > Signed-off-by: Jagdish Gediya <jvgediya@xxxxxxxxxxxxx> > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> Hi Aneesh, Superficial review only for this first pass. We'll give it a spin next week and come back with anything more substantial. Thanks, Jonathan > --- > include/linux/migrate.h | 38 ++++++++---- > mm/Kconfig | 11 ++++ > mm/migrate.c | 134 ++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 170 insertions(+), 13 deletions(-) > > diff --git a/include/linux/migrate.h b/include/linux/migrate.h > index 90e75d5a54d6..0ec653623565 100644 > --- a/include/linux/migrate.h > +++ b/include/linux/migrate.h > @@ -47,17 +47,8 @@ void folio_migrate_copy(struct folio *newfolio, struct folio *folio); > int folio_migrate_mapping(struct address_space *mapping, > struct folio *newfolio, struct folio *folio, int extra_count); > > -extern bool numa_demotion_enabled; > -extern void migrate_on_reclaim_init(void); > -#ifdef CONFIG_HOTPLUG_CPU > -extern void set_migration_target_nodes(void); > -#else > -static inline void set_migration_target_nodes(void) {} > -#endif > #else > > -static inline void set_migration_target_nodes(void) {} > - > static inline void putback_movable_pages(struct list_head *l) {} > static inline int migrate_pages(struct list_head *l, new_page_t new, > free_page_t free, unsigned long private, enum migrate_mode mode, > @@ -82,7 +73,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, > return -ENOSYS; > } > > -#define numa_demotion_enabled false > #endif /* CONFIG_MIGRATION */ > > #ifdef CONFIG_COMPACTION > @@ -172,15 +162,37 @@ struct migrate_vma { > int migrate_vma_setup(struct migrate_vma *args); > void migrate_vma_pages(struct migrate_vma *migrate); > void migrate_vma_finalize(struct migrate_vma *migrate); > -int next_demotion_node(int node); > +#endif /* CONFIG_MIGRATION */ > + > +#ifdef CONFIG_TIERED_MEMORY > + > +extern bool numa_demotion_enabled; > +#define DEFAULT_MEMORY_TIER 1 > + > +enum memory_tier_type { > + MEMORY_TIER_HBM_GPU, > + MEMORY_TIER_DRAM, > + MEMORY_TIER_PMEM, > + MAX_MEMORY_TIERS Not used yet. Introduce it in patch that makes use of it. > +}; > > -#else /* CONFIG_MIGRATION disabled: */ > +int next_demotion_node(int node); > > +extern void migrate_on_reclaim_init(void); > +#ifdef CONFIG_HOTPLUG_CPU > +extern void set_migration_target_nodes(void); > +#else > +static inline void set_migration_target_nodes(void) {} > +#endif > +#else Worth noting what this else is for as we have a lot of them about! Comments as used elsewhere in this file for #else will help. > +#define numa_demotion_enabled false > static inline int next_demotion_node(int node) > { > return NUMA_NO_NODE; > } > > -#endif /* CONFIG_MIGRATION */ > +static inline void set_migration_target_nodes(void) {} > +static inline void migrate_on_reclaim_init(void) {} > +#endif /* CONFIG_TIERED_MEMORY */ > > #endif /* _LINUX_MIGRATE_H */ > diff --git a/mm/Kconfig b/mm/Kconfig > index 034d87953600..7bfbddef46ed 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -258,6 +258,17 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION > config ARCH_ENABLE_THP_MIGRATION > bool > > +config TIERED_MEMORY > + bool "Support for explicit memory tiers" > + def_bool y New options as y is generally hard to argue for > + depends on MIGRATION && NUMA > + help > + Support to split nodes into memory tiers explicitly and > + to demote pages on reclaim to lower tiers. This option > + also exposes sysfs interface to read nodes available in > + specific tier and to move specific node among different > + possible tiers. > + > config HUGETLB_PAGE_SIZE_VARIABLE > def_bool n > help > diff --git a/mm/migrate.c b/mm/migrate.c > index 6c31ee1e1c9b..f28ee93fb017 100644 > --- a/mm/migrate.c > +++ b/mm/migrate.c > @@ -2118,6 +2118,113 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, > #endif /* CONFIG_NUMA_BALANCING */ > #endif /* CONFIG_NUMA */ > > +#ifdef CONFIG_TIERED_MEMORY I wonder if it makes sense to put this in a separate file given it's reasonably separable and that file is big enough already... > + > +struct memory_tier { > + struct device dev; > + nodemask_t nodelist; > +}; > + > +#define to_memory_tier(device) container_of(device, struct memory_tier, dev) > + > +static struct bus_type memory_tier_subsys = { > + .name = "memtier", > + .dev_name = "memtier", > +}; > + > +static struct memory_tier *memory_tiers[MAX_MEMORY_TIERS]; > + > +static ssize_t nodelist_show(struct device *dev, > + struct device_attribute *attr, char *buf) > +{ > + int tier = dev->id; struct memory_tier *tier = memory_tiers[dev->id]; as the local variable will give more readable code I think. > + > + return sysfs_emit(buf, "%*pbl\n", > + nodemask_pr_args(&memory_tiers[tier]->nodelist)); > + > +} > +static DEVICE_ATTR_RO(nodelist); > + > +static struct attribute *memory_tier_dev_attrs[] = { > + &dev_attr_nodelist.attr, > + NULL > +}; > + > +static const struct attribute_group memory_tier_dev_group = { > + .attrs = memory_tier_dev_attrs, > +}; > + > +static const struct attribute_group *memory_tier_dev_groups[] = { > + &memory_tier_dev_group, > + NULL > +}; > + > +static void memory_tier_device_release(struct device *dev) > +{ > + struct memory_tier *tier = to_memory_tier(dev); > + > + kfree(tier); > +} > + > +static int register_memory_tier(int tier) > +{ > + int error; > + I would add a sanity check that the tier is not already present. Trivial check and might help debugging... > + memory_tiers[tier] = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); prefer sizeof(*memory_tiers[tier]) to avoid need to check type matches. > + if (!memory_tiers[tier]) > + return -ENOMEM; > + > + memory_tiers[tier]->dev.id = tier; This would all be simpler to read if you use a local variable. struct memory_tier *tier = kzalloc(sizeof(*tier), GFP_KERNEL); and only assign it to memory_tiers[tier] on successful device_register > + memory_tiers[tier]->dev.bus = &memory_tier_subsys; > + memory_tiers[tier]->dev.release = memory_tier_device_release; > + memory_tiers[tier]->dev.groups = memory_tier_dev_groups; > + error = device_register(&memory_tiers[tier]->dev); > + > + if (error) { > + put_device(&memory_tiers[tier]->dev); > + memory_tiers[tier] = NULL; > + } > + > + return error; > +} > + > +static void unregister_memory_tier(int tier) > +{ > + device_unregister(&memory_tiers[tier]->dev); > + memory_tiers[tier] = NULL; > +} > + > +static ssize_t > +max_tiers_show(struct device *dev, struct device_attribute *attr, char *buf) > +{ > + return sysfs_emit(buf, "%d\n", MAX_MEMORY_TIERS); > +} > + > +static DEVICE_ATTR_RO(max_tiers); > + > +static ssize_t > +default_tier_show(struct device *dev, struct device_attribute *attr, char *buf) > +{ > + return sysfs_emit(buf, "%d\n", DEFAULT_MEMORY_TIER); > +} Common convention for these is don't leave a blank line. Helps associate the single function with the ATTR definition. > + > +static DEVICE_ATTR_RO(default_tier); > + > +static struct attribute *memoty_tier_attrs[] = { memory > + &dev_attr_max_tiers.attr, > + &dev_attr_default_tier.attr, > + NULL > +}; > + > +static const struct attribute_group memory_tier_attr_group = { > + .attrs = memoty_tier_attrs, > +}; > + > +static const struct attribute_group *memory_tier_attr_groups[] = { > + &memory_tier_attr_group, > + NULL, > +}; > + > /* > * node_demotion[] example: > * > @@ -2569,3 +2676,30 @@ static int __init numa_init_sysfs(void) > } > subsys_initcall(numa_init_sysfs); > #endif You've caught up some other stuff in your CONFIG_TIERED_MEMORY ifdef. > + > +static int __init memory_tier_init(void) > +{ > + int ret; > + > + ret = subsys_system_register(&memory_tier_subsys, memory_tier_attr_groups); > + if (ret) > + panic("%s() failed to register subsystem: %d\n", __func__, ret); > + > + /* > + * Register only default memory tier to hide all empty > + * memory tier from sysfs. > + */ > + ret = register_memory_tier(DEFAULT_MEMORY_TIER); > + if (ret) > + panic("%s() failed to register memory tier: %d\n", __func__, ret); > + > + /* > + * CPU only nodes are not part of memoty tiers. memory, plus single line comment syntax probably better. > + */ > + memory_tiers[DEFAULT_MEMORY_TIER]->nodelist = node_states[N_MEMORY]; > + > + return 0; > +} > +subsys_initcall(memory_tier_init); > + > +#endif /* CONFIG_TIERED_MEMORY */