"Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> writes: > "Huang, Ying" <ying.huang@xxxxxxxxx> writes: > >> "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> writes: >> >>> By default, all nodes are assigned to the default memory tier which >>> is the memory tier designated for nodes with DRAM >>> >>> Set dax kmem device node's tier to slower memory tier by assigning >>> abstract distance to MEMTIER_ADISTANCE_PMEM. PMEM tier >>> appears below the default memory tier in demotion order. >>> >>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> >>> --- >>> drivers/dax/kmem.c | 9 +++++++++ >>> include/linux/memory-tiers.h | 19 ++++++++++++++++++- >>> mm/memory-tiers.c | 28 ++++++++++++++++------------ >>> 3 files changed, 43 insertions(+), 13 deletions(-) >>> >>> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c >>> index a37622060fff..6b0d5de9a3e9 100644 >>> --- a/drivers/dax/kmem.c >>> +++ b/drivers/dax/kmem.c >>> @@ -11,6 +11,7 @@ >>> #include <linux/fs.h> >>> #include <linux/mm.h> >>> #include <linux/mman.h> >>> +#include <linux/memory-tiers.h> >>> #include "dax-private.h" >>> #include "bus.h" >>> >>> @@ -41,6 +42,12 @@ struct dax_kmem_data { >>> struct resource *res[]; >>> }; >>> >>> +static struct memory_dev_type default_pmem_type = { >> >> Why is this named as default_pmem_type? We will not change the memory >> type of a node usually. >> > > Any other suggestion? pmem_dev_type? Or dax_pmem_type? DAX is used to enumerate the memory device. > >>> + .adistance = MEMTIER_ADISTANCE_PMEM, >>> + .tier_sibiling = LIST_HEAD_INIT(default_pmem_type.tier_sibiling), >>> + .nodes = NODE_MASK_NONE, >>> +}; >>> + >>> static int dev_dax_kmem_probe(struct dev_dax *dev_dax) >>> { >>> struct device *dev = &dev_dax->dev; >>> @@ -62,6 +69,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) >>> return -EINVAL; >>> } >>> >>> + init_node_memory_type(numa_node, &default_pmem_type); >>> + >> >> The memory hot-add below may fail. So the error handling needs to be >> added. >> >> And, it appears that the memory type and memory tier of a node may be >> fully initialized here before NUMA hot-adding started. So I suggest to >> set node_memory_types[] here only. And set memory_dev_type->nodes in >> node hot-add callback. I think there is the proper place to complete >> the initialization. >> >> And, in theory dax/kmem.c can be unloaded. So we need to clear >> node_memory_types[] for nodes somewhere. >> > > I guess by module exit we can be sure that all the memory managed > by dax/kmem is hotplugged out. How about something like below? Because we set node_memorty_types[] in dev_dax_kmem_probe(), it's natural to clear it in dev_dax_kmem_remove(). Best Regards, Huang, Ying > diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c > index 6b0d5de9a3e9..eb4e158012a9 100644 > --- a/drivers/dax/kmem.c > +++ b/drivers/dax/kmem.c > @@ -248,6 +248,7 @@ static void __exit dax_kmem_exit(void) > dax_driver_unregister(&device_dax_kmem_driver); > if (!any_hotremove_failed) > kfree_const(kmem_name); > + unregister_memory_type(&default_pmem_type); > } > > MODULE_AUTHOR("Intel Corporation"); > diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h > index fc6b7a14da51..8355baf5b8b4 100644 > --- a/include/linux/memory-tiers.h > +++ b/include/linux/memory-tiers.h > @@ -31,6 +31,7 @@ struct memory_dev_type { > #ifdef CONFIG_NUMA > extern bool numa_demotion_enabled; > void init_node_memory_type(int node, struct memory_dev_type *default_type); > +void unregister_memory_type(struct memory_dev_type *memtype); > #ifdef CONFIG_MIGRATION > int next_demotion_node(int node); > void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); > @@ -57,6 +58,10 @@ static inline bool node_is_toptier(int node) > #define numa_demotion_enabled false > static inline void init_node_memory_type(int node, struct memory_dev_type *default_type) > { > +} > + > +static inline void unregister_memory_type(struct memory_dev_type *memtype) > +{ > > } > > diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c > index 064e0f932795..4d29ebd4c4f3 100644 > --- a/mm/memory-tiers.c > +++ b/mm/memory-tiers.c > @@ -500,6 +500,28 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type) > mutex_unlock(&memory_tier_lock); > } > > +void unregister_memory_type(struct memory_dev_type *memtype) > +{ > + int node; > + struct memory_tier *memtier = memtype->memtier; > + > + mutex_lock(&memory_tier_lock); > + for(node = 0; node < MAX_NUMNODES; node++) { > + if (node_memory_types[node] == memtype) { > + if (!nodes_empty(memtype->nodes)) > + WARN_ON(1); > + node_memory_types[node] = NULL; > + } > + } > + > + list_del(&memtype->tier_sibiling); > + memtype->memtier = NULL; > + if (list_empty(&memtier->memory_types)) > + destroy_memory_tier(memtier); > + > + mutex_unlock(&memory_tier_lock); > +} > + > void update_node_adistance(int node, struct memory_dev_type *memtype) > { > pg_data_t *pgdat;