"Huang, Ying" <ying.huang@xxxxxxxxx> writes: > "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> writes: > >> By default, all nodes are assigned to the default memory tier which >> is the memory tier designated for nodes with DRAM >> >> Set dax kmem device node's tier to slower memory tier by assigning >> abstract distance to MEMTIER_ADISTANCE_PMEM. PMEM tier >> appears below the default memory tier in demotion order. >> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> >> --- >> drivers/dax/kmem.c | 9 +++++++++ >> include/linux/memory-tiers.h | 19 ++++++++++++++++++- >> mm/memory-tiers.c | 28 ++++++++++++++++------------ >> 3 files changed, 43 insertions(+), 13 deletions(-) >> >> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c >> index a37622060fff..6b0d5de9a3e9 100644 >> --- a/drivers/dax/kmem.c >> +++ b/drivers/dax/kmem.c >> @@ -11,6 +11,7 @@ >> #include <linux/fs.h> >> #include <linux/mm.h> >> #include <linux/mman.h> >> +#include <linux/memory-tiers.h> >> #include "dax-private.h" >> #include "bus.h" >> >> @@ -41,6 +42,12 @@ struct dax_kmem_data { >> struct resource *res[]; >> }; >> >> +static struct memory_dev_type default_pmem_type = { > > Why is this named as default_pmem_type? We will not change the memory > type of a node usually. > Any other suggestion? pmem_dev_type? >> + .adistance = MEMTIER_ADISTANCE_PMEM, >> + .tier_sibiling = LIST_HEAD_INIT(default_pmem_type.tier_sibiling), >> + .nodes = NODE_MASK_NONE, >> +}; >> + >> static int dev_dax_kmem_probe(struct dev_dax *dev_dax) >> { >> struct device *dev = &dev_dax->dev; >> @@ -62,6 +69,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) >> return -EINVAL; >> } >> >> + init_node_memory_type(numa_node, &default_pmem_type); >> + > > The memory hot-add below may fail. So the error handling needs to be > added. > > And, it appears that the memory type and memory tier of a node may be > fully initialized here before NUMA hot-adding started. So I suggest to > set node_memory_types[] here only. And set memory_dev_type->nodes in > node hot-add callback. I think there is the proper place to complete > the initialization. > > And, in theory dax/kmem.c can be unloaded. So we need to clear > node_memory_types[] for nodes somewhere. > I guess by module exit we can be sure that all the memory managed by dax/kmem is hotplugged out. How about something like below? diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 6b0d5de9a3e9..eb4e158012a9 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -248,6 +248,7 @@ static void __exit dax_kmem_exit(void) dax_driver_unregister(&device_dax_kmem_driver); if (!any_hotremove_failed) kfree_const(kmem_name); + unregister_memory_type(&default_pmem_type); } MODULE_AUTHOR("Intel Corporation"); diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index fc6b7a14da51..8355baf5b8b4 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -31,6 +31,7 @@ struct memory_dev_type { #ifdef CONFIG_NUMA extern bool numa_demotion_enabled; void init_node_memory_type(int node, struct memory_dev_type *default_type); +void unregister_memory_type(struct memory_dev_type *memtype); #ifdef CONFIG_MIGRATION int next_demotion_node(int node); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); @@ -57,6 +58,10 @@ static inline bool node_is_toptier(int node) #define numa_demotion_enabled false static inline void init_node_memory_type(int node, struct memory_dev_type *default_type) { +} + +static inline void unregister_memory_type(struct memory_dev_type *memtype) +{ } diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 064e0f932795..4d29ebd4c4f3 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -500,6 +500,28 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type) mutex_unlock(&memory_tier_lock); } +void unregister_memory_type(struct memory_dev_type *memtype) +{ + int node; + struct memory_tier *memtier = memtype->memtier; + + mutex_lock(&memory_tier_lock); + for(node = 0; node < MAX_NUMNODES; node++) { + if (node_memory_types[node] == memtype) { + if (!nodes_empty(memtype->nodes)) + WARN_ON(1); + node_memory_types[node] = NULL; + } + } + + list_del(&memtype->tier_sibiling); + memtype->memtier = NULL; + if (list_empty(&memtier->memory_types)) + destroy_memory_tier(memtier); + + mutex_unlock(&memory_tier_lock); +} + void update_node_adistance(int node, struct memory_dev_type *memtype) { pg_data_t *pgdat;