With multiple dax devices having the same node affinity, the kernel wrongly assigned default_dram memory type to some devices after the memory hotplug operation. Fix this by not clearing node_memory_types on the dax device remove. The current kernel cleared node_memory_type on successful removal of a dax device. But then we can have multiple dax devices with the same node affinity. Clearing the node_memory_type results in assigning other dax devices to the default dram type when we bring them online. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> --- mm/memory-tiers.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index ba844fe9cc8c..c4bd6d052a33 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -27,9 +27,14 @@ struct demotion_nodes { nodemask_t preferred; }; +struct node_memory_type_map { + struct memory_dev_type *memtype; + int map_count; +}; + static DEFINE_MUTEX(memory_tier_lock); static LIST_HEAD(memory_tiers); -static struct memory_dev_type *node_memory_types[MAX_NUMNODES]; +static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; static struct memory_dev_type *default_dram_type; #ifdef CONFIG_MIGRATION static int top_tier_adistance; @@ -386,9 +391,19 @@ static inline void establish_demotion_targets(void) {} static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) { - if (!node_memory_types[node]) { - node_memory_types[node] = memtype; - kref_get(&memtype->kref); + if (!node_memory_types[node].memtype) + node_memory_types[node].memtype = memtype; + /* + * for each device getting added in the same NUMA node + * with this specific memtype, bump the map count. We + * Only take memtype device reference once, so that + * changing a node memtype can be done by droping the + * only reference count taken here. + */ + + if (node_memory_types[node].memtype == memtype) { + if (!node_memory_types[node].map_count++) + kref_get(&memtype->kref); } } @@ -406,7 +421,7 @@ static struct memory_tier *set_node_memory_tier(int node) __init_node_memory_type(node, default_dram_type); - memtype = node_memory_types[node]; + memtype = node_memory_types[node].memtype; node_set(node, memtype->nodes); memtier = find_create_memory_tier(memtype); if (!IS_ERR(memtier)) @@ -448,7 +463,7 @@ static bool clear_node_memory_tier(int node) rcu_assign_pointer(pgdat->memtier, NULL); synchronize_rcu(); - memtype = node_memory_types[node]; + memtype = node_memory_types[node].memtype; node_clear(node, memtype->nodes); if (nodes_empty(memtype->nodes)) { list_del_init(&memtype->tier_sibiling); @@ -502,8 +517,14 @@ EXPORT_SYMBOL_GPL(init_node_memory_type); void clear_node_memory_type(int node, struct memory_dev_type *memtype) { mutex_lock(&memory_tier_lock); - if (node_memory_types[node] == memtype) { - node_memory_types[node] = NULL; + if (node_memory_types[node].memtype == memtype) + node_memory_types[node].map_count--; + /* + * If we umapped all the attached devices to this node, + * clear the node memory type. + */ + if (!node_memory_types[node].map_count) { + node_memory_types[node].memtype = NULL; kref_put(&memtype->kref, release_memtype); } mutex_unlock(&memory_tier_lock); -- 2.37.2