"Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> writes: > Also update different helpes to use NODE_DATA()->memtier. Since > node specific memtier can change based on the reassignment of > NUMA node to a different memory tiers, accessing NODE_DATA()->memtier > needs to happen under an rcu read lock or memory_tier_lock. > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> > --- > include/linux/mmzone.h | 3 ++ > mm/memory-tiers.c | 64 +++++++++++++++++++++++++++++++----------- > 2 files changed, 50 insertions(+), 17 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index aab70355d64f..353812495a70 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -928,6 +928,9 @@ typedef struct pglist_data { > /* Per-node vmstats */ > struct per_cpu_nodestat __percpu *per_cpu_nodestats; > atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; > +#ifdef CONFIG_NUMA > + struct memory_tier __rcu *memtier; > +#endif > } pg_data_t; > > #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) > diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c > index e951f54ce56c..bab4700bf58d 100644 > --- a/mm/memory-tiers.c > +++ b/mm/memory-tiers.c > @@ -7,6 +7,7 @@ > #include <linux/moduleparam.h> > #include <linux/memory.h> > #include <linux/random.h> > +#include <linux/rcupdate.h> > #include <linux/memory-tiers.h> > > #include "internal.h" > @@ -124,18 +125,23 @@ static struct memory_tier *register_memory_tier(unsigned int tier) > static void unregister_memory_tier(struct memory_tier *memtier) > { > list_del(&memtier->list); > - kfree(memtier); > + kfree_rcu(memtier); > } > > static struct memory_tier *__node_get_memory_tier(int node) > { > - struct memory_tier *memtier; > + pg_data_t *pgdat; > > - list_for_each_entry(memtier, &memory_tiers, list) { > - if (node_isset(node, memtier->nodelist)) > - return memtier; > - } > - return NULL; > + pgdat = NODE_DATA(node); > + if (!pgdat) > + return NULL; > + /* > + * Since we hold memory_tier_lock, we can avoid > + * RCU read locks when accessing the details. No > + * parallel updates are possible here. > + */ > + return rcu_dereference_check(pgdat->memtier, > + lockdep_is_held(&memory_tier_lock)); > } > > static struct memory_tier *__get_memory_tier_from_id(int id) > @@ -149,6 +155,33 @@ static struct memory_tier *__get_memory_tier_from_id(int id) > return NULL; > } > > +/* > + * Called with memory_tier_lock. Hence the device references cannot > + * be dropped during this function. > + */ > +static void memtier_node_set(int node, struct memory_tier *memtier) > +{ > + pg_data_t *pgdat; > + struct memory_tier *current_memtier; > + > + pgdat = NODE_DATA(node); > + if (!pgdat) > + return; > + /* > + * Make sure we mark the memtier NULL before we assign the new memory tier > + * to the NUMA node. This make sure that anybody looking at NODE_DATA > + * finds a NULL memtier or the one which is still valid. > + */ > + current_memtier = rcu_dereference_check(pgdat->memtier, > + lockdep_is_held(&memory_tier_lock)); > + rcu_assign_pointer(pgdat->memtier, NULL); > + if (current_memtier) > + node_clear(node, current_memtier->nodelist); It seems odd to me that you would update the current memtier prior to the synchronize_rcu(). I suppose it's really memory_tier_lock that protects the details like ->nodelist, but is there any reason not do the update after anyway? > + synchronize_rcu(); > + node_set(node, memtier->nodelist); > + rcu_assign_pointer(pgdat->memtier, memtier); > +} > + > static int __node_create_and_set_memory_tier(int node, int tier) > { > int ret = 0; > @@ -162,7 +195,7 @@ static int __node_create_and_set_memory_tier(int node, int tier) > goto out; > } > } > - node_set(node, memtier->nodelist); > + memtier_node_set(node, memtier); > out: > return ret; > } > @@ -184,14 +217,7 @@ int node_create_and_set_memory_tier(int node, int tier) > if (current_tier->id == tier) > goto out; > > - node_clear(node, current_tier->nodelist); > - > ret = __node_create_and_set_memory_tier(node, tier); > - if (ret) { > - /* reset it back to older tier */ > - node_set(node, current_tier->nodelist); > - goto out; > - } > if (nodes_empty(current_tier->nodelist)) > unregister_memory_tier(current_tier); > > @@ -213,7 +239,7 @@ static int __node_set_memory_tier(int node, int tier) > ret = -EINVAL; > goto out; > } > - node_set(node, memtier->nodelist); > + memtier_node_set(node, memtier); > out: > return ret; > } > @@ -428,6 +454,7 @@ static void __init migrate_on_reclaim_init(void) > > static int __init memory_tier_init(void) > { > + int node; > struct memory_tier *memtier; > > /* > @@ -444,7 +471,10 @@ static int __init memory_tier_init(void) > __func__, PTR_ERR(memtier)); > > /* CPU only nodes are not part of memory tiers. */ > - memtier->nodelist = node_states[N_MEMORY]; > + for_each_node_state(node, N_MEMORY) { > + rcu_assign_pointer(NODE_DATA(node)->memtier, memtier); > + node_set(node, memtier->nodelist); Similar comment here - the order seems opposite to what I'd expect. Shouldn't memtier->nodelist be fully initialised prior to making it visible with rcu_assign_pointer()? > + } > mutex_unlock(&memory_tier_lock); > > migrate_on_reclaim_init();