Ying Huang <ying.huang@xxxxxxxxx> writes: .... > > > >> > > is this good (not tested)? >> > > /* >> > > * build the allowed promotion mask. Promotion is allowed >> > > * from higher memory tier to lower memory tier only if >> > > * lower memory tier doesn't include compute. We want to >> > > * skip promotion from a memory tier, if any node which is >> > > * part of that memory tier have CPUs. Once we detect such >> > > * a memory tier, we consider that tier as top tier from >> > > * which promotion is not allowed. >> > > */ >> > > list_for_each_entry_reverse(memtier, &memory_tiers, list) { >> > > nodes_and(allowed, node_state[N_CPU], memtier->nodelist); >> > > if (nodes_empty(allowed)) >> > > nodes_or(promotion_mask, promotion_mask, allowed); >> > > else >> > > break; >> > > } >> > > >> > > and then >> > > >> > > static inline bool node_is_toptier(int node) >> > > { >> > > >> > > return !node_isset(node, promotion_mask); >> > > } >> > > >> > >> > This should work. But it appears unnatural. So, I don't think we >> > should avoid to add more and more node masks to mitigate the design >> > decision that we cannot access memory tier information directly. All >> > these becomes simple and natural, if we can access memory tier >> > information directly. >> > >> >> how do you derive whether node is toptier details if we have memtier >> details in pgdat? > > pgdat -> memory tier -> rank > > Then we can compare this rank with the fast memory rank. The fast > memory rank can be calculated dynamically at appropriate places. This is what I am testing now. We still need to closely audit that lock free access to the NODE_DATA()->memtier. For v6 I will keep this as a separate patch and once we all agree that it is safe, I will fold it back. diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index a388a806b61a..3e733de1a8a0 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -17,7 +17,6 @@ #define MAX_MEMORY_TIERS (MAX_STATIC_MEMORY_TIERS + 2) extern bool numa_demotion_enabled; -extern nodemask_t promotion_mask; int node_create_and_set_memory_tier(int node, int tier); int next_demotion_node(int node); int node_set_memory_tier(int node, int tier); @@ -25,15 +24,7 @@ int node_get_memory_tier_id(int node); int node_reset_memory_tier(int node, int tier); void node_remove_from_memory_tier(int node); void node_get_allowed_targets(int node, nodemask_t *targets); - -/* - * By default all nodes are top tiper. As we create new memory tiers - * we below top tiers we add them to NON_TOP_TIER state. - */ -static inline bool node_is_toptier(int node) -{ - return !node_isset(node, promotion_mask); -} +bool node_is_toptier(int node); #else #define numa_demotion_enabled false diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aab70355d64f..c4fcfd2b9980 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -928,6 +928,9 @@ typedef struct pglist_data { /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; +#ifdef CONFIG_TIERED_MEMORY + struct memory_tier *memtier; +#endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 29a038bb38b0..31ef0fab5f19 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -7,6 +7,7 @@ #include <linux/random.h> #include <linux/memory.h> #include <linux/idr.h> +#include <linux/rcupdate.h> #include "internal.h" @@ -26,7 +27,7 @@ struct demotion_nodes { static void establish_migration_targets(void); static DEFINE_MUTEX(memory_tier_lock); static LIST_HEAD(memory_tiers); -nodemask_t promotion_mask; +static int top_tier_rank; /* * node_demotion[] examples: * @@ -135,7 +136,7 @@ static void memory_tier_device_release(struct device *dev) if (tier->dev.id >= MAX_STATIC_MEMORY_TIERS) ida_free(&memtier_dev_id, tier->dev.id); - kfree(tier); + kfree_rcu(tier); } /* @@ -233,6 +234,70 @@ static struct memory_tier *__get_memory_tier_from_id(int id) return NULL; } +/* + * Called with memory_tier_lock. Hence the device references cannot + * be dropped during this function. + */ +static void memtier_node_clear(int node, struct memory_tier *memtier) +{ + pg_data_t *pgdat; + + pgdat = NODE_DATA(node); + if (!pgdat) + return; + + rcu_assign_pointer(pgdat->memtier, NULL); + /* + * Make sure read side see the NULL value before we clear the node + * from the nodelist. + */ + synchronize_rcu(); + node_clear(node, memtier->nodelist); +} + +static void memtier_node_set(int node, struct memory_tier *memtier) +{ + pg_data_t *pgdat; + + pgdat = NODE_DATA(node); + if (!pgdat) + return; + /* + * Make sure we mark the memtier NULL before we assign the new memory tier + * to the NUMA node. This make sure that anybody looking at NODE_DATA + * finds a NULL memtier or the one which is still valid. + */ + rcu_assign_pointer(pgdat->memtier, NULL); + synchronize_rcu(); + node_set(node, memtier->nodelist); + rcu_assign_pointer(pgdat->memtier, memtier); +} + +bool node_is_toptier(int node) +{ + bool toptier; + pg_data_t *pgdat; + struct memory_tier *memtier; + + pgdat = NODE_DATA(node); + if (!pgdat) + return false; + + rcu_read_lock(); + memtier = rcu_dereference(pgdat->memtier); + if (!memtier) { + toptier = true; + goto out; + } + if (memtier->rank >= top_tier_rank) + toptier = true; + else + toptier = false; +out: + rcu_read_unlock(); + return toptier; +} + static int __node_create_and_set_memory_tier(int node, int tier) { int ret = 0; @@ -253,7 +318,7 @@ static int __node_create_and_set_memory_tier(int node, int tier) goto out; } } - node_set(node, memtier->nodelist); + memtier_node_set(node, memtier); out: return ret; } @@ -275,12 +340,12 @@ int node_create_and_set_memory_tier(int node, int tier) if (current_tier->dev.id == tier) goto out; - node_clear(node, current_tier->nodelist); + memtier_node_clear(node, current_tier); ret = __node_create_and_set_memory_tier(node, tier); if (ret) { /* reset it back to older tier */ - node_set(node, current_tier->nodelist); + memtier_node_set(node, current_tier); goto out; } @@ -305,7 +370,7 @@ static int __node_set_memory_tier(int node, int tier) ret = -EINVAL; goto out; } - node_set(node, memtier->nodelist); + memtier_node_set(node, memtier); out: return ret; } @@ -374,12 +439,12 @@ int node_reset_memory_tier(int node, int tier) if (current_tier->dev.id == tier) goto out; - node_clear(node, current_tier->nodelist); + memtier_node_clear(node, current_tier); ret = __node_set_memory_tier(node, tier); if (ret) { /* reset it back to older tier */ - node_set(node, current_tier->nodelist); + memtier_node_set(node, current_tier); goto out; } @@ -407,7 +472,7 @@ void node_remove_from_memory_tier(int node) * empty then unregister it to make it invisible * in sysfs. */ - node_clear(node, memtier->nodelist); + memtier_node_clear(node, memtier); if (nodes_empty(memtier->nodelist)) unregister_memory_tier(memtier); @@ -570,15 +635,13 @@ static void establish_migration_targets(void) * a memory tier, we consider that tier as top tiper from * which promotion is not allowed. */ - promotion_mask = NODE_MASK_NONE; list_for_each_entry_reverse(memtier, &memory_tiers, list) { nodes_and(allowed, node_states[N_CPU], memtier->nodelist); - if (nodes_empty(allowed)) - nodes_or(promotion_mask, promotion_mask, memtier->nodelist); - else + if (!nodes_empty(allowed)) { + top_tier_rank = memtier->rank; break; + } } - pr_emerg("top tier rank is %d\n", top_tier_rank); allowed = NODE_MASK_NONE; /* @@ -748,7 +811,7 @@ static const struct attribute_group *memory_tier_attr_groups[] = { static int __init memory_tier_init(void) { - int ret; + int ret, node; struct memory_tier *memtier; ret = subsys_system_register(&memory_tier_subsys, memory_tier_attr_groups); @@ -766,7 +829,13 @@ static int __init memory_tier_init(void) panic("%s() failed to register memory tier: %d\n", __func__, ret); /* CPU only nodes are not part of memory tiers. */ - memtier->nodelist = node_states[N_MEMORY]; + for_each_node_state(node, N_MEMORY) { + /* + * Should be safe to do this early in the boot. + */ + NODE_DATA(node)->memtier = memtier; + node_set(node, memtier->nodelist); + } migrate_on_reclaim_init(); return 0;