On 7/25/22 2:24 PM, Huang, Ying wrote: > "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> writes: > >> With memory tiers support we can have memory only NUMA nodes >> in the top tier from which we want to avoid promotion tracking NUMA >> faults. Update node_is_toptier to work with memory tiers. >> All NUMA nodes are by default top tier nodes. With lower memory >> tiers added we consider all memory tiers above a memory tier having >> CPU NUMA nodes as a top memory tier >> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> >> --- >> include/linux/memory-tiers.h | 11 +++++++++ >> include/linux/node.h | 5 ----- >> mm/huge_memory.c | 1 + >> mm/memory-tiers.c | 43 ++++++++++++++++++++++++++++++++++++ >> mm/migrate.c | 1 + >> mm/mprotect.c | 1 + >> 6 files changed, 57 insertions(+), 5 deletions(-) >> >> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h >> index 0e58588fa066..085dd815bf73 100644 >> --- a/include/linux/memory-tiers.h >> +++ b/include/linux/memory-tiers.h >> @@ -20,6 +20,7 @@ extern bool numa_demotion_enabled; >> #ifdef CONFIG_MIGRATION >> int next_demotion_node(int node); >> void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); >> +bool node_is_toptier(int node); >> #else >> static inline int next_demotion_node(int node) >> { >> @@ -30,6 +31,11 @@ static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *target >> { >> *targets = NODE_MASK_NONE; >> } >> + >> +static inline bool node_is_toptier(int node) >> +{ >> + return true; >> +} >> #endif >> >> #else >> @@ -44,5 +50,10 @@ static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *target >> { >> *targets = NODE_MASK_NONE; >> } >> + >> +static inline bool node_is_toptier(int node) >> +{ >> + return true; >> +} >> #endif /* CONFIG_NUMA */ >> #endif /* _LINUX_MEMORY_TIERS_H */ >> diff --git a/include/linux/node.h b/include/linux/node.h >> index a2a16d4104fd..d0432db18094 100644 >> --- a/include/linux/node.h >> +++ b/include/linux/node.h >> @@ -191,9 +191,4 @@ static inline void register_hugetlbfs_with_node(node_registration_func_t reg, >> >> #define to_node(device) container_of(device, struct node, dev) >> >> -static inline bool node_is_toptier(int node) >> -{ >> - return node_state(node, N_CPU); >> -} >> - >> #endif /* _LINUX_NODE_H_ */ >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 834f288b3769..8405662646e9 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -35,6 +35,7 @@ >> #include <linux/numa.h> >> #include <linux/page_owner.h> >> #include <linux/sched/sysctl.h> >> +#include <linux/memory-tiers.h> >> >> #include <asm/tlb.h> >> #include <asm/pgalloc.h> >> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c >> index 4a96e4213d66..f0515bfd4051 100644 >> --- a/mm/memory-tiers.c >> +++ b/mm/memory-tiers.c >> @@ -13,6 +13,7 @@ >> >> struct memory_tier { >> struct list_head list; >> + int id; >> int perf_level; >> nodemask_t nodelist; >> nodemask_t lower_tier_mask; >> @@ -26,6 +27,7 @@ static LIST_HEAD(memory_tiers); >> static DEFINE_MUTEX(memory_tier_lock); >> >> #ifdef CONFIG_MIGRATION >> +static int top_tier_id; >> /* >> * node_demotion[] examples: >> * >> @@ -129,6 +131,7 @@ static struct memory_tier *find_create_memory_tier(unsigned int perf_level) >> if (!new_memtier) >> return ERR_PTR(-ENOMEM); >> >> + new_memtier->id = perf_level >> MEMTIER_CHUNK_BITS; >> new_memtier->perf_level = perf_level; >> if (found_slot) >> list_add_tail(&new_memtier->list, ent); >> @@ -154,6 +157,31 @@ static struct memory_tier *__node_get_memory_tier(int node) >> } >> >> #ifdef CONFIG_MIGRATION >> +bool node_is_toptier(int node) >> +{ >> + bool toptier; >> + pg_data_t *pgdat; >> + struct memory_tier *memtier; >> + >> + pgdat = NODE_DATA(node); >> + if (!pgdat) >> + return false; >> + >> + rcu_read_lock(); >> + memtier = rcu_dereference(pgdat->memtier); >> + if (!memtier) { >> + toptier = true; >> + goto out; >> + } >> + if (memtier->id >= top_tier_id) >> + toptier = true; >> + else >> + toptier = false; >> +out: >> + rcu_read_unlock(); >> + return toptier; >> +} >> + >> void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) >> { >> struct memory_tier *memtier; >> @@ -304,6 +332,21 @@ static void establish_migration_targets(void) >> } >> } while (1); >> } >> + /* >> + * Promotion is allowed from a memory tier to higher >> + * memory tier only if the memory tier doesn't include >> + * compute. We want to skip promotion from a memory tier, >> + * if any node that is part of the memory tier have CPUs. >> + * Once we detect such a memory tier, we consider that tier >> + * as top tiper from which promotion is not allowed. >> + */ >> + list_for_each_entry_reverse(memtier, &memory_tiers, list) { >> + nodes_and(used, node_states[N_CPU], memtier->nodelist); >> + if (!nodes_empty(used)) { >> + top_tier_id = memtier->id; > > I don't think we need to introduce memory tier ID for this. We can add > a top_tier_perf_level, set it here and use it in node_is_toptier(). > Sure. Will switch to that in the next iteration. -aneesh