On 7/29/22 12:11 PM, Aneesh Kumar K V wrote: > On 7/29/22 12:09 PM, Huang, Ying wrote: >> "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> writes: >> >>> With memory tiers support we can have memory only NUMA nodes >>> in the top tier from which we want to avoid promotion tracking NUMA >>> faults. Update node_is_toptier to work with memory tiers. >>> All NUMA nodes are by default top tier nodes. With lower memory >>> tiers added we consider all memory tiers above a memory tier having >>> CPU NUMA nodes as a top memory tier >>> >>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> >>> --- >>> include/linux/memory-tiers.h | 11 ++++++++++ >>> include/linux/node.h | 5 ----- >>> mm/huge_memory.c | 1 + >>> mm/memory-tiers.c | 42 ++++++++++++++++++++++++++++++++++++ >>> mm/migrate.c | 1 + >>> mm/mprotect.c | 1 + >>> 6 files changed, 56 insertions(+), 5 deletions(-) >>> >>> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h >>> index f8dbeda617a7..bc9fb9d39b2c 100644 >>> --- a/include/linux/memory-tiers.h >>> +++ b/include/linux/memory-tiers.h >>> @@ -35,6 +35,7 @@ struct memory_dev_type *init_node_memory_type(int node, struct memory_dev_type * >>> #ifdef CONFIG_MIGRATION >>> int next_demotion_node(int node); >>> void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); >>> +bool node_is_toptier(int node); >>> #else >>> static inline int next_demotion_node(int node) >>> { >>> @@ -45,6 +46,11 @@ static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *target >>> { >>> *targets = NODE_MASK_NONE; >>> } >>> + >>> +static inline bool node_is_toptier(int node) >>> +{ >>> + return true; >>> +} >>> #endif >>> >>> #else >>> @@ -64,5 +70,10 @@ static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *target >>> { >>> *targets = NODE_MASK_NONE; >>> } >>> + >>> +static inline bool node_is_toptier(int node) >>> +{ >>> + return true; >>> +} >>> #endif /* CONFIG_NUMA */ >>> #endif /* _LINUX_MEMORY_TIERS_H */ >>> diff --git a/include/linux/node.h b/include/linux/node.h >>> index 40d641a8bfb0..9ec680dd607f 100644 >>> --- a/include/linux/node.h >>> +++ b/include/linux/node.h >>> @@ -185,9 +185,4 @@ static inline void register_hugetlbfs_with_node(node_registration_func_t reg, >>> >>> #define to_node(device) container_of(device, struct node, dev) >>> >>> -static inline bool node_is_toptier(int node) >>> -{ >>> - return node_state(node, N_CPU); >>> -} >>> - >>> #endif /* _LINUX_NODE_H_ */ >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>> index 834f288b3769..8405662646e9 100644 >>> --- a/mm/huge_memory.c >>> +++ b/mm/huge_memory.c >>> @@ -35,6 +35,7 @@ >>> #include <linux/numa.h> >>> #include <linux/page_owner.h> >>> #include <linux/sched/sysctl.h> >>> +#include <linux/memory-tiers.h> >>> >>> #include <asm/tlb.h> >>> #include <asm/pgalloc.h> >>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c >>> index 84e2be31a853..36d87dc422ab 100644 >>> --- a/mm/memory-tiers.c >>> +++ b/mm/memory-tiers.c >>> @@ -30,6 +30,7 @@ static DEFINE_MUTEX(memory_tier_lock); >>> static LIST_HEAD(memory_tiers); >>> struct memory_dev_type *node_memory_types[MAX_NUMNODES]; >>> #ifdef CONFIG_MIGRATION >>> +static int top_tier_adistance; >>> /* >>> * node_demotion[] examples: >>> * >>> @@ -159,6 +160,31 @@ static struct memory_tier *__node_get_memory_tier(int node) >>> } >>> >>> #ifdef CONFIG_MIGRATION >>> +bool node_is_toptier(int node) >>> +{ >>> + bool toptier; >>> + pg_data_t *pgdat; >>> + struct memory_tier *memtier; >>> + >>> + pgdat = NODE_DATA(node); >>> + if (!pgdat) >>> + return false; >>> + >>> + rcu_read_lock(); >>> + memtier = rcu_dereference(pgdat->memtier); >>> + if (!memtier) { >>> + toptier = true; >>> + goto out; >>> + } >>> + if (memtier->adistance_start >= top_tier_adistance) >>> + toptier = true; >>> + else >>> + toptier = false; >>> +out: >>> + rcu_read_unlock(); >>> + return toptier; >>> +} >>> + >>> void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) >>> { >>> struct memory_tier *memtier; >>> @@ -315,6 +341,22 @@ static void establish_demotion_targets(void) >>> } >>> } while (1); >>> } >>> + /* >>> + * Promotion is allowed from a memory tier to higher >>> + * memory tier only if the memory tier doesn't include >>> + * compute. We want to skip promotion from a memory tier, >>> + * if any node that is part of the memory tier have CPUs. >>> + * Once we detect such a memory tier, we consider that tier >>> + * as top tiper from which promotion on is not allowed. >>> + */ >>> + list_for_each_entry(memtier, &memory_tiers, list) { >>> + tier_nodes = get_memtier_nodemask(memtier); >>> + nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); >>> + if (!nodes_empty(tier_nodes)) { >>> + top_tier_adistance = memtier->adistance_start; >> >> IMHO, this should be, >> >> top_tier_adistance = memtier->adistance_start + MEMTIER_CHUNK_SIZE; >> > > Good catch. Will update. BTW i did send v12 version of the patchset already to the list. > > Checking this again, we consider a node top tier if the node's memtier abstract distance satisfy the below. if (memtier->adistance_start <= top_tier_adistance) toptier = true; With that we should be good with the current code. But I agree with you that top_tier_distance should cover the full range of the top memory tier. -aneesh