From: Jagdish Gediya <jvgediya@xxxxxxxxxxxxx> This patch switch the demotion target building logic to use memory tiers instead of NUMA distance. All N_MEMORY NUMA nodes will be placed in the default tier 1 and additional memory tiers will be added by drivers like dax kmem. This patch builds the demotion target for a NUMA node by looking at all memory tiers below the tier to which the NUMA node belongs. The closest node in the immediately following memory tier is used as a demotion target. Since we are now only building demotion target for N_MEMORY NUMA nodes the CPU hotplug calls are removed in this patch. Signed-off-by: Jagdish Gediya <jvgediya@xxxxxxxxxxxxx> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> --- include/linux/migrate.h | 8 - mm/migrate.c | 460 +++++++++++++++------------------------- mm/vmstat.c | 5 - 3 files changed, 172 insertions(+), 301 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index d37d1d5dee82..cbef71a499c1 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -177,12 +177,6 @@ enum memory_tier_type { }; int next_demotion_node(int node); -extern void migrate_on_reclaim_init(void); -#ifdef CONFIG_HOTPLUG_CPU -extern void set_migration_target_nodes(void); -#else -static inline void set_migration_target_nodes(void) {} -#endif int node_get_memory_tier(int node); int node_set_memory_tier(int node, int tier); int node_reset_memory_tier(int node, int tier); @@ -193,8 +187,6 @@ static inline int next_demotion_node(int node) return NUMA_NO_NODE; } -static inline void set_migration_target_nodes(void) {} -static inline void migrate_on_reclaim_init(void) {} #endif /* CONFIG_TIERED_MEMORY */ #endif /* _LINUX_MIGRATE_H */ diff --git a/mm/migrate.c b/mm/migrate.c index 304559ba3372..d819a64db5b1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2125,6 +2125,10 @@ struct memory_tier { nodemask_t nodelist; }; +struct demotion_nodes { + nodemask_t preferred; +}; + #define to_memory_tier(device) container_of(device, struct memory_tier, dev) static struct bus_type memory_tier_subsys = { @@ -2132,9 +2136,73 @@ static struct bus_type memory_tier_subsys = { .dev_name = "memtier", }; +static void establish_migration_targets(void); + DEFINE_MUTEX(memory_tier_lock); static struct memory_tier *memory_tiers[MAX_MEMORY_TIERS]; +/* + * node_demotion[] examples: + * + * Example 1: + * + * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. + * + * node distances: + * node 0 1 2 3 + * 0 10 20 30 40 + * 1 20 10 40 30 + * 2 30 40 10 40 + * 3 40 30 40 10 + * + * memory_tiers[0] = <empty> + * memory_tiers[1] = 0-1 + * memory_tiers[2] = 2-3 + * + * node_demotion[0].preferred = 2 + * node_demotion[1].preferred = 3 + * node_demotion[2].preferred = <empty> + * node_demotion[3].preferred = <empty> + * + * Example 2: + * + * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. + * + * node distances: + * node 0 1 2 + * 0 10 20 30 + * 1 20 10 30 + * 2 30 30 10 + * + * memory_tiers[0] = <empty> + * memory_tiers[1] = 0-2 + * memory_tiers[2] = <empty> + * + * node_demotion[0].preferred = <empty> + * node_demotion[1].preferred = <empty> + * node_demotion[2].preferred = <empty> + * + * Example 3: + * + * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. + * + * node distances: + * node 0 1 2 + * 0 10 20 30 + * 1 20 10 40 + * 2 30 40 10 + * + * memory_tiers[0] = 1 + * memory_tiers[1] = 0 + * memory_tiers[2] = 2 + * + * node_demotion[0].preferred = 2 + * node_demotion[1].preferred = 0 + * node_demotion[2].preferred = <empty> + * + */ +static struct demotion_nodes *node_demotion __read_mostly; + static ssize_t nodelist_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -2238,6 +2306,28 @@ static int __node_get_memory_tier(int node) return -1; } +static void node_remove_from_memory_tier(int node) +{ + int tier; + + mutex_lock(&memory_tier_lock); + + tier = __node_get_memory_tier(node); + + /* + * Remove node from tier, if tier becomes + * empty then unregister it to make it invisible + * in sysfs. + */ + node_clear(node, memory_tiers[tier]->nodelist); + if (nodes_empty(memory_tiers[tier]->nodelist)) + unregister_memory_tier(tier); + + establish_migration_targets(); + + mutex_unlock(&memory_tier_lock); +} + int node_get_memory_tier(int node) { int tier; @@ -2271,6 +2361,7 @@ int __node_set_memory_tier(int node, int tier) } node_set(node, memory_tiers[tier]->nodelist); + establish_migration_targets(); out: return ret; @@ -2328,75 +2419,6 @@ int node_set_memory_tier(int node, int tier) return ret; } -/* - * node_demotion[] example: - * - * Consider a system with two sockets. Each socket has - * three classes of memory attached: fast, medium and slow. - * Each memory class is placed in its own NUMA node. The - * CPUs are placed in the node with the "fast" memory. The - * 6 NUMA nodes (0-5) might be split among the sockets like - * this: - * - * Socket A: 0, 1, 2 - * Socket B: 3, 4, 5 - * - * When Node 0 fills up, its memory should be migrated to - * Node 1. When Node 1 fills up, it should be migrated to - * Node 2. The migration path start on the nodes with the - * processors (since allocations default to this node) and - * fast memory, progress through medium and end with the - * slow memory: - * - * 0 -> 1 -> 2 -> stop - * 3 -> 4 -> 5 -> stop - * - * This is represented in the node_demotion[] like this: - * - * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1 - * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2 - * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate - * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4 - * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5 - * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate - * - * Moreover some systems may have multiple slow memory nodes. - * Suppose a system has one socket with 3 memory nodes, node 0 - * is fast memory type, and node 1/2 both are slow memory - * type, and the distance between fast memory node and slow - * memory node is same. So the migration path should be: - * - * 0 -> 1/2 -> stop - * - * This is represented in the node_demotion[] like this: - * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 - * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate - * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate - */ - -/* - * Writes to this array occur without locking. Cycles are - * not allowed: Node X demotes to Y which demotes to X... - * - * If multiple reads are performed, a single rcu_read_lock() - * must be held over all reads to ensure that no cycles are - * observed. - */ -#define DEFAULT_DEMOTION_TARGET_NODES 15 - -#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES -#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1) -#else -#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES -#endif - -struct demotion_nodes { - unsigned short nr; - short nodes[DEMOTION_TARGET_NODES]; -}; - -static struct demotion_nodes *node_demotion __read_mostly; - /** * next_demotion_node() - Get the next node in the demotion path * @node: The starting node to lookup the next node @@ -2409,8 +2431,7 @@ static struct demotion_nodes *node_demotion __read_mostly; int next_demotion_node(int node) { struct demotion_nodes *nd; - unsigned short target_nr, index; - int target; + int target, nnodes, i; if (!node_demotion) return NUMA_NO_NODE; @@ -2419,61 +2440,46 @@ int next_demotion_node(int node) /* * node_demotion[] is updated without excluding this - * function from running. RCU doesn't provide any - * compiler barriers, so the READ_ONCE() is required - * to avoid compiler reordering or read merging. + * function from running. * * Make sure to use RCU over entire code blocks if * node_demotion[] reads need to be consistent. */ rcu_read_lock(); - target_nr = READ_ONCE(nd->nr); - switch (target_nr) { - case 0: - target = NUMA_NO_NODE; - goto out; - case 1: - index = 0; - break; - default: - /* - * If there are multiple target nodes, just select one - * target node randomly. - * - * In addition, we can also use round-robin to select - * target node, but we should introduce another variable - * for node_demotion[] to record last selected target node, - * that may cause cache ping-pong due to the changing of - * last target node. Or introducing per-cpu data to avoid - * caching issue, which seems more complicated. So selecting - * target node randomly seems better until now. - */ - index = get_random_int() % target_nr; - break; - } + nnodes = nodes_weight(nd->preferred); + if (!nnodes) + return NUMA_NO_NODE; - target = READ_ONCE(nd->nodes[index]); + /* + * If there are multiple target nodes, just select one + * target node randomly. + * + * In addition, we can also use round-robin to select + * target node, but we should introduce another variable + * for node_demotion[] to record last selected target node, + * that may cause cache ping-pong due to the changing of + * last target node. Or introducing per-cpu data to avoid + * caching issue, which seems more complicated. So selecting + * target node randomly seems better until now. + */ + nnodes = get_random_int() % nnodes; + target = first_node(nd->preferred); + for (i = 0; i < nnodes; i++) + target = next_node(target, nd->preferred); -out: rcu_read_unlock(); + return target; } -#if defined(CONFIG_HOTPLUG_CPU) /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) { - int node, i; + int node; - if (!node_demotion) - return; - - for_each_online_node(node) { - node_demotion[node].nr = 0; - for (i = 0; i < DEMOTION_TARGET_NODES; i++) - node_demotion[node].nodes[i] = NUMA_NO_NODE; - } + for_each_node_mask(node, node_states[N_MEMORY]) + node_demotion[node].preferred = NODE_MASK_NONE; } static void disable_all_migrate_targets(void) @@ -2485,173 +2491,70 @@ static void disable_all_migrate_targets(void) * Readers will see either a combination of before+disable * state or disable+after. They will never see before and * after state together. - * - * The before+after state together might have cycles and - * could cause readers to do things like loop until this - * function finishes. This ensures they can only see a - * single "bad" read and would, for instance, only loop - * once. */ synchronize_rcu(); } /* - * Find an automatic demotion target for 'node'. - * Failing here is OK. It might just indicate - * being at the end of a chain. - */ -static int establish_migrate_target(int node, nodemask_t *used, - int best_distance) +* Find an automatic demotion target for all memory +* nodes. Failing here is OK. It might just indicate +* being at the end of a chain. +*/ +static void establish_migration_targets(void) { - int migration_target, index, val; struct demotion_nodes *nd; + int tier, target = NUMA_NO_NODE, node; + int distance, best_distance; + nodemask_t used; if (!node_demotion) - return NUMA_NO_NODE; - - nd = &node_demotion[node]; - - migration_target = find_next_best_node(node, used); - if (migration_target == NUMA_NO_NODE) - return NUMA_NO_NODE; - - /* - * If the node has been set a migration target node before, - * which means it's the best distance between them. Still - * check if this node can be demoted to other target nodes - * if they have a same best distance. - */ - if (best_distance != -1) { - val = node_distance(node, migration_target); - if (val > best_distance) - goto out_clear; - } - - index = nd->nr; - if (WARN_ONCE(index >= DEMOTION_TARGET_NODES, - "Exceeds maximum demotion target nodes\n")) - goto out_clear; - - nd->nodes[index] = migration_target; - nd->nr++; + return; - return migration_target; -out_clear: - node_clear(migration_target, *used); - return NUMA_NO_NODE; -} + disable_all_migrate_targets(); -/* - * When memory fills up on a node, memory contents can be - * automatically migrated to another node instead of - * discarded at reclaim. - * - * Establish a "migration path" which will start at nodes - * with CPUs and will follow the priorities used to build the - * page allocator zonelists. - * - * The difference here is that cycles must be avoided. If - * node0 migrates to node1, then neither node1, nor anything - * node1 migrates to can migrate to node0. Also one node can - * be migrated to multiple nodes if the target nodes all have - * a same best-distance against the source node. - * - * This function can run simultaneously with readers of - * node_demotion[]. However, it can not run simultaneously - * with itself. Exclusion is provided by memory hotplug events - * being single-threaded. - */ -static void __set_migration_target_nodes(void) -{ - nodemask_t next_pass = NODE_MASK_NONE; - nodemask_t this_pass = NODE_MASK_NONE; - nodemask_t used_targets = NODE_MASK_NONE; - int node, best_distance; + for_each_node_mask(node, node_states[N_MEMORY]) { + best_distance = -1; + nd = &node_demotion[node]; - /* - * Avoid any oddities like cycles that could occur - * from changes in the topology. This will leave - * a momentary gap when migration is disabled. - */ - disable_all_migrate_targets(); + tier = __node_get_memory_tier(node); + /* + * Find next tier to demote. + */ + while (++tier < MAX_MEMORY_TIERS) { + if (memory_tiers[tier]) + break; + } - /* - * Allocations go close to CPUs, first. Assume that - * the migration path starts at the nodes with CPUs. - */ - next_pass = node_states[N_CPU]; -again: - this_pass = next_pass; - next_pass = NODE_MASK_NONE; - /* - * To avoid cycles in the migration "graph", ensure - * that migration sources are not future targets by - * setting them in 'used_targets'. Do this only - * once per pass so that multiple source nodes can - * share a target node. - * - * 'used_targets' will become unavailable in future - * passes. This limits some opportunities for - * multiple source nodes to share a destination. - */ - nodes_or(used_targets, used_targets, this_pass); + if (tier >= MAX_MEMORY_TIERS) + continue; - for_each_node_mask(node, this_pass) { - best_distance = -1; + nodes_andnot(used, node_states[N_MEMORY], memory_tiers[tier]->nodelist); /* - * Try to set up the migration path for the node, and the target - * migration nodes can be multiple, so doing a loop to find all - * the target nodes if they all have a best node distance. + * Find all the nodes in the memory tier node list of same best distance. + * add add them to the preferred mask. We randomly select between nodes + * in the preferred mask when allocating pages during demotion. */ do { - int target_node = - establish_migrate_target(node, &used_targets, - best_distance); - - if (target_node == NUMA_NO_NODE) + target = find_next_best_node(node, &used); + if (target == NUMA_NO_NODE) break; - if (best_distance == -1) - best_distance = node_distance(node, target_node); - - /* - * Visit targets from this pass in the next pass. - * Eventually, every node will have been part of - * a pass, and will become set in 'used_targets'. - */ - node_set(target_node, next_pass); + distance = node_distance(node, target); + if (distance == best_distance || best_distance == -1) { + best_distance = distance; + node_set(target, nd->preferred); + } else { + break; + } } while (1); } - /* - * 'next_pass' contains nodes which became migration - * targets in this pass. Make additional passes until - * no more migrations targets are available. - */ - if (!nodes_empty(next_pass)) - goto again; } /* - * For callers that do not hold get_online_mems() already. - */ -void set_migration_target_nodes(void) -{ - get_online_mems(); - __set_migration_target_nodes(); - put_online_mems(); -} - -/* - * This leaves migrate-on-reclaim transiently disabled between - * the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs - * whether reclaim-based migration is enabled or not, which - * ensures that the user can turn reclaim-based migration at - * any time without needing to recalculate migration targets. - * - * These callbacks already hold get_online_mems(). That is why - * __set_migration_target_nodes() can be used as opposed to - * set_migration_target_nodes(). + * This runs whether reclaim-based migration is enabled or not, + * which ensures that the user can turn reclaim-based migration + * at any time without needing to recalculate migration targets. */ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self, unsigned long action, void *_arg) @@ -2660,64 +2563,44 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self, /* * Only update the node migration order when a node is - * changing status, like online->offline. This avoids - * the overhead of synchronize_rcu() in most cases. + * changing status, like online->offline. */ if (arg->status_change_nid < 0) return notifier_from_errno(0); switch (action) { - case MEM_GOING_OFFLINE: - /* - * Make sure there are not transient states where - * an offline node is a migration target. This - * will leave migration disabled until the offline - * completes and the MEM_OFFLINE case below runs. - */ - disable_all_migrate_targets(); - break; case MEM_OFFLINE: - case MEM_ONLINE: /* - * Recalculate the target nodes once the node - * reaches its final state (online or offline). + * In case we are moving out of N_MEMORY. Keep the node + * in the memory tier so that when we bring memory online, + * they appear in the right memory tier. We still need + * to rebuild the demotion order. */ - __set_migration_target_nodes(); + mutex_lock(&memory_tier_lock); + establish_migration_targets(); + mutex_unlock(&memory_tier_lock); break; - case MEM_CANCEL_OFFLINE: + case MEM_ONLINE: /* - * MEM_GOING_OFFLINE disabled all the migration - * targets. Reenable them. + * We ignore the error here, if the node already have the tier + * registered, we will continue to use that for the new memory + * we are adding here. */ - __set_migration_target_nodes(); - break; - case MEM_GOING_ONLINE: - case MEM_CANCEL_ONLINE: + node_set_memory_tier(arg->status_change_nid, DEFAULT_MEMORY_TIER); break; } return notifier_from_errno(0); } -void __init migrate_on_reclaim_init(void) +static void __init migrate_on_reclaim_init(void) { - node_demotion = kmalloc_array(nr_node_ids, - sizeof(struct demotion_nodes), - GFP_KERNEL); + node_demotion = kcalloc(MAX_NUMNODES, sizeof(struct demotion_nodes), + GFP_KERNEL); WARN_ON(!node_demotion); hotplug_memory_notifier(migrate_on_reclaim_callback, 100); - /* - * At this point, all numa nodes with memory/CPus have their state - * properly set, so we can build the demotion order now. - * Let us hold the cpu_hotplug lock just, as we could possibily have - * CPU hotplug events during boot. - */ - cpus_read_lock(); - set_migration_target_nodes(); - cpus_read_unlock(); } -#endif /* CONFIG_HOTPLUG_CPU */ bool numa_demotion_enabled = false; @@ -2800,6 +2683,7 @@ static int __init memory_tier_init(void) * CPU only nodes are not part of memoty tiers. */ memory_tiers[DEFAULT_MEMORY_TIER]->nodelist = node_states[N_MEMORY]; + migrate_on_reclaim_init(); return 0; } diff --git a/mm/vmstat.c b/mm/vmstat.c index b75b1a64b54c..7815d21345a4 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2053,7 +2053,6 @@ static int vmstat_cpu_online(unsigned int cpu) if (!node_state(cpu_to_node(cpu), N_CPU)) { node_set_state(cpu_to_node(cpu), N_CPU); - set_migration_target_nodes(); } return 0; @@ -2078,7 +2077,6 @@ static int vmstat_cpu_dead(unsigned int cpu) return 0; node_clear_state(node, N_CPU); - set_migration_target_nodes(); return 0; } @@ -2111,9 +2109,6 @@ void __init init_mm_internals(void) start_shepherd_timer(); #endif -#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU) - migrate_on_reclaim_init(); -#endif #ifdef CONFIG_PROC_FS proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); -- 2.36.1