From: Srinivasulu Thanneeru <sthanneeru@xxxxxxxxxx> Existing interleave policy spreads out pages evenly across a set of specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems have CPU-less memory nodes with different peak bandwidth and latency-bandwidth characteristics. In such systems, we will want to use the additional bandwidth provided by lowtier memory for bandwidth-intensive applications. However, the default 1:1 interleave can lead to suboptimal bandwidth distribution. Introduce an interleave policy for multi-tiers that is based on interleave weights, where pages are assigned from nodes of the tier based on the tier weight. For instance, 50:30:20 are the weights of tiers 0, 1, and 3, which leads to a 50%/30%/20% traffic breakdown across the three tiers. Signed-off-by: Srinivasulu Thanneeru <sthanneeru@xxxxxxxxxx> Co-authored-by: Ravi Jonnalagadda <ravis.opensrc@xxxxxxxxxx> --- include/linux/memory-tiers.h | 25 +++++++- include/linux/sched.h | 2 + mm/memory-tiers.c | 31 ++-------- mm/mempolicy.c | 107 +++++++++++++++++++++++++++++++++-- 4 files changed, 132 insertions(+), 33 deletions(-) diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index c62d286749d0..74be39cb56c4 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -2,6 +2,7 @@ #ifndef _LINUX_MEMORY_TIERS_H #define _LINUX_MEMORY_TIERS_H +#include <linux/device.h> #include <linux/types.h> #include <linux/nodemask.h> #include <linux/kref.h> @@ -21,7 +22,27 @@ #define MAX_TIER_INTERLEAVE_WEIGHT 100 -struct memory_tier; +struct memory_tier { + /* hierarchy of memory tiers */ + struct list_head list; + /* list of all memory types part of this tier */ + struct list_head memory_types; + /* + * By default all tiers will have weight as 1, which means they + * follow default standard allocation. + */ + unsigned short interleave_weight; + /* + * start value of abstract distance. memory tier maps + * an abstract distance range, + * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE + */ + int adistance_start; + struct device dev; + /* All the nodes that are part of all the lower memory tiers. */ + nodemask_t lower_tier_mask; +}; + struct memory_dev_type { /* list of memory types that are part of same tier as this type */ struct list_head tier_sibiling; @@ -38,6 +59,8 @@ struct memory_dev_type *alloc_memory_type(int adistance); void put_memory_type(struct memory_dev_type *memtype); void init_node_memory_type(int node, struct memory_dev_type *default_type); void clear_node_memory_type(int node, struct memory_dev_type *memtype); +struct memory_tier *node_get_memory_tier(int node); +nodemask_t get_memtier_nodemask(struct memory_tier *memtier); #ifdef CONFIG_MIGRATION int next_demotion_node(int node); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); diff --git a/include/linux/sched.h b/include/linux/sched.h index 77f01ac385f7..07ea837c3afb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1252,7 +1252,9 @@ struct task_struct { /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; + unsigned short il_count; short pref_node_fork; + unsigned int current_node; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 7e06c9e0fa41..5e2ddc9f994a 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -8,27 +8,6 @@ #include "internal.h" -struct memory_tier { - /* hierarchy of memory tiers */ - struct list_head list; - /* list of all memory types part of this tier */ - struct list_head memory_types; - /* - * By default all tiers will have weight as 1, which means they - * follow default standard allocation. - */ - unsigned short interleave_weight; - /* - * start value of abstract distance. memory tier maps - * an abstract distance range, - * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE - */ - int adistance_start; - struct device dev; - /* All the nodes that are part of all the lower memory tiers. */ - nodemask_t lower_tier_mask; -}; - struct demotion_nodes { nodemask_t preferred; }; @@ -115,7 +94,7 @@ static inline struct memory_tier *to_memory_tier(struct device *device) return container_of(device, struct memory_tier, dev); } -static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) +nodemask_t get_memtier_nodemask(struct memory_tier *memtier) { nodemask_t nodes = NODE_MASK_NONE; struct memory_dev_type *memtype; @@ -264,7 +243,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty return memtier; } -static struct memory_tier *__node_get_memory_tier(int node) +struct memory_tier *node_get_memory_tier(int node) { pg_data_t *pgdat; @@ -380,7 +359,7 @@ static void disable_all_demotion_targets(void) * We are holding memory_tier_lock, it is safe * to access pgda->memtier. */ - memtier = __node_get_memory_tier(node); + memtier = node_get_memory_tier(node); if (memtier) memtier->lower_tier_mask = NODE_MASK_NONE; } @@ -417,7 +396,7 @@ static void establish_demotion_targets(void) best_distance = -1; nd = &node_demotion[node]; - memtier = __node_get_memory_tier(node); + memtier = node_get_memory_tier(node); if (!memtier || list_is_last(&memtier->list, &memory_tiers)) continue; /* @@ -562,7 +541,7 @@ static bool clear_node_memory_tier(int node) * This also enables us to free the destroyed memory tier * with kfree instead of kfree_rcu */ - memtier = __node_get_memory_tier(node); + memtier = node_get_memory_tier(node); if (memtier) { struct memory_dev_type *memtype; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 42b5567e3773..4f80c6ee1176 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -100,6 +100,8 @@ #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> +#include <linux/memory-tiers.h> +#include <linux/nodemask.h> #include <linux/printk.h> #include <linux/swapops.h> @@ -882,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE) + if (new && new->mode == MPOL_INTERLEAVE) { current->il_prev = MAX_NUMNODES-1; + current->il_count = 0; + current->current_node = MAX_NUMNODES; + } task_unlock(current); mpol_put(old); ret = 0; @@ -1899,13 +1904,76 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) return nd; } +/* Return interleave weight of node from tier's weight */ +static unsigned short node_interleave_weight(int nid, nodemask_t pol_nodemask) +{ + struct memory_tier *memtier; + nodemask_t tier_nodes, tier_and_pol; + unsigned short avrg_weight = 0; + int node, nnodes, reminder; + + memtier = node_get_memory_tier(nid); + + if (!memtier) + return 0; + + tier_nodes = get_memtier_nodemask(memtier); + nodes_and(tier_and_pol, tier_nodes, pol_nodemask); + nnodes = nodes_weight(tier_and_pol); + if (!nnodes) + return 0; + + avrg_weight = memtier->interleave_weight / nnodes; + /* Set minimum weight of node as 1 so that at least one page + * is allocated. + */ + if (!avrg_weight) + return 1; + + reminder = memtier->interleave_weight % nnodes; + if (reminder) { + for_each_node_mask(node, tier_and_pol) { + /* Increment target node's weight by 1, if it falls + * within remaining weightage 'reminder'. + */ + if (node == nid) { + if (reminder > 0) + avrg_weight = avrg_weight + 1; + break; + } + reminder--; + } + } + return avrg_weight; +} + /* Do dynamic interleaving for a process */ static unsigned interleave_nodes(struct mempolicy *policy) { unsigned next; struct task_struct *me = current; + unsigned short node_weight = 0; - next = next_node_in(me->il_prev, policy->nodes); + /* select current node or next node from nodelist based on + * available tier interleave weight. + */ + if (me->current_node == MAX_NUMNODES) + next = next_node_in(me->il_prev, policy->nodes); + else + next = me->current_node; + node_weight = node_interleave_weight(next, policy->nodes); + if (!node_weight) + goto set_il_prev; + if (me->il_count < node_weight) { + me->il_count++; + me->current_node = next; + if (me->il_count == node_weight) { + me->current_node = MAX_NUMNODES; + me->il_count = 0; + } + } + +set_il_prev: if (next < MAX_NUMNODES) me->il_prev = next; return next; @@ -1966,9 +2034,10 @@ unsigned int mempolicy_slab_node(void) static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) { nodemask_t nodemask = pol->nodes; - unsigned int target, nnodes; - int i; - int nid; + unsigned int target, nnodes, vnnodes = 0; + unsigned short node_weight = 0; + int nid, vtarget, i; + /* * The barrier will stabilize the nodemask in a register or on * the stack so that it will stop changing under the code. @@ -1981,7 +2050,33 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) nnodes = nodes_weight(nodemask); if (!nnodes) return numa_node_id(); - target = (unsigned int)n % nnodes; + + /* + * Calculate the virtual target for @n in a nodelist that is scaled + * with interleave weights.... + */ + for_each_node_mask(nid, nodemask) { + node_weight = node_interleave_weight(nid, nodemask); + if (!node_weight) + continue; + vnnodes += node_weight; + } + if (!vnnodes) + return numa_node_id(); + vtarget = (int)((unsigned int)n % vnnodes); + + /* ...then map it back to the physical nodelist */ + target = 0; + for_each_node_mask(nid, nodemask) { + node_weight = node_interleave_weight(nid, nodemask); + if (!node_weight) + continue; + vtarget -= node_weight; + if (vtarget < 0) + break; + target++; + } + nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); -- 2.39.3