[PATCH 2/2] mm: mempolicy: Interleave policy for tiered memory nodes

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Srinivasulu Thanneeru <sthanneeru@xxxxxxxxxx>

Existing interleave policy spreads out pages evenly across a set of
specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems
have CPU-less memory nodes with different peak bandwidth and
latency-bandwidth characteristics. In such systems, we will want to
use the additional bandwidth provided by lowtier memory for
bandwidth-intensive applications. However, the default 1:1 interleave
can lead to suboptimal bandwidth distribution.

Introduce an interleave policy for multi-tiers that is based on
interleave weights, where pages are assigned from nodes of the tier
based on the tier weight.

For instance, 50:30:20 are the weights of tiers 0, 1, and 3, which
leads to a 50%/30%/20% traffic breakdown across the three tiers.

Signed-off-by: Srinivasulu Thanneeru <sthanneeru@xxxxxxxxxx>
Co-authored-by: Ravi Jonnalagadda <ravis.opensrc@xxxxxxxxxx>
---
 include/linux/memory-tiers.h |  25 +++++++-
 include/linux/sched.h        |   2 +
 mm/memory-tiers.c            |  31 ++--------
 mm/mempolicy.c               | 107 +++++++++++++++++++++++++++++++++--
 4 files changed, 132 insertions(+), 33 deletions(-)

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index c62d286749d0..74be39cb56c4 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_MEMORY_TIERS_H
 #define _LINUX_MEMORY_TIERS_H
 
+#include <linux/device.h>
 #include <linux/types.h>
 #include <linux/nodemask.h>
 #include <linux/kref.h>
@@ -21,7 +22,27 @@
 
 #define MAX_TIER_INTERLEAVE_WEIGHT 100
 
-struct memory_tier;
+struct memory_tier {
+	/* hierarchy of memory tiers */
+	struct list_head list;
+	/* list of all memory types part of this tier */
+	struct list_head memory_types;
+	/*
+	 * By default all tiers will have weight as 1, which means they
+	 * follow default standard allocation.
+	 */
+	unsigned short interleave_weight;
+	/*
+	 * start value of abstract distance. memory tier maps
+	 * an abstract distance  range,
+	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
+	 */
+	int adistance_start;
+	struct device dev;
+	/* All the nodes that are part of all the lower memory tiers. */
+	nodemask_t lower_tier_mask;
+};
+
 struct memory_dev_type {
 	/* list of memory types that are part of same tier as this type */
 	struct list_head tier_sibiling;
@@ -38,6 +59,8 @@ struct memory_dev_type *alloc_memory_type(int adistance);
 void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+struct memory_tier *node_get_memory_tier(int node);
+nodemask_t get_memtier_nodemask(struct memory_tier *memtier);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..07ea837c3afb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1252,7 +1252,9 @@ struct task_struct {
 	/* Protected by alloc_lock: */
 	struct mempolicy		*mempolicy;
 	short				il_prev;
+	unsigned short			il_count;
 	short				pref_node_fork;
+	unsigned int			current_node;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 	int				numa_scan_seq;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 7e06c9e0fa41..5e2ddc9f994a 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -8,27 +8,6 @@
 
 #include "internal.h"
 
-struct memory_tier {
-	/* hierarchy of memory tiers */
-	struct list_head list;
-	/* list of all memory types part of this tier */
-	struct list_head memory_types;
-	/*
-	 * By default all tiers will have weight as 1, which means they
-	 * follow default standard allocation.
-	 */
-	unsigned short interleave_weight;
-	/*
-	 * start value of abstract distance. memory tier maps
-	 * an abstract distance  range,
-	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
-	 */
-	int adistance_start;
-	struct device dev;
-	/* All the nodes that are part of all the lower memory tiers. */
-	nodemask_t lower_tier_mask;
-};
-
 struct demotion_nodes {
 	nodemask_t preferred;
 };
@@ -115,7 +94,7 @@ static inline struct memory_tier *to_memory_tier(struct device *device)
 	return container_of(device, struct memory_tier, dev);
 }
 
-static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
+nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
 {
 	nodemask_t nodes = NODE_MASK_NONE;
 	struct memory_dev_type *memtype;
@@ -264,7 +243,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
 	return memtier;
 }
 
-static struct memory_tier *__node_get_memory_tier(int node)
+struct memory_tier *node_get_memory_tier(int node)
 {
 	pg_data_t *pgdat;
 
@@ -380,7 +359,7 @@ static void disable_all_demotion_targets(void)
 		 * We are holding memory_tier_lock, it is safe
 		 * to access pgda->memtier.
 		 */
-		memtier = __node_get_memory_tier(node);
+		memtier = node_get_memory_tier(node);
 		if (memtier)
 			memtier->lower_tier_mask = NODE_MASK_NONE;
 	}
@@ -417,7 +396,7 @@ static void establish_demotion_targets(void)
 		best_distance = -1;
 		nd = &node_demotion[node];
 
-		memtier = __node_get_memory_tier(node);
+		memtier = node_get_memory_tier(node);
 		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
 			continue;
 		/*
@@ -562,7 +541,7 @@ static bool clear_node_memory_tier(int node)
 	 * This also enables us to free the destroyed memory tier
 	 * with kfree instead of kfree_rcu
 	 */
-	memtier = __node_get_memory_tier(node);
+	memtier = node_get_memory_tier(node);
 	if (memtier) {
 		struct memory_dev_type *memtype;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 42b5567e3773..4f80c6ee1176 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -100,6 +100,8 @@
 #include <linux/ctype.h>
 #include <linux/mm_inline.h>
 #include <linux/mmu_notifier.h>
+#include <linux/memory-tiers.h>
+#include <linux/nodemask.h>
 #include <linux/printk.h>
 #include <linux/swapops.h>
 
@@ -882,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE)
+	if (new && new->mode == MPOL_INTERLEAVE) {
 		current->il_prev = MAX_NUMNODES-1;
+		current->il_count = 0;
+		current->current_node = MAX_NUMNODES;
+	}
 	task_unlock(current);
 	mpol_put(old);
 	ret = 0;
@@ -1899,13 +1904,76 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 	return nd;
 }
 
+/* Return interleave weight of node from tier's weight */
+static unsigned short node_interleave_weight(int nid, nodemask_t pol_nodemask)
+{
+	struct memory_tier *memtier;
+	nodemask_t tier_nodes, tier_and_pol;
+	unsigned short avrg_weight = 0;
+	int node, nnodes, reminder;
+
+	memtier = node_get_memory_tier(nid);
+
+	if (!memtier)
+		return 0;
+
+	tier_nodes = get_memtier_nodemask(memtier);
+	nodes_and(tier_and_pol, tier_nodes, pol_nodemask);
+	nnodes = nodes_weight(tier_and_pol);
+	if (!nnodes)
+		return 0;
+
+	avrg_weight = memtier->interleave_weight / nnodes;
+	/* Set minimum weight of node as 1 so that at least one page
+	 * is allocated.
+	 */
+	if (!avrg_weight)
+		return 1;
+
+	reminder = memtier->interleave_weight % nnodes;
+	if (reminder) {
+		for_each_node_mask(node, tier_and_pol) {
+			/* Increment target node's weight by 1, if it falls
+			 * within remaining weightage 'reminder'.
+			 */
+			if (node == nid) {
+				if (reminder > 0)
+					avrg_weight = avrg_weight + 1;
+				break;
+			}
+			reminder--;
+		}
+	}
+	return avrg_weight;
+}
+
 /* Do dynamic interleaving for a process */
 static unsigned interleave_nodes(struct mempolicy *policy)
 {
 	unsigned next;
 	struct task_struct *me = current;
+	unsigned short node_weight = 0;
 
-	next = next_node_in(me->il_prev, policy->nodes);
+	/* select current node or next node from nodelist based on
+	 * available tier interleave weight.
+	 */
+	if (me->current_node == MAX_NUMNODES)
+		next = next_node_in(me->il_prev, policy->nodes);
+	else
+		next = me->current_node;
+	node_weight = node_interleave_weight(next, policy->nodes);
+	if (!node_weight)
+		goto set_il_prev;
+	if (me->il_count < node_weight) {
+		me->il_count++;
+		me->current_node = next;
+		if (me->il_count == node_weight) {
+			me->current_node = MAX_NUMNODES;
+			me->il_count = 0;
+		}
+	}
+
+set_il_prev:
 	if (next < MAX_NUMNODES)
 		me->il_prev = next;
 	return next;
@@ -1966,9 +2034,10 @@ unsigned int mempolicy_slab_node(void)
 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 {
 	nodemask_t nodemask = pol->nodes;
-	unsigned int target, nnodes;
-	int i;
-	int nid;
+	unsigned int target, nnodes, vnnodes = 0;
+	unsigned short node_weight = 0;
+	int nid, vtarget, i;
+
 	/*
 	 * The barrier will stabilize the nodemask in a register or on
 	 * the stack so that it will stop changing under the code.
@@ -1981,7 +2050,33 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 	nnodes = nodes_weight(nodemask);
 	if (!nnodes)
 		return numa_node_id();
-	target = (unsigned int)n % nnodes;
+
+	/*
+	 * Calculate the virtual target for @n in a nodelist that is scaled
+	 * with interleave weights....
+	 */
+	for_each_node_mask(nid, nodemask) {
+		node_weight = node_interleave_weight(nid, nodemask);
+		if (!node_weight)
+			continue;
+		vnnodes += node_weight;
+	}
+	if (!vnnodes)
+		return numa_node_id();
+	vtarget = (int)((unsigned int)n % vnnodes);
+
+	/* ...then map it back to the physical nodelist */
+	target = 0;
+	for_each_node_mask(nid, nodemask) {
+		node_weight = node_interleave_weight(nid, nodemask);
+		if (!node_weight)
+			continue;
+		vtarget -= node_weight;
+		if (vtarget < 0)
+			break;
+		target++;
+	}
+
 	nid = first_node(nodemask);
 	for (i = 0; i < target; i++)
 		nid = next_node(nid, nodemask);
-- 
2.39.3




[Index of Archives]     [Linux Kernel]     [Kernel Newbies]     [x86 Platform Driver]     [Netdev]     [Linux Wireless]     [Netfilter]     [Bugtraq]     [Linux Filesystems]     [Yosemite Discussion]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]

  Powered by Linux