[RFC PATCH v4 2/3] mm/mempolicy: implement weighted interleave

Gregory Price <gourry.memverge@xxxxxxxxx> · Wed, 8 Nov 2023 19:25:16 -0500

Implements interleave weighting for bandwidth optimization.

The mempolicy MPOL_INTERLEAVE utilizes the node weights to implement
weighted interleave.

There are 3 integration points:

interleave_nodes:
  Counts the number of allocations as they occur, and applies the
  weight for the current node.  When the weight reaches 0, switch
  to the next node.

offset_il_node:
  Gets the total weight of the nodemask as well as each individual
  node weight, then calculates the node based on the given index n.

bulk_array_interleave:
  Gets the total weight of the nodemask as well as each individual
  node weight, then calculates the number of "interleave rounds" as
  well as any delta ("partial round").  Calculates the number of
  pages for each node and allocates them.

  If a node was scheduled for interleave via interleave_nodes, the
  current weight (pol->cur_weight) will be allocated first, before
  the remaining bulk calculation is done. This simplifies the
  calculation at the cost of an additional allocation call.

The functions mempolicy_get_il_weight and mempolicy_get_il_weights
were added so that should mempolicy be extended in the future to
include local mempolicy weights, there is a clear integration point.

Signed-off-by: Gregory Price <gregory.price@xxxxxxxxxxxx>
---
 include/linux/mempolicy.h |   3 +
 mm/mempolicy.c            | 153 +++++++++++++++++++++++++++++++-------
 2 files changed, 128 insertions(+), 28 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..b1ca63077fc4 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -48,6 +48,9 @@ struct mempolicy {
 	nodemask_t nodes;	/* interleave/bind/perfer */
 	int home_node;		/* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
 
+	unsigned char cur_weight;  /* weight of current il node */
+	unsigned char il_weights[MAX_NUMNODES]; /* used during allocation */
+
 	union {
 		nodemask_t cpuset_mems_allowed;	/* relative to these nodes */
 		nodemask_t user_nodemask;	/* nodemask passed by user */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 29ebf1e7898c..231b9bbd391a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -300,6 +300,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	policy->mode = mode;
 	policy->flags = flags;
 	policy->home_node = NUMA_NO_NODE;
+	policy->cur_weight = 0;
 
 	return policy;
 }
@@ -334,6 +335,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 		tmp = *nodes;
 
 	pol->nodes = tmp;
+	pol->cur_weight = 0;
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -881,8 +883,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE)
+	if (new && new->mode == MPOL_INTERLEAVE) {
 		current->il_prev = MAX_NUMNODES-1;
+		new->cur_weight = 0;
+	}
 	task_unlock(current);
 	mpol_put(old);
 	ret = 0;
@@ -1900,15 +1904,50 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 	return nd;
 }
 
+static unsigned char mempolicy_get_il_weight(struct mempolicy *policy,
+					     unsigned int nid)
+{
+	int weight = mem_cgroup_get_il_weight(nid);
+
+	return weight ? weight : 1;
+}
+
+static unsigned int mempolicy_get_il_weights(struct mempolicy *policy,
+					     nodemask_t *nodes,
+					     unsigned char *weights)
+{
+	unsigned int total = 0;
+	unsigned int nid;
+
+	total = mem_cgroup_get_il_weights(nodes, weights);
+	if (total)
+		return total;
+
+	for_each_node_mask(nid, *nodes) {
+		weights[nid] = 1;
+		total += 1;
+	}
+	return total;
+}
+
 /* Do dynamic interleaving for a process */
 static unsigned interleave_nodes(struct mempolicy *policy)
 {
 	unsigned next;
+	unsigned char next_weight;
 	struct task_struct *me = current;
 
 	next = next_node_in(me->il_prev, policy->nodes);
-	if (next < MAX_NUMNODES)
+	if (!policy->cur_weight) {
+		/* If the node is set, at least 1 allocation is required */
+		next_weight = mempolicy_get_il_weight(policy, next);
+		policy->cur_weight = next_weight ? next_weight : 1;
+	}
+
+	policy->cur_weight--;
+	if (next < MAX_NUMNODES && !policy->cur_weight)
 		me->il_prev = next;
+
 	return next;
 }
 
@@ -1967,8 +2006,8 @@ unsigned int mempolicy_slab_node(void)
 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 {
 	nodemask_t nodemask = pol->nodes;
-	unsigned int target, nnodes;
-	int i;
+	unsigned int target, nnodes, il_weight;
+	unsigned char weight;
 	int nid;
 	/*
 	 * The barrier will stabilize the nodemask in a register or on
@@ -1982,10 +2021,18 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 	nnodes = nodes_weight(nodemask);
 	if (!nnodes)
 		return numa_node_id();
-	target = (unsigned int)n % nnodes;
+
+	il_weight = mempolicy_get_il_weights(pol, &nodemask, pol->il_weights);
+	target = (unsigned int)n % il_weight;
 	nid = first_node(nodemask);
-	for (i = 0; i < target; i++)
-		nid = next_node(nid, nodemask);
+
+	while (target) {
+		weight = pol->il_weights[nid];
+		if (target < weight)
+			break;
+		target -= weight;
+		nid = next_node_in(nid, nodemask);
+	}
 	return nid;
 }
 
@@ -2319,32 +2366,82 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
 		struct mempolicy *pol, unsigned long nr_pages,
 		struct page **page_array)
 {
-	int nodes;
-	unsigned long nr_pages_per_node;
-	int delta;
-	int i;
-	unsigned long nr_allocated;
+	struct task_struct *me = current;
 	unsigned long total_allocated = 0;
+	unsigned long nr_allocated;
+	unsigned long rounds;
+	unsigned long node_pages, delta;
+	unsigned char weight;
+	unsigned long il_weight;
+	unsigned long req_pages = nr_pages;
+	int nnodes, node, prev_node;
+	int i;
 
-	nodes = nodes_weight(pol->nodes);
-	nr_pages_per_node = nr_pages / nodes;
-	delta = nr_pages - nodes * nr_pages_per_node;
-
-	for (i = 0; i < nodes; i++) {
-		if (delta) {
-			nr_allocated = __alloc_pages_bulk(gfp,
-					interleave_nodes(pol), NULL,
-					nr_pages_per_node + 1, NULL,
-					page_array);
-			delta--;
-		} else {
-			nr_allocated = __alloc_pages_bulk(gfp,
-					interleave_nodes(pol), NULL,
-					nr_pages_per_node, NULL, page_array);
+	prev_node = me->il_prev;
+	nnodes = nodes_weight(pol->nodes);
+	/* Continue allocating from most recent node */
+	if (pol->cur_weight) {
+		node = next_node_in(prev_node, pol->nodes);
+		node_pages = pol->cur_weight;
+		if (node_pages > nr_pages)
+			node_pages = nr_pages;
+		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
+		page_array += nr_allocated;
+		total_allocated += nr_allocated;
+		/* if that's all the pages, no need to interleave */
+		if (req_pages <= pol->cur_weight) {
+			pol->cur_weight -= req_pages;
+			return total_allocated;
 		}
-
+		/* Otherwise we adjust req_pages down, and continue from there */
+		req_pages -= pol->cur_weight;
+		pol->cur_weight = 0;
+		prev_node = node;
+	}
+
+	il_weight = mempolicy_get_il_weights(pol, &pol->nodes,
+					     pol->il_weights);
+	rounds = req_pages / il_weight;
+	delta = req_pages % il_weight;
+	for (i = 0; i < nnodes; i++) {
+		node = next_node_in(prev_node, pol->nodes);
+		weight = pol->il_weights[node];
+		node_pages = weight * rounds;
+		if (delta > weight) {
+			node_pages += weight;
+			delta -= weight;
+		} else if (delta) {
+			node_pages += delta;
+			delta = 0;
+		}
+		/* The number of requested pages may not hit every node */
+		if (!node_pages)
+			break;
+		/* If an over-allocation would occur, floor it */
+		if (node_pages + total_allocated > nr_pages) {
+			node_pages = nr_pages - total_allocated;
+			delta = 0;
+		}
+		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
 		page_array += nr_allocated;
 		total_allocated += nr_allocated;
+		prev_node = node;
+	}
+
+	/*
+	 * Finally, we need to update me->il_prev and pol->cur_weight
+	 * If the last node allocated on has un-used weight, apply
+	 * the remainder as the cur_weight, otherwise proceed to next node
+	 */
+	if (node_pages) {
+		me->il_prev = prev_node;
+		node_pages %= weight;
+		pol->cur_weight = weight - node_pages;
+	} else {
+		me->il_prev = node;
+		pol->cur_weight = 0;
 	}
 
 	return total_allocated;
-- 
2.39.1