[PATCH v3 4/4] mm/mempolicy: change cur_il_weight to atomic and carry the node with it

Gregory Price <gourry.memverge@xxxxxxxxx> · Thu, 25 Jan 2024 13:43:45 -0500

In the prior patch, we carry only the current weight for a weighted
interleave round with us across calls through the allocator path.

node = next_node_in(current->il_prev, pol->nodemask)
pol->cur_il_weight <--- this weight applies to the above node

This separation of data can cause a race condition.

If a cgroup-initiated task migration or mems_allowed change occurs
from outside the context of the task, this can cause the weight to
become stale, meaning we may end using that weight to allocate
memory on the wrong node.

Example:
  1) task A sets (cur_il_weight = 8) and (current->il_prev) to
     node0. node1 is the next set bit in pol->nodemask
  2) rebind event occurs, removing node1 from the nodemask.
     node2 is now the next set bit in pol->nodemask
     cur_il_weight is now stale.
  3) allocation occurs, next_node_in(il_prev, nodes) returns
     node2. cur_il_weight is now applied to the wrong node.

The upper level allocator logic must still enforce mems_allowed,
so this isn't dangerous, but it is innaccurate.

Just clearing the weight is insufficient, as it creates two more
race conditions.  The root of the issue is the separation of weight
and node data between nodemask and cur_il_weight.

To solve this, update cur_il_weight to be an atomic_t, and place the
node that the weight applies to in the upper bits of the field:

atomic_t cur_il_weight
	node bits 32:8
	weight bits 7:0

Now retrieving or clearing the active interleave node and weight
is a single atomic operation, and we are not dependent on the
potentially changing state of (pol->nodemask) to determine what
node the weight applies to.

Two special observations:
- if the weight is non-zero, cur_il_weight must *always* have a
  valid node number, e.g. it cannot be NUMA_NO_NODE (-1).
  This is because we steal the top byte for the weight.

- MAX_NUMNODES is presently limited to 1024 or less on every
  architecture. This would permanently limit MAX_NUMNODES to
  an absolute maximum of (1 << 24) to avoid overflows.

Per some reading and discussion, it appears that max nodes is
limited to 1024 so that zone type still fits in page flags, so
this method seemed preferable compared to the alternatives of
trying to make all or part of mempolicy RCU protected (which
may not be possible, since it is often referenced during code
chunks which call operations that may sleep).

Signed-off-by: Gregory Price <gregory.price@xxxxxxxxxxxx>
---
 include/linux/mempolicy.h |  2 +-
 mm/mempolicy.c            | 93 +++++++++++++++++++++++++--------------
 2 files changed, 61 insertions(+), 34 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index c644d7bbd396..8108fc6e96ca 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -56,7 +56,7 @@ struct mempolicy {
 	} w;
 
 	/* Weighted interleave settings */
-	u8 cur_il_weight;
+	atomic_t cur_il_weight;
 };
 
 /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5a517511658e..41b5fef0a6f5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -321,7 +321,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	policy->mode = mode;
 	policy->flags = flags;
 	policy->home_node = NUMA_NO_NODE;
-	policy->cur_il_weight = 0;
+	atomic_set(&policy->cur_il_weight, 0);
 
 	return policy;
 }
@@ -356,6 +356,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 		tmp = *nodes;
 
 	pol->nodes = tmp;
+	atomic_set(&pol->cur_il_weight, 0);
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -973,8 +974,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			*policy = next_node_in(current->il_prev, pol->nodes);
 		} else if (pol == current->mempolicy &&
 				(pol->mode == MPOL_WEIGHTED_INTERLEAVE)) {
-			if (pol->cur_il_weight)
-				*policy = current->il_prev;
+			int cweight = atomic_read(&pol->cur_il_weight);
+
+			if (cweight & 0xFF)
+				*policy = cweight >> 8;
 			else
 				*policy = next_node_in(current->il_prev,
 						       pol->nodes);
@@ -1864,36 +1867,48 @@ static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
 	unsigned int node, next;
 	struct task_struct *me = current;
 	u8 __rcu *table;
+	int cur_weight;
 	u8 weight;
 
-	node = next_node_in(me->il_prev, policy->nodes);
-	if (node == MAX_NUMNODES)
-		return node;
+	cur_weight = atomic_read(&policy->cur_il_weight);
+	node = cur_weight >> 8;
+	weight = cur_weight & 0xff;
 
-	/* on first alloc after setting mempolicy, acquire first weight */
-	if (unlikely(!policy->cur_il_weight)) {
+	/* If nodemask was rebound, just fetch the next node */
+	if (!weight || !node_isset(node, policy->nodes)) {
+		node = next_node_in(me->il_prev, policy->nodes);
+		/* can only happen if nodemask has become invalid */
+		if (node == MAX_NUMNODES)
+			return node;
 		rcu_read_lock();
 		table = rcu_dereference(iw_table);
 		/* detect system-default values */
 		weight = table ? table[node] : 1;
-		policy->cur_il_weight = weight ? weight : 1;
+		weight = weight ? weight : 1;
 		rcu_read_unlock();
 	}
 
 	/* account for this allocation call */
-	policy->cur_il_weight--;
+	weight--;
 
 	/* if now at 0, move to next node and set up that node's weight */
-	if (unlikely(!policy->cur_il_weight)) {
+	if (unlikely(!weight)) {
 		me->il_prev = node;
 		next = next_node_in(node, policy->nodes);
-		rcu_read_lock();
-		table = rcu_dereference(iw_table);
-		/* detect system-default values */
-		weight = table ? table[next] : 1;
-		policy->cur_il_weight = weight ? weight : 1;
-		rcu_read_unlock();
-	}
+		if (next != MAX_NUMNODES) {
+			rcu_read_lock();
+			table = rcu_dereference(iw_table);
+			/* detect system-default values */
+			weight = table ? table[next] : 1;
+			weight = weight ? weight : 1;
+			rcu_read_unlock();
+			cur_weight = (next << 8) | weight;
+		} else /* policy->nodes became invalid */
+			cur_weight = 0;
+	} else
+		cur_weight = (node << 8) | weight;
+
+	atomic_set(&policy->cur_il_weight, cur_weight);
 	return node;
 }
 
@@ -2385,6 +2400,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
 	nodemask_t nodes;
 	int nnodes, node, resume_node, next_node;
 	int prev_node = me->il_prev;
+	int cur_node_and_weight = atomic_read(&pol->cur_il_weight);
 	int i;
 
 	if (!nr_pages)
@@ -2394,10 +2410,11 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
 	if (!nnodes)
 		return 0;
 
+	node = cur_node_and_weight >> 8;
+	weight = cur_node_and_weight & 0xff;
 	/* Continue allocating from most recent node and adjust the nr_pages */
-	if (pol->cur_il_weight) {
-		node = next_node_in(prev_node, nodes);
-		node_pages = pol->cur_il_weight;
+	if (weight && node_isset(node, nodes)) {
+		node_pages = weight;
 		if (node_pages > rem_pages)
 			node_pages = rem_pages;
 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
@@ -2408,27 +2425,36 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
 		 * if that's all the pages, no need to interleave, otherwise
 		 * we need to set up the next interleave node/weight correctly.
 		 */
-		if (rem_pages < pol->cur_il_weight) {
+		if (rem_pages < weight) {
 			/* stay on current node, adjust cur_il_weight */
-			pol->cur_il_weight -= rem_pages;
+			weight -= rem_pages;
+			atomic_set(&pol->cur_il_weight, ((node << 8) | weight));
 			return total_allocated;
-		} else if (rem_pages == pol->cur_il_weight) {
+		} else if (rem_pages == weight) {
 			/* move to next node / weight */
 			me->il_prev = node;
 			next_node = next_node_in(node, nodes);
-			rcu_read_lock();
-			table = rcu_dereference(iw_table);
-			weight = table ? table[next_node] : 1;
-			/* detect system-default usage */
-			pol->cur_il_weight = weight ? weight : 1;
-			rcu_read_unlock();
+			if (next_node == MAX_NUMNODES) {
+				next_node = 0;
+				weight = 0;
+			} else {
+				rcu_read_lock();
+				table = rcu_dereference(iw_table);
+				weight = table ? table[next_node] : 1;
+				/* detect system-default usage */
+				weight = weight ? weight : 1;
+				rcu_read_unlock();
+			}
+			atomic_set(&pol->cur_il_weight,
+				   ((next_node << 8) | weight));
 			return total_allocated;
 		}
 		/* Otherwise we adjust nr_pages down, and continue from there */
-		rem_pages -= pol->cur_il_weight;
-		pol->cur_il_weight = 0;
+		rem_pages -= weight;
 		prev_node = node;
 	}
+	/* clear cur_il_weight in case of an allocation failure */
+	atomic_set(&pol->cur_il_weight, 0);
 
 	/* create a local copy of node weights to operate on outside rcu */
 	weights = kmalloc(nr_node_ids, GFP_KERNEL);
@@ -2513,7 +2539,8 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
 	}
 	/* resume allocating from the calculated node and weight */
 	me->il_prev = resume_node;
-	pol->cur_il_weight = resume_weight;
+	resume_node = next_node_in(resume_node, nodes);
+	atomic_set(&pol->cur_il_weight, ((resume_node << 8) | resume_weight));
 	kfree(weights);
 	return total_allocated;
 }
-- 
2.39.1