On Wed, Jan 31, 2024 at 05:19:51PM +0800, Huang, Ying wrote: > Gregory Price <gregory.price@xxxxxxxxxxxx> writes: > > > > > I think this is handled already? It is definitely an explicit race > > condition that is documented elsewhere: > > > > /* > > * mpol_rebind_policy - Migrate a policy to a different set of nodes > > * > > * Per-vma policies are protected by mmap_lock. Allocations using per-task > > * policies are protected by task->mems_allowed_seq to prevent a premature > > * OOM/allocation failure due to parallel nodemask modification. > > */ > > Thanks for pointing this out! > > If we use task->mems_allowed_seq reader side in > weighted_interleave_nodes() we can guarantee the consistency of > policy->nodes. That may be not deserved, because it's not a big deal to > allocate 1 page in a wrong node. > > It makes more sense to do that in > alloc_pages_bulk_array_weighted_interleave(), because a lot of pages may > be allocated there. > To save the versioning if there are issues, here are the 3 diffs that I have left. If you are good with these changes, I'll squash the first 2 into the third commit, keep the last one as a separate commit (it changes the interleave_nodes() logic too), and submit v5 w/ your reviewed tag on all of them. Fix one (pedantic?) warning from syzbot: ---------------------------------------- diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b1437396c357..dfd097009606 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2391,7 +2391,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, unsigned long nr_allocated = 0; unsigned long rounds; unsigned long node_pages, delta; - u8 __rcu *table, *weights, weight; + u8 __rcu *table, __rcu *weights, weight; unsigned int weight_total = 0; unsigned long rem_pages = nr_pages; nodemask_t nodes; Simplifying resume_node/weight logic: ------------------------------------- diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2c1aef8eab70..b0ca9bcdd64c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2405,15 +2405,9 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, page_array += nr_allocated; total_allocated += nr_allocated; /* if that's all the pages, no need to interleave */ - if (rem_pages < weight) { - /* stay on current node, adjust il_weight */ + if (rem_pages <= weight) { me->il_weight -= rem_pages; return total_allocated; - } else if (rem_pages == weight) { - /* move to next node / weight */ - me->il_prev = next_node_in(node, nodes); - me->il_weight = get_il_weight(me->il_prev); - return total_allocated; } /* Otherwise we adjust remaining pages, continue from there */ rem_pages -= weight; @@ -2460,17 +2454,10 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, node_pages += weight; delta -= weight; } else if (delta) { + /* when delta is deleted, resume from that node */ node_pages += delta; - /* delta may deplete on a boundary or w/ a remainder */ - if (delta == weight) { - /* boundary: resume from next node/weight */ - resume_node = next_node_in(node, nodes); - resume_weight = weights[resume_node]; - } else { - /* remainder: resume this node w/ remainder */ - resume_node = node; - resume_weight = weight - delta; - } + resume_node = node; + resume_weight = weight - delta; delta = 0; } /* node_pages can be 0 if an allocation fails and rounds == 0 */ task->mems_allowed_seq protection (added as 4th patch) ------------------------------------------------------ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b0ca9bcdd64c..b1437396c357 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1879,10 +1879,15 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) static unsigned int weighted_interleave_nodes(struct mempolicy *policy) { unsigned int node = current->il_prev; + unsigned int cpuset_mems_cookie; +retry: + /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ + cpuset_mems_cookie = read_mems_allowed_begin(); if (!current->il_weight || !node_isset(node, policy->nodes)) { node = next_node_in(node, policy->nodes); - /* can only happen if nodemask is being rebound */ + if (read_mems_allowed_retry(cpuset_mems_cookie)) + goto retry; if (node == MAX_NUMNODES) return node; current->il_prev = node; @@ -1896,10 +1901,17 @@ static unsigned int weighted_interleave_nodes(struct mempolicy *policy) static unsigned int interleave_nodes(struct mempolicy *policy) { unsigned int nid; + unsigned int cpuset_mems_cookie; + + /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + nid = next_node_in(current->il_prev, policy->nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); - nid = next_node_in(current->il_prev, policy->nodes); if (nid < MAX_NUMNODES) current->il_prev = nid; + return nid; } @@ -2374,6 +2386,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, struct page **page_array) { struct task_struct *me = current; + unsigned int cpuset_mems_cookie; unsigned long total_allocated = 0; unsigned long nr_allocated = 0; unsigned long rounds; @@ -2388,10 +2401,17 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, int prev_node; int i; + if (!nr_pages) return 0; - nnodes = read_once_policy_nodemask(pol, &nodes); + /* read the nodes onto the stack, retry if done during rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + nnodes = read_once_policy_nodemask(pol, &nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + /* if the nodemask has become invalid, we cannot do anything */ if (!nnodes) return 0;