Gregory Price <gregory.price@xxxxxxxxxxxx> writes: > On Wed, Jan 31, 2024 at 05:19:51PM +0800, Huang, Ying wrote: >> Gregory Price <gregory.price@xxxxxxxxxxxx> writes: >> >> > >> > I think this is handled already? It is definitely an explicit race >> > condition that is documented elsewhere: >> > >> > /* >> > * mpol_rebind_policy - Migrate a policy to a different set of nodes >> > * >> > * Per-vma policies are protected by mmap_lock. Allocations using per-task >> > * policies are protected by task->mems_allowed_seq to prevent a premature >> > * OOM/allocation failure due to parallel nodemask modification. >> > */ >> >> Thanks for pointing this out! >> >> If we use task->mems_allowed_seq reader side in >> weighted_interleave_nodes() we can guarantee the consistency of >> policy->nodes. That may be not deserved, because it's not a big deal to >> allocate 1 page in a wrong node. >> >> It makes more sense to do that in >> alloc_pages_bulk_array_weighted_interleave(), because a lot of pages may >> be allocated there. >> > > To save the versioning if there are issues, here are the 3 diffs that > I have left. If you are good with these changes, I'll squash the first > 2 into the third commit, keep the last one as a separate commit (it > changes the interleave_nodes() logic too), and submit v5 w/ your > reviewed tag on all of them. > > > Fix one (pedantic?) warning from syzbot: > ---------------------------------------- > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index b1437396c357..dfd097009606 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -2391,7 +2391,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, > unsigned long nr_allocated = 0; > unsigned long rounds; > unsigned long node_pages, delta; > - u8 __rcu *table, *weights, weight; > + u8 __rcu *table, __rcu *weights, weight; The __rcu usage can be checked with `sparse` directly. For example, make C=1 mm/mempolicy.o More details can be found in https://www.kernel.org/doc/html/latest/dev-tools/sparse.html Per my understanding, we shouldn't use "__rcu" here. Please search "__rcu" in the following document. https://www.kernel.org/doc/html/latest/RCU/checklist.html > unsigned int weight_total = 0; > unsigned long rem_pages = nr_pages; > nodemask_t nodes; > > > > Simplifying resume_node/weight logic: > ------------------------------------- > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 2c1aef8eab70..b0ca9bcdd64c 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -2405,15 +2405,9 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, > page_array += nr_allocated; > total_allocated += nr_allocated; > /* if that's all the pages, no need to interleave */ > - if (rem_pages < weight) { > - /* stay on current node, adjust il_weight */ > + if (rem_pages <= weight) { > me->il_weight -= rem_pages; > return total_allocated; > - } else if (rem_pages == weight) { > - /* move to next node / weight */ > - me->il_prev = next_node_in(node, nodes); > - me->il_weight = get_il_weight(me->il_prev); > - return total_allocated; > } > /* Otherwise we adjust remaining pages, continue from there */ > rem_pages -= weight; > @@ -2460,17 +2454,10 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, > node_pages += weight; > delta -= weight; > } else if (delta) { > + /* when delta is deleted, resume from that node */ ~~~~~~~ depleted? > node_pages += delta; > - /* delta may deplete on a boundary or w/ a remainder */ > - if (delta == weight) { > - /* boundary: resume from next node/weight */ > - resume_node = next_node_in(node, nodes); > - resume_weight = weights[resume_node]; > - } else { > - /* remainder: resume this node w/ remainder */ > - resume_node = node; > - resume_weight = weight - delta; > - } > + resume_node = node; > + resume_weight = weight - delta; > delta = 0; > } > /* node_pages can be 0 if an allocation fails and rounds == 0 */ > > > > > > task->mems_allowed_seq protection (added as 4th patch) > ------------------------------------------------------ > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index b0ca9bcdd64c..b1437396c357 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -1879,10 +1879,15 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) > static unsigned int weighted_interleave_nodes(struct mempolicy *policy) > { > unsigned int node = current->il_prev; > + unsigned int cpuset_mems_cookie; > > +retry: > + /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ > + cpuset_mems_cookie = read_mems_allowed_begin(); > if (!current->il_weight || !node_isset(node, policy->nodes)) { > node = next_node_in(node, policy->nodes); node will be changed in the loop. So we need to change the logic here. > - /* can only happen if nodemask is being rebound */ > + if (read_mems_allowed_retry(cpuset_mems_cookie)) > + goto retry; > if (node == MAX_NUMNODES) > return node; > current->il_prev = node; > @@ -1896,10 +1901,17 @@ static unsigned int weighted_interleave_nodes(struct mempolicy *policy) > static unsigned int interleave_nodes(struct mempolicy *policy) > { > unsigned int nid; > + unsigned int cpuset_mems_cookie; > + > + /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ > + do { > + cpuset_mems_cookie = read_mems_allowed_begin(); > + nid = next_node_in(current->il_prev, policy->nodes); > + } while (read_mems_allowed_retry(cpuset_mems_cookie)); > > - nid = next_node_in(current->il_prev, policy->nodes); > if (nid < MAX_NUMNODES) > current->il_prev = nid; > + > return nid; > } > > @@ -2374,6 +2386,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, > struct page **page_array) > { > struct task_struct *me = current; > + unsigned int cpuset_mems_cookie; > unsigned long total_allocated = 0; > unsigned long nr_allocated = 0; > unsigned long rounds; > @@ -2388,10 +2401,17 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, > int prev_node; > int i; > > + Change by accident? > if (!nr_pages) > return 0; > > - nnodes = read_once_policy_nodemask(pol, &nodes); > + /* read the nodes onto the stack, retry if done during rebind */ > + do { > + cpuset_mems_cookie = read_mems_allowed_begin(); > + nnodes = read_once_policy_nodemask(pol, &nodes); > + } while (read_mems_allowed_retry(cpuset_mems_cookie)); > + > + /* if the nodemask has become invalid, we cannot do anything */ > if (!nnodes) > return 0; -- Best Regards, Huang, Ying