On Thu, Jan 18, 2024 at 11:05:52AM +0800, Huang, Ying wrote: > Gregory Price <gourry.memverge@xxxxxxxxx> writes: > > +static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, > > + struct mempolicy *pol, unsigned long nr_pages, > > + struct page **page_array) > > +{ > > + struct task_struct *me = current; > > + unsigned long total_allocated = 0; > > + unsigned long nr_allocated; > > + unsigned long rounds; > > + unsigned long node_pages, delta; > > + u8 weight; > > + struct iw_table __rcu *table; > > + u8 *weights; > > + unsigned int weight_total = 0; > > + unsigned long rem_pages = nr_pages; > > + nodemask_t nodes; > > + int nnodes, node, weight_nodes; > > + int prev_node = NUMA_NO_NODE; > > + int i; > > + > > + nnodes = read_once_policy_nodemask(pol, &nodes); > > + if (!nnodes) > > + return 0; > > + > > + /* Continue allocating from most recent node and adjust the nr_pages */ > > + if (pol->wil.cur_weight) { > > + node = next_node_in(me->il_prev, nodes); > > + node_pages = pol->wil.cur_weight; > > + if (node_pages > rem_pages) > > + node_pages = rem_pages; > > + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, > > + NULL, page_array); > > + page_array += nr_allocated; > > + total_allocated += nr_allocated; > > + /* if that's all the pages, no need to interleave */ > > + if (rem_pages <= pol->wil.cur_weight) { > > + pol->wil.cur_weight -= rem_pages; > > + return total_allocated; > > + } > > + /* Otherwise we adjust nr_pages down, and continue from there */ > > + rem_pages -= pol->wil.cur_weight; > > + pol->wil.cur_weight = 0; > > + prev_node = node; > > + } > > + > > + /* fetch the weights for this operation and calculate total weight */ > > + weights = kmalloc(nnodes, GFP_KERNEL); > > + if (!weights) > > + return total_allocated; > > + > > + rcu_read_lock(); > > + table = rcu_dereference(iw_table); > > + weight_nodes = 0; > > + for_each_node_mask(node, nodes) { > > + weights[weight_nodes++] = table->weights[node]; > > + weight_total += table->weights[node]; > > + } > > + rcu_read_unlock(); > > + > > + if (!weight_total) { > > + kfree(weights); > > + return total_allocated; > > + } > > + > > + /* Now we can continue allocating as if from 0 instead of an offset */ > > + rounds = rem_pages / weight_total; > > + delta = rem_pages % weight_total; > > + for (i = 0; i < nnodes; i++) { > > + node = next_node_in(prev_node, nodes); > > + weight = weights[i]; > > + node_pages = weight * rounds; > > + if (delta) { > > + if (delta > weight) { > > + node_pages += weight; > > + delta -= weight; > > + } else { > > + node_pages += delta; > > + delta = 0; > > + } > > + } > > + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, > > + NULL, page_array); > > + page_array += nr_allocated; > > + total_allocated += nr_allocated; > > + if (total_allocated == nr_pages) > > + break; > > + prev_node = node; > > + } > > + > > + /* > > + * Finally, we need to update me->il_prev and pol->wil.cur_weight > > + * if there were overflow pages, but not equivalent to the node > > + * weight, set the cur_weight to node_weight - delta and the > > + * me->il_prev to the previous node. Otherwise if it was perfect > > + * we can simply set il_prev to node and cur_weight to 0 > > + */ > > + if (node_pages) { > > + me->il_prev = prev_node; > > + node_pages %= weight; > > + pol->wil.cur_weight = weight - node_pages; > > + } else { > > + me->il_prev = node; > > + pol->wil.cur_weight = 0; > > + } > > > It appears that we should set me->il_prev and pol->wil.cur_weight when > delta becomes 0? That is, following allocation should start from there? > So the observation is that when delta reaches 0, we know what the prior node should be. The only corner case being that delta is 0 when we enter the loop (in which case current prev_node is the correct prev_node). Eyeballing it, this seems correct, but I'll do some additional validation tomorrow. That should clean up the last block a bit. Thanks! ~Gregory