Re: [EXT] [RFC PATCH v3 4/4] mm/mempolicy: modify interleave mempolicy to use node weights

Srinivasulu Thanneeru <sthanneeru.opensrc@xxxxxxxxxx> · Tue, 31 Oct 2023 23:53:15 +0530

On 10/31/2023 6:08 AM, Gregory Price wrote:


The node subsystem implements interleave weighting for the purpose
of bandwidth optimization.  Each node may have different weights in
relation to each compute node ("access node").

The mempolicy MPOL_INTERLEAVE utilizes the node weights to implement
weighted interleave.  By default, since all nodes default to a weight
of 1, the original interleave behavior is retained.

Examples

Weight settings:
echo 4 > node0/access0/il_weight
echo 1 > node0/access1/il_weight

echo 3 > node1/access0/il_weight
echo 2 > node1/access1/il_weight

Results:

Task A:
    cpunode:  0
    nodemask: [0,1]
    weights:  [4,3]
    allocation result: [0,0,0,0,1,1,1 repeat]

Task B:
    cpunode:  1
    nodemask: [0,1]
    weights:  [1,2]
    allocation result: [0,1,1 repeat]
    Weights are relative to access node

Signed-off-by: Gregory Price <gregory.price@xxxxxxxxxxxx>
Thank you Gregory for the collaboration.
Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@xxxxxxxxxx>
---
  include/linux/mempolicy.h |   4 ++
  mm/mempolicy.c            | 138 +++++++++++++++++++++++++++++---------
  2 files changed, 112 insertions(+), 30 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..240468b669fd 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -48,6 +48,10 @@ struct mempolicy {
         nodemask_t nodes;       /* interleave/bind/perfer */
         int home_node;          /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */

+       /* weighted interleave settings */
+       unsigned char cur_weight;
+       unsigned char il_weights[MAX_NUMNODES];
+
         union {
                 nodemask_t cpuset_mems_allowed; /* relative to these nodes */
                 nodemask_t user_nodemask;       /* nodemask passed by user */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 29ebf1e7898c..d62e942a13bd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -102,6 +102,7 @@
  #include <linux/mmu_notifier.h>
  #include <linux/printk.h>
  #include <linux/swapops.h>
+#include <linux/memory-tiers.h>

  #include <asm/tlbflush.h>
  #include <asm/tlb.h>
@@ -300,6 +301,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
         policy->mode = mode;
         policy->flags = flags;
         policy->home_node = NUMA_NO_NODE;
+       policy->cur_weight = 0;

         return policy;
  }
@@ -334,6 +336,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
                 tmp = *nodes;

         pol->nodes = tmp;
+       pol->cur_weight = 0;
  }

  static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -881,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,

         old = current->mempolicy;
         current->mempolicy = new;
-       if (new && new->mode == MPOL_INTERLEAVE)
+       if (new && new->mode == MPOL_INTERLEAVE) {
                 current->il_prev = MAX_NUMNODES-1;
+               new->cur_weight = 0;
+       }
+
         task_unlock(current);
         mpol_put(old);
         ret = 0;
@@ -1903,12 +1909,21 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
  /* Do dynamic interleaving for a process */
  static unsigned interleave_nodes(struct mempolicy *policy)
  {
-       unsigned next;
+       unsigned int next;
+       unsigned char next_weight;
         struct task_struct *me = current;

         next = next_node_in(me->il_prev, policy->nodes);
-       if (next < MAX_NUMNODES)
+       if (!policy->cur_weight) {
+               /* If the node is set, at least 1 allocation is required */
+               next_weight = node_get_il_weight(next, numa_node_id());
+               policy->cur_weight = next_weight ? next_weight : 1;
+       }
+
+       policy->cur_weight--;
+       if (next < MAX_NUMNODES && !policy->cur_weight)
                 me->il_prev = next;
+
         return next;
  }

@@ -1967,25 +1982,37 @@ unsigned int mempolicy_slab_node(void)
  static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
  {
         nodemask_t nodemask = pol->nodes;
-       unsigned int target, nnodes;
-       int i;
+       unsigned int target, nnodes, il_weight;
+       unsigned char weight;
         int nid;
+       int cur_node = numa_node_id();
+
         /*
          * The barrier will stabilize the nodemask in a register or on
          * the stack so that it will stop changing under the code.
          *
          * Between first_node() and next_node(), pol->nodes could be changed
          * by other threads. So we put pol->nodes in a local stack.
+        *
+        * Additionally, place the cur_node on the stack in case of a migration
          */
         barrier();

         nnodes = nodes_weight(nodemask);
         if (!nnodes)
-               return numa_node_id();
-       target = (unsigned int)n % nnodes;
+               return cur_node;
+
+       il_weight = nodes_get_il_weights(cur_node, &nodemask, pol->il_weights);
+       target = (unsigned int)n % il_weight;
         nid = first_node(nodemask);
-       for (i = 0; i < target; i++)
-               nid = next_node(nid, nodemask);
+       while (target) {
+               weight = pol->il_weights[nid];
+               if (target < weight)
+                       break;
+               target -= weight;
+               nid = next_node_in(nid, nodemask);
+       }
+
         return nid;
  }

@@ -2319,32 +2346,83 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
                 struct mempolicy *pol, unsigned long nr_pages,
                 struct page **page_array)
  {
-       int nodes;
-       unsigned long nr_pages_per_node;
-       int delta;
-       int i;
-       unsigned long nr_allocated;
+       struct task_struct *me = current;
         unsigned long total_allocated = 0;
+       unsigned long nr_allocated;
+       unsigned long rounds;
+       unsigned long node_pages, delta;
+       unsigned char weight;
+       unsigned long il_weight;
+       unsigned long req_pages = nr_pages;
+       int nnodes, node, prev_node;
+       int cur_node = numa_node_id();
+       int i;

-       nodes = nodes_weight(pol->nodes);
-       nr_pages_per_node = nr_pages / nodes;
-       delta = nr_pages - nodes * nr_pages_per_node;
-
-       for (i = 0; i < nodes; i++) {
-               if (delta) {
-                       nr_allocated = __alloc_pages_bulk(gfp,
-                                       interleave_nodes(pol), NULL,
-                                       nr_pages_per_node + 1, NULL,
-                                       page_array);
-                       delta--;
-               } else {
-                       nr_allocated = __alloc_pages_bulk(gfp,
-                                       interleave_nodes(pol), NULL,
-                                       nr_pages_per_node, NULL, page_array);
+       prev_node = me->il_prev;
+       nnodes = nodes_weight(pol->nodes);
+       /* Continue allocating from most recent node */
+       if (pol->cur_weight) {
+               node = next_node_in(prev_node, pol->nodes);
+               node_pages = pol->cur_weight;
+               if (node_pages > nr_pages)
+                       node_pages = nr_pages;
+               nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+                                                 NULL, page_array);
+               page_array += nr_allocated;
+               total_allocated += nr_allocated;
+               /* if that's all the pages, no need to interleave */
+               if (req_pages <= pol->cur_weight) {
+                       pol->cur_weight -= req_pages;
+                       return total_allocated;
                 }
-
+               /* Otherwise we adjust req_pages down, and continue from there */
+               req_pages -= pol->cur_weight;
+               pol->cur_weight = 0;
+               prev_node = node;
+       }
+
+       il_weight = nodes_get_il_weights(cur_node, &pol->nodes,
+                                        pol->il_weights);
+       rounds = req_pages / il_weight;
+       delta = req_pages % il_weight;
+       for (i = 0; i < nnodes; i++) {
+               node = next_node_in(prev_node, pol->nodes);
+               weight = pol->il_weights[node];
+               node_pages = weight * rounds;
+               if (delta > weight) {
+                       node_pages += weight;
+                       delta -= weight;
+               } else if (delta) {
+                       node_pages += delta;
+                       delta = 0;
+               }
+               /* The number of requested pages may not hit every node */
+               if (!node_pages)
+                       break;
+               /* If an over-allocation would occur, floor it */
+               if (node_pages + total_allocated > nr_pages) {
+                       node_pages = nr_pages - total_allocated;
+                       delta = 0;
+               }
+               nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+                                                 NULL, page_array);
                 page_array += nr_allocated;
                 total_allocated += nr_allocated;
+               prev_node = node;
+       }
+
+       /*
+        * Finally, we need to update me->il_prev and pol->cur_weight
+        * If the last node allocated on has un-used weight, apply
+        * the remainder as the cur_weight, otherwise proceed to next node
+        */
+       if (node_pages) {
+               me->il_prev = prev_node;
+               node_pages %= weight;
+               pol->cur_weight = weight - node_pages;
+       } else {
+               me->il_prev = node;
+               pol->cur_weight = 0;
         }

         return total_allocated;
--
2.39.1