[PATCH] Custom interleaved allocation

Vasileios Karakasis <bkk@xxxxxxxxxxxxxxxxx> · Wed, 21 Nov 2012 20:54:56 +0200

Hi,

I'm submitting a patch that I found quite useful in my case. It supports
a user-specified custom interleaved allocation scheme. More
specifically, apart from the required allocation size, the user
specifies an arbitrary set of partitions and the set of nodes, where
each partition must lie on. The interleaved allocator will allocate the
required region with a single mmap() call and will then bind the
different partitions on the specified nodes, taking care of the correct
alignment of the partitions and the possible merging of very small ones.

The key motivation behind this patch is the seamless conversion of
existing multithreaded shared-memory-based code to NUMA-aware. For
example, suppose you have a multithreaded code acting on a large dataset
that you split using your own static load balancing scheme. Converting
this code to NUMA-aware using only numa_alloc_onnode() calls would
require considerable refactoring, since you need to alter your algorithm
toward a more distributed logic. The allocation scheme I'm submitting
leaves intact the original shared-memory logic by applying only a
user-guided interleaving of the memory pages, based on the algorithm's
needs.

The key functions are the `alloc_interleaved()', which is responsible
for the allocation and node binding, and `fix_interleaving()', which
fixes the user-specified partitions.

I'm also submitting a set of test/check routines. If you think this
patch is relevant, I can integrate it to libnuma.

Regards,

V.K.

[PATCH follows]

diff -urN numactl-2.0.8-orig/Makefile numactl-2.0.8/Makefile

--- numactl-2.0.8-orig/Makefile    2012-10-11 23:52:24.000000000 +0300
+++ numactl-2.0.8/Makefile    2012-11-21 20:41:39.000000000 +0200
@@ -32,10 +32,11 @@
           test/mbind_mig_pages test/migrate_pages \
           migratepages migspeed migspeed.o libnuma.a \
           test/move_pages test/realloc_test sysfs.o affinity.o \
-          test/node-parse rtnetlink.o test/A numastat
+          test/node-parse rtnetlink.o test/A numastat \
+          custom_interleaved.o custom_interleaved
 SOURCES := bitops.c libnuma.c distance.c memhog.c numactl.c numademo.c \
     numamon.c shm.c stream_lib.c stream_main.c syscall.c util.c mt.c \
-    clearcache.c test/*.c affinity.c sysfs.c rtnetlink.c
+    clearcache.c test/*.c affinity.c sysfs.c rtnetlink.c
custom_interleaved.c
 
 ifeq ($(strip $(PREFIX)),)
 prefix := /usr
@@ -49,7 +50,7 @@
      test/tshared stream test/mynode test/pagesize test/ftok
test/prefered \
      test/randmap test/nodemap test/distance test/tbitmap test/move_pages \
      test/mbind_mig_pages test/migrate_pages test/realloc_test libnuma.a \
-     test/node-parse numastat
+     test/node-parse numastat custom_interleaved
 
 numactl: numactl.o util.o shm.o bitops.o libnuma.so
 
@@ -103,6 +104,8 @@
     $(AR) rc $@ $^
     $(RANLIB) $@
 
+custom_interleaved: LDLIBS += libnuma.a
+
 distance.o : CFLAGS += -fPIC
 
 syscall.o : CFLAGS += -fPIC
diff -urN numactl-2.0.8-orig/custom_interleaved.c
numactl-2.0.8/custom_interleaved.c
--- numactl-2.0.8-orig/custom_interleaved.c    1970-01-01
02:00:00.000000000 +0200
+++ numactl-2.0.8/custom_interleaved.c    2012-11-21 19:48:57.000000000
+0200
@@ -0,0 +1,333 @@
+/*
+ * User-defined custom interleaving of memory pages.
+ *
+ * Copyright (C) 2011-2012, Vasileios Karakasis
+ * Copyright (C) 2011-2012, Theodoros Gkountouvas
+ */
+#include <numa.h>
+#include <numaif.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <assert.h>
+
+#define ALIGN_ADDR(addr, bound) (void *)((unsigned long) addr & ~(bound-1))
+
+static int get_best_node(const size_t *node_scores, int nr_nodes)
+{
+    int j;
+    int ret = 0;
+    size_t max_score = node_scores[0];
+    for (j = 1; j < nr_nodes; j++)
+        if (node_scores[j] > max_score) {
+            max_score = node_scores[j];
+            ret = j;
+        }
+
+    return ret;
+}
+
+static void fix_interleaving(size_t nr_parts, size_t *parts, int *nodes)
+{
+    int i;
+    int nr_nodes = numa_num_configured_nodes();
+    int pagesize = numa_pagesize();
+    size_t *node_scores = calloc(nr_nodes, sizeof(*node_scores));
+    if (!node_scores) {
+        perror("malloc");
+        exit(1);
+    }
+
+    // merge small partitions
+    size_t base_part = 0;
+    node_scores[nodes[0]] = parts[0];
+    for (i = 1; i < nr_parts; i++) {
+        size_t part_score = parts[i];
+        if (parts[base_part] < pagesize) {
+            // merge with base partition
+            parts[base_part] += parts[i];
+            parts[i] = 0;
+        } else {
+            // assign partition to the node with the highest score
+            nodes[base_part] = get_best_node(node_scores, nr_nodes);
+           
+            // reset the scores
+            memset(node_scores, 0, nr_nodes*sizeof(*node_scores));
+            base_part = i;
+        }
+
+        node_scores[nodes[i]] += part_score;
+    }
+
+    // fix the last merger
+    nodes[base_part] = get_best_node(node_scores, nr_nodes);
+
+    // adjust partition size to multiples of system's page size
+    size_t rem = 0;
+    ssize_t size_adjust = 0;   // can be negative
+    for (i = 0; i < nr_parts; i++) {
+        if (!parts[i])
+            continue;
+
+        parts[i] += size_adjust;   // adjustment from previous partition
+        rem = parts[i] % pagesize;
+
+        // find next valid partition
+        size_t next_part = i+1;
+        while (next_part != nr_parts && !parts[next_part])
+            next_part++;
+
+        //
+        // If at last partition, always execute the else path.
+        // Otherwise, take the remainder of the page, if the largest
part is
+        // in the current partition, assuring, however, that the next
+        // partition will have at least one page.
+        //
+        if (next_part != nr_parts &&
+            ((rem < pagesize / 2) ||
+             (parts[next_part] + rem < 2*pagesize))) {
+            parts[i] -= rem;
+            size_adjust = rem;
+        } else {
+            parts[i] += pagesize - rem;
+            size_adjust = rem - pagesize;
+        }
+    }
+
+    free(node_scores);
+}
+
+void *alloc_interleaved(size_t size, size_t *parts, size_t nr_parts,
+                        int *nodes)
+{
+    int i;
+    void *ret = NULL;
+
+    ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS |
MAP_PRIVATE,
+               0, 0);
+    if (ret == (void *) -1) {
+        ret = NULL;
+        goto exit;
+    }
+
+    struct bitmask *nodemask = numa_allocate_nodemask();
+
+    /*
+     * Fix the interleaving and bind parts to specific nodes
+     */
+    fix_interleaving(nr_parts, parts, nodes);
+
+    void *curr_part = ret;
+    for (i = 0; i < nr_parts; i++) {
+        if (!parts[i])
+            continue;
+
+        printf("node = %d, part = %zd\n", nodes[i], parts[i]);
+        numa_bitmask_setbit(nodemask, nodes[i]);
+        if (mbind(curr_part, parts[i], MPOL_BIND,
+                  nodemask->maskp, nodemask->size, 0) < 0) {
+            perror("mbind");
+            exit(1);
+        }
+        curr_part += parts[i];
+
+        /* Clear the mask for the next round */
+        numa_bitmask_clearbit(nodemask, nodes[i]);
+    }
+
+    numa_bitmask_free(nodemask);
+exit:
+    return ret;
+}
+
+int check_region(void *addr, size_t size, int node)
+{
+    void *misplaced_start;
+    int pagesize;
+    size_t i;
+    int misplaced_node;
+    int err = 0;
+
+    pagesize = numa_pagesize();
+    misplaced_start = NULL;
+    misplaced_node = -1;
+    void *aligned_addr = ALIGN_ADDR(addr, pagesize);
+    for (i = 0; i < size; i += pagesize) {
+        int page_node;
+        if (get_mempolicy(&page_node, 0, 0, aligned_addr + i,
+                          MPOL_F_ADDR | MPOL_F_NODE) < 0) {
+            perror("get_mempolicy()");
+            exit(1);
+        }
+
+        if (page_node != node) {
+            // Start of a misplaced region
+            if (!misplaced_start)
+                misplaced_start = aligned_addr + i;
+            misplaced_node = page_node;
+            err = 1;
+        } else {
+            if (misplaced_start) {
+                // End of a misplaced region
+                assert(misplaced_node != -1);
+                size_t misplaced_size = (aligned_addr + i -
misplaced_start);
+                fprintf(stderr, "Region [%p,%p) (%zd bytes) is misplaced "
+                        "(lies on node %d but it should be on node %d)\n",
+                        misplaced_start, aligned_addr + i, misplaced_size,
+                        misplaced_node, node);
+                misplaced_start = NULL;
+                misplaced_node = -1;
+            }
+        }
+    }
+
+    if (misplaced_start) {
+        // Last misplaced region
+        assert(misplaced_node != -1);
+        size_t misplaced_size = (aligned_addr + i - misplaced_start);
+        fprintf(stderr, "Region [%p,%p) (%zd bytes) is misplaced "
+                "(lies on node %d but it should be on node %d)\n",
+                misplaced_start, aligned_addr + i, misplaced_size,
+                misplaced_node, node);
+    }
+
+    return err;
+}
+
+int check_interleaved(void *addr, const size_t *parts, size_t nr_parts,
+                      const int *nodes)
+{
+    assert(addr && "addr is NULL");
+    assert(parts && "parts is NULL");
+
+    size_t i = 0;
+    int err = 0;
+    for (i = 0; i < nr_parts; i++) {
+        if (check_region(addr, parts[i], nodes[i]))
+            err = 1;
+
+        addr += parts[i];
+    }
+
+    return err;
+}
+
+void print_alloc_status(const char *data_descr, int err)
+{
+    printf("allocation check for %s... %s\n", data_descr,
+           (err) ? "FAILED (see above for more info)" : "DONE");
+}
+
+static size_t test_region_size(const size_t *parts, size_t nr_parts)
+{
+    int i;
+    size_t ret;
+
+    ret = 0;
+    for (i = 0; i < nr_parts; i++)
+        ret += parts[i];
+
+    return ret;
+}
+
+static void test_touch_region(void *addr, size_t size)
+{
+    size_t i;
+    size_t pagesize = numa_pagesize();
+    for (i = 0; i < size; i += pagesize)
+        *((char *) (addr + i)) = 1;
+}
+
+static void test_print_parts(const char *descr,
+                             const size_t *parts, size_t nr_parts,
+                             const int *nodes)
+{
+    size_t i;
+
+    printf("%s", descr);
+    for (i = 0; i < nr_parts; i++) {
+        printf("parts[%zd] = %zd, nodes[%zd] = %d\n",
+               i, parts[i], i, nodes[i]);
+    }
+}
+
+static void test_interleaving(size_t *parts, size_t nr_parts,
+                              int *nodes)
+{
+    size_t region_size = test_region_size(parts, nr_parts);
+
+    test_print_parts("Orig. partitions:\n", parts, nr_parts, nodes);
+    void *region = alloc_interleaved(region_size,
+                                     parts, nr_parts, nodes);
+
+    if (!region) {
+        perror("alloc_interleaved");
+        exit(1);
+    }
+
+    test_print_parts("New partitions:\n", parts, nr_parts, nodes);
+    test_touch_region(region, region_size);
+   
+    /* Check interleaving */
+    int err = check_interleaved(region, parts, nr_parts, nodes);
+    print_alloc_status("test data", err);
+    numa_free(region, region_size);
+}
+
+#define NR_PARTS    ((size_t) 4)
+
+size_t test_parts[NR_PARTS];
+int test_nodes[NR_PARTS];
+
+int main(void)
+{
+    /* All small regions */
+    test_parts[0] = 100;
+    test_parts[1] = 200;
+    test_parts[2] = 300;
+    test_parts[3] = 400;
+    test_nodes[0] = 0;
+    test_nodes[1] = 0;
+    test_nodes[2] = 1;
+    test_nodes[3] = 1;
+    printf("\n*** Testing very small partitions ***\n");
+    test_interleaving(test_parts, NR_PARTS, test_nodes);
+
+    /* Some small regions */
+    test_parts[0] = 1000;
+    test_parts[1] = 2000;
+    test_parts[2] = 3000;
+    test_parts[3] = 4000;
+    test_nodes[0] = 0;
+    test_nodes[1] = 0;
+    test_nodes[2] = 1;
+    test_nodes[3] = 1;
+    printf("\n*** Testing small partitions ***\n");
+    test_interleaving(test_parts, NR_PARTS, test_nodes);
+
+    /* Some large regions */
+    test_parts[0] = 10000;
+    test_parts[1] = 20000;
+    test_parts[2] = 30000;
+    test_parts[3] = 40000;
+    test_nodes[0] = 0;
+    test_nodes[1] = 0;
+    test_nodes[2] = 1;
+    test_nodes[3] = 1;
+    printf("\n*** Testing large partitions ***\n");
+    test_interleaving(test_parts, NR_PARTS, test_nodes);
+
+    /* Mixture of small and large regions */
+    test_parts[0] = 1000;
+    test_parts[1] = 4000;
+    test_parts[2] = 30000;
+    test_parts[3] = 40000;
+    test_nodes[0] = 0;
+    test_nodes[1] = 0;
+    test_nodes[2] = 1;
+    test_nodes[3] = 1;
+    printf("\n*** Testing small/large partitions ***\n");
+    test_interleaving(test_parts, NR_PARTS, test_nodes);
+
+    return 0;
+}
Binary files numactl-2.0.8-orig/test/move_pages and
numactl-2.0.8/test/move_pages differ
--
To unsubscribe from this list: send the line "unsubscribe linux-numa" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html