Hi, I'm submitting a patch that I found quite useful in my case. It supports a user-specified custom interleaved allocation scheme. More specifically, apart from the required allocation size, the user specifies an arbitrary set of partitions and the set of nodes, where each partition must lie on. The interleaved allocator will allocate the required region with a single mmap() call and will then bind the different partitions on the specified nodes, taking care of the correct alignment of the partitions and the possible merging of very small ones. The key motivation behind this patch is the seamless conversion of existing multithreaded shared-memory-based code to NUMA-aware. For example, suppose you have a multithreaded code acting on a large dataset that you split using your own static load balancing scheme. Converting this code to NUMA-aware using only numa_alloc_onnode() calls would require considerable refactoring, since you need to alter your algorithm toward a more distributed logic. The allocation scheme I'm submitting leaves intact the original shared-memory logic by applying only a user-guided interleaving of the memory pages, based on the algorithm's needs. The key functions are the `alloc_interleaved()', which is responsible for the allocation and node binding, and `fix_interleaving()', which fixes the user-specified partitions. I'm also submitting a set of test/check routines. If you think this patch is relevant, I can integrate it to libnuma. Regards, V.K. [PATCH follows] diff -urN numactl-2.0.8-orig/Makefile numactl-2.0.8/Makefile --- numactl-2.0.8-orig/Makefile 2012-10-11 23:52:24.000000000 +0300 +++ numactl-2.0.8/Makefile 2012-11-21 20:41:39.000000000 +0200 @@ -32,10 +32,11 @@ test/mbind_mig_pages test/migrate_pages \ migratepages migspeed migspeed.o libnuma.a \ test/move_pages test/realloc_test sysfs.o affinity.o \ - test/node-parse rtnetlink.o test/A numastat + test/node-parse rtnetlink.o test/A numastat \ + custom_interleaved.o custom_interleaved SOURCES := bitops.c libnuma.c distance.c memhog.c numactl.c numademo.c \ numamon.c shm.c stream_lib.c stream_main.c syscall.c util.c mt.c \ - clearcache.c test/*.c affinity.c sysfs.c rtnetlink.c + clearcache.c test/*.c affinity.c sysfs.c rtnetlink.c custom_interleaved.c ifeq ($(strip $(PREFIX)),) prefix := /usr @@ -49,7 +50,7 @@ test/tshared stream test/mynode test/pagesize test/ftok test/prefered \ test/randmap test/nodemap test/distance test/tbitmap test/move_pages \ test/mbind_mig_pages test/migrate_pages test/realloc_test libnuma.a \ - test/node-parse numastat + test/node-parse numastat custom_interleaved numactl: numactl.o util.o shm.o bitops.o libnuma.so @@ -103,6 +104,8 @@ $(AR) rc $@ $^ $(RANLIB) $@ +custom_interleaved: LDLIBS += libnuma.a + distance.o : CFLAGS += -fPIC syscall.o : CFLAGS += -fPIC diff -urN numactl-2.0.8-orig/custom_interleaved.c numactl-2.0.8/custom_interleaved.c --- numactl-2.0.8-orig/custom_interleaved.c 1970-01-01 02:00:00.000000000 +0200 +++ numactl-2.0.8/custom_interleaved.c 2012-11-21 19:48:57.000000000 +0200 @@ -0,0 +1,333 @@ +/* + * User-defined custom interleaving of memory pages. + * + * Copyright (C) 2011-2012, Vasileios Karakasis + * Copyright (C) 2011-2012, Theodoros Gkountouvas + */ +#include <numa.h> +#include <numaif.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <assert.h> + +#define ALIGN_ADDR(addr, bound) (void *)((unsigned long) addr & ~(bound-1)) + +static int get_best_node(const size_t *node_scores, int nr_nodes) +{ + int j; + int ret = 0; + size_t max_score = node_scores[0]; + for (j = 1; j < nr_nodes; j++) + if (node_scores[j] > max_score) { + max_score = node_scores[j]; + ret = j; + } + + return ret; +} + +static void fix_interleaving(size_t nr_parts, size_t *parts, int *nodes) +{ + int i; + int nr_nodes = numa_num_configured_nodes(); + int pagesize = numa_pagesize(); + size_t *node_scores = calloc(nr_nodes, sizeof(*node_scores)); + if (!node_scores) { + perror("malloc"); + exit(1); + } + + // merge small partitions + size_t base_part = 0; + node_scores[nodes[0]] = parts[0]; + for (i = 1; i < nr_parts; i++) { + size_t part_score = parts[i]; + if (parts[base_part] < pagesize) { + // merge with base partition + parts[base_part] += parts[i]; + parts[i] = 0; + } else { + // assign partition to the node with the highest score + nodes[base_part] = get_best_node(node_scores, nr_nodes); + + // reset the scores + memset(node_scores, 0, nr_nodes*sizeof(*node_scores)); + base_part = i; + } + + node_scores[nodes[i]] += part_score; + } + + // fix the last merger + nodes[base_part] = get_best_node(node_scores, nr_nodes); + + // adjust partition size to multiples of system's page size + size_t rem = 0; + ssize_t size_adjust = 0; // can be negative + for (i = 0; i < nr_parts; i++) { + if (!parts[i]) + continue; + + parts[i] += size_adjust; // adjustment from previous partition + rem = parts[i] % pagesize; + + // find next valid partition + size_t next_part = i+1; + while (next_part != nr_parts && !parts[next_part]) + next_part++; + + // + // If at last partition, always execute the else path. + // Otherwise, take the remainder of the page, if the largest part is + // in the current partition, assuring, however, that the next + // partition will have at least one page. + // + if (next_part != nr_parts && + ((rem < pagesize / 2) || + (parts[next_part] + rem < 2*pagesize))) { + parts[i] -= rem; + size_adjust = rem; + } else { + parts[i] += pagesize - rem; + size_adjust = rem - pagesize; + } + } + + free(node_scores); +} + +void *alloc_interleaved(size_t size, size_t *parts, size_t nr_parts, + int *nodes) +{ + int i; + void *ret = NULL; + + ret = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, + 0, 0); + if (ret == (void *) -1) { + ret = NULL; + goto exit; + } + + struct bitmask *nodemask = numa_allocate_nodemask(); + + /* + * Fix the interleaving and bind parts to specific nodes + */ + fix_interleaving(nr_parts, parts, nodes); + + void *curr_part = ret; + for (i = 0; i < nr_parts; i++) { + if (!parts[i]) + continue; + + printf("node = %d, part = %zd\n", nodes[i], parts[i]); + numa_bitmask_setbit(nodemask, nodes[i]); + if (mbind(curr_part, parts[i], MPOL_BIND, + nodemask->maskp, nodemask->size, 0) < 0) { + perror("mbind"); + exit(1); + } + curr_part += parts[i]; + + /* Clear the mask for the next round */ + numa_bitmask_clearbit(nodemask, nodes[i]); + } + + numa_bitmask_free(nodemask); +exit: + return ret; +} + +int check_region(void *addr, size_t size, int node) +{ + void *misplaced_start; + int pagesize; + size_t i; + int misplaced_node; + int err = 0; + + pagesize = numa_pagesize(); + misplaced_start = NULL; + misplaced_node = -1; + void *aligned_addr = ALIGN_ADDR(addr, pagesize); + for (i = 0; i < size; i += pagesize) { + int page_node; + if (get_mempolicy(&page_node, 0, 0, aligned_addr + i, + MPOL_F_ADDR | MPOL_F_NODE) < 0) { + perror("get_mempolicy()"); + exit(1); + } + + if (page_node != node) { + // Start of a misplaced region + if (!misplaced_start) + misplaced_start = aligned_addr + i; + misplaced_node = page_node; + err = 1; + } else { + if (misplaced_start) { + // End of a misplaced region + assert(misplaced_node != -1); + size_t misplaced_size = (aligned_addr + i - misplaced_start); + fprintf(stderr, "Region [%p,%p) (%zd bytes) is misplaced " + "(lies on node %d but it should be on node %d)\n", + misplaced_start, aligned_addr + i, misplaced_size, + misplaced_node, node); + misplaced_start = NULL; + misplaced_node = -1; + } + } + } + + if (misplaced_start) { + // Last misplaced region + assert(misplaced_node != -1); + size_t misplaced_size = (aligned_addr + i - misplaced_start); + fprintf(stderr, "Region [%p,%p) (%zd bytes) is misplaced " + "(lies on node %d but it should be on node %d)\n", + misplaced_start, aligned_addr + i, misplaced_size, + misplaced_node, node); + } + + return err; +} + +int check_interleaved(void *addr, const size_t *parts, size_t nr_parts, + const int *nodes) +{ + assert(addr && "addr is NULL"); + assert(parts && "parts is NULL"); + + size_t i = 0; + int err = 0; + for (i = 0; i < nr_parts; i++) { + if (check_region(addr, parts[i], nodes[i])) + err = 1; + + addr += parts[i]; + } + + return err; +} + +void print_alloc_status(const char *data_descr, int err) +{ + printf("allocation check for %s... %s\n", data_descr, + (err) ? "FAILED (see above for more info)" : "DONE"); +} + +static size_t test_region_size(const size_t *parts, size_t nr_parts) +{ + int i; + size_t ret; + + ret = 0; + for (i = 0; i < nr_parts; i++) + ret += parts[i]; + + return ret; +} + +static void test_touch_region(void *addr, size_t size) +{ + size_t i; + size_t pagesize = numa_pagesize(); + for (i = 0; i < size; i += pagesize) + *((char *) (addr + i)) = 1; +} + +static void test_print_parts(const char *descr, + const size_t *parts, size_t nr_parts, + const int *nodes) +{ + size_t i; + + printf("%s", descr); + for (i = 0; i < nr_parts; i++) { + printf("parts[%zd] = %zd, nodes[%zd] = %d\n", + i, parts[i], i, nodes[i]); + } +} + +static void test_interleaving(size_t *parts, size_t nr_parts, + int *nodes) +{ + size_t region_size = test_region_size(parts, nr_parts); + + test_print_parts("Orig. partitions:\n", parts, nr_parts, nodes); + void *region = alloc_interleaved(region_size, + parts, nr_parts, nodes); + + if (!region) { + perror("alloc_interleaved"); + exit(1); + } + + test_print_parts("New partitions:\n", parts, nr_parts, nodes); + test_touch_region(region, region_size); + + /* Check interleaving */ + int err = check_interleaved(region, parts, nr_parts, nodes); + print_alloc_status("test data", err); + numa_free(region, region_size); +} + +#define NR_PARTS ((size_t) 4) + +size_t test_parts[NR_PARTS]; +int test_nodes[NR_PARTS]; + +int main(void) +{ + /* All small regions */ + test_parts[0] = 100; + test_parts[1] = 200; + test_parts[2] = 300; + test_parts[3] = 400; + test_nodes[0] = 0; + test_nodes[1] = 0; + test_nodes[2] = 1; + test_nodes[3] = 1; + printf("\n*** Testing very small partitions ***\n"); + test_interleaving(test_parts, NR_PARTS, test_nodes); + + /* Some small regions */ + test_parts[0] = 1000; + test_parts[1] = 2000; + test_parts[2] = 3000; + test_parts[3] = 4000; + test_nodes[0] = 0; + test_nodes[1] = 0; + test_nodes[2] = 1; + test_nodes[3] = 1; + printf("\n*** Testing small partitions ***\n"); + test_interleaving(test_parts, NR_PARTS, test_nodes); + + /* Some large regions */ + test_parts[0] = 10000; + test_parts[1] = 20000; + test_parts[2] = 30000; + test_parts[3] = 40000; + test_nodes[0] = 0; + test_nodes[1] = 0; + test_nodes[2] = 1; + test_nodes[3] = 1; + printf("\n*** Testing large partitions ***\n"); + test_interleaving(test_parts, NR_PARTS, test_nodes); + + /* Mixture of small and large regions */ + test_parts[0] = 1000; + test_parts[1] = 4000; + test_parts[2] = 30000; + test_parts[3] = 40000; + test_nodes[0] = 0; + test_nodes[1] = 0; + test_nodes[2] = 1; + test_nodes[3] = 1; + printf("\n*** Testing small/large partitions ***\n"); + test_interleaving(test_parts, NR_PARTS, test_nodes); + + return 0; +} Binary files numactl-2.0.8-orig/test/move_pages and numactl-2.0.8/test/move_pages differ -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html