Hello everyone, First, thanks for the work done with libnuma, it really helps working this whole system ! I've been trying to use numa_alloc_interleaved for some internal tests, and I wanted to check that each pages of one big buffer was indeed interleaved between all different NUMA nodes. You can find attached a patch against 2.0.8-rc4 that adds the "tinterleave" test-case, that you can use as this: $ test/tinterleave $((4096*4)) where 4096 is the page size in bytes. What it basically does is creating three buffers of 16kb using numa_alloc_interleaved, and check that all the pages are distributed accross the NUMA nodes. What's interesting is that the results are different whether the buffer are accessed just after being allocated (accesses done using memset), or whether they are accessed after all the three allocations have been done. Here is an example to illustrate this : (Note that the tests here are made with a machine with two NUMA nodes, using numactl 2.0.8-rc4). First case: size_t sbuf = sizeof(uint8_t)*n; uint8_t* v1 = numa_alloc_interleaved(sbuf); memset(v1, 0xAA, sbuf); uint8_t* v2 = numa_alloc_interleaved(sbuf); memset(v2, 0xAA, sbuf); uint8_t* res = numa_alloc_interleaved(sbuf); memset(res, 0xAA, sbuf); and here is the repartition of the pages of all three buffers: Nodes for v1: 0x7f923d289000 Node 1 from 0 to 4096 Node 0 from 4096 to 8192 Node 1 from 8192 to 12288 Node 0 from 12288 to 16384 Nodes for v2: 0x7f923d285000 Node 1 from 0 to 16384 Nodes for res: 0x7f923d281000 Node 1 from 0 to 4096 Node 0 from 4096 to 8192 Node 1 from 8192 to 12288 Node 0 from 12288 to 16384 Thus, only v1 and res are spread accross the NUMA nodes. In the second case: v1 = numa_alloc_interleaved(sbuf); v2 = numa_alloc_interleaved(sbuf); res = numa_alloc_interleaved(sbuf); memset(v1, 0xAA, sbuf); memset(v2, 0xAA, sbuf); memset(res, 0xAA, sbuf); and here is the repartition of the pages of all three buffers: Nodes for v1: 0x7f923d289000 Node 1 from 0 to 16384 Nodes for v2: 0x7f923d285000 Node 1 from 0 to 16384 Nodes for res: 0x7f923d281000 Node 1 from 0 to 4096 Node 0 from 4096 to 8192 Node 1 from 8192 to 12288 Node 0 from 12288 to 16384 Here, only 'res' is indeed spread accross the two nodes. Moreover, I've tried the tshared test case, which fails: $ ./tshared |head offset 4096 node 0 expected 1 offset 12288 node 0 expected 1 offset 20480 node 0 expected 1 offset 28672 node 0 expected 1 offset 36864 node 0 expected 1 offset 45056 node 0 expected 1 [...] all pages are on node 0. If I modified tshared so that the page access isn't done in children processes but in the main process, it works (which actually is equivalent to my test case with only one buffer). I don't know if this is a known issue or if I'm doing something wrong (like using get_mempolicy to have the mem's node ID), so I'm asking for any advices :) Thanks for any help! Regards, -- Adrien.
diff -r -N -u numactl-2.0.8-rc4/Makefile numactl-2.0.8-rc4-own/Makefile --- numactl-2.0.8-rc4/Makefile 2012-02-16 16:08:50.000000000 +0100 +++ numactl-2.0.8-rc4-own/Makefile 2012-05-10 12:38:17.909201309 +0200 @@ -32,7 +32,7 @@ test/mbind_mig_pages test/migrate_pages \ migratepages migspeed migspeed.o libnuma.a \ test/move_pages test/realloc_test sysfs.o affinity.o \ - test/node-parse rtnetlink.o test/A + test/node-parse test/tinterleave test/tinterleave.o rtnetlink.o test/A SOURCES := bitops.c libnuma.c distance.c memhog.c numactl.c numademo.c \ numamon.c shm.c stream_lib.c stream_main.c syscall.c util.c mt.c \ clearcache.c test/*.c affinity.c sysfs.c rtnetlink.c @@ -44,7 +44,7 @@ all: numactl migratepages migspeed libnuma.so numademo numamon memhog \ test/tshared stream test/mynode test/pagesize test/ftok test/prefered \ test/randmap test/nodemap test/distance test/tbitmap test/move_pages \ - test/mbind_mig_pages test/migrate_pages test/realloc_test libnuma.a \ + test/mbind_mig_pages test/migrate_pages test/realloc_test test/tinterleave libnuma.a \ test/node-parse numactl: numactl.o util.o shm.o bitops.o libnuma.so @@ -135,6 +135,8 @@ test/node-parse: test/node-parse.c libnuma.so util.o +test/tinterleave: test/tinterleave.c libnuma.so + .PHONY: install all clean html depend MANPAGES := numa.3 numactl.8 numastat.8 migratepages.8 migspeed.8 Binary files numactl-2.0.8-rc4/.numademo.c.swp and numactl-2.0.8-rc4-own/.numademo.c.swp differ Binary files numactl-2.0.8-rc4/test/move_pages and numactl-2.0.8-rc4-own/test/move_pages differ diff -r -N -u numactl-2.0.8-rc4/test/printcpu numactl-2.0.8-rc4-own/test/printcpu --- numactl-2.0.8-rc4/test/printcpu 2012-02-16 16:08:51.000000000 +0100 +++ numactl-2.0.8-rc4-own/test/printcpu 2012-05-10 12:02:07.257152257 +0200 @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash #print cpu it is running on declare -a arr arr=( $(< /proc/self/stat) ) diff -r -N -u numactl-2.0.8-rc4/test/regress2 numactl-2.0.8-rc4-own/test/regress2 --- numactl-2.0.8-rc4/test/regress2 2012-02-16 16:08:51.000000000 +0100 +++ numactl-2.0.8-rc4-own/test/regress2 2012-05-10 12:11:24.449164848 +0200 @@ -23,4 +23,5 @@ T ./checkaffinity T ./checktopology T ./tbitmap +T ./tshared #T ./randmap diff -r -N -u numactl-2.0.8-rc4/test/tinterleave.c numactl-2.0.8-rc4-own/test/tinterleave.c --- numactl-2.0.8-rc4/test/tinterleave.c 1970-01-01 01:00:00.000000000 +0100 +++ numactl-2.0.8-rc4-own/test/tinterleave.c 2012-05-10 14:06:00.377320225 +0200 @@ -0,0 +1,98 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdbool.h> +#include <sched.h> +#include <numa.h> +#include <numaif.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +int get_node_of_mem(void* addr) +{ + int node = -1; + get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE|MPOL_F_ADDR); + return node; +} + +bool dump_page_nodes_of_mem(uint8_t* buf, size_t buf_size) +{ + int node = get_node_of_mem(buf); + int new_node = -1; + size_t ifrom = 0; + size_t i; + size_t pagesz = getpagesize(); + for (i = pagesz; i < buf_size; i += pagesz) { + new_node = get_node_of_mem(buf + i); + if (new_node != node) { + printf("Node %d from %lu to %lu\n", node, ifrom, i); + node = new_node; + ifrom = i; + } + } + if (new_node == node) { + printf("Node %d from %lu to %lu\n", node, ifrom, buf_size); + } + return true; +} + +void dump_bufs(uint8_t* v1, uint8_t* v2, uint8_t* res, size_t sbuf) +{ + printf("Nodes for v1: %p\n", v1); + dump_page_nodes_of_mem(v1, sbuf); + printf("Nodes for v2: %p\n", v2); + dump_page_nodes_of_mem(v2, sbuf); + printf("Nodes for res: %p\n", res); + dump_page_nodes_of_mem(res, sbuf); + + numa_free(v1, sbuf); + numa_free(v2, sbuf); + numa_free(res, sbuf); +} + +void test(size_t n) +{ + const size_t sbuf = n*sizeof(uint8_t); + + printf("Test with memset just after the allocations:\n\n"); + + uint8_t* v1 = numa_alloc_interleaved(sbuf); + memset(v1, 0xAA, sbuf); + uint8_t* v2 = numa_alloc_interleaved(sbuf); + memset(v2, 0xAA, sbuf); + uint8_t* res = numa_alloc_interleaved(sbuf); + memset(res, 0xAA, sbuf); + + dump_bufs(v1, v2, res, sbuf); + + printf("\nTest with allocations grouped:\n\n"); + + v1 = numa_alloc_interleaved(sbuf); + v2 = numa_alloc_interleaved(sbuf); + res = numa_alloc_interleaved(sbuf); + memset(v1, 0xAA, sbuf); + memset(v2, 0xAA, sbuf); + memset(res, 0xAA, sbuf); + + dump_bufs(v1, v2, res, sbuf); +} + +int main(int argc, char** argv) +{ + if (argc < 2) { + fprintf(stderr, "Usage: %s n\n", argv[0]); + return 1; + } + + if (numa_available() < 0) { + printf("no NUMA API available\n"); + return 1; + } + + size_t n = atoll(argv[1]); + + test(n); + + return 0; +}