Issues with numa_alloc_interleaved

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello everyone,

First, thanks for the work done with libnuma, it really helps working this
whole system !

I've been trying to use numa_alloc_interleaved for some internal tests,
and I wanted to check that each pages of one big buffer was indeed
interleaved between all different NUMA nodes.
You can find attached a patch against 2.0.8-rc4 that adds the
"tinterleave" test-case, that you can use as this:

$ test/tinterleave $((4096*4))

where 4096 is the page size in bytes.

What it basically does is creating three buffers of 16kb using
numa_alloc_interleaved, and check that all the pages are distributed
accross the NUMA nodes.
What's interesting is that the results are different whether the buffer
are accessed just after being allocated (accesses done using memset), or
whether they are accessed after all the three allocations have been done.

Here is an example to illustrate this :

(Note that the tests here are made with a machine with two NUMA nodes,
using numactl 2.0.8-rc4).

First case:

size_t sbuf = sizeof(uint8_t)*n;
uint8_t* v1 = numa_alloc_interleaved(sbuf);
memset(v1, 0xAA, sbuf);
uint8_t* v2 = numa_alloc_interleaved(sbuf);
memset(v2, 0xAA, sbuf);
uint8_t* res = numa_alloc_interleaved(sbuf);
memset(res, 0xAA, sbuf);

and here is the repartition of the pages of all three buffers:

Nodes for v1: 0x7f923d289000
Node 1 from 0 to 4096
Node 0 from 4096 to 8192
Node 1 from 8192 to 12288
Node 0 from 12288 to 16384
Nodes for v2: 0x7f923d285000
Node 1 from 0 to 16384
Nodes for res: 0x7f923d281000
Node 1 from 0 to 4096
Node 0 from 4096 to 8192
Node 1 from 8192 to 12288
Node 0 from 12288 to 16384

Thus, only v1 and res are spread accross the NUMA nodes.

In the second case:

v1 = numa_alloc_interleaved(sbuf);
v2 = numa_alloc_interleaved(sbuf);
res = numa_alloc_interleaved(sbuf);
memset(v1, 0xAA, sbuf);
memset(v2, 0xAA, sbuf);
memset(res, 0xAA, sbuf);

and here is the repartition of the pages of all three buffers:

Nodes for v1: 0x7f923d289000
Node 1 from 0 to 16384
Nodes for v2: 0x7f923d285000
Node 1 from 0 to 16384
Nodes for res: 0x7f923d281000
Node 1 from 0 to 4096
Node 0 from 4096 to 8192
Node 1 from 8192 to 12288
Node 0 from 12288 to 16384

Here, only 'res' is indeed spread accross the two nodes.

Moreover, I've tried the tshared test case, which fails:

$ ./tshared |head
offset 4096 node 0 expected 1
offset 12288 node 0 expected 1
offset 20480 node 0 expected 1
offset 28672 node 0 expected 1
offset 36864 node 0 expected 1
offset 45056 node 0 expected 1
[...]

all pages are on node 0. If I modified tshared so that the page access
isn't done in children processes but in the main process, it works (which
actually is equivalent to my test case with only one buffer).

I don't know if this is a known issue or if I'm doing something wrong
(like using get_mempolicy to have the mem's node ID), so I'm asking for any
advices :)

Thanks for any help!

Regards,

-- 
Adrien.
diff -r -N -u numactl-2.0.8-rc4/Makefile numactl-2.0.8-rc4-own/Makefile
--- numactl-2.0.8-rc4/Makefile	2012-02-16 16:08:50.000000000 +0100
+++ numactl-2.0.8-rc4-own/Makefile	2012-05-10 12:38:17.909201309 +0200
@@ -32,7 +32,7 @@
 	      test/mbind_mig_pages test/migrate_pages \
 	      migratepages migspeed migspeed.o libnuma.a \
 	      test/move_pages test/realloc_test sysfs.o affinity.o \
-	      test/node-parse rtnetlink.o test/A
+	      test/node-parse test/tinterleave test/tinterleave.o rtnetlink.o test/A
 SOURCES := bitops.c libnuma.c distance.c memhog.c numactl.c numademo.c \
 	numamon.c shm.c stream_lib.c stream_main.c syscall.c util.c mt.c \
 	clearcache.c test/*.c affinity.c sysfs.c rtnetlink.c
@@ -44,7 +44,7 @@
 all: numactl migratepages migspeed libnuma.so numademo numamon memhog \
      test/tshared stream test/mynode test/pagesize test/ftok test/prefered \
      test/randmap test/nodemap test/distance test/tbitmap test/move_pages \
-     test/mbind_mig_pages test/migrate_pages test/realloc_test libnuma.a \
+     test/mbind_mig_pages test/migrate_pages test/realloc_test test/tinterleave libnuma.a \
      test/node-parse
 
 numactl: numactl.o util.o shm.o bitops.o libnuma.so
@@ -135,6 +135,8 @@
 
 test/node-parse: test/node-parse.c libnuma.so util.o
 
+test/tinterleave: test/tinterleave.c libnuma.so
+
 .PHONY: install all clean html depend
 
 MANPAGES := numa.3 numactl.8 numastat.8 migratepages.8 migspeed.8
Binary files numactl-2.0.8-rc4/.numademo.c.swp and numactl-2.0.8-rc4-own/.numademo.c.swp differ
Binary files numactl-2.0.8-rc4/test/move_pages and numactl-2.0.8-rc4-own/test/move_pages differ
diff -r -N -u numactl-2.0.8-rc4/test/printcpu numactl-2.0.8-rc4-own/test/printcpu
--- numactl-2.0.8-rc4/test/printcpu	2012-02-16 16:08:51.000000000 +0100
+++ numactl-2.0.8-rc4-own/test/printcpu	2012-05-10 12:02:07.257152257 +0200
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 #print cpu it is running on
 declare -a arr
 arr=( $(< /proc/self/stat) )
diff -r -N -u numactl-2.0.8-rc4/test/regress2 numactl-2.0.8-rc4-own/test/regress2
--- numactl-2.0.8-rc4/test/regress2	2012-02-16 16:08:51.000000000 +0100
+++ numactl-2.0.8-rc4-own/test/regress2	2012-05-10 12:11:24.449164848 +0200
@@ -23,4 +23,5 @@
 T ./checkaffinity
 T ./checktopology
 T ./tbitmap
+T ./tshared
 #T ./randmap
diff -r -N -u numactl-2.0.8-rc4/test/tinterleave.c numactl-2.0.8-rc4-own/test/tinterleave.c
--- numactl-2.0.8-rc4/test/tinterleave.c	1970-01-01 01:00:00.000000000 +0100
+++ numactl-2.0.8-rc4-own/test/tinterleave.c	2012-05-10 14:06:00.377320225 +0200
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <sched.h>
+#include <numa.h>
+#include <numaif.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+int get_node_of_mem(void* addr)
+{
+	int node = -1;
+	get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE|MPOL_F_ADDR);
+	return node;
+}
+
+bool dump_page_nodes_of_mem(uint8_t* buf, size_t buf_size)
+{
+	int node = get_node_of_mem(buf);
+	int new_node = -1;
+	size_t ifrom = 0;
+	size_t i;
+	size_t pagesz = getpagesize();
+	for (i = pagesz; i < buf_size; i += pagesz) {
+		new_node = get_node_of_mem(buf + i);
+		if (new_node != node) {
+			printf("Node %d from %lu to %lu\n", node, ifrom, i);
+			node = new_node;
+			ifrom = i;
+		}
+	}
+	if (new_node == node) {
+		printf("Node %d from %lu to %lu\n", node, ifrom, buf_size);
+	}
+	return true;
+}
+
+void dump_bufs(uint8_t* v1, uint8_t* v2, uint8_t* res, size_t sbuf)
+{
+	printf("Nodes for v1: %p\n", v1);
+	dump_page_nodes_of_mem(v1, sbuf);
+	printf("Nodes for v2: %p\n", v2);
+	dump_page_nodes_of_mem(v2, sbuf);
+	printf("Nodes for res: %p\n", res);
+	dump_page_nodes_of_mem(res, sbuf);
+
+	numa_free(v1, sbuf);
+	numa_free(v2, sbuf);
+	numa_free(res, sbuf);
+}
+
+void test(size_t n)
+{
+	const size_t sbuf = n*sizeof(uint8_t);
+
+	printf("Test with memset just after the allocations:\n\n");
+
+	uint8_t* v1 = numa_alloc_interleaved(sbuf);
+	memset(v1, 0xAA, sbuf);
+	uint8_t* v2 = numa_alloc_interleaved(sbuf);
+	memset(v2, 0xAA, sbuf);
+	uint8_t* res = numa_alloc_interleaved(sbuf);
+	memset(res, 0xAA, sbuf);
+
+	dump_bufs(v1, v2, res, sbuf);
+	
+	printf("\nTest with allocations grouped:\n\n");
+
+	v1 = numa_alloc_interleaved(sbuf);
+	v2 = numa_alloc_interleaved(sbuf);
+	res = numa_alloc_interleaved(sbuf);
+	memset(v1, 0xAA, sbuf);
+	memset(v2, 0xAA, sbuf);
+	memset(res, 0xAA, sbuf);
+
+	dump_bufs(v1, v2, res, sbuf);
+}
+
+int main(int argc, char** argv)
+{
+	if (argc < 2) {
+		fprintf(stderr, "Usage: %s n\n", argv[0]);
+		return 1;
+	}
+
+	if (numa_available() < 0) {
+		printf("no NUMA API available\n"); 
+		return 1;
+	}
+
+	size_t n = atoll(argv[1]);
+
+	test(n);
+	
+	return 0;
+}

[Index of Archives]     [Linux Kernel]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux SCSI]     [Devices]

  Powered by Linux