[PATCH 5/8] Add IO affinity support to libnuma

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Andi Kleen <ak@xxxxxxxxxxxxxxx>

This adds new higher level node specifiers to libnuma/numactl.
You can specify IO devices and libnuma automatically resolves
the underlying device if the kernel knows about it.

For example:

netdev:eth0		node of net device eth0
file:/foo/bar 		node of block device /foo/bar is on
ip:hostname		node of net device the route to hostname is pointing to
block:sda3		node of block device sda3 (as known by the kernel)
pci:0.0.1		node of pci device 0.0.1

There are obvious limits to this: in some cases there's no unique node
(like device on RAID). Various IO layers do not export the necessary
information to map back to a PCI device And then on many common systems
the kernel doesn't actually know the node for a PCI device because of
ACPI limitations (but this will hopefully improve).

Still with these caveats it's a useful extension.
---
 CHANGES           |    1 +
 Makefile          |   25 +++-
 affinity.c        |  343 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 affinity.h        |    6 +
 libnuma.c         |   22 +++-
 numactl.8         |   48 ++++++--
 numactl.c         |    6 +
 numaint.h         |   19 +++
 rtnetlink.c       |   90 ++++++++++++++
 rtnetlink.h       |    5 +
 sysfs.c           |   69 +++++++++++
 sysfs.h           |    4 +
 test/node-parse.c |   26 ++++
 test/regress-io   |   46 +++++++
 15 files changed, 685 insertions(+), 25 deletions(-)
 create mode 100644 affinity.c
 create mode 100644 affinity.h
 create mode 100644 rtnetlink.c
 create mode 100644 rtnetlink.h
 create mode 100644 sysfs.c
 create mode 100644 sysfs.h
 delete mode 100755 test/move_pages
 create mode 100644 test/node-parse.c
 create mode 100755 test/regress-io

diff --git a/CHANGES b/CHANGES
index d63aa12..d91a4d0 100644
--- a/CHANGES
+++ b/CHANGES
@@ -359,3 +359,4 @@ newer:
 
 
 - Add "same" nodemask alias to numactl (Andi Kleen)
+- Add IO affinity support (Andi Kleen)
diff --git a/Makefile b/Makefile
index a462528..b9a60c8 100755
--- a/Makefile
+++ b/Makefile
@@ -31,10 +31,11 @@ CLEANFILES := numactl.o libnuma.o numactl numademo numademo.o distance.o \
 	      test/after test/before threadtest test_move_pages \
 	      test/mbind_mig_pages test/migrate_pages \
 	      migratepages migspeed migspeed.o libnuma.a \
-	      test/move_pages test/realloc_test
+	      test/move_pages test/realloc_test sysfs.o affinity.o \
+	      test/node-parse rtnetlink.o
 SOURCES := bitops.c libnuma.c distance.c memhog.c numactl.c numademo.c \
 	numamon.c shm.c stream_lib.c stream_main.c syscall.c util.c mt.c \
-	clearcache.c test/*.c
+	clearcache.c test/*.c affinity.c sysfs.c rtnetlink.c
 
 prefix := /usr
 libdir := ${prefix}/$(shell ./getlibdir)
@@ -43,7 +44,8 @@ docdir := ${prefix}/share/doc
 all: numactl migratepages migspeed libnuma.so numademo numamon memhog \
      test/tshared stream test/mynode test/pagesize test/ftok test/prefered \
      test/randmap test/nodemap test/distance test/tbitmap test/move_pages \
-     test/mbind_mig_pages test/migrate_pages test/realloc_test libnuma.a
+     test/mbind_mig_pages test/migrate_pages test/realloc_test libnuma.a \
+     test/node-parse
 
 numactl: numactl.o util.o shm.o bitops.o libnuma.so
 
@@ -81,7 +83,7 @@ stream_main.o: stream_main.c
 
 libnuma.so.1: versions.ldscript
 
-libnuma.so.1: libnuma.o syscall.o distance.o
+libnuma.so.1: libnuma.o syscall.o distance.o affinity.o sysfs.o rtnetlink.o
 	${CC} ${LDFLAGS} -shared -Wl,-soname=libnuma.so.1 -Wl,--version-script,versions.ldscript -Wl,-init,numa_init -Wl,-fini,numa_fini -o libnuma.so.1 $(filter-out versions.ldscript,$^)
 
 libnuma.so: libnuma.so.1
@@ -91,7 +93,7 @@ libnuma.o : CFLAGS += -fPIC
 
 AR ?= ar
 RANLIB ?= ranlib
-libnuma.a: libnuma.o syscall.o distance.o
+libnuma.a: libnuma.o syscall.o distance.o sysfs.o affinity.o rtnetlink.o
 	$(AR) rc $@ $^
 	$(RANLIB) $@
 
@@ -99,6 +101,12 @@ distance.o : CFLAGS += -fPIC
 
 syscall.o : CFLAGS += -fPIC
 
+affinity.o : CFLAGS += -fPIC
+
+sysfs.o : CFLAGS += -fPIC
+
+rtnetlink.o : CFLAGS += -fPIC
+
 test/tshared: test/tshared.o libnuma.so
 
 test/mynode: test/mynode.o libnuma.so
@@ -125,6 +133,8 @@ test/migrate_pages: test/migrate_pages.c libnuma.so
 
 test/realloc_test: test/realloc_test.c libnuma.so
 
+test/node-parse: test/node-parse.c libnuma.so util.o
+
 .PHONY: install all clean html depend
 
 MANPAGES := numa.3 numactl.8 numastat.8 migratepages.8 migspeed.8
@@ -190,4 +200,7 @@ regress1:
 regress2:
 	cd test ; ./regress2
 
-test: all regress1 regress2 test_numademo
+regress3:
+	cd test ; ./regress-io
+
+test: all regress1 regress2 test_numademo regress3
diff --git a/affinity.c b/affinity.c
new file mode 100644
index 0000000..cb59646
--- /dev/null
+++ b/affinity.c
@@ -0,0 +1,343 @@
+/* Support for specifying IO affinity by various means.
+   Copyright 2010 Intel Corporation
+   Author: Andi Kleen
+
+   libnuma is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; version
+   2.1.
+
+   libnuma is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should find a copy of v2.1 of the GNU Lesser General Public License
+   somewhere on your Linux system; if not, write to the Free Software 
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/* Notebook:
+   - Separate real errors from no NUMA with fallback
+   - Infiniband 
+   - FCoE?
+   - Support for other special IO devices
+   - Specifying cpu subsets inside the IO node?
+   - Handle multiple IO nodes (needs kernel changes) 
+   - Better support for multi-path IO?
+ */
+#define _GNU_SOURCE 1
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <dirent.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <sys/types.h>
+#include <ctype.h>
+#include <assert.h>
+#include <regex.h>
+#include "numa.h"
+#include "numaint.h"
+#include "sysfs.h"
+#include "affinity.h"
+#include "rtnetlink.h"
+
+static int badchar(char *s)
+{
+	if (strpbrk(s, "/."))
+		return 1;
+	return 0;
+}
+
+static int node_parse_failure(int ret, char *cls, char *dev)
+{
+	if (!cls)
+		cls = "";
+	if (ret == -2)
+		numa_warn(W_node_parse1, 
+			  "Kernel does not know node mask for%s%s device `%s'", 
+				*cls ? " " : "", cls, dev);
+	else
+		numa_warn(W_node_parse2, 
+			  "Cannot read node mask for %s device `%s'", 
+			  cls, dev);
+	return -1;
+}
+
+/* Generic sysfs class lookup */
+static int affinity_class(struct bitmask *mask, char *cls, char *dev)
+{
+	int ret;
+	while (isspace(*dev))
+		dev++;
+	if (badchar(dev)) {
+		numa_warn(W_badchar, "Illegal characters in `%s' specification",
+			  dev);
+		return -1;
+	}
+       
+	/* Somewhat hackish: extract device from symlink path.
+	   Better would be a direct backlink. This knows slightly too
+	   much about the actual sysfs layout. */
+	char path[1024];
+	char *fn = NULL;
+	if (asprintf(&fn, "/sys/class/%s/%s", cls, dev) > 0 &&
+	    readlink(fn, path, sizeof path) > 0) {
+		regex_t re;
+		regmatch_t match[2];
+		char *p;
+
+		regcomp(&re, "(/devices/pci[0-9a-fA-F:/]+\\.[0-9]+)/",
+			REG_EXTENDED);
+		ret = regexec(&re, path, 2, match, 0);
+		regfree(&re);
+		if (ret == 0) {
+			free(fn);
+			assert(match[0].rm_so > 0);
+			assert(match[0].rm_eo > 0);
+			path[match[1].rm_eo + 1] = 0;
+			p = path + match[0].rm_so;
+			ret = sysfs_node_read(mask, "/sys/%s/numa_node", p);
+			if (ret < 0)
+				return node_parse_failure(ret, NULL, p);
+			return ret;
+		}
+	}
+	free(fn);
+
+	ret = sysfs_node_read(mask, "/sys/class/%s/%s/device/numa_node", 
+			      cls, dev);
+	if (ret < 0)
+		return node_parse_failure(ret, cls, dev);
+	return 0;
+}
+
+
+/* Turn file (or device node) into class name */
+static int affinity_file(struct bitmask *mask, char *cls, char *file)
+{
+	struct stat st;
+	DIR *dir;
+	int n;
+	unsigned maj = 0, min = 0;
+	dev_t d;
+	struct dirent de, *dep;
+
+	cls = "block";
+	char fn[sizeof("/sys/class/") + strlen(cls)];
+	if (stat(file, &st) < 0) {
+		numa_warn(W_blockdev1, "Cannot stat file %s", file);
+		return -1;
+	}
+	d = st.st_dev;
+	if (S_ISCHR(st.st_mode)) {
+		/* Better choice than misc? Most likely misc will not work
+		   anyways unless the kernel is fixed. */
+		cls = "misc";
+		d = st.st_rdev;
+	} else if (S_ISBLK(st.st_mode))
+		d = st.st_rdev;
+	
+	sprintf(fn, "/sys/class/%s", cls);
+	dir = opendir(fn);
+	if (!dir) {
+		numa_warn(W_blockdev2, "Cannot enumerate %s devices in sysfs",
+			  cls);
+		return -1;
+	}
+	while (readdir_r(dir, &de, &dep) == 0 && dep) { 
+		char *name = dep->d_name;
+		if (*name == '.')
+			continue;
+		char *dev;
+		char fn2[sizeof("/sys/class/block//dev") + strlen(name)];
+
+		n = -1;
+		if (sprintf(fn2, "/sys/class/block/%s/dev", name) < 0)
+			break;
+		dev = sysfs_read(fn2);
+		if (dev) {
+			n = sscanf(dev, "%u:%u", &maj, &min);
+			free(dev);
+		}
+		if (n != 2) {
+			numa_warn(W_blockdev3, "Cannot parse sysfs device %s", 
+				  name);
+			continue;
+		}
+	
+		if (major(d) != maj || minor(d) != min)	
+			continue;
+
+		closedir(dir);
+		return affinity_class(mask, "block", name);
+	}
+	closedir(dir);
+	numa_warn(W_blockdev5, "Cannot find block device %x:%x in sysfs for `%s'",
+		  maj, min, file);
+	return -1;
+}
+
+/* Look up interface of route using rtnetlink. */
+static int find_route(struct sockaddr *dst, int *iifp)
+{
+	struct rtattr *rta;
+	const int hdrlen = NLMSG_LENGTH(sizeof(struct rtmsg));
+	struct {
+		struct nlmsghdr msg;
+		struct rtmsg rt;
+		char buf[256];
+	} req = {
+		.msg = {
+			.nlmsg_len = hdrlen,
+			.nlmsg_type = RTM_GETROUTE,
+			.nlmsg_flags = NLM_F_REQUEST,
+		},
+		.rt = {
+			.rtm_family = dst->sa_family,
+		},
+	};
+	struct sockaddr_nl adr = {
+		.nl_family = AF_NETLINK,
+	};
+
+	if (rta_put_address(&req.msg, RTA_DST, dst) < 0) {
+		numa_warn(W_netlink1, "Cannot handle network family %x",
+			  dst->sa_family);
+		return -1;
+	}
+
+	if (rtnetlink_request(&req.msg, sizeof req, &adr) < 0) {
+		numa_warn(W_netlink2, "Cannot request rtnetlink route: %s",
+			  strerror(errno));
+		return -1;
+	}
+
+	/* Fish the interface out of the netlink soup. */
+	rta = NULL;
+	while ((rta = rta_get(&req.msg, rta, hdrlen)) != NULL) {
+		if (rta->rta_type == RTA_OIF) {
+			memcpy(iifp, RTA_DATA(rta), sizeof(int));
+			return 0;
+		}
+	}
+
+	numa_warn(W_netlink3, "rtnetlink query did not return interface");
+	return -1;
+}
+
+static int iif_to_name(int iif, struct ifreq *ifr)
+{
+	int n;
+	int sk = socket(PF_INET, SOCK_DGRAM, 0);
+	if (sk < 0) 
+		return -1;
+	ifr->ifr_ifindex = iif;
+	n = ioctl(sk, SIOCGIFNAME, ifr);
+	close(sk);
+	return n;
+}
+
+/* Resolve an IP address to the nodes of a network device.
+   This generally only attempts to handle simple cases:
+   no multi-path, no bounding etc. In these cases only 
+   the first interface or none is chosen. */
+static int affinity_ip(struct bitmask *mask, char *cls, char *id)
+{
+	struct addrinfo *ai;
+	int n;
+	int iif;
+	struct ifreq ifr;
+
+	if ((n = getaddrinfo(id, NULL, NULL, &ai)) != 0) {  
+		numa_warn(W_net1, "Cannot resolve %s: %s", 
+			  id, gai_strerror(n));
+		return -1;
+	}	
+
+	if (find_route(&ai->ai_addr[0], &iif) < 0)
+		goto out_ai;
+
+	if (iif_to_name(iif, &ifr) < 0) { 
+		numa_warn(W_net2, "Cannot resolve network interface %d", iif);
+		goto out_ai;
+	}
+
+	freeaddrinfo(ai);
+	return affinity_class(mask, "net", ifr.ifr_name);
+
+out_ai:
+	freeaddrinfo(ai);
+	return -1;
+}
+
+/* Look up affinity for a PCI device */
+static int affinity_pci(struct bitmask *mask, char *cls, char *id)
+{
+	unsigned seg, bus, dev, func;
+	int n, ret;
+
+	/* Func is optional. */
+	if ((n = sscanf(id, "%x:%x:%x.%x",&seg,&bus,&dev,&func)) == 4 || n == 3) {
+		if (n == 3)
+			func = 0;
+	}
+	/* Segment is optional too */
+	else if ((n = sscanf(id, "%x:%x.%x",&bus,&dev,&func)) == 3 || n == 2) {
+		seg = 0;
+		if (n == 2)
+			func = 0;
+	} else {
+		numa_warn(W_pci1, "Cannot parse PCI device `%s'", id);
+		return -1;
+	}
+	ret = sysfs_node_read(mask, 
+			"/sys/devices/pci%04x:%02x/%04x:%02x:%02x.%x/numa_node",
+			      seg, bus, seg, bus, dev, func);
+	if (ret < 0)
+		return node_parse_failure(ret, cls, id);
+	return 0;
+}
+
+static struct handler {
+	char first;
+	char *name;
+	char *cls;
+	int (*handler)(struct bitmask *mask, char *cls, char *desc);
+} handlers[] = {
+	{ 'n', "netdev:", "net",   affinity_class },
+	{ 'i', "ip:",     NULL,    affinity_ip    },
+	{ 'f', "file:",   NULL,    affinity_file  },
+	{ 'b', "block:",  "block", affinity_class },
+	{ 'p', "pci:",    NULL,	   affinity_pci   },
+	{}
+};
+
+hidden int resolve_affinity(char *id, struct bitmask *mask)
+{
+	struct handler *h;
+
+	for (h = &handlers[0]; h->first; h++) {
+		int len;
+		if (id[0] != h->first)
+			continue;
+		len = strlen(h->name);
+		if (!strncmp(id, h->name, len)) {
+			int ret = h->handler(mask, h->cls, id + len);
+			if (ret == -2) {
+				numa_warn(W_nonode, "Kernel does not know node for %s\n",
+					  id + len);
+			}
+			return ret;
+		}
+	}
+	return NO_IO_AFFINITY;
+}
+
diff --git a/affinity.h b/affinity.h
new file mode 100644
index 0000000..4863d8f
--- /dev/null
+++ b/affinity.h
@@ -0,0 +1,6 @@
+enum {
+	NO_IO_AFFINITY = -2
+};
+
+int resolve_affinity(char *id, struct bitmask *mask);
+
diff --git a/libnuma.c b/libnuma.c
index b4773a2..663046b 100755
--- a/libnuma.c
+++ b/libnuma.c
@@ -35,6 +35,7 @@
 #include "numaif.h"
 #include "numaint.h"
 #include "util.h"
+#include "affinity.h"
 
 #define WEAK __attribute__((weak))
 
@@ -1776,12 +1777,23 @@ numa_parse_nodestring(char *s)
 		s++;
 	}
 	do {
-		int i;
 		unsigned long arg;
-		if (!strcmp(s,"all")) {
-			copy_bitmask_to_bitmask(numa_all_nodes_ptr, mask);
-			s+=4;
-			break;
+		int i;
+		if (isalpha(*s)) { 
+			int n;
+			if (!strcmp(s,"all")) {
+				copy_bitmask_to_bitmask(numa_all_nodes_ptr, 
+							mask);
+				s+=4;
+				break;
+			}
+			n = resolve_affinity(s, mask);
+			if (n != NO_IO_AFFINITY) {
+				if (n < 0)
+					goto err;
+				s += strlen(s) + 1;
+				break;
+			}
 		}
 		arg = get_nr(s, &end, numa_all_nodes_ptr, relative);
 		if (end == s) {
diff --git a/numactl.8 b/numactl.8
index 684e80a..989910c 100644
--- a/numactl.8
+++ b/numactl.8
@@ -73,20 +73,10 @@ memory policy
 runs processes with a specific NUMA scheduling or memory placement policy.
 The policy is set for command and inherited by all of its children.
 In addition it can set persistent policy for shared memory segments or files.
-.TP
+.PP
 Use -- before command if using command options that could be confused
 with numactl options.
-.TP
-Policy settings are:
-.TP
-.B \-\-interleave=nodes, \-i nodes
-Set a memory interleave policy. Memory will be allocated using round robin
-on 
-.I nodes.
-When memory cannot be allocated on the current interleave target fall back
-to other nodes.
-Multiple nodes may be specified on --interleave, --membind and --cpunodebind.
-You may specify "all", which means all nodes in the current cpuset.
+.PP
 .I nodes
 may be specified as N,N,N or  N-N or N,N-N or  N-N,N-N and so forth.
 Relative
@@ -98,6 +88,32 @@ A !N-N notation indicates the inverse of N-N, in other words all nodes
 except N-N.  If used with + notation, specify !+N-N. When 
 .I same 
 is specified the previous nodemask specified on the command line is used.
+all means all nodes in the current cpuset.
+.PP
+Instead of a number a node can also be:
+.TS
+tab(|);
+l l.
+netdev:DEV|The node connected to network device DEV.
+file:PATH |The node the block device of PATH.
+ip:HOST   |The node of the network device of HOST
+block:PATH|The node of block device PATH
+pci:[seg:]bus:dev[:func]|The node of a PCI device.
+.TE
+
+Note that block resolves the kernel block device names only
+for udev names in /dev use 
+.I file:
+.TP
+Policy settings are:
+.TP
+.B \-\-interleave=nodes, \-i nodes
+Set a memory interleave policy. Memory will be allocated using round robin
+on 
+.I nodes.
+When memory cannot be allocated on the current interleave target fall back
+to other nodes.
+Multiple nodes may be specified on --interleave, --membind and --cpunodebind.
 .TP
 .B \-\-membind=nodes, \-m nodes
 Only allocate memory from nodes.  Allocation will fail when there
@@ -249,13 +265,17 @@ Run myapplic on cpus 0-4 and 8-12 of the current cpuset.
 numactl \-\-interleave=all bigdatabase arguments
 Run big database with its memory interleaved on all CPUs.
 
-numactl \-\-cpubind=0 \-\-membind=0,1 process
+numactl \-\-cpunodebind=0 \-\-membind=0,1 process
 Run process on node 0 with memory allocated on node 0 and 1.
 
-numactl \-\-cpubind=0 \-\-membind=0,1 -- process -l
+numactl \-\-cpunodebind=0 \-\-membind=0,1 -- process -l
 Run process as above, but with an option (-l) that would be confused with
 a numactl option.
 
+numactl \-\-nodebind=netdev:eth0 \-\-membind=netdev:eth0 network-server
+Run network-server on the node of network device eth0 with its memory
+also in the same node.
+
 numactl \-\-preferred=1 numactl \-\-show
 Set preferred node 1 and show the resulting state.
 
diff --git a/numactl.c b/numactl.c
index d1be03c..047a6d0 100755
--- a/numactl.c
+++ b/numactl.c
@@ -73,6 +73,12 @@ void usage(void)
 		"\n"
 		"memory policy is --interleave, --preferred, --membind, --localalloc\n"
 		"nodes is a comma delimited list of node numbers or A-B ranges or all.\n"
+		"Instead of a number a node can also be:\n"
+		"  netdev:DEV the node connected to network device DEV\n"
+		"  file:PATH  the node the block device of path is connected to\n"
+		"  ip:HOST    the node of the network device host routes through\n"
+		"  block:PATH the node of block device path\n"
+		"  pci:[seg:]bus:dev[:func] The node of a PCI device\n"
 		"cpus is a comma delimited list of cpu numbers or A-B ranges or all\n"
 		"all ranges can be inverted with !\n"
 		"all numbers and ranges can be made cpuset-relative with +\n"
diff --git a/numaint.h b/numaint.h
index 059a871..16f4878 100755
--- a/numaint.h
+++ b/numaint.h
@@ -16,6 +16,7 @@ extern int numa_sched_getaffinity_v2_int(pid_t pid, struct bitmask *mask);
 #define CPU_LONGS(x) (CPU_BYTES(x) / sizeof(long))
 
 #define make_internal_alias(x) extern __typeof (x) x##_int __attribute((alias(#x), visibility("hidden")))
+#define hidden __attribute__((visibility("hidden")))
 
 enum numa_warn {
 	W_nosysfs,
@@ -29,6 +30,24 @@ enum numa_warn {
 	W_memory,
 	W_cpuparse,
 	W_nodeparse,
+	W_blockdev1,
+	W_blockdev2,
+	W_blockdev3,
+	W_blockdev4,
+	W_blockdev5,
+	W_netlink1,
+	W_netlink2,
+	W_netlink3,
+	W_net1,
+	W_net2,
+	W_class1,
+	W_class2,
+	W_pci1,
+	W_pci2,
+	W_node_parse1,
+	W_node_parse2,
+	W_nonode,
+	W_badchar,
 };
 
 #define howmany(x,y) (((x)+((y)-1))/(y))
diff --git a/rtnetlink.c b/rtnetlink.c
new file mode 100644
index 0000000..e4cdbf1
--- /dev/null
+++ b/rtnetlink.c
@@ -0,0 +1,90 @@
+/* Simple LPGLed rtnetlink library */
+#include <sys/socket.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <netinet/in.h>
+#include <errno.h>
+#include <unistd.h>
+#define hidden __attribute__((visibility("hidden")))
+#include "rtnetlink.h"
+
+
+hidden void *rta_put(struct nlmsghdr *m, int type, int len)
+{
+	struct rtattr *rta = (void *)m + NLMSG_ALIGN(m->nlmsg_len);
+	int rtalen = RTA_LENGTH(len);
+
+	rta->rta_type = type;
+	rta->rta_len = rtalen;
+	m->nlmsg_len = NLMSG_ALIGN(m->nlmsg_len) + RTA_ALIGN(rtalen);
+	return RTA_DATA(rta);
+}
+
+hidden struct rtattr *rta_get(struct nlmsghdr *m, struct rtattr *p, int offset)
+{
+	struct rtattr *rta;
+
+	if (p) {
+		rta = RTA_NEXT(p, m->nlmsg_len);
+		if (!RTA_OK(rta, m->nlmsg_len))
+			return NULL;
+	} else {
+		rta = (void *)m + NLMSG_ALIGN(offset);
+	}
+	return rta;
+}
+
+hidden int
+rta_put_address(struct nlmsghdr *msg, int type, struct sockaddr *adr)
+{
+	switch (adr->sa_family) { 
+	case AF_INET: {
+		struct in_addr *i = rta_put(msg, type, 4);
+		*i = ((struct sockaddr_in *)adr)->sin_addr;
+		break;
+	}
+	case AF_INET6: {
+		struct in6_addr *i6 = rta_put(msg, type, 16);
+		*i6 = ((struct sockaddr_in6 *)adr)->sin6_addr;
+		break;
+	}
+	default:
+		return -1;
+	}
+	return 0;
+}
+
+/* Assumes no truncation. Make the buffer large enough. */
+hidden int 
+rtnetlink_request(struct nlmsghdr *msg, int buflen, struct sockaddr_nl *adr)
+{
+	int rsk;
+	int n;
+	int e;
+
+	/* Use a private socket to avoid having to keep state
+	   for a sequence number. */
+	rsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (rsk < 0)
+		return -1;
+	n = sendto(rsk, msg, msg->nlmsg_len, 0, (struct sockaddr *)adr, 
+		   sizeof(struct sockaddr_nl));
+	if (n >= 0) { 
+		socklen_t adrlen = sizeof(struct sockaddr_nl);
+		n = recvfrom(rsk, msg, buflen, 0, (struct sockaddr *)adr, 
+			     &adrlen);
+	}
+	e = errno;
+	close(rsk);
+	errno = e;
+	if (n < 0)
+		return -1;
+	/* Assume we only get a single reply back. This is (hopefully?) 
+	   safe because it's a single use socket. */
+	if (msg->nlmsg_type == NLMSG_ERROR) {
+		struct nlmsgerr *err = NLMSG_DATA(msg);
+		errno = -err->error;
+		return -1;
+	}
+	return 0;
+}
diff --git a/rtnetlink.h b/rtnetlink.h
new file mode 100644
index 0000000..f73d909
--- /dev/null
+++ b/rtnetlink.h
@@ -0,0 +1,5 @@
+hidden int
+rta_put_address(struct nlmsghdr *msg, int type, struct sockaddr *adr);
+hidden struct rtattr *rta_get(struct nlmsghdr *m, struct rtattr *p, int offset);
+hidden void *rta_put(struct nlmsghdr *m, int type, int len);
+hidden int rtnetlink_request(struct nlmsghdr *msg, int buflen, struct sockaddr_nl *adr);
diff --git a/sysfs.c b/sysfs.c
new file mode 100644
index 0000000..7da792b
--- /dev/null
+++ b/sysfs.c
@@ -0,0 +1,69 @@
+/* Utility functions for reading sysfs values */
+#define _GNU_SOURCE 1
+#include <stdio.h>
+#include <sys/fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include "numa.h"
+#include "numaint.h"
+
+#define SYSFS_BLOCK 4096
+
+hidden char *sysfs_read(char *name)
+{
+	char *buf;
+	int n;
+	int fd;
+
+	fd = open(name, O_RDONLY);
+	buf = malloc(SYSFS_BLOCK);
+	if (!buf)
+		return NULL;
+	n = read(fd, buf, SYSFS_BLOCK - 1);
+	close(fd);	
+	if (n <= 0) {
+		free(buf);
+		return NULL;
+	}
+	buf[n] = 0;
+	return buf;
+}
+
+hidden int sysfs_node_read(struct bitmask *mask, char *fmt, ...)
+{
+	int n;
+	va_list ap;
+	char *p, *fn, *m, *end;
+	int num;
+
+	va_start(ap, fmt);
+	n = vasprintf(&fn, fmt, ap);
+	va_end(ap);
+	if (n < 0)
+		return -1;
+	p = sysfs_read(fn);
+	free(fn);
+	if (!p)
+		return -1;
+	
+	m = p;
+	do {
+		num = strtol(m, &end, 0); 
+		if (m == end)
+			return -1;
+		if (num < 0)
+			return -2;
+		if (num >= numa_num_task_nodes()) 
+			return -1;
+		numa_bitmask_setbit(mask, num);
+
+		/* Continuation not supported by kernel yet. */
+		m = end;
+		while (isspace(*m) || *m == ',')
+			m++;
+	} while (isdigit(*m));
+	free(p);
+	return 0;
+}
diff --git a/sysfs.h b/sysfs.h
new file mode 100644
index 0000000..7a13b85
--- /dev/null
+++ b/sysfs.h
@@ -0,0 +1,4 @@
+struct bitmask;
+hidden char *sysfs_read(char *name);
+hidden int sysfs_node_read(struct bitmask *mask, char *fmt, ...);
+
diff --git a/test/node-parse.c b/test/node-parse.c
new file mode 100644
index 0000000..b7a6542
--- /dev/null
+++ b/test/node-parse.c
@@ -0,0 +1,26 @@
+/* Test wrapper for the nodemask parser */
+#include <stdio.h>
+#include "numa.h"
+#include "util.h"
+
+/* For util.c. Fixme. */
+void usage(void)
+{
+	exit(1);
+}
+
+int main(int ac, char **av)
+{
+	int err = 0;
+	while (*++av) { 
+		struct bitmask *mask = numa_parse_nodestring(*av);
+		if (!mask) {
+			printf("Failed to convert `%s'\n", *av);
+			err |= 1;
+			continue;
+		}
+		printmask("result", mask);	
+		numa_bitmask_free(mask);
+	}
+	return err;
+}
diff --git a/test/regress-io b/test/regress-io
new file mode 100755
index 0000000..f967073
--- /dev/null
+++ b/test/regress-io
@@ -0,0 +1,46 @@
+#!/bin/bash
+# test IO affinity parsing
+# tests may fail depending on machine setup
+
+E=0
+
+check() { 
+	echo testing $@
+	if "$@" ; then
+		true
+	else
+		echo failed
+		E=1
+	fi
+	
+}
+
+fail() { 
+	echo testing failure of $@
+	if "$@" ; then
+		echo failed
+		E=1
+	else
+		true
+	fi
+}
+
+BASE=`(cd ..; pwd)`
+export LD_LIBRARY_PATH=$BASE
+export PATH=$BASE:$PATH
+
+check ./node-parse file:. 
+check ./node-parse ip:8.8.8.8
+fail ./node-parse ip:127.0.0.1  
+
+IF=$(ip link ls | grep eth | cut -d: -f2 | head -1)
+check ./node-parse "netdev:$IF"
+fail ./node-parse netdev:lo
+DEV=$(df | awk '/\/$/ { print $1 }')
+check ./node-parse file:$DEV
+check ./node-parse block:$(basename $DEV)
+check ./node-parse pci:0:0.0
+
+if [ "$E" = 0 ] ; then echo SUCCESS ; else echo FAILURE ; fi
+
+exit $E
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-numa" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux SCSI]     [Devices]

  Powered by Linux