Author: Tim Pepper <lnxninja@xxxxxxxxxxxxxxxxxx> Date: Tue Sep 21 16:53:01 2010 -0700 numactl --hardware should handle sparse node numbering Previously this has been enabled, but the patches by Amit Arora in 2009 were partly undone by Thomas Renninger in 2010, because these past patches appear to have only considered parts of the problem. Reverting the patches fixed Thomas's problem but reintroduced Amit's. It is possible to have sparse node numbering as well as nodes with no memory, nodes with no cpus and nodes with neither cpus or memory. All of these should be handled. The existing node bitmasks and code also have conflated those possibilities with policy (ie: numa_all_nodes_ptr contains nodes which have memory and from which the calling process may allocated memory). For this reason a new mode bitmask, numa_all_nodes, is added and populated so that 'numactl --hardware' can truly inventory the available hardware as per the man page. This may or may not be correct in that the word "available" may or may not have been intended to include policy and actual usability of the nodes. But it seems that the command is meant to show a lower level inventory, in which case it seems reasonable to truly print out all the nodes which kernelspace has exposed to userspace. Signed-off-by: Tim Pepper <lnxninja@xxxxxxxxxxxxxxxxxx> Cc: Thomas Renninger <trenn@xxxxxxx> Cc: Anton Blanchard <anton@xxxxxxxxx> Cc: Amit Arora <aarora@xxxxxxxxxxxxxxxxxx> --- distance.c | 26 ++++++++++++++++---------- libnuma.c | 9 +++++++-- numa.h | 3 +++ numactl.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++---- versions.ldscript | 1 + 5 files changed, 74 insertions(+), 16 deletions(-) diff --git a/distance.c b/distance.c index 2aad2bb..2b48f97 100755 --- a/distance.c +++ b/distance.c @@ -26,14 +26,21 @@ static int distance_numnodes; static int *distance_table; -static void parse_numbers(char *s, int *iptr, int n) +static void parse_numbers(char *s, int *iptr) { int i, d, j; char *end; - for (i = 0, j = 0; i < n; i++, j++) { + int maxnode = numa_max_node(); + int numnodes = 0; + + for (i = 0; i <= maxnode; i++) + if (numa_bitmask_isbitset(numa_nodes_ptr, i)) + numnodes++; + + for (i = 0, j = 0; i <= maxnode; i++, j++) { d = strtoul(s, &end, 0); /* Skip unavailable nodes */ - while (j<n && !numa_bitmask_isbitset(numa_all_nodes_ptr, j)) + while (j<=maxnode && !numa_bitmask_isbitset(numa_nodes_ptr, j)) j++; *(iptr+j) = d; if (s == end) @@ -47,10 +54,10 @@ static int read_distance_table(void) int nd, len; char *line = NULL; size_t linelen = 0; - int numnodes = 0; + int maxnode = numa_max_node() + 1; int *table = NULL; int err = -1; - + for (nd = 0;; nd++) { char fn[100]; FILE *dfh; @@ -59,7 +66,7 @@ static int read_distance_table(void) if (!dfh) { if (errno == ENOENT && nd > 0) err = 0; - if (!err && nd<=numa_max_node()) + if (!err && nd<maxnode) continue; else break; @@ -70,15 +77,14 @@ static int read_distance_table(void) break; if (!table) { - numnodes = numa_num_configured_nodes(); - table = calloc(numnodes * numnodes, sizeof(int)); + table = calloc(maxnode * maxnode, sizeof(int)); if (!table) { errno = ENOMEM; break; } } - parse_numbers(line, table + nd * numnodes, numnodes); + parse_numbers(line, table + nd * maxnode); } free(line); if (err) { @@ -96,7 +102,7 @@ static int read_distance_table(void) free(table); return 0; } - distance_numnodes = numnodes; + distance_numnodes = maxnode; distance_table = table; return 0; } diff --git a/libnuma.c b/libnuma.c index 641ad26..d40835d 100644 --- a/libnuma.c +++ b/libnuma.c @@ -51,6 +51,7 @@ struct bitmask *numa_all_cpus_ptr = NULL; of numa_no_nodes and numa_all_nodes, but the loader does not correctly handle versioning of BSS versus small data items */ +struct bitmask *numa_nodes_ptr = NULL; static struct bitmask *numa_memnode_ptr = NULL; static unsigned long *node_cpu_mask_v1[NUMA_NUM_NODES]; struct bitmask **node_cpu_mask_v2; @@ -105,6 +106,8 @@ numa_fini(void) numa_bitmask_free(numa_no_nodes_ptr); if (numa_memnode_ptr) numa_bitmask_free(numa_memnode_ptr); + if (numa_nodes_ptr) + numa_bitmask_free(numa_nodes_ptr); } /* @@ -292,8 +295,8 @@ int numa_pagesize(void) make_internal_alias(numa_pagesize); /* - * Find nodes with memory (numa_memnode_ptr) and the highest numbered - * existing node (maxconfigurednode). + * Find nodes (numa_nodes_ptr), nodes with memory (numa_memnode_ptr) + * and the highest numbered existing node (maxconfigurednode). */ static void set_configured_nodes(void) @@ -303,6 +306,7 @@ set_configured_nodes(void) long long freep; numa_memnode_ptr = numa_allocate_nodemask(); + numa_nodes_ptr = numa_allocate_nodemask(); d = opendir("/sys/devices/system/node"); if (!d) { @@ -313,6 +317,7 @@ set_configured_nodes(void) if (strncmp(de->d_name, "node", 4)) continue; nd = strtoul(de->d_name+4, NULL, 0); + numa_bitmask_setbit(numa_nodes_ptr, nd); if (numa_node_size64(nd, &freep) > 0) numa_bitmask_setbit(numa_memnode_ptr, nd); if (maxconfigurednode < nd) diff --git a/numa.h b/numa.h index 989f4d7..9a6a644 100755 --- a/numa.h +++ b/numa.h @@ -150,6 +150,9 @@ int numa_pagesize(void); Only valid after numa_available. */ extern struct bitmask *numa_all_nodes_ptr; +/* Set with all nodes the kernel has exposed to userspace */ +extern struct bitmask *numa_nodes_ptr; + /* For source compatibility */ extern nodemask_t numa_all_nodes; diff --git a/numactl.c b/numactl.c index ce3a482..2e21ae8 100755 --- a/numactl.c +++ b/numactl.c @@ -188,12 +188,17 @@ static void print_distances(int maxnode) printf("node distances:\n"); printf("node "); for (i = 0; i <= maxnode; i++) - printf("% 3d ", i); + if (numa_bitmask_isbitset(numa_nodes_ptr, i)) + printf("% 3d ", i); printf("\n"); for (i = 0; i <= maxnode; i++) { + if (!numa_bitmask_isbitset(numa_nodes_ptr, i)) + continue; printf("% 3d: ", i); for (k = 0; k <= maxnode; k++) - printf("% 3d ", numa_distance(i,k)); + if (numa_bitmask_isbitset(numa_nodes_ptr, i) && + numa_bitmask_isbitset(numa_nodes_ptr, k)) + printf("% 3d ", numa_distance(i,k)); printf("\n"); } } @@ -216,14 +221,52 @@ void print_node_cpus(int node) void hardware(void) { int i; + int numnodes=0; + int prevnode=-1; + int skip=0; int maxnode = numa_max_node(); - printf("available: %d nodes (0-%d)\n", 1+maxnode, maxnode); + for (i=0; i<=maxnode; i++) + if (numa_bitmask_isbitset(numa_nodes_ptr, i)) + numnodes++; + printf("available: %d nodes (", numnodes); + for (i=0; i<=maxnode; i++) { + if (numa_bitmask_isbitset(numa_nodes_ptr, i)) { + if (prevnode == -1) { + printf("%d", i); + prevnode=i; + continue; + } + + if (i > prevnode + 1) { + if (skip) { + printf("%d", prevnode); + skip=0; + } + printf(",%d", i); + prevnode=i; + continue; + } + + if (i == prevnode + 1) { + if (!skip) { + printf("-"); + skip=1; + } + prevnode=i; + } + + if ((i == maxnode) && skip) + printf("%d", prevnode); + } + } + printf(")\n"); + for (i = 0; i <= maxnode; i++) { char buf[64]; long long fr; unsigned long long sz = numa_node_size64(i, &fr); - if (!numa_bitmask_isbitset(numa_all_nodes_ptr, i)) + if (!numa_bitmask_isbitset(numa_nodes_ptr, i)) continue; printf("node %d cpus:", i); diff --git a/versions.ldscript b/versions.ldscript index c2c88b6..e3389e0 100755 --- a/versions.ldscript +++ b/versions.ldscript @@ -119,6 +119,7 @@ libnuma_1.2 { numa_node_size; numa_node_to_cpus; numa_node_of_cpu; + numa_nodes_ptr; numa_num_configured_cpus; numa_num_configured_nodes; numa_num_possible_nodes; -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html