The patch titled configurabl node sizes has been removed from the -mm tree. Its filename was 2-configurable-node-sizes.patch This patch was dropped because it is obsolete ------------------------------------------------------ Subject: configurabl node sizes From: David Rientjes <rientjes@xxxxxxxxxx> Allows a more elaborate command line to be specified to numa=fake so that nodes can be different sizes. The command line is specified as: numa=fake=<cmdline> If a number, fakes <cmdline> nodes and ignores NUMA setup of the actual machine. Otherwise, system memory is configured depending on the sizes and coefficients listed. For example: numa=fake=2*512,1024,4*256,*128 gives two 512M nodes, a 1024M node, four 256M nodes, and the rest split into 128M chunks. If the last character of <cmdline> is a *, the remaining memory is divided up equally among its coefficient: numa=fake=2*512,2* gives two 512M nodes and the rest split into two nodes. If the last character is a comma, the remaining system memory is not allocated to an additional node. Useful in combination with CPUsets. Signed-off-by: David Rientjes <rientjes@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxx> --- Documentation/x86_64/boot-options.txt | 14 + arch/x86_64/mm/numa.c | 234 ++++++++++++++++++------ 2 files changed, 195 insertions(+), 53 deletions(-) diff -puN Documentation/x86_64/boot-options.txt~2-configurable-node-sizes Documentation/x86_64/boot-options.txt --- a/Documentation/x86_64/boot-options.txt~2-configurable-node-sizes +++ a/Documentation/x86_64/boot-options.txt @@ -149,7 +149,19 @@ NUMA numa=noacpi Don't parse the SRAT table for NUMA setup - numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. + numa=fake=<cmdline> + If a number, fakes <cmdline> nodes and ignores NUMA setup of + the actual machine. Otherwise, system memory is configured + depending on the sizes and coefficients listed. For example: + numa=fake=2*512,1024,4*256,*128 + gives two 512 MB nodes, a 1024 MB node, four 256 MB nodes, and + the rest split into 128 MB chunks. If the last character of + <cmdline> is a *, the remaining memory is divided up equally + among its coefficient: + numa=fake=2*512,2* + gives two 512 MB nodes and the rest split into two nodes. If + the last character is a comma, the remaining system memory is + not allocated to an additional node. numa=hotadd=percent Only allow hotadd memory to preallocate page structures upto diff -puN arch/x86_64/mm/numa.c~2-configurable-node-sizes arch/x86_64/mm/numa.c --- a/arch/x86_64/mm/numa.c~2-configurable-node-sizes +++ a/arch/x86_64/mm/numa.c @@ -180,74 +180,208 @@ void __init numa_init_array(void) } +/* NUMA emulation */ #ifdef CONFIG_NUMA_EMU -int numa_fake __initdata = 0; +#define E820_ADDR_HOLE_SIZE(start, end) \ + (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \ + PAGE_SHIFT) +char *cmdline __initdata; -/* Numa emulation */ -static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +/* + * Sets up nodeid to range from addr to addr + sz. If the end boundary is + * greater than max_addr, then max_addr is used instead. The return value is 0 + * if there is additional memory left for allocation past addr and -1 otherwise. + * addr is adjusted to be at the end of node. + */ +static int setup_node_range(int nodeid, struct bootnode *nodes, u64 *addr, + u64 sz, u64 max_addr) { - int i, big; - struct bootnode nodes[numa_fake]; - u64 addr = start_pfn << PAGE_SHIFT; - u64 sz = ((end_pfn - start_pfn - e820_hole_size(start_pfn, end_pfn)) << - PAGE_SHIFT) / numa_fake; + int ret = 0; + nodes[nodeid].start = *addr; + *addr += sz; + if (*addr >= max_addr) { + *addr = max_addr; + ret = -1; + } + nodes[nodeid].end = *addr; + node_set_online(nodeid); + printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nodeid, + nodes[nodeid].start, nodes[nodeid].end, + (nodes[nodeid].end - nodes[nodeid].start) >> 20); + return ret; +} + +/* + * Splits num_nodes nodes up equally starting at node_start. The return value + * is the number of nodes split up and addr is adjusted to be at the end of the + * last node allocated. + */ +static int split_nodes_equal(struct bootnode *nodes, u64 *addr, u64 max_addr, + int node_start, int num_nodes) +{ + unsigned int big; + u64 sz; + int i; + if (num_nodes <= 0) + return -1; + if (num_nodes > MAX_NUMNODES) + num_nodes = MAX_NUMNODES; + sz = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) / + num_nodes; /* * Calculate the number of big nodes that can be allocated as a result - * of consolidating our masked leftovers. + * of consolidating the leftovers. */ - big = ((sz & ~NODE_HASH_MASK) * numa_fake) / NODE_MIN_SIZE; + big = ((sz & ~NODE_HASH_MASK) * num_nodes) / NODE_MIN_SIZE; - /* Round down to nearest 4MB for hash function */ + /* Round down to the nearest 4 MB for hash function */ sz &= NODE_HASH_MASK; - if (sz == 0) { - printk(KERN_ERR "Not enough memory allotted for each node. " - "Numa emulation disabled.\n"); + if (!sz) { + printk(KERN_ERR "Not enough memory for each node. " + "NUMA emulation disabled.\n"); return -1; } - memset(&nodes,0,sizeof(nodes)); - for (i = 0; i < numa_fake; i++) { - nodes[i].start = addr; - addr += sz; + for (i = node_start; i < num_nodes + node_start; i++) { + u64 end = *addr + sz; if (i < big) - addr += NODE_MIN_SIZE; + end += NODE_MIN_SIZE; /* - * The final node cannot be over-allocated so give it the - * remaining memory available and ignore asymmetry. - * - * Other nodes receive roughly the same amount of avaiable - * pages. + * The final node can have the remaining system RAM. Other + * nodes receive roughly the same amount of available pages. */ - if (i == numa_fake - 1) - addr = end_pfn << PAGE_SHIFT; + if (i == num_nodes + node_start - 1) + end = max_addr; else - while (addr - nodes[i].start - - (e820_hole_size(nodes[i].start >> PAGE_SHIFT, - addr >> PAGE_SHIFT) << - PAGE_SHIFT) < sz) - addr += NODE_MIN_SIZE; - nodes[i].end = addr; - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", - i, - nodes[i].start, nodes[i].end, - (nodes[i].end - nodes[i].start) >> 20); - node_set_online(i); - } - populate_physnode_map(nodes, numa_fake); - for_each_online_node(i) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - numa_init_array(); - return 0; + while (end - *addr - + E820_ADDR_HOLE_SIZE(*addr, end) < sz) + end += NODE_MIN_SIZE; + if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) + break; + } + return i - node_start + 1; +} + +/* + * Splits the remaining system RAM into chunks of size. The remaining memory is + * always assigned to a final node and can be asymmetric. Returns the number of + * nodes split. + */ +static int split_nodes_size(struct bootnode *nodes, u64 *addr, u64 max_addr, + int node_start, u64 sz) +{ + int i = node_start; + sz = (sz << 20) & NODE_HASH_MASK; + while (!setup_node_range(i++, nodes, addr, sz, max_addr)) + ; + return i - node_start; } -#endif + +/* + * Sets up the system RAM area from start_pfn to end_pfn according to the + * numa=fake command line. + */ +static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +{ + struct bootnode nodes[MAX_NUMNODES]; + u64 addr = start_pfn << PAGE_SHIFT; + u64 max_addr = end_pfn << PAGE_SHIFT; + u64 sz; + int num_nodes; + int coeff_flag; + int coeff = -1; + int num; + int i; + + memset(&nodes, 0, sizeof(nodes)); + /* + * If the numa=fake command line is just a single number N, split the + * system RAM into N fake nodes. + */ + if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { + num_nodes = split_nodes_equal(nodes, &addr, max_addr, 0, + simple_strtol(cmdline, NULL, 0)); + if (num_nodes < 0) + return num_nodes; + goto out; + } + + /* Parse the command line */ + for (coeff_flag = num = num_nodes = 0; ; cmdline++) { + if (*cmdline && isdigit(*cmdline)) { + num = num * 10 + *cmdline - '0'; + continue; + } else if (*cmdline == '*') { + if (num > 0) + coeff = num; + coeff_flag = 1; + } else if (*cmdline == ',' || !*cmdline) { + if (!coeff_flag) + coeff = 1; + /* + * Round down to the nearest 4 MB for hash function. + * Command line coefficients are in megabytes. + */ + sz = ((u64)num << 20) & NODE_HASH_MASK; + if (sz) + for (i = 0; i < coeff; i++, num_nodes++) + if (setup_node_range(num_nodes, nodes, + &addr, sz, max_addr) < 0) + goto done; + if (!*cmdline) + break; + coeff = -1; + coeff_flag = 0; + } + num = 0; + } +done: + if (!num_nodes) + return -1; + /* Fill remaining system RAM */ + if (addr < max_addr) { + if (coeff_flag && coeff < 0) { + /* Split remaining nodes into num-sized chunks */ + num_nodes += split_nodes_size(nodes, &addr, max_addr, + num_nodes, num); + goto out; + } + switch (*(cmdline - 1)) { + case '*': + /* Split remaining nodes into coeff chunks */ + if (coeff <= 0) + break; + num_nodes += split_nodes_equal(nodes, &addr, max_addr, + num_nodes, coeff); + break; + case ',': + /* Do not allocate remaining system RAM */ + break; + default: + /* Give one final node */ + setup_node_range(num_nodes, nodes, &addr, + max_addr - addr, max_addr); + num_nodes++; + } + } +out: + populate_physnode_map(nodes, num_nodes); + for_each_online_node(i) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + numa_init_array(); + return 0; +} + +#undef E820_ADDR_HOLE_SIZE +#endif /* CONFIG_NUMA_EMU */ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { int i; #ifdef CONFIG_NUMA_EMU - if (numa_fake && !numa_emulation(start_pfn, end_pfn)) + if (cmdline && !numa_emulation(start_pfn, end_pfn)) return; #endif @@ -267,7 +401,7 @@ void __init numa_initmem_init(unsigned l printk(KERN_INFO "Faking a node at %016lx-%016lx\n", start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); - /* setup dummy node covering all memory */ + /* setup dummy node covering all memory */ nodes_clear(node_online_map); node_set_online(0); for (i = 0; i < NR_CPUS; i++) @@ -322,17 +456,13 @@ void __init paging_init(void) } } -/* [numa=off] */ __init int numa_setup(char *opt) { if (!strncmp(opt,"off",3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU - if(!strncmp(opt, "fake=", 5)) { - numa_fake = simple_strtoul(opt+5,NULL,0); ; - if (numa_fake >= MAX_NUMNODES) - numa_fake = MAX_NUMNODES; - } + if(!strncmp(opt, "fake=", 5)) + cmdline = opt + 5; #endif #ifdef CONFIG_ACPI_NUMA if (!strncmp(opt,"noacpi",6)) _ Patches currently in -mm which might be from rientjes@xxxxxxxxxx are - To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html