The patch titled x86_64: fix fake numa has been added to the -mm tree. Its filename is x86_64-fix-fake-numa.patch See http://www.zip.com.au/~akpm/linux/patches/stuff/added-to-mm.txt to find out what to do about this ------------------------------------------------------ Subject: x86_64: fix fake numa From: David Rientjes <rientjes@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxx> --- arch/x86_64/kernel/setup.c | 7 - arch/x86_64/mm/k8topology.c | 12 - arch/x86_64/mm/numa.c | 228 +++++++++++++++++++++++++--------- arch/x86_64/mm/srat.c | 17 -- include/asm-x86_64/numa.h | 3 5 files changed, 174 insertions(+), 93 deletions(-) diff -puN arch/x86_64/kernel/setup.c~x86_64-fix-fake-numa arch/x86_64/kernel/setup.c --- a/arch/x86_64/kernel/setup.c~x86_64-fix-fake-numa +++ a/arch/x86_64/kernel/setup.c @@ -416,13 +416,6 @@ void __init setup_arch(char **cmdline_p) max_pfn = end_pfn; high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1; -#ifdef CONFIG_ACPI_NUMA - /* - * Parse SRAT to discover nodes. - */ - acpi_numa_init(); -#endif - #ifdef CONFIG_NUMA numa_initmem_init(0, end_pfn); #else diff -puN arch/x86_64/mm/k8topology.c~x86_64-fix-fake-numa arch/x86_64/mm/k8topology.c --- a/arch/x86_64/mm/k8topology.c~x86_64-fix-fake-numa +++ a/arch/x86_64/mm/k8topology.c @@ -43,7 +43,6 @@ static __init int find_northbridge(void) int __init k8_scan_nodes(unsigned long start, unsigned long end) { unsigned long prevbase; - struct bootnode nodes[8]; int nodeid, i, nb; unsigned char nodeids[8]; int found = 0; @@ -65,7 +64,6 @@ int __init k8_scan_nodes(unsigned long s printk(KERN_INFO "Number of nodes %d\n", numnodes); - memset(&nodes,0,sizeof(nodes)); prevbase = 0; for (i = 0; i < 8; i++) { unsigned long base,limit; @@ -155,22 +153,14 @@ int __init k8_scan_nodes(unsigned long s if (!found) return -1; - memnode_shift = compute_hash_shift(nodes, 8); - if (memnode_shift < 0) { - printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); - return -1; - } - printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); - for (i = 0; i < 8; i++) { if (nodes[i].start != nodes[i].end) { nodeid = nodeids[i]; apicid_to_node[nodeid << dualcore] = i; apicid_to_node[(nodeid << dualcore) + dualcore] = i; - setup_node_bootmem(i, nodes[i].start, nodes[i].end); + node_set_online(i); } } - numa_init_array(); return 0; } diff -puN arch/x86_64/mm/numa.c~x86_64-fix-fake-numa arch/x86_64/mm/numa.c --- a/arch/x86_64/mm/numa.c~x86_64-fix-fake-numa +++ a/arch/x86_64/mm/numa.c @@ -1,4 +1,4 @@ -/* +/* * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ @@ -11,6 +11,7 @@ #include <linux/ctype.h> #include <linux/module.h> #include <linux/nodemask.h> +#include <linux/acpi.h> #include <asm/e820.h> #include <asm/proto.h> @@ -22,6 +23,7 @@ #define Dprintk(x...) #endif +struct bootnode nodes[MAX_NUMNODES] __initdata; struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; bootmem_data_t plat_node_bdata[MAX_NUMNODES]; @@ -46,7 +48,7 @@ int numa_off __initdata; * -1 if node overlap or lost ram (shift too big) */ static int __init -populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) +populate_memnodemap(const struct bootnode *nodes, int shift) { int i; int res = -1; @@ -55,7 +57,7 @@ populate_memnodemap(const struct bootnod if (shift >= 64) return -1; memset(memnodemap, 0xff, sizeof(memnodemap)); - for (i = 0; i < numnodes; i++) { + for_each_online_node(i) { addr = nodes[i].start; end = nodes[i].end; if (addr >= end) @@ -73,17 +75,17 @@ populate_memnodemap(const struct bootnod return res; } -int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +int __init compute_hash_shift(struct bootnode *nodes) { int shift = 20; - while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) + while (populate_memnodemap(nodes, shift + 1) >= 0) shift++; printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); - if (populate_memnodemap(nodes, numnodes, shift) != 1) { + if (populate_memnodemap(nodes, shift) != 1) { printk(KERN_INFO "Your memory is not aligned you need to rebuild your kernel " "with a bigger NODEMAPSIZE shift=%d\n", @@ -225,81 +227,189 @@ void __init numa_init_array(void) int numa_fake __initdata = 0; /* Numa emulation */ -static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +static unsigned long __init numa_emu_node_size(unsigned long real_size, + int last) { - int i; - struct bootnode nodes[MAX_NUMNODES]; - unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; - - /* Kludge needed for the hash function */ - if (hweight64(sz) > 1) { - unsigned long x = 1; - while ((x << 1) < sz) - x <<= 1; - if (x < sz/2) - printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); - sz = x; - } - - memset(&nodes,0,sizeof(nodes)); - for (i = 0; i < numa_fake; i++) { - nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; - if (i == numa_fake-1) - sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; - nodes[i].end = nodes[i].start + sz; - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", - i, - nodes[i].start, nodes[i].end, - (nodes[i].end - nodes[i].start) >> 20); - node_set_online(i); - } - memnode_shift = compute_hash_shift(nodes, numa_fake); - if (memnode_shift < 0) { - memnode_shift = 0; - printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); - return -1; - } - for_each_online_node(i) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - numa_init_array(); - return 0; + unsigned long node_size; + + if (!numa_fake) + return 0; + + node_size = real_size / numa_fake; + + if (!node_size) + return 0; + + node_size = roundup_pow_of_two(node_size); + + if (last) + node_size = real_size - (node_size * (numa_fake - 1)); + + return node_size; } -#endif -void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) -{ - int i; +static int __init numa_emu_new(int nid, + unsigned long real_start, unsigned long real_end, + unsigned long *emu_start, unsigned long *emu_end) +{ + unsigned long node_size; -#ifdef CONFIG_NUMA_EMU - if (numa_fake && !numa_emulation(start_pfn, end_pfn)) - return; + if (nid >= numa_fake) + return -1; + + node_size = numa_emu_node_size(real_end - real_start, 0); + + if (!node_size) + return -1; + + *emu_start = real_start + (nid * node_size); + + /* give node the remaining memory */ + if (nid == (numa_fake - 1)) { + node_size = numa_emu_node_size(real_end - real_start, 1); + + if (!node_size) + return -1; + } + + *emu_end = *emu_start + node_size; + + printk("configuring fake node %u: pfn %lu - %lu\n", + nid, *emu_start, *emu_end); + + return 0; +} + +static int __init numa_emu_shrink(int nid, int new_nodes, + unsigned long real_start, + unsigned long real_end, + unsigned long *emu_start, + unsigned long *emu_end) +{ + unsigned long node_size; + + if (numa_fake != (new_nodes + 1)) + return -1; + + node_size = numa_emu_node_size(real_end - real_start, 0); + + if (!node_size) + return -1; + + *emu_start = real_start; + *emu_end = real_start + node_size; + + printk("configuring real node %u: pfn %lu - %lu\n", + nid, *emu_start, *emu_end); + printk("NUMA emulation, adding %u emulated node(s) to node %u\n", + new_nodes, nid); + + return 0; +} + +void __init numa_emu_setup_nid(int real_nid) +{ + unsigned long start_pfn, end_pfn; + int real_max = 1; + int nid, new_nodes = 0; + + if (real_nid >= real_max) + return; + + /* setup emulated nodes */ + for (nid = real_nid + real_max; nid < MAX_NUMNODES; nid += real_max) { + if (numa_emu_new(nid, nodes[real_nid].start >> PAGE_SHIFT, + nodes[real_nid].end >> PAGE_SHIFT, + &start_pfn, &end_pfn)) + break; + + nodes[nid].start = start_pfn << PAGE_SHIFT; + nodes[nid].end = end_pfn << PAGE_SHIFT; + new_nodes++; + } + + if (!new_nodes) + return; + + /* shrink real node */ + if (numa_emu_shrink(real_nid, new_nodes, + nodes[real_nid].start >> PAGE_SHIFT, + nodes[real_nid].end >> PAGE_SHIFT, + &start_pfn, &end_pfn)) + return; + + nodes[real_nid].start = start_pfn << PAGE_SHIFT; + nodes[real_nid].end = end_pfn << PAGE_SHIFT; + + for (nid = real_nid + real_max; nid < MAX_NUMNODES; nid += real_max) { + node_set_online(nid); + + if (!--new_nodes) + break; + } +} #endif +static void __init numa_initmem_doinit(unsigned long start_pfn, + unsigned long end_pfn) +{ #ifdef CONFIG_ACPI_NUMA + nodes_clear(node_online_map); + memset(&nodes, 0, sizeof(nodes)); + + /* + * Parse SRAT to discover nodes. + */ + acpi_numa_init(); + if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT)) return; #endif #ifdef CONFIG_K8_NUMA + nodes_clear(node_online_map); + memset(&nodes, 0, sizeof(nodes)); + if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) return; #endif + + nodes_clear(node_online_map); + memset(&nodes, 0, sizeof(nodes)); + printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", + printk(KERN_INFO "Single node at %016lx-%016lx\n", start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); /* setup dummy node covering all memory */ - memnode_shift = 63; - memnodemap[0] = 0; - nodes_clear(node_online_map); + node_set_online(0); - for (i = 0; i < NR_CPUS; i++) - numa_set_node(i, 0); - node_to_cpumask[0] = cpumask_of_cpu(0); - setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); + nodes[0].start = start_pfn << PAGE_SHIFT; + nodes[0].end = end_pfn << PAGE_SHIFT; +} + +void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + + numa_initmem_doinit(start_pfn, end_pfn); + + for_each_online_node(i) + numa_emu_setup_nid(i); + + memnode_shift = compute_hash_shift(nodes); + if (memnode_shift < 0) { + printk(KERN_ERR "No NUMA hash function found. Contact maintainer\n"); + return; + } + printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + + for_each_online_node(i) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + numa_init_array(); } __cpuinit void numa_add_cpu(int cpu) @@ -355,7 +465,7 @@ static __init int numa_setup(char *opt) if (!strncmp(opt,"off",3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU - if(!strncmp(opt, "fake=", 5)) { + if(!strncmp(opt, "fake=", 5) && (*(opt + 5))) { numa_fake = simple_strtoul(opt+5,NULL,0); ; if (numa_fake >= MAX_NUMNODES) numa_fake = MAX_NUMNODES; diff -puN arch/x86_64/mm/srat.c~x86_64-fix-fake-numa arch/x86_64/mm/srat.c --- a/arch/x86_64/mm/srat.c~x86_64-fix-fake-numa +++ a/arch/x86_64/mm/srat.c @@ -32,7 +32,6 @@ int acpi_numa __initdata; static struct acpi_table_slit *acpi_slit; static nodemask_t nodes_parsed __initdata; -static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode nodes_add[MAX_NUMNODES] __initdata; static int found_add_area __initdata; int hotadd_percent __initdata = 0; @@ -384,7 +383,8 @@ int __init acpi_scan_nodes(unsigned long if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { unparse_node(i); node_set_offline(i); - } + } else + node_set_online(i); } if (acpi_numa <= 0) @@ -395,20 +395,8 @@ int __init acpi_scan_nodes(unsigned long return -1; } - memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES); - if (memnode_shift < 0) { - printk(KERN_ERR - "SRAT: No NUMA node hash function found. Contact maintainer\n"); - bad_srat(); - return -1; - } - /* Finally register nodes */ for_each_node_mask(i, nodes_parsed) - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - /* Try again in case setup_node_bootmem missed one due - to missing bootmem */ - for_each_node_mask(i, nodes_parsed) if (!node_online(i)) setup_node_bootmem(i, nodes[i].start, nodes[i].end); @@ -418,7 +406,6 @@ int __init acpi_scan_nodes(unsigned long if (!node_isset(cpu_to_node[i], nodes_parsed)) numa_set_node(i, NUMA_NO_NODE); } - numa_init_array(); return 0; } diff -puN include/asm-x86_64/numa.h~x86_64-fix-fake-numa include/asm-x86_64/numa.h --- a/include/asm-x86_64/numa.h~x86_64-fix-fake-numa +++ a/include/asm-x86_64/numa.h @@ -7,7 +7,7 @@ struct bootnode { u64 start,end; }; -extern int compute_hash_shift(struct bootnode *nodes, int numnodes); +extern int compute_hash_shift(struct bootnode *nodes); #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) @@ -20,6 +20,7 @@ extern void srat_reserve_add_area(int no extern int hotadd_percent; extern unsigned char apicid_to_node[256]; +extern struct bootnode nodes[MAX_NUMNODES]; #ifdef CONFIG_NUMA extern void __init init_cpu_to_node(void); _ Patches currently in -mm which might be from rientjes@xxxxxxxxxx are x86_64-fix-fake-numa.patch - To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html