The patch titled x86_64: fix numa-fake allocation has been removed from the -mm tree. Its filename was 1-fix-numa-fake-allocation.patch This patch was dropped because it is obsolete ------------------------------------------------------ Subject: x86_64: fix numa-fake allocation From: David Rientjes <rientjes@xxxxxxxxxx> Fixes numa=fake on x86_64 so that nodes can be allocated in 4M chunks across 64G of total memory. Any node size that does not fall on a 4M boundary is rounded down to the nearest 4M. populate_memnode_map() is replaced with populate_physnode_map() which is a signed array of bytes mapping page frame numbers to node ID. Memory is spread evenly among the nodes specified on the command line. Reserved memory is taken into account so that there is (hopefully) an even spread of available pages among the nodes. Since memory can seldom be divided evenly against the maximum system RAM at boot, there may be larger nodes (+4M) to balance the distribution. These nodes are moved to lower memory so that they are collected together at one end and may assist in ZONE_DMA if NUMA emulation is later used in combination with CPUsets. Signed-off-by: David Rientjes <rientjes@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxx> --- arch/x86_64/mm/k8topology.c | 10 +- arch/x86_64/mm/numa.c | 121 ++++++++++++---------------------- arch/x86_64/mm/srat.c | 17 +--- include/asm-x86_64/mmzone.h | 31 ++++---- include/asm-x86_64/numa.h | 2 5 files changed, 72 insertions(+), 109 deletions(-) diff -puN arch/x86_64/mm/k8topology.c~1-fix-numa-fake-allocation arch/x86_64/mm/k8topology.c --- a/arch/x86_64/mm/k8topology.c~1-fix-numa-fake-allocation +++ a/arch/x86_64/mm/k8topology.c @@ -138,6 +138,9 @@ int __init k8_scan_nodes(unsigned long s prevbase,base); return -1; } + + /* Round down to nearest 4MB for hash function */ + limit = ((limit - base) & NODE_HASH_MASK) + base; printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", nodeid, base, limit); @@ -155,12 +158,7 @@ int __init k8_scan_nodes(unsigned long s if (!found) return -1; - memnode_shift = compute_hash_shift(nodes, 8); - if (memnode_shift < 0) { - printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); - return -1; - } - printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + populate_physnode_map(nodes, 8); for (i = 0; i < 8; i++) { if (nodes[i].start != nodes[i].end) { diff -puN arch/x86_64/mm/numa.c~1-fix-numa-fake-allocation arch/x86_64/mm/numa.c --- a/arch/x86_64/mm/numa.c~1-fix-numa-fake-allocation +++ a/arch/x86_64/mm/numa.c @@ -25,8 +25,6 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; bootmem_data_t plat_node_bdata[MAX_NUMNODES]; -struct memnode memnode; - unsigned char cpu_to_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; @@ -35,62 +33,23 @@ unsigned char apicid_to_node[MAX_LOCAL_A }; cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; +s8 physnode_map[MAX_ELEMENTS] __read_mostly; int numa_off __initdata; - -/* - * Given a shift value, try to populate memnodemap[] - * Returns : - * 1 if OK - * 0 if memnodmap[] too small (of shift too small) - * -1 if node overlap or lost ram (shift too big) - */ -static int __init -populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) +void __init populate_physnode_map(struct bootnode *nodes, int numnodes) { int i; - int res = -1; - unsigned long addr, end; + unsigned long addr, end, pfn; - if (shift >= 64) - return -1; - memset(memnodemap, 0xff, sizeof(memnodemap)); for (i = 0; i < numnodes; i++) { addr = nodes[i].start; end = nodes[i].end; if (addr >= end) continue; - if ((end >> shift) >= NODEMAPSIZE) - return 0; - do { - if (memnodemap[addr >> shift] != 0xff) - return -1; - memnodemap[addr >> shift] = i; - addr += (1UL << shift); - } while (addr < end); - res = 1; + for (pfn = addr >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); + pfn += PAGES_PER_ELEMENT) + physnode_map[pfn / PAGES_PER_ELEMENT] = i; } - return res; -} - -int __init compute_hash_shift(struct bootnode *nodes, int numnodes) -{ - int shift = 20; - - while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) - shift++; - - printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", - shift); - - if (populate_memnodemap(nodes, numnodes, shift) != 1) { - printk(KERN_INFO - "Your memory is not aligned you need to rebuild your kernel " - "with a bigger NODEMAPSIZE shift=%d\n", - shift); - return -1; - } - return shift; } #ifdef CONFIG_SPARSEMEM @@ -227,38 +186,55 @@ int numa_fake __initdata = 0; /* Numa emulation */ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { - int i; - struct bootnode nodes[MAX_NUMNODES]; - unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; - - /* Kludge needed for the hash function */ - if (hweight64(sz) > 1) { - unsigned long x = 1; - while ((x << 1) < sz) - x <<= 1; - if (x < sz/2) - printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); - sz = x; - } + int i, big; + struct bootnode nodes[numa_fake]; + u64 addr = start_pfn << PAGE_SHIFT; + u64 sz = ((end_pfn - start_pfn - e820_hole_size(start_pfn, end_pfn)) << + PAGE_SHIFT) / numa_fake; + + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating our masked leftovers. + */ + big = ((sz & ~NODE_HASH_MASK) * numa_fake) / NODE_MIN_SIZE; + + /* Round down to nearest 4MB for hash function */ + sz &= NODE_HASH_MASK; + if (sz == 0) { + printk(KERN_ERR "Not enough memory allotted for each node. " + "Numa emulation disabled.\n"); + return -1; + } memset(&nodes,0,sizeof(nodes)); for (i = 0; i < numa_fake; i++) { - nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; - if (i == numa_fake-1) - sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; - nodes[i].end = nodes[i].start + sz; + nodes[i].start = addr; + addr += sz; + if (i < big) + addr += NODE_MIN_SIZE; + /* + * The final node cannot be over-allocated so give it the + * remaining memory available and ignore asymmetry. + * + * Other nodes receive roughly the same amount of avaiable + * pages. + */ + if (i == numa_fake - 1) + addr = end_pfn << PAGE_SHIFT; + else + while (addr - nodes[i].start - + (e820_hole_size(nodes[i].start >> PAGE_SHIFT, + addr >> PAGE_SHIFT) << + PAGE_SHIFT) < sz) + addr += NODE_MIN_SIZE; + nodes[i].end = addr; printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", i, nodes[i].start, nodes[i].end, (nodes[i].end - nodes[i].start) >> 20); node_set_online(i); } - memnode_shift = compute_hash_shift(nodes, numa_fake); - if (memnode_shift < 0) { - memnode_shift = 0; - printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); - return -1; - } + populate_physnode_map(nodes, numa_fake); for_each_online_node(i) setup_node_bootmem(i, nodes[i].start, nodes[i].end); numa_init_array(); @@ -292,8 +268,6 @@ void __init numa_initmem_init(unsigned l start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); /* setup dummy node covering all memory */ - memnode_shift = 63; - memnodemap[0] = 0; nodes_clear(node_online_map); node_set_online(0); for (i = 0; i < NR_CPUS; i++) @@ -396,7 +370,6 @@ void __init init_cpu_to_node(void) EXPORT_SYMBOL(cpu_to_node); EXPORT_SYMBOL(node_to_cpumask); -EXPORT_SYMBOL(memnode); EXPORT_SYMBOL(node_data); #ifdef CONFIG_DISCONTIGMEM @@ -413,8 +386,6 @@ int pfn_valid(unsigned long pfn) if (pfn >= num_physpages) return 0; nid = pfn_to_nid(pfn); - if (nid == 0xff) - return 0; return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); } EXPORT_SYMBOL(pfn_valid); diff -puN arch/x86_64/mm/srat.c~1-fix-numa-fake-allocation arch/x86_64/mm/srat.c --- a/arch/x86_64/mm/srat.c~1-fix-numa-fake-allocation +++ a/arch/x86_64/mm/srat.c @@ -38,10 +38,6 @@ int hotadd_percent __initdata = 0; #define hotadd_percent 0 /* Ignore all settings */ #endif -/* Too small nodes confuse the VM badly. Usually they result - from BIOS bugs. */ -#define NODE_MIN_SIZE (4*1024*1024) - static __init int setup_node(int pxm) { return acpi_map_pxm_to_node(pxm); @@ -378,11 +374,14 @@ int __init acpi_scan_nodes(unsigned long /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) { + signed long sz = nodes[i].end - nodes[i].start; cutoff_node(i, start, end); - if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { + if (sz < NODE_MIN_SIZE) { unparse_node(i); node_set_offline(i); } + /* Round down to nearest 4MB for hash function */ + nodes[i].end = nodes[i].start + (sz & NODE_HASH_MASK); } if (acpi_numa <= 0) @@ -393,13 +392,7 @@ int __init acpi_scan_nodes(unsigned long return -1; } - memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES); - if (memnode_shift < 0) { - printk(KERN_ERR - "SRAT: No NUMA node hash function found. Contact maintainer\n"); - bad_srat(); - return -1; - } + populate_physnode_map(nodes, MAX_NUMNODES); /* Finally register nodes */ for_each_node_mask(i, nodes_parsed) diff -puN include/asm-x86_64/mmzone.h~1-fix-numa-fake-allocation include/asm-x86_64/mmzone.h --- a/include/asm-x86_64/mmzone.h~1-fix-numa-fake-allocation +++ a/include/asm-x86_64/mmzone.h @@ -11,26 +11,27 @@ #include <asm/smp.h> -/* Should really switch to dynamic allocation at some point */ -#define NODEMAPSIZE 0x4fff - -/* Simple perfect hash to map physical addresses to node numbers */ -struct memnode { - int shift; - u8 map[NODEMAPSIZE]; -} ____cacheline_aligned; -extern struct memnode memnode; -#define memnode_shift memnode.shift -#define memnodemap memnode.map +/* + * Generic node memory support with the following assumptions: + * 1) Memory is allocated in 4 MB contiguous chunks + * 2) There is not more than 64 GB of system memory total + * + * 64GB / (4KB / page) = 16777216 pages + */ +#define MAX_NR_PAGES 16777216 +#define MAX_ELEMENTS 16384 +#define PAGES_PER_ELEMENT (MAX_NR_PAGES / MAX_ELEMENTS) +#define NODE_MIN_SIZE (4*1024*1024) +#define NODE_HASH_MASK 0xffffffffffc00000ul +extern s8 physnode_map[]; extern struct pglist_data *node_data[]; static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) { - unsigned nid; - VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE); - nid = memnodemap[addr >> memnode_shift]; - VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); + int nid; + nid = ((int) physnode_map[(addr >> PAGE_SHIFT) / PAGES_PER_ELEMENT]); + VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); return nid; } diff -puN include/asm-x86_64/numa.h~1-fix-numa-fake-allocation include/asm-x86_64/numa.h --- a/include/asm-x86_64/numa.h~1-fix-numa-fake-allocation +++ a/include/asm-x86_64/numa.h @@ -7,7 +7,7 @@ struct bootnode { u64 start,end; }; -extern int compute_hash_shift(struct bootnode *nodes, int numnodes); +extern void populate_physnode_map(struct bootnode *nodes, int numnodes); #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) _ Patches currently in -mm which might be from rientjes@xxxxxxxxxx are - To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html