The patch titled Subject: mm: introduce numa_emulation has been added to the -mm mm-unstable branch. Its filename is mm-introduce-numa_emulation.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-introduce-numa_emulation.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: "Mike Rapoport (Microsoft)" <rppt@xxxxxxxxxx> Subject: mm: introduce numa_emulation Date: Wed, 7 Aug 2024 09:41:03 +0300 Move numa_emulation code from arch/x86 to mm/numa_emulation.c This code will be later reused by arch_numa. No functional changes. Link: https://lkml.kernel.org/r/20240807064110.1003856-20-rppt@xxxxxxxxxx Signed-off-by: Mike Rapoport (Microsoft) <rppt@xxxxxxxxxx> Tested-by: Zi Yan <ziy@xxxxxxxxxx> # for x86_64 and arm64 Reviewed-by: Jonathan Cameron <Jonathan.Cameron@xxxxxxxxxx> Tested-by: Jonathan Cameron <Jonathan.Cameron@xxxxxxxxxx> [arm64 + CXL via QEMU] Acked-by: Dan Williams <dan.j.williams@xxxxxxxxx> Cc: Alexander Gordeev <agordeev@xxxxxxxxxxxxx> Cc: Andreas Larsson <andreas@xxxxxxxxxxx> Cc: Arnd Bergmann <arnd@xxxxxxxx> Cc: Borislav Petkov <bp@xxxxxxxxx> Cc: Catalin Marinas <catalin.marinas@xxxxxxx> Cc: Christophe Leroy <christophe.leroy@xxxxxxxxxx> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: Davidlohr Bueso <dave@xxxxxxxxxxxx> Cc: David S. Miller <davem@xxxxxxxxxxxxx> Cc: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> Cc: Heiko Carstens <hca@xxxxxxxxxxxxx> Cc: Huacai Chen <chenhuacai@xxxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: Jiaxun Yang <jiaxun.yang@xxxxxxxxxxx> Cc: John Paul Adrian Glaubitz <glaubitz@xxxxxxxxxxxxxxxxxxx> Cc: Jonathan Corbet <corbet@xxxxxxx> Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx> Cc: Palmer Dabbelt <palmer@xxxxxxxxxxx> Cc: Rafael J. Wysocki <rafael@xxxxxxxxxx> Cc: Rob Herring (Arm) <robh@xxxxxxxxxx> Cc: Samuel Holland <samuel.holland@xxxxxxxxxx> Cc: Thomas Bogendoerfer <tsbogend@xxxxxxxxxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Vasily Gorbik <gor@xxxxxxxxxxxxx> Cc: Will Deacon <will@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- arch/x86/Kconfig | 8 arch/x86/include/asm/numa.h | 12 arch/x86/mm/Makefile | 1 arch/x86/mm/numa_emulation.c | 573 --------------------------------- arch/x86/mm/numa_internal.h | 11 include/linux/numa_memblks.h | 17 mm/Kconfig | 8 mm/Makefile | 1 mm/numa_emulation.c | 571 ++++++++++++++++++++++++++++++++ 9 files changed, 597 insertions(+), 605 deletions(-) --- a/arch/x86/include/asm/numa.h~mm-introduce-numa_emulation +++ a/arch/x86/include/asm/numa.h @@ -65,16 +65,4 @@ static inline void init_gi_nodes(void) void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable); #endif -#ifdef CONFIG_NUMA_EMU -int numa_emu_cmdline(char *str); -void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, - unsigned int nr_emu_nids); -u64 __init numa_emu_dma_end(void); -#else /* CONFIG_NUMA_EMU */ -static inline int numa_emu_cmdline(char *str) -{ - return -EINVAL; -} -#endif /* CONFIG_NUMA_EMU */ - #endif /* _ASM_X86_NUMA_H */ --- a/arch/x86/Kconfig~mm-introduce-numa_emulation +++ a/arch/x86/Kconfig @@ -1600,14 +1600,6 @@ config X86_64_ACPI_NUMA help Enable ACPI SRAT based node topology detection. -config NUMA_EMU - bool "NUMA emulation" - depends on NUMA - help - Enable NUMA emulation. A flat machine will be split - into virtual nodes when booted with "numa=fake=N", where N is the - number of nodes. This is only useful for debugging. - config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP range 1 10 --- a/arch/x86/mm/Makefile~mm-introduce-numa_emulation +++ a/arch/x86/mm/Makefile @@ -57,7 +57,6 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmio obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o -obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o diff --git a/arch/x86/mm/numa_emulation.c a/arch/x86/mm/numa_emulation.c deleted file mode 100644 --- a/arch/x86/mm/numa_emulation.c +++ /dev/null @@ -1,573 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NUMA emulation - */ -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/topology.h> -#include <linux/memblock.h> -#include <linux/numa_memblks.h> -#include <asm/dma.h> - -#include "numa_internal.h" - -#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) - -static int emu_nid_to_phys[MAX_NUMNODES]; -static char *emu_cmdline __initdata; - -int __init numa_emu_cmdline(char *str) -{ - emu_cmdline = str; - return 0; -} - -static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) -{ - int i; - - for (i = 0; i < mi->nr_blks; i++) - if (mi->blk[i].nid == nid) - return i; - return -ENOENT; -} - -static u64 __init mem_hole_size(u64 start, u64 end) -{ - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = PFN_DOWN(end); - - if (start_pfn < end_pfn) - return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); - return 0; -} - -/* - * Sets up nid to range from @start to @end. The return value is -errno if - * something went wrong, 0 otherwise. - */ -static int __init emu_setup_memblk(struct numa_meminfo *ei, - struct numa_meminfo *pi, - int nid, int phys_blk, u64 size) -{ - struct numa_memblk *eb = &ei->blk[ei->nr_blks]; - struct numa_memblk *pb = &pi->blk[phys_blk]; - - if (ei->nr_blks >= NR_NODE_MEMBLKS) { - pr_err("NUMA: Too many emulated memblks, failing emulation\n"); - return -EINVAL; - } - - ei->nr_blks++; - eb->start = pb->start; - eb->end = pb->start + size; - eb->nid = nid; - - if (emu_nid_to_phys[nid] == NUMA_NO_NODE) - emu_nid_to_phys[nid] = pb->nid; - - pb->start += size; - if (pb->start >= pb->end) { - WARN_ON_ONCE(pb->start > pb->end); - numa_remove_memblk_from(phys_blk, pi); - } - - printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", - nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); - return 0; -} - -/* - * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr. - * - * Returns zero on success or negative on error. - */ -static int __init split_nodes_interleave(struct numa_meminfo *ei, - struct numa_meminfo *pi, - u64 addr, u64 max_addr, int nr_nodes) -{ - nodemask_t physnode_mask = numa_nodes_parsed; - u64 size; - int big; - int nid = 0; - int i, ret; - - if (nr_nodes <= 0) - return -1; - if (nr_nodes > MAX_NUMNODES) { - pr_info("numa=fake=%d too large, reducing to %d\n", - nr_nodes, MAX_NUMNODES); - nr_nodes = MAX_NUMNODES; - } - - /* - * Calculate target node size. x86_32 freaks on __udivdi3() so do - * the division in ulong number of pages and convert back. - */ - size = max_addr - addr - mem_hole_size(addr, max_addr); - size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); - - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the remainder. - */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / - FAKE_NODE_MIN_SIZE; - - size &= FAKE_NODE_MIN_HASH_MASK; - if (!size) { - pr_err("Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - - /* - * Continue to fill physical nodes with fake nodes until there is no - * memory left on any of them. - */ - while (!nodes_empty(physnode_mask)) { - for_each_node_mask(i, physnode_mask) { - u64 dma32_end = numa_emu_dma_end(); - u64 start, limit, end; - int phys_blk; - - phys_blk = emu_find_memblk_by_nid(i, pi); - if (phys_blk < 0) { - node_clear(i, physnode_mask); - continue; - } - start = pi->blk[phys_blk].start; - limit = pi->blk[phys_blk].end; - end = start + size; - - if (nid < big) - end += FAKE_NODE_MIN_SIZE; - - /* - * Continue to add memory to this fake node if its - * non-reserved memory is less than the per-node size. - */ - while (end - start - mem_hole_size(start, end) < size) { - end += FAKE_NODE_MIN_SIZE; - if (end > limit) { - end = limit; - break; - } - } - - /* - * If there won't be at least FAKE_NODE_MIN_SIZE of - * non-reserved memory in ZONE_DMA32 for the next node, - * this one must extend to the boundary. - */ - if (end < dma32_end && dma32_end - end - - mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) - end = dma32_end; - - /* - * If there won't be enough non-reserved memory for the - * next node, this one must extend to the end of the - * physical node. - */ - if (limit - end - mem_hole_size(end, limit) < size) - end = limit; - - ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, - phys_blk, - min(end, limit) - start); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* - * Returns the end address of a node so that there is at least `size' amount of - * non-reserved memory or `max_addr' is reached. - */ -static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) -{ - u64 end = start + size; - - while (end - start - mem_hole_size(start, end) < size) { - end += FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end = max_addr; - break; - } - } - return end; -} - -static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) -{ - unsigned long max_pfn = PHYS_PFN(max_addr); - unsigned long base_pfn = PHYS_PFN(base); - unsigned long hole_pfns = PHYS_PFN(hole); - - return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); -} - -/* - * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'. - * - * Returns zero on success or negative on error. - */ -static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, - struct numa_meminfo *pi, - u64 addr, u64 max_addr, u64 size, - int nr_nodes, struct numa_memblk *pblk, - int nid) -{ - nodemask_t physnode_mask = numa_nodes_parsed; - int i, ret, uniform = 0; - u64 min_size; - - if ((!size && !nr_nodes) || (nr_nodes && !pblk)) - return -1; - - /* - * In the 'uniform' case split the passed in physical node by - * nr_nodes, in the non-uniform case, ignore the passed in - * physical block and try to create nodes of at least size - * @size. - * - * In the uniform case, split the nodes strictly by physical - * capacity, i.e. ignore holes. In the non-uniform case account - * for holes and treat @size as a minimum floor. - */ - if (!nr_nodes) - nr_nodes = MAX_NUMNODES; - else { - nodes_clear(physnode_mask); - node_set(pblk->nid, physnode_mask); - uniform = 1; - } - - if (uniform) { - min_size = uniform_size(max_addr, addr, 0, nr_nodes); - size = min_size; - } else { - /* - * The limit on emulated nodes is MAX_NUMNODES, so the - * size per node is increased accordingly if the - * requested size is too small. This creates a uniform - * distribution of node sizes across the entire machine - * (but not necessarily over physical nodes). - */ - min_size = uniform_size(max_addr, addr, - mem_hole_size(addr, max_addr), nr_nodes); - } - min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); - if (size < min_size) { - pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", - size >> 20, min_size >> 20); - size = min_size; - } - size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); - - /* - * Fill physical nodes with fake nodes of size until there is no memory - * left on any of them. - */ - while (!nodes_empty(physnode_mask)) { - for_each_node_mask(i, physnode_mask) { - u64 dma32_end = numa_emu_dma_end(); - u64 start, limit, end; - int phys_blk; - - phys_blk = emu_find_memblk_by_nid(i, pi); - if (phys_blk < 0) { - node_clear(i, physnode_mask); - continue; - } - - start = pi->blk[phys_blk].start; - limit = pi->blk[phys_blk].end; - - if (uniform) - end = start + size; - else - end = find_end_of_node(start, limit, size); - /* - * If there won't be at least FAKE_NODE_MIN_SIZE of - * non-reserved memory in ZONE_DMA32 for the next node, - * this one must extend to the boundary. - */ - if (end < dma32_end && dma32_end - end - - mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) - end = dma32_end; - - /* - * If there won't be enough non-reserved memory for the - * next node, this one must extend to the end of the - * physical node. - */ - if ((limit - end - mem_hole_size(end, limit) < size) - && !uniform) - end = limit; - - ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, - phys_blk, - min(end, limit) - start); - if (ret < 0) - return ret; - } - } - return nid; -} - -static int __init split_nodes_size_interleave(struct numa_meminfo *ei, - struct numa_meminfo *pi, - u64 addr, u64 max_addr, u64 size) -{ - return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, - 0, NULL, 0); -} - -static int __init setup_emu2phys_nid(int *dfl_phys_nid) -{ - int i, max_emu_nid = 0; - - *dfl_phys_nid = NUMA_NO_NODE; - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { - if (emu_nid_to_phys[i] != NUMA_NO_NODE) { - max_emu_nid = i; - if (*dfl_phys_nid == NUMA_NO_NODE) - *dfl_phys_nid = emu_nid_to_phys[i]; - } - } - - return max_emu_nid; -} - -/** - * numa_emulation - Emulate NUMA nodes - * @numa_meminfo: NUMA configuration to massage - * @numa_dist_cnt: The size of the physical NUMA distance table - * - * Emulate NUMA nodes according to the numa=fake kernel parameter. - * @numa_meminfo contains the physical memory configuration and is modified - * to reflect the emulated configuration on success. @numa_dist_cnt is - * used to determine the size of the physical distance table. - * - * On success, the following modifications are made. - * - * - @numa_meminfo is updated to reflect the emulated nodes. - * - * - __apicid_to_node[] is updated such that APIC IDs are mapped to the - * emulated nodes. - * - * - NUMA distance table is rebuilt to represent distances between emulated - * nodes. The distances are determined considering how emulated nodes - * are mapped to physical nodes and match the actual distances. - * - * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical - * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). - * - * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with - * identity mapping and no other modification is made. - */ -void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) -{ - static struct numa_meminfo ei __initdata; - static struct numa_meminfo pi __initdata; - const u64 max_addr = PFN_PHYS(max_pfn); - u8 *phys_dist = NULL; - size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); - int max_emu_nid, dfl_phys_nid; - int i, j, ret; - - if (!emu_cmdline) - goto no_emu; - - memset(&ei, 0, sizeof(ei)); - pi = *numa_meminfo; - - for (i = 0; i < MAX_NUMNODES; i++) - emu_nid_to_phys[i] = NUMA_NO_NODE; - - /* - * If the numa=fake command-line contains a 'M' or 'G', it represents - * the fixed node size. Otherwise, if it is just a single number N, - * split the system RAM into N fake nodes. - */ - if (strchr(emu_cmdline, 'U')) { - nodemask_t physnode_mask = numa_nodes_parsed; - unsigned long n; - int nid = 0; - - n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); - ret = -1; - for_each_node_mask(i, physnode_mask) { - /* - * The reason we pass in blk[0] is due to - * numa_remove_memblk_from() called by - * emu_setup_memblk() will delete entry 0 - * and then move everything else up in the pi.blk - * array. Therefore we should always be looking - * at blk[0]. - */ - ret = split_nodes_size_interleave_uniform(&ei, &pi, - pi.blk[0].start, pi.blk[0].end, 0, - n, &pi.blk[0], nid); - if (ret < 0) - break; - if (ret < n) { - pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", - __func__, i, ret, n); - ret = -1; - break; - } - nid = ret; - } - } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { - u64 size; - - size = memparse(emu_cmdline, &emu_cmdline); - ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); - } else { - unsigned long n; - - n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); - ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); - } - if (*emu_cmdline == ':') - emu_cmdline++; - - if (ret < 0) - goto no_emu; - - if (numa_cleanup_meminfo(&ei) < 0) { - pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); - goto no_emu; - } - - /* copy the physical distance table */ - if (numa_dist_cnt) { - phys_dist = memblock_alloc(phys_size, PAGE_SIZE); - if (!phys_dist) { - pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); - goto no_emu; - } - - for (i = 0; i < numa_dist_cnt; i++) - for (j = 0; j < numa_dist_cnt; j++) - phys_dist[i * numa_dist_cnt + j] = - node_distance(i, j); - } - - /* - * Determine the max emulated nid and the default phys nid to use - * for unmapped nodes. - */ - max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); - - /* commit */ - *numa_meminfo = ei; - - /* Make sure numa_nodes_parsed only contains emulated nodes */ - nodes_clear(numa_nodes_parsed); - for (i = 0; i < ARRAY_SIZE(ei.blk); i++) - if (ei.blk[i].start != ei.blk[i].end && - ei.blk[i].nid != NUMA_NO_NODE) - node_set(ei.blk[i].nid, numa_nodes_parsed); - - numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); - - /* make sure all emulated nodes are mapped to a physical node */ - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) - if (emu_nid_to_phys[i] == NUMA_NO_NODE) - emu_nid_to_phys[i] = dfl_phys_nid; - - /* transform distance table */ - numa_reset_distance(); - for (i = 0; i < max_emu_nid + 1; i++) { - for (j = 0; j < max_emu_nid + 1; j++) { - int physi = emu_nid_to_phys[i]; - int physj = emu_nid_to_phys[j]; - int dist; - - if (get_option(&emu_cmdline, &dist) == 2) - ; - else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) - dist = physi == physj ? - LOCAL_DISTANCE : REMOTE_DISTANCE; - else - dist = phys_dist[physi * numa_dist_cnt + physj]; - - numa_set_distance(i, j, dist); - } - } - - /* free the copied physical distance table */ - memblock_free(phys_dist, phys_size); - return; - -no_emu: - /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) - emu_nid_to_phys[i] = i; -} - -#ifndef CONFIG_DEBUG_PER_CPU_MAPS -void numa_add_cpu(unsigned int cpu) -{ - int physnid, nid; - - nid = early_cpu_to_node(cpu); - BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); - - physnid = emu_nid_to_phys[nid]; - - /* - * Map the cpu to each emulated node that is allocated on the physical - * node of the cpu's apic id. - */ - for_each_online_node(nid) - if (emu_nid_to_phys[nid] == physnid) - cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); -} - -void numa_remove_cpu(unsigned int cpu) -{ - int i; - - for_each_online_node(i) - cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); -} -#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ -static void numa_set_cpumask(unsigned int cpu, bool enable) -{ - int nid, physnid; - - nid = early_cpu_to_node(cpu); - if (nid == NUMA_NO_NODE) { - /* early_cpu_to_node() already emits a warning and trace */ - return; - } - - physnid = emu_nid_to_phys[nid]; - - for_each_online_node(nid) { - if (emu_nid_to_phys[nid] != physnid) - continue; - - debug_cpumask_set_cpu(cpu, nid, enable); - } -} - -void numa_add_cpu(unsigned int cpu) -{ - numa_set_cpumask(cpu, true); -} - -void numa_remove_cpu(unsigned int cpu) -{ - numa_set_cpumask(cpu, false); -} -#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ --- a/arch/x86/mm/numa_internal.h~mm-introduce-numa_emulation +++ a/arch/x86/mm/numa_internal.h @@ -7,15 +7,4 @@ void __init x86_numa_init(void); -struct numa_meminfo; - -#ifdef CONFIG_NUMA_EMU -void __init numa_emulation(struct numa_meminfo *numa_meminfo, - int numa_dist_cnt); -#else -static inline void numa_emulation(struct numa_meminfo *numa_meminfo, - int numa_dist_cnt) -{ } -#endif - #endif /* __X86_MM_NUMA_INTERNAL_H */ --- a/include/linux/numa_memblks.h~mm-introduce-numa_emulation +++ a/include/linux/numa_memblks.h @@ -34,6 +34,23 @@ int __init numa_register_meminfo(struct void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, const struct numa_meminfo *mi); +#ifdef CONFIG_NUMA_EMU +int numa_emu_cmdline(char *str); +void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, + unsigned int nr_emu_nids); +u64 __init numa_emu_dma_end(void); +void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt); +#else +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt) +{ } +static inline int numa_emu_cmdline(char *str) +{ + return -EINVAL; +} +#endif /* CONFIG_NUMA_EMU */ + #endif /* CONFIG_NUMA_MEMBLKS */ #endif /* __NUMA_MEMBLKS_H */ --- a/mm/Kconfig~mm-introduce-numa_emulation +++ a/mm/Kconfig @@ -1270,6 +1270,14 @@ config EXECMEM config NUMA_MEMBLKS bool +config NUMA_EMU + bool "NUMA emulation" + depends on NUMA_MEMBLKS + help + Enable NUMA emulation. A flat machine will be split + into virtual nodes when booted with "numa=fake=N", where N is the + number of nodes. This is only useful for debugging. + source "mm/damon/Kconfig" endmenu --- a/mm/Makefile~mm-introduce-numa_emulation +++ a/mm/Makefile @@ -119,6 +119,7 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += e obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_NUMA_MEMBLKS) += numa_memblks.o +obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o diff --git a/mm/numa_emulation.c a/mm/numa_emulation.c new file mode 100644 --- /dev/null +++ a/mm/numa_emulation.c @@ -0,0 +1,571 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NUMA emulation + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/topology.h> +#include <linux/memblock.h> +#include <linux/numa_memblks.h> +#include <asm/numa.h> + +#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) + +static int emu_nid_to_phys[MAX_NUMNODES]; +static char *emu_cmdline __initdata; + +int __init numa_emu_cmdline(char *str) +{ + emu_cmdline = str; + return 0; +} + +static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].nid == nid) + return i; + return -ENOENT; +} + +static u64 __init mem_hole_size(u64 start, u64 end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = PFN_DOWN(end); + + if (start_pfn < end_pfn) + return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); + return 0; +} + +/* + * Sets up nid to range from @start to @end. The return value is -errno if + * something went wrong, 0 otherwise. + */ +static int __init emu_setup_memblk(struct numa_meminfo *ei, + struct numa_meminfo *pi, + int nid, int phys_blk, u64 size) +{ + struct numa_memblk *eb = &ei->blk[ei->nr_blks]; + struct numa_memblk *pb = &pi->blk[phys_blk]; + + if (ei->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: Too many emulated memblks, failing emulation\n"); + return -EINVAL; + } + + ei->nr_blks++; + eb->start = pb->start; + eb->end = pb->start + size; + eb->nid = nid; + + if (emu_nid_to_phys[nid] == NUMA_NO_NODE) + emu_nid_to_phys[nid] = pb->nid; + + pb->start += size; + if (pb->start >= pb->end) { + WARN_ON_ONCE(pb->start > pb->end); + numa_remove_memblk_from(phys_blk, pi); + } + + printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", + nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); + return 0; +} + +/* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr + * to max_addr. + * + * Returns zero on success or negative on error. + */ +static int __init split_nodes_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, int nr_nodes) +{ + nodemask_t physnode_mask = numa_nodes_parsed; + u64 size; + int big; + int nid = 0; + int i, ret; + + if (nr_nodes <= 0) + return -1; + if (nr_nodes > MAX_NUMNODES) { + pr_info("numa=fake=%d too large, reducing to %d\n", + nr_nodes, MAX_NUMNODES); + nr_nodes = MAX_NUMNODES; + } + + /* + * Calculate target node size. x86_32 freaks on __udivdi3() so do + * the division in ulong number of pages and convert back. + */ + size = max_addr - addr - mem_hole_size(addr, max_addr); + size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); + + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the remainder. + */ + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / + FAKE_NODE_MIN_SIZE; + + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + pr_err("Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + /* + * Continue to fill physical nodes with fake nodes until there is no + * memory left on any of them. + */ + while (!nodes_empty(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = numa_emu_dma_end(); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + end = start + size; + + if (nid < big) + end += FAKE_NODE_MIN_SIZE; + + /* + * Continue to add memory to this fake node if its + * non-reserved memory is less than the per-node size. + */ + while (end - start - mem_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > limit) { + end = limit; + break; + } + } + + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - mem_hole_size(end, limit) < size) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ + u64 end = start + size; + + while (end - start - mem_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + return end; +} + +static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) +{ + unsigned long max_pfn = PHYS_PFN(max_addr); + unsigned long base_pfn = PHYS_PFN(base); + unsigned long hole_pfns = PHYS_PFN(hole); + + return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. + * + * Returns zero on success or negative on error. + */ +static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size, + int nr_nodes, struct numa_memblk *pblk, + int nid) +{ + nodemask_t physnode_mask = numa_nodes_parsed; + int i, ret, uniform = 0; + u64 min_size; + + if ((!size && !nr_nodes) || (nr_nodes && !pblk)) + return -1; + + /* + * In the 'uniform' case split the passed in physical node by + * nr_nodes, in the non-uniform case, ignore the passed in + * physical block and try to create nodes of at least size + * @size. + * + * In the uniform case, split the nodes strictly by physical + * capacity, i.e. ignore holes. In the non-uniform case account + * for holes and treat @size as a minimum floor. + */ + if (!nr_nodes) + nr_nodes = MAX_NUMNODES; + else { + nodes_clear(physnode_mask); + node_set(pblk->nid, physnode_mask); + uniform = 1; + } + + if (uniform) { + min_size = uniform_size(max_addr, addr, 0, nr_nodes); + size = min_size; + } else { + /* + * The limit on emulated nodes is MAX_NUMNODES, so the + * size per node is increased accordingly if the + * requested size is too small. This creates a uniform + * distribution of node sizes across the entire machine + * (but not necessarily over physical nodes). + */ + min_size = uniform_size(max_addr, addr, + mem_hole_size(addr, max_addr), nr_nodes); + } + min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); + + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (!nodes_empty(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = numa_emu_dma_end(); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + + if (uniform) + end = start + size; + else + end = find_end_of_node(start, limit, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if ((limit - end - mem_hole_size(end, limit) < size) + && !uniform) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return nid; +} + +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size) +{ + return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, + 0, NULL, 0); +} + +static int __init setup_emu2phys_nid(int *dfl_phys_nid) +{ + int i, max_emu_nid = 0; + + *dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (*dfl_phys_nid == NUMA_NO_NODE) + *dfl_phys_nid = emu_nid_to_phys[i]; + } + } + + return max_emu_nid; +} + +/** + * numa_emulation - Emulate NUMA nodes + * @numa_meminfo: NUMA configuration to massage + * @numa_dist_cnt: The size of the physical NUMA distance table + * + * Emulate NUMA nodes according to the numa=fake kernel parameter. + * @numa_meminfo contains the physical memory configuration and is modified + * to reflect the emulated configuration on success. @numa_dist_cnt is + * used to determine the size of the physical distance table. + * + * On success, the following modifications are made. + * + * - @numa_meminfo is updated to reflect the emulated nodes. + * + * - __apicid_to_node[] is updated such that APIC IDs are mapped to the + * emulated nodes. + * + * - NUMA distance table is rebuilt to represent distances between emulated + * nodes. The distances are determined considering how emulated nodes + * are mapped to physical nodes and match the actual distances. + * + * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical + * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). + * + * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with + * identity mapping and no other modification is made. + */ +void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) +{ + static struct numa_meminfo ei __initdata; + static struct numa_meminfo pi __initdata; + const u64 max_addr = PFN_PHYS(max_pfn); + u8 *phys_dist = NULL; + size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); + int max_emu_nid, dfl_phys_nid; + int i, j, ret; + + if (!emu_cmdline) + goto no_emu; + + memset(&ei, 0, sizeof(ei)); + pi = *numa_meminfo; + + for (i = 0; i < MAX_NUMNODES; i++) + emu_nid_to_phys[i] = NUMA_NO_NODE; + + /* + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. + */ + if (strchr(emu_cmdline, 'U')) { + nodemask_t physnode_mask = numa_nodes_parsed; + unsigned long n; + int nid = 0; + + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); + ret = -1; + for_each_node_mask(i, physnode_mask) { + /* + * The reason we pass in blk[0] is due to + * numa_remove_memblk_from() called by + * emu_setup_memblk() will delete entry 0 + * and then move everything else up in the pi.blk + * array. Therefore we should always be looking + * at blk[0]. + */ + ret = split_nodes_size_interleave_uniform(&ei, &pi, + pi.blk[0].start, pi.blk[0].end, 0, + n, &pi.blk[0], nid); + if (ret < 0) + break; + if (ret < n) { + pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", + __func__, i, ret, n); + ret = -1; + break; + } + nid = ret; + } + } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { + u64 size; + + size = memparse(emu_cmdline, &emu_cmdline); + ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); + } else { + unsigned long n; + + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); + ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); + } + if (*emu_cmdline == ':') + emu_cmdline++; + + if (ret < 0) + goto no_emu; + + if (numa_cleanup_meminfo(&ei) < 0) { + pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + goto no_emu; + } + + /* copy the physical distance table */ + if (numa_dist_cnt) { + phys_dist = memblock_alloc(phys_size, PAGE_SIZE); + if (!phys_dist) { + pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + goto no_emu; + } + + for (i = 0; i < numa_dist_cnt; i++) + for (j = 0; j < numa_dist_cnt; j++) + phys_dist[i * numa_dist_cnt + j] = + node_distance(i, j); + } + + /* + * Determine the max emulated nid and the default phys nid to use + * for unmapped nodes. + */ + max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); + + /* commit */ + *numa_meminfo = ei; + + /* Make sure numa_nodes_parsed only contains emulated nodes */ + nodes_clear(numa_nodes_parsed); + for (i = 0; i < ARRAY_SIZE(ei.blk); i++) + if (ei.blk[i].start != ei.blk[i].end && + ei.blk[i].nid != NUMA_NO_NODE) + node_set(ei.blk[i].nid, numa_nodes_parsed); + + numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); + + /* make sure all emulated nodes are mapped to a physical node */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + if (emu_nid_to_phys[i] == NUMA_NO_NODE) + emu_nid_to_phys[i] = dfl_phys_nid; + + /* transform distance table */ + numa_reset_distance(); + for (i = 0; i < max_emu_nid + 1; i++) { + for (j = 0; j < max_emu_nid + 1; j++) { + int physi = emu_nid_to_phys[i]; + int physj = emu_nid_to_phys[j]; + int dist; + + if (get_option(&emu_cmdline, &dist) == 2) + ; + else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) + dist = physi == physj ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + else + dist = phys_dist[physi * numa_dist_cnt + physj]; + + numa_set_distance(i, j, dist); + } + } + + /* free the copied physical distance table */ + memblock_free(phys_dist, phys_size); + return; + +no_emu: + /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + emu_nid_to_phys[i] = i; +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS +void numa_add_cpu(unsigned int cpu) +{ + int physnid, nid; + + nid = early_cpu_to_node(cpu); + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); + + physnid = emu_nid_to_phys[nid]; + + /* + * Map the cpu to each emulated node that is allocated on the physical + * node of the cpu's apic id. + */ + for_each_online_node(nid) + if (emu_nid_to_phys[nid] == physnid) + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); +} + +void numa_remove_cpu(unsigned int cpu) +{ + int i; + + for_each_online_node(i) + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); +} +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ +static void numa_set_cpumask(unsigned int cpu, bool enable) +{ + int nid, physnid; + + nid = early_cpu_to_node(cpu); + if (nid == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + + physnid = emu_nid_to_phys[nid]; + + for_each_online_node(nid) { + if (emu_nid_to_phys[nid] != physnid) + continue; + + debug_cpumask_set_cpu(cpu, nid, enable); + } +} + +void numa_add_cpu(unsigned int cpu) +{ + numa_set_cpumask(cpu, true); +} + +void numa_remove_cpu(unsigned int cpu) +{ + numa_set_cpumask(cpu, false); +} +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ _ Patches currently in -mm which might be from rppt@xxxxxxxxxx are mm-move-kernel-numac-to-mm.patch mips-sgi-ip27-make-node_data-the-same-as-on-all-other-architectures.patch mips-sgi-ip27-ensure-node_possible_map-only-contains-valid-nodes.patch mips-sgi-ip27-drop-have_arch_nodedata_extension.patch mips-loongson64-rename-__node_data-to-node_data.patch mips-loongson64-drop-have_arch_nodedata_extension.patch arch-mm-move-definition-of-node_data-to-generic-code.patch mm-drop-config_have_arch_nodedata_extension.patch arch-mm-pull-out-allocation-of-node_data-to-generic-code.patch x86-numa-simplify-numa_distance-allocation.patch x86-numa-use-get_pfn_range_for_nid-to-verify-that-node-spans-memory.patch x86-numa-move-fake_node_-defines-to-numa_emu.patch x86-numa_emu-simplify-allocation-of-phys_dist.patch x86-numa_emu-split-__apicid_to_node-update-to-a-helper-function.patch x86-numa_emu-use-a-helper-function-to-get-max_dma32_pfn.patch x86-numa-numa_addremove_cpu-make-cpu-parameter-unsigned.patch mm-introduce-numa_memblks.patch mm-move-numa_distance-and-related-code-from-x86-to-numa_memblks.patch mm-introduce-numa_emulation.patch mm-numa_memblks-introduce-numa_memblks_init.patch mm-numa_memblks-make-several-functions-and-variables-static.patch mm-numa_memblks-use-memblock_startend_of_dram-when-sanitizing-meminfo.patch of-numa-return-einval-when-no-numa-node-id-is-found.patch arch_numa-switch-over-to-numa_memblks.patch arch_numa-switch-over-to-numa_memblks-fix.patch mm-make-range-to-target_node-lookup-facility-a-part-of-numa_memblks.patch docs-move-numa=fake-description-to-kernel-parameterstxt.patch