On Fri, 25 Jan 2013 17:42:09 +0800 Tang Chen <tangchen@xxxxxxxxxxxxxx> wrote: > We now provide an option for users who don't want to specify physical > memory address in kernel commandline. > > /* > * For movablemem_map=acpi: > * > * SRAT: |_____| |_____| |_________| |_________| ...... > * node id: 0 1 1 2 > * hotpluggable: n y y n > * movablemem_map: |_____| |_________| > * > * Using movablemem_map, we can prevent memblock from allocating memory > * on ZONE_MOVABLE at boot time. > */ > > So user just specify movablemem_map=acpi, and the kernel will use hotpluggable > info in SRAT to determine which memory ranges should be set as ZONE_MOVABLE. Well, as a reult of my previous hackery, arch/x86/mm/srat.c now looks rather different. Please check it carefully and runtime test this code when it appears in linux-next? /* * ACPI 3.0 based NUMA setup * Copyright 2004 Andi Kleen, SuSE Labs. * * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. * * Called from acpi_numa_init while reading the SRAT and SLIT tables. * Assumes all memory regions belonging to a single proximity domain * are in one chunk. Holes between them will be included in the node. */ #include <linux/kernel.h> #include <linux/acpi.h> #include <linux/mmzone.h> #include <linux/bitmap.h> #include <linux/module.h> #include <linux/topology.h> #include <linux/bootmem.h> #include <linux/memblock.h> #include <linux/mm.h> #include <asm/proto.h> #include <asm/numa.h> #include <asm/e820.h> #include <asm/apic.h> #include <asm/uv/uv.h> int acpi_numa __initdata; static __init int setup_node(int pxm) { return acpi_map_pxm_to_node(pxm); } static __init void bad_srat(void) { printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; } static __init inline int srat_disabled(void) { return acpi_numa < 0; } /* Callback for SLIT parsing */ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { int i, j; for (i = 0; i < slit->locality_count; i++) for (j = 0; j < slit->locality_count; j++) numa_set_distance(pxm_to_node(i), pxm_to_node(j), slit->entry[slit->locality_count * i + j]); } /* Callback for Proximity Domain -> x2APIC mapping */ void __init acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) { int pxm, node; int apic_id; if (srat_disabled()) return; if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { bad_srat(); return; } if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) return; pxm = pa->proximity_domain; apic_id = pa->apic_id; if (!apic->apic_id_valid(apic_id)) { printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n", pxm, apic_id); return; } node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); bad_srat(); return; } if (apic_id >= MAX_LOCAL_APIC) { printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); return; } set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); } /* Callback for Proximity Domain -> LAPIC mapping */ void __init acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { int pxm, node; int apic_id; if (srat_disabled()) return; if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { bad_srat(); return; } if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) return; pxm = pa->proximity_domain_lo; if (acpi_srat_revision >= 2) pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8; node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); bad_srat(); return; } if (get_uv_system_type() >= UV_X2APIC) apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; else apic_id = pa->apic_id; if (apic_id >= MAX_LOCAL_APIC) { printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); return; } set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); } #ifdef CONFIG_MEMORY_HOTPLUG static inline int save_add_info(void) {return 1;} #else static inline int save_add_info(void) {return 0;} #endif #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static void __init handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) { int overlap; unsigned long start_pfn, end_pfn; start_pfn = PFN_DOWN(start); end_pfn = PFN_UP(end); /* * For movablemem_map=acpi: * * SRAT: |_____| |_____| |_________| |_________| ...... * node id: 0 1 1 2 * hotpluggable: n y y n * movablemem_map: |_____| |_________| * * Using movablemem_map, we can prevent memblock from allocating memory * on ZONE_MOVABLE at boot time. */ if (hotpluggable && movablemem_map.acpi) { insert_movablemem_map(start_pfn, end_pfn); goto out; } /* * For movablemem_map=nn[KMG]@ss[KMG]: * * SRAT: |_____| |_____| |_________| |_________| ...... * node id: 0 1 1 2 * user specified: |__| |___| * movablemem_map: |___| |_________| |______| ...... * * Using movablemem_map, we can prevent memblock from allocating memory * on ZONE_MOVABLE at boot time. * * NOTE: In this case, SRAT info will be ingored. */ overlap = movablemem_map_overlap(start_pfn, end_pfn); if (overlap >= 0) { /* * If part of this range is in movablemem_map, we need to * add the range after it to extend the range to the end * of the node, because from the min address specified to * the end of the node will be ZONE_MOVABLE. */ start_pfn = max(start_pfn, movablemem_map.map[overlap].start_pfn); insert_movablemem_map(start_pfn, end_pfn); /* * Set the nodemask, so that if the address range on one node * is not continuse, we can add the subsequent ranges on the * same node into movablemem_map. */ node_set(node, movablemem_map.numa_nodes_hotplug); } else { if (node_isset(node, movablemem_map.numa_nodes_hotplug)) /* * Insert the range if we already have movable ranges * on the same node. */ insert_movablemem_map(start_pfn, end_pfn); } out: return; } #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline void handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) { } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; u32 hotpluggable; int node, pxm; if (srat_disabled()) goto out_err; if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) goto out_err_bad_srat; if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) goto out_err; hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; if (hotpluggable && !save_add_info()) goto out_err; start = ma->base_address; end = start + ma->length; pxm = ma->proximity_domain; if (acpi_srat_revision <= 1) pxm &= 0xff; node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains.\n"); goto out_err_bad_srat; } if (numa_add_memblk(node, start, end) < 0) goto out_err_bad_srat; node_set(node, numa_nodes_parsed); printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1, hotpluggable ? "Hot Pluggable": ""); handle_movablemem(node, start, end, hotpluggable); return 0; out_err_bad_srat: bad_srat(); out_err: return -1; } void __init acpi_numa_arch_fixup(void) {} int __init x86_acpi_numa_init(void) { int ret; ret = acpi_numa_init(); if (ret < 0) return ret; return srat_disabled() ? -EINVAL : 0; } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>