On Fri, 19 Nov 2010, Shaohui Zheng wrote: > nr_node_ids is the possible node number. when we do regular memory online, > it is oline to a possible node, and it is already counted in to nr_node_ids. > > if you increment nr_node_ids dynamically when node online, it causes a lot of > problems. Many data are initialized according to nr_node_ids. That is our > experience when we debug the emulator. > I think what we'll end up wanting to do is something like this, which adds a numa=possible=<N> parameter for x86; this will add an additional N possible nodes to node_possible_map that we can use to online later. It also adds a new /sys/devices/system/memory/add_node file which takes a typical "size@start" value to hot-add an emulated node. For example, using "mem=2G numa=possible=1" on the command line and doing echo 128M@0x80000000" > /sys/devices/system/memory/add_node would hot-add a node of 128M. Comments? --- diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -33,6 +33,7 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { int numa_off __initdata; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; +static unsigned long __initdata numa_possible_nodes; /* * Map cpu index to node index @@ -611,7 +612,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, #ifdef CONFIG_NUMA_EMU if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) - return; + goto out; nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif @@ -619,14 +620,14 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, #ifdef CONFIG_ACPI_NUMA if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT)) - return; + goto out; nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif #ifdef CONFIG_K8_NUMA if (!numa_off && k8 && !k8_scan_nodes()) - return; + goto out; nodes_clear(node_possible_map); nodes_clear(node_online_map); #endif @@ -646,6 +647,15 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, numa_set_node(i, 0); memblock_x86_register_active_regions(0, start_pfn, last_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); +out: __maybe_unused + for (i = 0; i < numa_possible_nodes; i++) { + int nid; + + nid = first_unset_node(node_possible_map); + if (nid == MAX_NUMNODES) + break; + node_set(nid, node_possible_map); + } } unsigned long __init numa_free_all_bootmem(void) @@ -675,6 +685,8 @@ static __init int numa_setup(char *opt) if (!strncmp(opt, "noacpi", 6)) acpi_numa = -1; #endif + if (!strncmp(opt, "possible=", 9)) + numa_possible_nodes = simple_strtoul(opt + 9, NULL, 0); return 0; } early_param("numa", numa_setup); diff --git a/drivers/base/memory.c b/drivers/base/memory.c --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -353,10 +353,44 @@ memory_probe_store(struct class *class, struct class_attribute *attr, } static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store); +static ssize_t +memory_add_node_store(struct class *class, struct class_attribute *attr, + const char *buf, size_t count) +{ + nodemask_t mask; + u64 start, size; + char *p; + int nid; + int ret; + + size = memparse(buf, &p); + if (size < (PAGES_PER_SECTION << PAGE_SHIFT)) + return -EINVAL; + if (*p != '@') + return -EINVAL; + + start = simple_strtoull(p + 1, NULL, 0); + + nodes_andnot(mask, node_possible_map, node_online_map); + nid = first_node(mask); + if (nid == MAX_NUMNODES) + return -EINVAL; + + ret = add_memory(nid, start, size); + return ret ? ret : count; +} +static CLASS_ATTR(add_node, S_IWUSR, NULL, memory_add_node_store); + static int memory_probe_init(void) { - return sysfs_create_file(&memory_sysdev_class.kset.kobj, + int err; + + err = sysfs_create_file(&memory_sysdev_class.kset.kobj, &class_attr_probe.attr); + if (err) + return err; + return sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_add_node.attr); } #else static inline int memory_probe_init(void) -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>