On 07/26/2010 05:41 PM, Daniel Kiper wrote:
Hi, Currently there is fully working version. It has been tested on Xen Ver. 4.0.0 in PV guest i386/x86_64 with Linux kernel Ver. 2.6.32.16 and Ver. 2.6.34.1. This patch cleanly applys to Ver. 2.6.34.1
Thanks. I've pushed this into xen.git as xen/memory-hotplug so people can play with it more easily (but I haven't merged it into any of the other branches yet).
Some quick notes:
This patch enables two modes of operation: - enabled by CONFIG_XEN_MEMORY_HOTPLUG config option: - set memory limit for chosen domU from dom0: xm mem-max<domU> <new_memory_size_limit> - add memory in chosen domU: echo<unused_address> > \ /sys/devices/system/memory/probe; memory is added in sections which sizes differ from arch to arch (i386: 512 MiB, x86_64: 128 MiB; it could by checked by cat /sys/devices/system/memory/block_size_bytes; this value is in HEX); it is preffered to choose address at section boundary,
It seems odd that the user should choose the address. Can't it work it out for itself? What if the address is actually in use?
Is there any reason you would want to do this, rather than use the balloon driver method?
- online memory in chosen domU: echo online> \ /sys/devices/system/memory/memory<section_number>/state; <section_number> could be established in following manner: (int)(<unused_address> /<section_size>) - enabled by CONFIG_XEN_BALLOON_MEMORY_HOTPLUG config option: - set memory limit for chosen domU from dom0: xm mem-max<domU> <new_memory_size_limit> - add memory for chosen domU from dom0: xm mem-set<domU> <new_memory_size> If you have a questions please drop me a line. Daniel Signed-off-by: Daniel Kiper<dkiper@xxxxxxxxxxxx> --- arch/x86/Kconfig | 2 +- drivers/base/memory.c | 23 +++++ drivers/xen/Kconfig | 10 ++ drivers/xen/balloon.c | 196 +++++++++++++++++++++++++++++++++++++++- include/linux/memory_hotplug.h | 8 ++ include/xen/balloon.h | 6 ++ mm/Kconfig | 9 ++ mm/memory_hotplug.c | 140 ++++++++++++++++++++++++++++ 8 files changed, 390 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9458685..38434da 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1273,7 +1273,7 @@ config ARCH_SELECT_MEMORY_MODEL depends on ARCH_SPARSEMEM_ENABLE config ARCH_MEMORY_PROBE - def_bool X86_64 + def_bool y depends on MEMORY_HOTPLUG config ILLEGAL_POINTER_VALUE diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 933442f..709457b 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -27,6 +27,14 @@ #include<asm/atomic.h> #include<asm/uaccess.h> +#ifdef CONFIG_XEN_MEMORY_HOTPLUG +#include<xen/xen.h> +#endif + +#if defined(CONFIG_XEN_MEMORY_HOTPLUG)&& defined(CONFIG_XEN_BALLOON) +#include<xen/balloon.h> +#endif
It's not necessary to #ifdef #includes.
+ #define MEMORY_CLASS_NAME "memory" static struct sysdev_class memory_sysdev_class = { @@ -215,6 +223,10 @@ memory_block_action(struct memory_block *mem, unsigned long action) case MEM_ONLINE: start_pfn = page_to_pfn(first_page); ret = online_pages(start_pfn, PAGES_PER_SECTION); +#if defined(CONFIG_XEN_MEMORY_HOTPLUG)&& defined(CONFIG_XEN_BALLOON) + if (xen_domain()&& !ret) + balloon_update_stats(PAGES_PER_SECTION); +#endif
We need to work out a way to avoid having these kinds of Xen-specific calls in generic code.
Also, as a matter of style, I presume CONFIG_XEN_MEMORY_HOTPLUG already depends on CONFIG_XEN_BALLOON (and if not, it should), so just testing CONFIG_XEN_MEMORY_HOTPLUG should be enough.
break; case MEM_OFFLINE: mem->state = MEM_GOING_OFFLINE; @@ -225,6 +237,10 @@ memory_block_action(struct memory_block *mem, unsigned long action) mem->state = old_state; break; } +#if defined(CONFIG_XEN_MEMORY_HOTPLUG)&& defined(CONFIG_XEN_BALLOON) + if (xen_domain()) + balloon_update_stats(-PAGES_PER_SECTION); +#endif break; default: WARN(1, KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", @@ -341,6 +357,13 @@ memory_probe_store(struct class *class, struct class_attribute *attr, phys_addr = simple_strtoull(buf, NULL, 0); +#ifdef CONFIG_XEN_MEMORY_HOTPLUG + if (xen_domain()) { + ret = xen_memory_probe(phys_addr); + return ret ? ret : count; + } +#endif + nid = memory_add_physaddr_to_nid(phys_addr); ret = add_memory(nid, phys_addr, PAGES_PER_SECTION<< PAGE_SHIFT); diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index fad3df2..9713048 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -9,6 +9,16 @@ config XEN_BALLOON the system to expand the domain's memory allocation, or alternatively return unneeded memory to the system. +config XEN_BALLOON_MEMORY_HOTPLUG + bool "Xen memory balloon driver with memory hotplug support" + depends on EXPERIMENTAL&& XEN_BALLOON&& MEMORY_HOTPLUG
(EXPERIMENTAL is pretty useless)
+ default n + help + Xen memory balloon driver with memory hotplug support allows expanding + memory available for the system above limit declared at system startup. + It is very useful on critical systems which require long run without + rebooting. + config XEN_SCRUB_PAGES bool "Scrub pages before returning them to system" depends on XEN_BALLOON diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 1a0d8c2..f80bba0 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -6,6 +6,7 @@ * Copyright (c) 2003, B Dragovic * Copyright (c) 2003-2004, M Williamson, K Fraser * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * Copyright (c) 2010 Daniel Kiper * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version 2 @@ -61,6 +62,10 @@ #include<xen/features.h> #include<xen/page.h> +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +#include<linux/memory.h> +#endif
Again re #ifdefing #includes.
+ #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) #define BALLOON_CLASS_NAME "xen_memory" @@ -77,6 +82,11 @@ struct balloon_stats { /* Number of pages in high- and low-memory balloons. */ unsigned long balloon_low; unsigned long balloon_high; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + unsigned long boot_max_pfn; + u64 hotplug_start_paddr; + u64 hotplug_size; +#endif }; static DEFINE_MUTEX(balloon_mutex); @@ -184,6 +194,12 @@ static void balloon_alarm(unsigned long unused) schedule_work(&balloon_worker); } +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static inline unsigned long current_target(void) +{ + return balloon_stats.target_pages;
Why does this need its own version?
+} +#else static unsigned long current_target(void) { unsigned long target = balloon_stats.target_pages; @@ -195,11 +211,12 @@ static unsigned long current_target(void) return target; } +#endif static int increase_reservation(unsigned long nr_pages) { - unsigned long pfn, i, flags; - struct page *page; + unsigned long uninitialized_var(pfn), i, flags; + struct page *uninitialized_var(page); long rc; struct xen_memory_reservation reservation = { .address_bits = 0, @@ -207,11 +224,63 @@ static int increase_reservation(unsigned long nr_pages) .domid = DOMID_SELF }; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + resource_size_t r_min, r_size; + struct resource *r; +#endif + if (nr_pages> ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); spin_lock_irqsave(&balloon_lock, flags); +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + if (!balloon_stats.balloon_low&& !balloon_stats.balloon_high) { + if (!balloon_stats.hotplug_start_paddr) {
This should be in its own function. Overall, I think you should refactor increase_reservation to avoid all the gotos and - especially - all the #ifdefs.
+ + /* + * Look for first unused memory region starting + * at page boundary. Skip last memory section created + * at boot time becuase it may contains unused memory + * pages with PG_reserved bit not set (online_pages + * require PG_reserved bit set). + */ + + r = kzalloc(sizeof(struct resource), GFP_KERNEL); + + if (!r) { + rc = -ENOMEM; + goto out; + } + + r->name = "System RAM"; + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1)); + r_size = (balloon_stats.target_pages - balloon_stats.current_pages)<< PAGE_SHIFT; + + rc = allocate_resource(&iomem_resource, r, + r_size, r_min, ULONG_MAX, + PAGE_SIZE, NULL, NULL); + + if (rc< 0) { + kfree(r); + goto out; + } + + balloon_stats.hotplug_start_paddr = r->start; + } + + pfn = PFN_DOWN(balloon_stats.hotplug_start_paddr + + balloon_stats.hotplug_size); + + for (i = 0; i< nr_pages; ++i, ++pfn) + frame_list[i] = pfn; + + pfn -= nr_pages + 1; + goto populate_physmap; + } +#endif + page = balloon_first_page(); for (i = 0; i< nr_pages; i++) { BUG_ON(page == NULL); @@ -219,6 +288,9 @@ static int increase_reservation(unsigned long nr_pages) page = balloon_next_page(page); } +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +populate_physmap: +#endif set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op(XENMEM_populate_physmap,&reservation); @@ -226,17 +298,33 @@ static int increase_reservation(unsigned long nr_pages) goto out; for (i = 0; i< rc; i++) { +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + if (balloon_stats.hotplug_start_paddr) { + ++pfn; + goto set_p2m; + }
What does this do?
+#endif + page = balloon_retrieve(); BUG_ON(page == NULL); pfn = page_to_pfn(page); + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +set_p2m: +#endif BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap)&& phys_to_machine_mapping_valid(pfn)); set_phys_to_machine(pfn, frame_list[i]); +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + if (balloon_stats.hotplug_start_paddr) + continue; +#endif + /* Link back into the page tables if not highmem. */ - if (pfn< max_low_pfn) { + if (!PageHighMem(page)) { int ret; ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn<< PAGE_SHIFT), @@ -251,6 +339,11 @@ static int increase_reservation(unsigned long nr_pages) __free_page(page); } +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + if (balloon_stats.hotplug_start_paddr) + balloon_stats.hotplug_size += rc<< PAGE_SHIFT; +#endif + balloon_stats.current_pages += rc; out: @@ -331,6 +424,12 @@ static void balloon_process(struct work_struct *work) int need_sleep = 0; long credit; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + int nid, ret; + struct memory_block *mem; + unsigned long pfn, pfn_limit; +#endif + mutex_lock(&balloon_mutex); do { @@ -349,10 +448,93 @@ static void balloon_process(struct work_struct *work) /* Schedule more work if there is some still to be done. */ if (current_target() != balloon_stats.current_pages) mod_timer(&balloon_timer, jiffies + HZ); +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + else if (balloon_stats.hotplug_start_paddr) {
This should be its own function.
+ nid = memory_add_physaddr_to_nid(balloon_stats.hotplug_start_paddr); + + ret = xen_add_memory(nid, balloon_stats.hotplug_start_paddr, + balloon_stats.hotplug_size); + + if (ret) { + printk(KERN_ERR "%s: xen_add_memory: " + "Memory hotplug failed: %i\n", + __func__, ret); + goto error; + } + + pfn = PFN_DOWN(balloon_stats.hotplug_start_paddr); + pfn_limit = pfn + (balloon_stats.hotplug_size>> PAGE_SHIFT); + + for (; pfn< pfn_limit; ++pfn) + if (!PageHighMem(pfn_to_page(pfn))) + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn<< PAGE_SHIFT), + mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 0)); + + ret = online_pages(PFN_DOWN(balloon_stats.hotplug_start_paddr), + balloon_stats.hotplug_size>> PAGE_SHIFT); + + if (ret) { + printk(KERN_ERR "%s: online_pages: Failed: %i\n", + __func__, ret); + goto error; + } + + pfn = PFN_DOWN(balloon_stats.hotplug_start_paddr); + pfn_limit = pfn + (balloon_stats.hotplug_size>> PAGE_SHIFT); + + for (; pfn< pfn_limit; pfn += PAGES_PER_SECTION) { + mem = find_memory_block(__pfn_to_section(pfn)); + BUG_ON(!mem); + BUG_ON(!present_section_nr(mem->phys_index)); + mutex_lock(&mem->state_mutex); + mem->state = MEM_ONLINE; + mutex_unlock(&mem->state_mutex); + } + + goto out; + +error: + balloon_stats.current_pages -= balloon_stats.hotplug_size>> PAGE_SHIFT; + balloon_stats.target_pages -= balloon_stats.hotplug_size>> PAGE_SHIFT; + +out: + balloon_stats.hotplug_start_paddr = 0; + balloon_stats.hotplug_size = 0; + } +#endif mutex_unlock(&balloon_mutex); } +#ifdef CONFIG_XEN_MEMORY_HOTPLUG + +/* Resets the Xen limit, sets new target, and kicks off processing. */ +static void balloon_set_new_target(unsigned long target) +{ + mutex_lock(&balloon_mutex); + balloon_stats.target_pages = target; + mutex_unlock(&balloon_mutex); +
I don't think it should be necessary to define a new version of this function.
+ schedule_work(&balloon_worker); +} + +void balloon_update_stats(long nr_pages) +{ + mutex_lock(&balloon_mutex); + + balloon_stats.current_pages += nr_pages; + balloon_stats.target_pages += nr_pages; + + xenbus_printf(XBT_NIL, "memory", "target", "%llu", + (unsigned long long)balloon_stats.target_pages<< (PAGE_SHIFT - 10));
Hm, this looks odd. In what situations does this get called?
+ + mutex_unlock(&balloon_mutex); +} +EXPORT_SYMBOL_GPL(balloon_update_stats); + +#else + /* Resets the Xen limit, sets new target, and kicks off processing. */ static void balloon_set_new_target(unsigned long target) { @@ -361,6 +543,8 @@ static void balloon_set_new_target(unsigned long target) schedule_work(&balloon_worker); } +#endif + static struct xenbus_watch target_watch = { .node = "memory/target" @@ -416,6 +600,12 @@ static int __init balloon_init(void) balloon_stats.balloon_high = 0; balloon_stats.driver_pages = 0UL; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + balloon_stats.boot_max_pfn = max_pfn; + balloon_stats.hotplug_start_paddr = 0; + balloon_stats.hotplug_size = 0; +#endif + init_timer(&balloon_timer); balloon_timer.data = 0; balloon_timer.function = balloon_alarm; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 35b07b7..04e67b8 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -211,4 +211,12 @@ extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); +#if defined(CONFIG_XEN_MEMORY_HOTPLUG) || defined(CONFIG_XEN_BALLOON_MEMORY_HOTPLUG) +extern int xen_add_memory(int nid, u64 start, u64 size); +#endif + +#ifdef CONFIG_XEN_MEMORY_HOTPLUG +extern int xen_memory_probe(u64 phys_addr); +#endif
Not necessary to #ifdef prototypes.
+ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/xen/balloon.h b/include/xen/balloon.h new file mode 100644 index 0000000..84b17b7 --- /dev/null +++ b/include/xen/balloon.h @@ -0,0 +1,6 @@ +#ifndef _XEN_BALLOON_H +#define _XEN_BALLOON_H + +extern void balloon_update_stats(long nr_pages);
If this is Xen-specific, it should have xen_ in the name.
+ +#endif /* _XEN_BALLOON_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 9c61158..b04f3a8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -144,6 +144,15 @@ config MEMORY_HOTREMOVE depends on MEMORY_HOTPLUG&& ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION +config XEN_MEMORY_HOTPLUG + bool "Allow for memory hot-add in Xen guests" + depends on EXPERIMENTAL&& ARCH_MEMORY_PROBE&& XEN
Do we need both XEN_MEMORY_HOTPLUG and XEN_BALLOON_MEMORY_HOTPLUG?
+ default n + help + Memory hotplug allows expanding memory available for the system + above limit declared at system startup. It is very useful on critical + systems which require long run without rebooting. + # # If we have space for more page flags then we can enable additional # optimizations and functionality. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index be211a5..1c73703 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -32,6 +32,14 @@ #include<asm/tlbflush.h> +#ifdef CONFIG_XEN_MEMORY_HOTPLUG +#include<asm/xen/hypercall.h> +#include<xen/interface/xen.h> +#include<xen/interface/memory.h> +#include<xen/features.h> +#include<xen/page.h> +#endif
We need to think carefully about what the real requirements are here and try to abstract this away from being Xen-specific.
+ #include "internal.h" /* add this memory to iomem resource */ @@ -542,6 +550,138 @@ out: } EXPORT_SYMBOL_GPL(add_memory); +#if defined(CONFIG_XEN_MEMORY_HOTPLUG) || defined(CONFIG_XEN_BALLOON_MEMORY_HOTPLUG) +/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ +int __ref xen_add_memory(int nid, u64 start, u64 size) +{ + pg_data_t *pgdat = NULL; + int new_pgdat = 0, ret; + + lock_system_sleep(); + + if (!node_online(nid)) { + pgdat = hotadd_new_pgdat(nid, start); + ret = -ENOMEM; + if (!pgdat) + goto out; + new_pgdat = 1; + } + + /* call arch's memory hotadd */ + ret = arch_add_memory(nid, start, size); + + if (ret< 0) + goto error; + + /* we online node here. we can't roll back from here. */ + node_set_online(nid); + + if (new_pgdat) { + ret = register_one_node(nid); + /* + * If sysfs file of new node can't create, cpu on the node + * can't be hot-added. There is no rollback way now. + * So, check by BUG_ON() to catch it reluctantly.. + */ + BUG_ON(ret); + } +
This doesn't seem to be doing anything particularly xen-specific.
+ goto out; + +error: + /* rollback pgdat allocation */ + if (new_pgdat) + rollback_node_hotadd(nid, pgdat); + +out: + unlock_system_sleep(); + return ret; +} +EXPORT_SYMBOL_GPL(xen_add_memory); +#endif + +#ifdef CONFIG_XEN_MEMORY_HOTPLUG +int xen_memory_probe(u64 phys_addr) +{ + int nr_pages, ret; + struct resource *r; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF, + .nr_extents = PAGES_PER_SECTION + }; + unsigned long *frame_list, i, pfn; + + r = register_memory_resource(phys_addr, PAGES_PER_SECTION<< PAGE_SHIFT); + + if (!r) + return -EEXIST; + + frame_list = vmalloc(PAGES_PER_SECTION * sizeof(unsigned long));
Does this need to be done in PAGES_PER_SECTION chunks? Couldn't you just allocate a single page and batch it in 1024/512 page pieces?
+ + if (!frame_list) { + printk(KERN_ERR "%s: vmalloc: Out of memory\n", __func__); + ret = -ENOMEM; + goto error; + } + + set_xen_guest_handle(reservation.extent_start, frame_list); + for (i = 0, pfn = PFN_DOWN(phys_addr); i< PAGES_PER_SECTION; ++i, ++pfn) + frame_list[i] = pfn; + + ret = HYPERVISOR_memory_op(XENMEM_populate_physmap,&reservation); + + if (ret< PAGES_PER_SECTION) { + if (ret> 0) { + printk(KERN_ERR "%s: PHYSMAP is not fully " + "populated: %i/%lu\n", __func__, + ret, PAGES_PER_SECTION); + reservation.nr_extents = nr_pages = ret; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,&reservation); + BUG_ON(ret != nr_pages);
BUG_ON is pretty severe; it should only be used if its impossible to recover from the failure, or if failure indicates that some severe bug has occurred.
+ ret = -ENOMEM; + } else { + ret = (ret< 0) ? ret : -ENOMEM; + printk(KERN_ERR "%s: Can't populate PHYSMAP: %i\n", __func__, ret); + } + goto error; + } + + for (i = 0, pfn = PFN_DOWN(phys_addr); i< PAGES_PER_SECTION; ++i, ++pfn) { + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap)&& + phys_to_machine_mapping_valid(pfn)); + set_phys_to_machine(pfn, frame_list[i]); + } + + ret = xen_add_memory(memory_add_physaddr_to_nid(phys_addr), phys_addr, + PAGES_PER_SECTION<< PAGE_SHIFT); + + if (ret) { + printk(KERN_ERR "%s: xen_add_memory: Memory hotplug " + "failed: %i\n", __func__, ret); + goto out; + } + + for (i = 0, pfn = PFN_DOWN(phys_addr); i< PAGES_PER_SECTION; ++i, ++pfn) + if (!PageHighMem(pfn_to_page(pfn))) + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn<< PAGE_SHIFT), + mfn_pte(frame_list[i], PAGE_KERNEL), 0)); + + goto out; + +error: + release_memory_resource(r); + +out: + vfree(frame_list); + + return (ret< 0) ? ret : 0; +} +EXPORT_SYMBOL_GPL(xen_memory_probe); +#endif + #ifdef CONFIG_MEMORY_HOTREMOVE /* * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
-- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>