offline_pages() should not be called outside of the MM core. Especially, having to manually fiddle with the memory block states is a sign that this is not a good idea. To offline memory block devices cleanly, device_offline() should be used. This is the only remaining caller of offline_pages(), except the official device_offline() way. E.g., when trying to allocate right now we trigger messages like [ 11.227817] page:c00c000000182000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 [ 11.228056] raw: 007ffff000000000 c000000001538860 c000000001538860 0000000000000000 [ 11.228070] raw: 0000000000000000 0000000000000001 00000001ffffffff 0000000000000000 [ 11.228097] page dumped because: unmovable page and theoretically we might end up looping quite a long time trying to offline memory, which would have to be canceled by the user manually (CTRL-C). Memtrace needs to identify+allocate multiple consecutive memory blocks. It also has to remove the memory blocks to remove all page tables (HW requirement). Let's use alloc_contig_pages() to allocate memory that spans multiple memory block devices. We can then set all pages PageOffline() to allow these pages to get isolated. A temporary memory notifier can then make offlining of these pages succeed by dropping its reference to the pages on MEM_GOING_OFFLINE events(as documented in include/linux/page-flags.h for PageOffline() pages). Error handling is a bit tricky. Note1: ZONE_MOVABLE memory blocks won't be considered. Not sure if that was ever really relevant. (unmovable data would end up on these memory blocks for a tiny little time frame) Note2: We don't have to care about online_page_callback_t, as we forbid re-onlining from our memory notifier. Note3: I was told this feature is never used along with DIMM-based memory hotunplug - otherwise bad things will happen when a DIMM would try to remove "alread-removed" memory (that is still in use). Tested under QEMU with powernv emulation (kernel + initramfs). $ mount -t debugfs none /sys/kernel/debug/ $ cat /sys/devices/system/memory/block_size_bytes 10000000 $ echo 0x20000000 > /sys/kernel/debug/powerpc/memtrace/enable [ 19.809790] Offlined Pages 4096 [ 19.835842] Offlined Pages 4096 [ 19.853136] memtrace: Allocated trace memory on node 0 at 0x0000000040000000 Unfortunately, QEMU does not support NUMA for powernv yet, so I cannot test that. Cc: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> Cc: Paul Mackerras <paulus@xxxxxxxxx> Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: Allison Randal <allison@xxxxxxxxxxx> Cc: Jens Axboe <axboe@xxxxxxxxx> Cc: Anshuman Khandual <anshuman.khandual@xxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxxxx> Cc: Oscar Salvador <osalvador@xxxxxxx> Cc: Balbir Singh <bsingharora@xxxxxxxxx> Cc: Rashmica Gupta <rashmica.g@xxxxxxxxx> Cc: linuxppc-dev@xxxxxxxxxxxxxxxx Signed-off-by: David Hildenbrand <david@xxxxxxxxxx> --- arch/powerpc/platforms/powernv/Kconfig | 1 + arch/powerpc/platforms/powernv/memtrace.c | 175 ++++++++++++++-------- 2 files changed, 112 insertions(+), 64 deletions(-) diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 938803eab0ad..571a0fa9f055 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -29,6 +29,7 @@ config OPAL_PRD config PPC_MEMTRACE bool "Enable removal of RAM from kernel mappings for tracing" depends on PPC_POWERNV && MEMORY_HOTREMOVE + select CONTIG_ALLOC help Enabling this option allows for the removal of memory (RAM) from the kernel mappings to be used for hardware tracing. diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 2d2a0a2acd60..fe1e8f3926a1 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -76,83 +76,130 @@ static int memtrace_free_node(int nid, unsigned long start, unsigned long size) return ret; } -static int check_memblock_online(struct memory_block *mem, void *arg) -{ - if (mem->state != MEM_ONLINE) - return -1; - - return 0; -} - -static int change_memblock_state(struct memory_block *mem, void *arg) -{ - unsigned long state = (unsigned long)arg; - - mem->state = state; - - return 0; -} +struct memtrace_alloc_info { + struct notifier_block memory_notifier; + unsigned long base_pfn; + unsigned long nr_pages; +}; -/* called with device_hotplug_lock held */ -static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages) +static int memtrace_memory_notifier_cb(struct notifier_block *nb, + unsigned long action, void *arg) { - const unsigned long start = PFN_PHYS(start_pfn); - const unsigned long size = PFN_PHYS(nr_pages); - - if (walk_memory_blocks(start, size, NULL, check_memblock_online)) - return false; - - walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE, - change_memblock_state); - - if (offline_pages(start_pfn, nr_pages)) { - walk_memory_blocks(start, size, (void *)MEM_ONLINE, - change_memblock_state); - return false; + struct memtrace_alloc_info *info = container_of(nb, + struct memtrace_alloc_info, + memory_notifier); + unsigned long pfn, start_pfn, end_pfn; + const struct memory_notify *mhp = arg; + static bool going_offline; + + /* Ignore ranges that don't overlap. */ + if (mhp->start_pfn + mhp->nr_pages <= info->base_pfn || + info->base_pfn + info->nr_pages <= mhp->start_pfn) + return NOTIFY_OK; + + start_pfn = max_t(unsigned long, info->base_pfn, mhp->start_pfn); + end_pfn = min_t(unsigned long, info->base_pfn + info->nr_pages, + mhp->start_pfn + mhp->nr_pages); + + /* + * Drop our reference to the allocated (PageOffline()) pages, but + * reaquire them in case offlining fails. We might get called for + * MEM_CANCEL_OFFLINE but not for MEM_GOING_OFFLINE in case another + * notifier aborted offlining. + */ + switch (action) { + case MEM_GOING_OFFLINE: + for (pfn = start_pfn; pfn < end_pfn; pfn++) + page_ref_dec(pfn_to_page(pfn)); + going_offline = true; + break; + case MEM_CANCEL_OFFLINE: + if (going_offline) + for (pfn = start_pfn; pfn < end_pfn; pfn++) + page_ref_inc(pfn_to_page(pfn)); + going_offline = false; + break; + case MEM_GOING_ONLINE: + /* + * While our notifier is active, user space could + * offline+re-online this memory. Disallow any such activity. + */ + return notifier_to_errno(-EBUSY); } - - walk_memory_blocks(start, size, (void *)MEM_OFFLINE, - change_memblock_state); - - - return true; + return NOTIFY_OK; } static u64 memtrace_alloc_node(u32 nid, u64 size) { - u64 start_pfn, end_pfn, nr_pages, pfn; - u64 base_pfn; - u64 bytes = memory_block_size_bytes(); + const unsigned long memory_block_bytes = memory_block_size_bytes(); + const unsigned long nr_pages = size >> PAGE_SHIFT; + struct memtrace_alloc_info info = { + .memory_notifier = { + .notifier_call = memtrace_memory_notifier_cb, + }, + }; + unsigned long base_pfn, to_remove_pfn, pfn; + struct page *page; + int ret; if (!node_spanned_pages(nid)) return 0; - start_pfn = node_start_pfn(nid); - end_pfn = node_end_pfn(nid); - nr_pages = size >> PAGE_SHIFT; - - /* Trace memory needs to be aligned to the size */ - end_pfn = round_down(end_pfn - nr_pages, nr_pages); - - lock_device_hotplug(); - for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) { - if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) { - /* - * Remove memory in memory block size chunks so that - * iomem resources are always split to the same size and - * we never try to remove memory that spans two iomem - * resources. - */ - end_pfn = base_pfn + nr_pages; - for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) { - __remove_memory(nid, pfn << PAGE_SHIFT, bytes); - } - unlock_device_hotplug(); - return base_pfn << PAGE_SHIFT; - } + /* + * Try to allocate memory (that might span multiple memory blocks) + * on the requested node. Trace memory needs to be aligned to the size, + * which is guaranteed by alloc_contig_pages(). + */ + page = alloc_contig_pages(nr_pages, __GFP_THISNODE, nid, NULL); + if (!page) + return 0; + to_remove_pfn = base_pfn = page_to_pfn(page); + info.base_pfn = base_pfn; + info.nr_pages = nr_pages; + + /* PageOffline() allows to isolate the memory when offlining. */ + for (pfn = base_pfn; pfn < base_pfn + nr_pages; pfn++) + __SetPageOffline(pfn_to_page(pfn)); + + /* A temporary memory notifier allows to offline the isolated memory. */ + ret = register_memory_notifier(&info.memory_notifier); + if (ret) + goto out_free_pages; + + /* + * Try to offline and remove all involved memory blocks. This will + * only fail in the unlikely event that another memory notifier NACKs + * the offlining request - no memory has to be migrated. + * + * Remove memory in memory block size chunks so that iomem resources + * are always split to the same size and we never try to remove memory + * that spans two iomem resources. + */ + for (; to_remove_pfn < base_pfn + nr_pages; + to_remove_pfn += PHYS_PFN(memory_block_bytes)) { + ret = offline_and_remove_memory(nid, PFN_PHYS(to_remove_pfn), + memory_block_bytes); + if (ret) + goto out_readd_memory; } - unlock_device_hotplug(); + unregister_memory_notifier(&info.memory_notifier); + return PFN_PHYS(base_pfn); +out_readd_memory: + /* Unregister before adding+onlining (notifer blocks onlining). */ + unregister_memory_notifier(&info.memory_notifier); + if (to_remove_pfn != base_pfn) { + ret = memtrace_free_node(nid, PFN_PHYS(base_pfn), + PFN_PHYS(to_remove_pfn - base_pfn)); + if (ret) + /* Even more unlikely, log and ignore. */ + pr_err("Failed to add trace memory to node %d\n", nid); + } +out_free_pages: + /* Only free memory that was not temporarily offlined+removed. */ + for (pfn = to_remove_pfn; pfn < base_pfn + nr_pages; pfn++) + __ClearPageOffline(pfn_to_page(pfn)); + free_contig_range(to_remove_pfn, nr_pages - (to_remove_pfn - base_pfn)); return 0; } -- 2.23.0