Hello, I'm having a bit of trouble with the NUMA allocator in the kernel. This is in a numa=fake test-setup (though this shouldn't matter, I guess). I'm trying to allocate pages for KVM VMs from selected nodes (minimal PoC diff attached - hard coding the preferred page to 7 [the last node in my setup, but that doesn't matter - it just demonstrates the point most effectively]). The call of interest is: page = alloc_pages_node(7, GFP_KERNEL | GFP_THISNODE, 0) The problem is that - while page_to_nid() reports that the page is from node 7 - "numactl --hardware" doesn't show any allocations from node 7. In fact it seems that the memory is allocated from the first node with free pages until these run out. Only then pages from the selected (last) node are given out. Once the selected node is full, alloc_pages_node(... GFP_THISNODE ...) returns NULL - as it should - and I fall back to a normal allocation that then also reports a different node ID from page_to_nid (c.f. the attached diff). The strange thing is, that a simple test module (attached as well) works as expected. The allocation succeeds, reports the selected node in page_to_nid *and* the free memory reported from "numactl --hardware" in the selected node decreases. Any insight as to why the KVM allocation might be special are very appreciated. I tried to follow the call path, but didn't find any red flags that would indicate the difference. Thanks. -- /"\ Best regards, | mlaier@xxxxxxxxxxx \ / Max Laier | ICQ #67774661 X http://pf4freebsd.love2party.net/ | mlaier@EFnet / \ ASCII Ribbon Campaign | Against HTML Mail and News
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 479e748..82a3f56 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -300,9 +300,13 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = alloc_page(GFP_KERNEL); - if (!page) - return -ENOMEM; + page = alloc_pages_node(7, GFP_KERNEL | GFP_THISNODE, 0); + if (!page) { + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + printk("Page from node %d\n", page_to_nid(page)); + } set_page_private(page, 0); cache->objects[cache->nobjs++] = page_address(page); }
#include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/mm.h> MODULE_LICENSE("Dual BSD/GPL"); /* 200 MB */ #define NUM_PAGES (51200) struct page *pages[NUM_PAGES]; static int nodemem_init(void) { int i; printk("Trying to allocate %d pages from node 7\n", NUM_PAGES); for (i = 0; i < NUM_PAGES; i++) { pages[i] = alloc_pages_node(7, GFP_KERNEL | GFP_THISNODE, 0); if (!pages[i]) { for (i--; i >= 0; i--) __free_pages(pages[i], 0); return -ENOMEM; } printk("Page %d from node %d\n", i, page_to_nid(pages[i])); } return 0; } static void nodemem_exit(void) { int i; for (i = 0; i < NUM_PAGES; i++) __free_pages(pages[i], 0); } module_init(nodemem_init); module_exit(nodemem_exit);