The patch titled Subject: secretmem: use PMD-size pages to amortize direct map fragmentation has been added to the -mm tree. Its filename is secretmem-use-pmd-size-pages-to-amortize-direct-map-fragmentation.patch This patch should soon appear at https://ozlabs.org/~akpm/mmots/broken-out/secretmem-use-pmd-size-pages-to-amortize-direct-map-fragmentation.patch and later at https://ozlabs.org/~akpm/mmotm/broken-out/secretmem-use-pmd-size-pages-to-amortize-direct-map-fragmentation.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Mike Rapoport <rppt@xxxxxxxxxxxxx> Subject: secretmem: use PMD-size pages to amortize direct map fragmentation Removing a PAGE_SIZE page from the direct map every time such page is allocated for a secret memory mapping will cause severe fragmentation of the direct map. This fragmentation can be reduced by using PMD-size pages as a pool for small pages for secret memory mappings. Add a gen_pool per secretmem inode and lazily populate this pool with PMD-size pages. As pages allocated by secretmem become unmovable, use CMA to back large page caches so that page allocator won't be surprised by failing attempt to migrate these pages. The CMA area used by secretmem is controlled by the "secretmem=" kernel parameter. This allows explicit control over the memory available for secretmem and provides upper hard limit for secretmem consumption. Link: https://lkml.kernel.org/r/20201110151444.20662-6-rppt@xxxxxxxxxx Signed-off-by: Mike Rapoport <rppt@xxxxxxxxxxxxx> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx> Cc: Andy Lutomirski <luto@xxxxxxxxxx> Cc: Arnd Bergmann <arnd@xxxxxxxx> Cc: Borislav Petkov <bp@xxxxxxxxx> Cc: Catalin Marinas <catalin.marinas@xxxxxxx> Cc: Christopher Lameter <cl@xxxxxxxxx> Cc: Dan Williams <dan.j.williams@xxxxxxxxx> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: Elena Reshetova <elena.reshetova@xxxxxxxxx> Cc: Hagen Paul Pfeifer <hagen@xxxxxxxx> Cc: "H. Peter Anvin" <hpa@xxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxxxxx> Cc: James Bottomley <jejb@xxxxxxxxxxxxx> Cc: "Kirill A. Shutemov" <kirill@xxxxxxxxxxxxx> Cc: Mark Rutland <mark.rutland@xxxxxxx> Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> Cc: Michael Kerrisk <mtk.manpages@xxxxxxxxx> Cc: Palmer Dabbelt <palmerdabbelt@xxxxxxxxxx> Cc: Paul Walmsley <paul.walmsley@xxxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Rick Edgecombe <rick.p.edgecombe@xxxxxxxxx> Cc: Shuah Khan <shuah@xxxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Tycho Andersen <tycho@xxxxxxxx> Cc: Will Deacon <will@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- mm/Kconfig | 2 mm/secretmem.c | 151 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 135 insertions(+), 18 deletions(-) --- a/mm/Kconfig~secretmem-use-pmd-size-pages-to-amortize-direct-map-fragmentation +++ a/mm/Kconfig @@ -890,5 +890,7 @@ config KMAP_LOCAL config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED + select GENERIC_ALLOCATOR + select CMA endmenu --- a/mm/secretmem.c~secretmem-use-pmd-size-pages-to-amortize-direct-map-fragmentation +++ a/mm/secretmem.c @@ -7,12 +7,15 @@ #include <linux/mm.h> #include <linux/fs.h> +#include <linux/cma.h> #include <linux/mount.h> #include <linux/memfd.h> #include <linux/bitops.h> #include <linux/printk.h> #include <linux/pagemap.h> +#include <linux/genalloc.h> #include <linux/syscalls.h> +#include <linux/memblock.h> #include <linux/pseudo_fs.h> #include <linux/set_memory.h> #include <linux/sched/signal.h> @@ -40,24 +43,79 @@ #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK struct secretmem_ctx { + struct gen_pool *pool; unsigned int mode; }; -static struct page *secretmem_alloc_page(gfp_t gfp) +static struct cma *secretmem_cma; + +static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp) { + unsigned long nr_pages = (1 << PMD_PAGE_ORDER); + struct gen_pool *pool = ctx->pool; + unsigned long addr; + struct page *page; + int err; + + page = cma_alloc(secretmem_cma, nr_pages, PMD_SIZE, gfp & __GFP_NOWARN); + if (!page) + return -ENOMEM; + + err = set_direct_map_invalid_noflush(page, nr_pages); + if (err) + goto err_cma_release; + + addr = (unsigned long)page_address(page); + err = gen_pool_add(pool, addr, PMD_SIZE, NUMA_NO_NODE); + if (err) + goto err_set_direct_map; + + flush_tlb_kernel_range(addr, addr + PMD_SIZE); + + return 0; + +err_set_direct_map: /* - * FIXME: use a cache of large pages to reduce the direct map - * fragmentation + * If a split of PUD-size page was required, it already happened + * when we marked the pages invalid which guarantees that this call + * won't fail */ - return alloc_page(gfp); + set_direct_map_default_noflush(page, nr_pages); +err_cma_release: + cma_release(secretmem_cma, page, nr_pages); + return err; +} + +static struct page *secretmem_alloc_page(struct secretmem_ctx *ctx, + gfp_t gfp) +{ + struct gen_pool *pool = ctx->pool; + unsigned long addr; + struct page *page; + int err; + + if (gen_pool_avail(pool) < PAGE_SIZE) { + err = secretmem_pool_increase(ctx, gfp); + if (err) + return NULL; + } + + addr = gen_pool_alloc(pool, PAGE_SIZE); + if (!addr) + return NULL; + + page = virt_to_page(addr); + get_page(page); + + return page; } static vm_fault_t secretmem_fault(struct vm_fault *vmf) { + struct secretmem_ctx *ctx = vmf->vma->vm_file->private_data; struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct inode *inode = file_inode(vmf->vma->vm_file); pgoff_t offset = vmf->pgoff; - unsigned long addr; struct page *page; int ret = 0; @@ -66,7 +124,7 @@ static vm_fault_t secretmem_fault(struct page = find_get_entry(mapping, offset); if (!page) { - page = secretmem_alloc_page(vmf->gfp_mask); + page = secretmem_alloc_page(ctx, vmf->gfp_mask); if (!page) return vmf_error(-EINVAL); @@ -74,14 +132,8 @@ static vm_fault_t secretmem_fault(struct if (unlikely(ret)) goto err_put_page; - ret = set_direct_map_invalid_noflush(page, 1); - if (ret) - goto err_del_page_cache; - - addr = (unsigned long)page_address(page); - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - __SetPageUptodate(page); + set_page_private(page, (unsigned long)ctx); ret = VM_FAULT_LOCKED; } @@ -89,8 +141,6 @@ static vm_fault_t secretmem_fault(struct vmf->page = page; return ret; -err_del_page_cache: - delete_from_page_cache(page); err_put_page: put_page(page); return vmf_error(ret); @@ -143,8 +193,11 @@ static int secretmem_migratepage(struct static void secretmem_freepage(struct page *page) { - set_direct_map_default_noflush(page, 1); - clear_highpage(page); + unsigned long addr = (unsigned long)page_address(page); + struct secretmem_ctx *ctx = (struct secretmem_ctx *)page_private(page); + struct gen_pool *pool = ctx->pool; + + gen_pool_free(pool, addr, PAGE_SIZE); } static const struct address_space_operations secretmem_aops = { @@ -179,13 +232,18 @@ static struct file *secretmem_file_creat if (!ctx) goto err_free_inode; + ctx->pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE); + if (!ctx->pool) + goto err_free_ctx; + file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", O_RDWR, &secretmem_fops); if (IS_ERR(file)) - goto err_free_ctx; + goto err_free_pool; mapping_set_unevictable(inode->i_mapping); + inode->i_private = ctx; inode->i_mapping->private_data = ctx; inode->i_mapping->a_ops = &secretmem_aops; @@ -199,6 +257,8 @@ static struct file *secretmem_file_creat return file; +err_free_pool: + gen_pool_destroy(ctx->pool); err_free_ctx: kfree(ctx); err_free_inode: @@ -217,6 +277,9 @@ SYSCALL_DEFINE1(memfd_secret, unsigned l if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) return -EINVAL; + if (!secretmem_cma) + return -ENOMEM; + fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; @@ -237,11 +300,37 @@ err_put_fd: return err; } +static void secretmem_cleanup_chunk(struct gen_pool *pool, + struct gen_pool_chunk *chunk, void *data) +{ + unsigned long start = chunk->start_addr; + unsigned long end = chunk->end_addr; + struct page *page = virt_to_page(start); + unsigned long nr_pages = (end - start + 1) / PAGE_SIZE; + int i; + + set_direct_map_default_noflush(page, nr_pages); + + for (i = 0; i < nr_pages; i++) + clear_highpage(page + i); + + cma_release(secretmem_cma, page, nr_pages); +} + +static void secretmem_cleanup_pool(struct secretmem_ctx *ctx) +{ + struct gen_pool *pool = ctx->pool; + + gen_pool_for_each_chunk(pool, secretmem_cleanup_chunk, ctx); + gen_pool_destroy(pool); +} + static void secretmem_evict_inode(struct inode *inode) { struct secretmem_ctx *ctx = inode->i_private; truncate_inode_pages_final(&inode->i_data); + secretmem_cleanup_pool(ctx); clear_inode(inode); kfree(ctx); } @@ -278,3 +367,29 @@ static int secretmem_init(void) return ret; } fs_initcall(secretmem_init); + +static int __init secretmem_setup(char *str) +{ + phys_addr_t align = PMD_SIZE; + unsigned long reserved_size; + int err; + + reserved_size = memparse(str, NULL); + if (!reserved_size) + return 0; + + if (reserved_size * 2 > PUD_SIZE) + align = PUD_SIZE; + + err = cma_declare_contiguous(0, reserved_size, 0, align, 0, false, + "secretmem", &secretmem_cma); + if (err) { + pr_err("failed to create CMA: %d\n", err); + return err; + } + + pr_info("reserved %luM\n", reserved_size >> 20); + + return 0; +} +__setup("secretmem=", secretmem_setup); _ Patches currently in -mm which might be from rppt@xxxxxxxxxxxxx are alpha-switch-from-discontigmem-to-sparsemem.patch ia64-remove-custom-__early_pfn_to_nid.patch ia64-remove-ifdef-config_zone_dma32-statements.patch ia64-discontig-paging_init-remove-local-max_pfn-calculation.patch ia64-split-virtual-map-initialization-out-of-paging_init.patch ia64-forbid-using-virtual_mem_map-with-flatmem.patch ia64-make-sparsemem-default-and-disable-discontigmem.patch arm-remove-config_arch_has_holes_memorymodel.patch arm-arm64-move-free_unused_memmap-to-generic-mm.patch arc-use-flatmem-with-freeing-of-unused-memory-map-instead-of-discontigmem.patch m68k-mm-make-node-data-and-node-setup-depend-on-config_discontigmem.patch m68k-mm-enable-use-of-generic-memory_modelh-for-discontigmem.patch m68k-deprecate-discontigmem.patch mm-introduce-debug_pagealloc_mapunmap_pages-helpers.patch pm-hibernate-make-direct-map-manipulations-more-explicit.patch arch-mm-restore-dependency-of-__kernel_map_pages-on-debug_pagealloc.patch arch-mm-make-kernel_page_present-always-available.patch mm-add-definition-of-pmd_page_order.patch mmap-make-mlock_future_check-global.patch set_memory-allow-set_direct_map__noflush-for-multiple-pages.patch mm-introduce-memfd_secret-system-call-to-create-secret-memory-areas.patch secretmem-use-pmd-size-pages-to-amortize-direct-map-fragmentation.patch secretmem-add-memcg-accounting.patch pm-hibernate-disable-when-there-are-active-secretmem-users.patch arch-mm-wire-up-memfd_secret-system-call-were-relevant.patch secretmem-test-add-basic-selftest-for-memfd_secret2.patch