When a process mmaps a file with MAP_SHARED flag, it is possible that any other processes mmaping the same file with MAP_SHARED flag with same permissions could share the page table entries as well instead of creating duplicate entries. This patch introduces a new flag MAP_SHARED_PT which a process can use to hint that it can share page atbes with other processes using the same mapping. It creates a new mm struct to hold the shareable page table entries for the newly mapped region. This new mm is not associated with a task. Its lifetime is until the last shared mapping is deleted. It adds a new pointer "ptshare_data" to struct address_space which points to the data structure that will contain pointer to this newly created mm. Add support for creating a new set of shared page tables in a new mm_struct upon mmap of an region that can potentially share page tables. Add page fault handling for this now shared region. Modify free_pgtables path to make sure page tables in the shared regions are kept intact when a process using page table region exits and there are other mappers for the shared region. Clean up mm_struct holding shared page tables when the last process sharing the region exits. Signed-off-by: Khalid Aziz <khalid.aziz@xxxxxxxxxx> Suggested-by: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx> --- include/linux/fs.h | 1 + include/uapi/asm-generic/mman-common.h | 1 + mm/Makefile | 2 +- mm/internal.h | 16 ++ mm/memory.c | 52 ++++- mm/mmap.c | 87 ++++++++ mm/ptshare.c | 262 +++++++++++++++++++++++++ 7 files changed, 418 insertions(+), 3 deletions(-) create mode 100644 mm/ptshare.c diff --git a/include/linux/fs.h b/include/linux/fs.h index 59ae95ddb679..f940bf03303b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -441,6 +441,7 @@ struct address_space { spinlock_t private_lock; struct list_head private_list; void *private_data; + void *ptshare_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6ce1f1ceb432..4d23456b5915 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -29,6 +29,7 @@ #define MAP_HUGETLB 0x040000 /* create a huge page mapping */ #define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ +#define MAP_SHARED_PT 0x200000 /* Shared page table mappings */ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/mm/Makefile b/mm/Makefile index 8e105e5b3e29..d9bb14fdf220 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -40,7 +40,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ - pgtable-generic.o rmap.o vmalloc.o + pgtable-generic.o rmap.o vmalloc.o ptshare.o ifdef CONFIG_CROSS_MEMORY_ATTACH diff --git a/mm/internal.h b/mm/internal.h index 16083eca720e..22cae2ff97fa 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -861,4 +861,20 @@ static inline bool vma_is_shared(const struct vm_area_struct *vma) return vma->vm_flags & VM_SHARED_PT; } +/* + * mm/ptshare.c + */ +struct mshare_data { + struct mm_struct *mm; + refcount_t refcnt; + unsigned long start; + unsigned long size; + unsigned long mode; +}; +int ptshare_new_mm(struct file *file, struct vm_area_struct *vma); +void ptshare_del_mm(struct vm_area_struct *vm); +int ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma); +extern vm_fault_t find_shared_vma(struct vm_area_struct **vmap, + unsigned long *addrp, unsigned int flags); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index 8a6d5c823f91..051c82e13627 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -416,15 +416,21 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, unlink_anon_vmas(vma); unlink_file_vma(vma); + /* + * If vma is sharing page tables through a host mm, do not + * free page tables for it. Those page tables wil be freed + * when host mm is released. + */ if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); - } else { + } else if (!vma_is_shared(vma)) { /* * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_vm_hugetlb_page(next)) { + && !is_vm_hugetlb_page(next) + && !vma_is_shared(next)) { vma = next; next = mas_find(&mas, ceiling - 1); unlink_anon_vmas(vma); @@ -5189,6 +5195,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { vm_fault_t ret; + bool shared = false; + struct mm_struct *orig_mm; __set_current_state(TASK_RUNNING); @@ -5198,6 +5206,16 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); + orig_mm = vma->vm_mm; + if (unlikely(vma_is_shared(vma))) { + ret = find_shared_vma(&vma, &address, flags); + if (ret) + return ret; + if (!vma) + return VM_FAULT_SIGSEGV; + shared = true; + } + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) @@ -5219,6 +5237,36 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, lru_gen_exit_fault(); + /* + * Release the read lock on shared VMA's parent mm unless + * __handle_mm_fault released the lock already. + * __handle_mm_fault sets VM_FAULT_RETRY in return value if + * it released mmap lock. If lock was released, that implies + * the lock would have been released on task's original mm if + * this were not a shared PTE vma. To keep lock state consistent, + * make sure to release the lock on task's original mm + */ + if (shared) { + int release_mmlock = 1; + + if (!(ret & VM_FAULT_RETRY)) { + mmap_read_unlock(vma->vm_mm); + release_mmlock = 0; + } else if ((flags & FAULT_FLAG_ALLOW_RETRY) && + (flags & FAULT_FLAG_RETRY_NOWAIT)) { + mmap_read_unlock(vma->vm_mm); + release_mmlock = 0; + } + + /* + * Reset guest vma pointers that were set up in + * find_shared_vma() to process this fault. + */ + vma->vm_mm = orig_mm; + if (release_mmlock) + mmap_read_unlock(orig_mm); + } + if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* diff --git a/mm/mmap.c b/mm/mmap.c index 74a84eb33b90..c1adb9d52f5d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1245,6 +1245,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; vm_flags_t vm_flags; int pkey = 0; + int ptshare = 0; validate_mm(mm); *populate = 0; @@ -1282,6 +1283,21 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; + /* + * If MAP_SHARED_PT is set, MAP_SHARED or MAP_SHARED_VALIDATE must + * be set as well + */ + if (flags & MAP_SHARED_PT) { +#if VM_SHARED_PT + if (flags & (MAP_SHARED | MAP_SHARED_VALIDATE)) + ptshare = 1; + else + return -EINVAL; +#else + return -EINVAL; +#endif + } + /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ @@ -1414,6 +1430,60 @@ unsigned long do_mmap(struct file *file, unsigned long addr, ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; + +#if VM_SHARED_PT + /* + * Check if this mapping is a candidate for page table sharing + * at PMD level. It is if following conditions hold: + * - It is not anonymous mapping + * - It is not hugetlbfs mapping (for now) + * - flags conatins MAP_SHARED or MAP_SHARED_VALIDATE and + * MAP_SHARED_PT + * - Start address is aligned to PMD size + * - Mapping size is a multiple of PMD size + */ + if (ptshare && file && !is_file_hugepages(file)) { + struct vm_area_struct *vma; + + vma = find_vma(mm, addr); + if (!((vma->vm_start | vma->vm_end) & (PMD_SIZE - 1))) { + struct mshare_data *info = file->f_mapping->ptshare_data; + + /* + * If this mapping has not been set up for page table + * sharing yet, do so by creating a new mm to hold the + * shared page tables for this mapping + */ + if (info == NULL) { + int ret; + + ret = ptshare_new_mm(file, vma); + if (ret < 0) + return ret; + info = file->f_mapping->ptshare_data; + ret = ptshare_insert_vma(info->mm, vma); + if (ret < 0) + addr = ret; + else + vma->vm_flags |= VM_SHARED_PT; + } else { + /* + * Page tables will be shared only if the + * file is mapped in with the same permissions + * across all mappers with same starting + * address and size + */ + if (((prot & info->mode) == info->mode) && + (addr == info->start) && + (len == info->size)) { + vma->vm_flags |= VM_SHARED_PT; + refcount_inc(&info->refcnt); + } + } + } + } +#endif + return addr; } @@ -2491,6 +2561,21 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, if (end == start) return -EINVAL; + /* + * Check if this vma uses shared page tables + */ + vma = find_vma_intersection(mm, start, end); + if (vma && unlikely(vma_is_shared(vma))) { + struct mshare_data *info = NULL; + + if (vma->vm_file && vma->vm_file->f_mapping) + info = vma->vm_file->f_mapping->ptshare_data; + /* Don't allow partial munmaps */ + if (info && ((start != info->start) || (len != info->size))) + return -EINVAL; + ptshare_del_mm(vma); + } + /* arch_unmap() might do unmaps itself. */ arch_unmap(mm, start, end); @@ -2660,6 +2745,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } } + if (vm_flags & VM_SHARED_PT) + vma->vm_flags |= VM_SHARED_PT; vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); diff --git a/mm/ptshare.c b/mm/ptshare.c new file mode 100644 index 000000000000..97322f822233 --- /dev/null +++ b/mm/ptshare.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Share page table entries when possible to reduce the amount of extra + * memory consumed by page tables + * + * Copyright (C) 2022 Oracle Corp. All rights reserved. + * Authors: Khalid Aziz <khalid.aziz@xxxxxxxxxx> + * Matthew Wilcox <willy@xxxxxxxxxxxxx> + */ + +#include <linux/mm.h> +#include <linux/fs.h> +#include <asm/pgalloc.h> +#include "internal.h" + +/* + */ +static pmd_t +*get_pmd(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + } + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return NULL; + } + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; + } + + return pmd; +} + +/* + * Find the shared pmd entries in host mm struct and install them into + * guest page tables. + */ +static int +ptshare_copy_pmd(struct mm_struct *host_mm, struct mm_struct *guest_mm, + struct vm_area_struct *vma, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *host_pmd; + spinlock_t *host_ptl, *guest_ptl; + + pgd = pgd_offset(guest_mm, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + p4d = p4d_alloc(guest_mm, pgd, addr); + if (!p4d) + return 1; + } + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + host_pmd = get_pmd(host_mm, addr); + if (!host_pmd) + return 1; + + get_page(virt_to_page(host_pmd)); + host_ptl = pmd_lockptr(host_mm, host_pmd); + guest_ptl = pud_lockptr(guest_mm, pud); + spin_lock(host_ptl); + spin_lock(guest_ptl); + pud_populate(guest_mm, pud, + (pmd_t *)((unsigned long)host_pmd & PAGE_MASK)); + put_page(virt_to_page(host_pmd)); + spin_unlock(guest_ptl); + spin_unlock(host_ptl); + } + + return 0; +} + +/* + * Find the shared page tables in hosting mm struct and install those in + * the guest mm struct + */ +vm_fault_t +find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp, + unsigned int flags) +{ + struct mshare_data *info; + struct mm_struct *host_mm; + struct vm_area_struct *host_vma, *guest_vma = *vmap; + unsigned long host_addr; + pmd_t *guest_pmd, *host_pmd; + + if ((!guest_vma->vm_file) || (!guest_vma->vm_file->f_mapping)) + return 0; + info = guest_vma->vm_file->f_mapping->ptshare_data; + if (!info) { + pr_warn("VM_SHARED_PT vma with NULL ptshare_data"); + dump_stack_print_info(KERN_WARNING); + return 0; + } + host_mm = info->mm; + + mmap_read_lock(host_mm); + host_addr = *addrp - guest_vma->vm_start + host_mm->mmap_base; + host_pmd = get_pmd(host_mm, host_addr); + guest_pmd = get_pmd(guest_vma->vm_mm, *addrp); + if (!pmd_same(*guest_pmd, *host_pmd)) { + set_pmd(guest_pmd, *host_pmd); + mmap_read_unlock(host_mm); + return VM_FAULT_NOPAGE; + } + + *addrp = host_addr; + host_vma = find_vma(host_mm, host_addr); + if (!host_vma) + return VM_FAULT_SIGSEGV; + + /* + * Point vm_mm for the faulting vma to the mm struct holding shared + * page tables so the fault handling will happen in the right + * shared context + */ + guest_vma->vm_mm = host_mm; + + return 0; +} + +/* + * Create a new mm struct that will hold the shared PTEs. Pointer to + * this new mm is stored in the data structure mshare_data which also + * includes a refcount for any current references to PTEs in this new + * mm. This refcount is used to determine when the mm struct for shared + * PTEs can be deleted. + */ +int +ptshare_new_mm(struct file *file, struct vm_area_struct *vma) +{ + struct mm_struct *new_mm; + struct mshare_data *info = NULL; + int retval = 0; + unsigned long start = vma->vm_start; + unsigned long len = vma->vm_end - vma->vm_start; + + new_mm = mm_alloc(); + if (!new_mm) { + retval = -ENOMEM; + goto err_free; + } + new_mm->mmap_base = start; + new_mm->task_size = len; + if (!new_mm->task_size) + new_mm->task_size--; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + retval = -ENOMEM; + goto err_free; + } + info->mm = new_mm; + info->start = start; + info->size = len; + refcount_set(&info->refcnt, 1); + file->f_mapping->ptshare_data = info; + + return retval; + +err_free: + if (new_mm) + mmput(new_mm); + kfree(info); + return retval; +} + +/* + * insert vma into mm holding shared page tables + */ +int +ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + struct vm_area_struct *new_vma; + int err = 0; + + new_vma = vm_area_dup(vma); + if (!new_vma) + return -ENOMEM; + + new_vma->vm_file = NULL; + /* + * This new vma belongs to host mm, so clear the VM_SHARED_PT + * flag on this so we know this is the host vma when we clean + * up page tables. Do not use THP for page table shared regions + */ + new_vma->vm_flags &= ~(VM_SHARED | VM_SHARED_PT); + new_vma->vm_flags |= VM_NOHUGEPAGE; + new_vma->vm_mm = mm; + + err = insert_vm_struct(mm, new_vma); + if (err) + return -ENOMEM; + + /* + * Copy the PMD entries from host mm to guest so they use the + * same PTEs + */ + err = ptshare_copy_pmd(mm, vma->vm_mm, vma, vma->vm_start); + + return err; +} + +/* + * Free the mm struct created to hold shared PTEs and associated data + * structures + */ +static inline void +free_ptshare_mm(struct mshare_data *info) +{ + mmput(info->mm); + kfree(info); +} + +/* + * This function is called when a reference to the shared PTEs in mm + * struct is dropped. It updates refcount and checks to see if last + * reference to the mm struct holding shared PTEs has been dropped. If + * so, it cleans up the mm struct and associated data structures + */ +void +ptshare_del_mm(struct vm_area_struct *vma) +{ + struct mshare_data *info; + struct file *file = vma->vm_file; + + if (!file || (!file->f_mapping)) + return; + info = file->f_mapping->ptshare_data; + WARN_ON(!info); + if (!info) + return; + + if (refcount_dec_and_test(&info->refcnt)) { + free_ptshare_mm(info); + file->f_mapping->ptshare_data = NULL; + } +} -- 2.34.1