This adds a new syscall to duplicate a VMA of other process to current. Flag PVMMAP_FIXED may be specified, its meaning is similar to mmap()'s MAP_FIXED. VMA are merged on destination, i.e. if source task has VMA with address [start; end], and we map it sequentially twice: process_vm_mmap(@pid, start, start + (end - start)/2, ...); process_vm_mmap(@pid, start + (end - start)/2, end, ...); the destination task will have single vma [start, end]. v2: Add PVMMAP_FIXED_NOREPLACE flag. Use find_vma_without_flags() and may_mmap_overlapped_region() helpers. Fix whitespaces. Previous version has a possibility to duplicate VMA from current to remote process, but there was a error, so I removed that. It's needed to advance get_unmapped_area to make it working with remote VMA (which I missed initially). This requires a lot of refactoring, which may hide the main logic away from a reader, so let's I do that later in a separate series. Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx> --- include/linux/mm.h | 4 + include/linux/mm_types.h | 2 + include/uapi/asm-generic/mman-common.h | 6 ++ mm/mmap.c | 107 ++++++++++++++++++++++++++++++++ mm/process_vm_access.c | 69 +++++++++++++++++++++ 5 files changed, 188 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 65ceb56acd44..9d1c79a44128 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2385,6 +2385,10 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); +extern unsigned long mmap_process_vm(struct mm_struct *, unsigned long, + struct mm_struct *, unsigned long, + unsigned long, unsigned long, + struct list_head *); static inline unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1815fbc40926..885f256f2fb7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -261,11 +261,13 @@ struct vm_region { #ifdef CONFIG_USERFAULTFD #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, }) +#define IS_NULL_VM_UFFD_CTX(uctx) ((uctx)->ctx == NULL) struct vm_userfaultfd_ctx { struct userfaultfd_ctx *ctx; }; #else /* CONFIG_USERFAULTFD */ #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {}) +#define IS_NULL_VM_UFFD_CTX(uctx) (true) struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index abd238d0f7a4..91803e6ada7c 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -28,6 +28,12 @@ /* 0x0100 - 0x80000 flags are defined in asm-generic/mman.h */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ +/* + * Flags for process_vm_mmap + */ +#define PVMMAP_FIXED 0x01 +#define PVMMAP_FIXED_NOREPLACE 0x02 /* PVMAP_FIXED which doesn't unmap underlying mapping */ + /* * Flags for mlock */ diff --git a/mm/mmap.c b/mm/mmap.c index 260e47e917e6..3123ecee5fb8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3282,6 +3282,113 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; } +static int do_mmap_process_vm(struct vm_area_struct *src_vma, + unsigned long src_addr, + struct mm_struct *dst_mm, + unsigned long dst_addr, + unsigned long len, + struct list_head *uf) +{ + struct vm_area_struct *dst_vma; + unsigned long pgoff, ret; + bool unused; + + if (do_munmap(dst_mm, dst_addr, len, uf)) + return -ENOMEM; + + if (src_vma->vm_flags & VM_ACCOUNT) { + if (security_vm_enough_memory_mm(dst_mm, len >> PAGE_SHIFT)) + return -ENOMEM; + } + + pgoff = src_vma->vm_pgoff + + ((src_addr - src_vma->vm_start) >> PAGE_SHIFT); + dst_vma = copy_vma(&src_vma, dst_mm, dst_addr, + len, pgoff, &unused, false); + if (!dst_vma) { + ret = -ENOMEM; + goto unacct; + } + + ret = copy_page_range(dst_mm, src_vma->vm_mm, src_vma, + dst_addr, src_addr, src_addr + len); + if (ret) { + do_munmap(dst_mm, dst_addr, len, uf); + return -ENOMEM; + } + + if (dst_vma->vm_file) + uprobe_mmap(dst_vma); + perf_event_mmap(dst_vma); + + dst_vma->vm_flags |= VM_SOFTDIRTY; + vma_set_page_prot(dst_vma); + + vm_stat_account(dst_mm, dst_vma->vm_flags, len >> PAGE_SHIFT); + return 0; + +unacct: + vm_unacct_memory(len >> PAGE_SHIFT); + return ret; +} + +unsigned long mmap_process_vm(struct mm_struct *src_mm, + unsigned long src_addr, + struct mm_struct *dst_mm, + unsigned long dst_addr, + unsigned long len, + unsigned long flags, + struct list_head *uf) +{ + struct vm_area_struct *src_vma, *dst_vma; + unsigned long gua_flags = 0; + unsigned long ret; + + src_vma = find_vma_without_flags(src_mm, src_addr, len, + VM_HUGETLB|VM_DONTEXPAND|VM_PFNMAP|VM_IO); + if (IS_ERR(src_vma)) + return -EFAULT; + if (dst_mm->map_count > sysctl_max_map_count - 2) + return -ENOMEM; + if (!IS_NULL_VM_UFFD_CTX(&src_vma->vm_userfaultfd_ctx)) + return -ENOTSUPP; + + if (src_vma->vm_flags & VM_SHARED) + gua_flags |= MAP_SHARED; + else + gua_flags |= MAP_PRIVATE; + if (vma_is_anonymous(src_vma) || vma_is_shmem(src_vma)) + gua_flags |= MAP_ANONYMOUS; + if (flags & PVMMAP_FIXED) + gua_flags |= MAP_FIXED; + if (flags & PVMMAP_FIXED_NOREPLACE) + gua_flags |= MAP_FIXED | MAP_FIXED_NOREPLACE; + + ret = get_unmapped_area(src_vma->vm_file, dst_addr, len, + src_vma->vm_pgoff + + ((src_addr - src_vma->vm_start) >> PAGE_SHIFT), + gua_flags); + if (offset_in_page(ret)) + return ret; + if (flags & PVMMAP_FIXED_NOREPLACE) { + dst_vma = find_vma(dst_mm, dst_addr); + if (dst_vma && dst_vma->vm_start < dst_addr + len) + return -EEXIST; + } + + dst_addr = ret; + + /* Check against address space limit. */ + if (!may_mmap_overlapped_region(dst_mm, src_vma->vm_flags, dst_addr, len)) + return -ENOMEM; + + ret = do_mmap_process_vm(src_vma, src_addr, dst_mm, dst_addr, len, uf); + if (ret) + return ret; + + return dst_addr; +} + /* * Return true if the calling process may expand its vm space by the passed * number of pages diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index a447092d4635..e2073f52415b 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -17,6 +17,8 @@ #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/syscalls.h> +#include <linux/mman.h> +#include <linux/userfaultfd_k.h> #ifdef CONFIG_COMPAT #include <linux/compat.h> @@ -295,6 +297,66 @@ static ssize_t process_vm_rw(pid_t pid, return rc; } +static unsigned long process_vm_mmap(pid_t pid, unsigned long src_addr, + unsigned long len, unsigned long dst_addr, + unsigned long flags) +{ + struct mm_struct *src_mm, *dst_mm; + struct task_struct *task; + unsigned long ret; + int depth = 0; + LIST_HEAD(uf); + + len = PAGE_ALIGN(len); + src_addr = round_down(src_addr, PAGE_SIZE); + if (flags & PVMMAP_FIXED) + dst_addr = round_down(dst_addr, PAGE_SIZE); + else + dst_addr = round_hint_to_min(dst_addr); + + if ((flags & ~(PVMMAP_FIXED|PVMMAP_FIXED_NOREPLACE)) || + len == 0 || len > TASK_SIZE || src_addr == 0 || + dst_addr > TASK_SIZE - len || pid <= 0) + return -EINVAL; + task = find_get_task_by_vpid(pid); + if (!task) + return -ESRCH; + if (unlikely(task->flags & PF_KTHREAD)) { + ret = -EINVAL; + goto out_put_task; + } + + src_mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); + if (!src_mm || IS_ERR(src_mm)) { + ret = IS_ERR(src_mm) ? PTR_ERR(src_mm) : -ESRCH; + goto out_put_task; + } + dst_mm = current->mm; + mmget(dst_mm); + + /* Double lock mm in address order: smallest is the first */ + if (src_mm < dst_mm) { + down_write(&src_mm->mmap_sem); + depth = SINGLE_DEPTH_NESTING; + } + down_write_nested(&dst_mm->mmap_sem, depth); + if (src_mm > dst_mm) + down_write_nested(&src_mm->mmap_sem, SINGLE_DEPTH_NESTING); + + ret = mmap_process_vm(src_mm, src_addr, dst_mm, dst_addr, len, flags, &uf); + + up_write(&dst_mm->mmap_sem); + if (dst_mm != src_mm) + up_write(&src_mm->mmap_sem); + + userfaultfd_unmap_complete(dst_mm, &uf); + mmput(src_mm); + mmput(dst_mm); +out_put_task: + put_task_struct(task); + return ret; +} + SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, unsigned long, liovcnt, const struct iovec __user *, rvec, unsigned long, riovcnt, unsigned long, flags) @@ -310,6 +372,13 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); } +SYSCALL_DEFINE5(process_vm_mmap, pid_t, pid, + unsigned long, src_addr, unsigned long, len, + unsigned long, dst_addr, unsigned long, flags) +{ + return process_vm_mmap(pid, src_addr, len, dst_addr, flags); +} + #ifdef CONFIG_COMPAT static ssize_t