This adds a new syscall to map from or to another process vma. Flag PVMMAP_FIXED may be specified, its meaning is similar to mmap()'s MAP_FIXED. @pid > 0 means to map from process of @pid to current, @pid < 0 means to map from current to @pid process. VMA are merged on destination, i.e. if source task has VMA with address [start; end], and we map it sequentially twice: process_vm_mmap(@pid, start, start + (end - start)/2, ...); process_vm_mmap(@pid, start + (end - start)/2, end, ...); the destination task will have single vma [start, end]. Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx> --- include/linux/mm.h | 4 + include/linux/mm_types.h | 2 + include/uapi/asm-generic/mman-common.h | 5 + mm/mmap.c | 108 ++++++++++++++++++++++++++++++++ mm/process_vm_access.c | 71 +++++++++++++++++++++ 5 files changed, 190 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 54328d08dbdd..c49bcfac593c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2382,6 +2382,10 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); +extern unsigned long mmap_process_vm(struct mm_struct *, unsigned long, + struct mm_struct *, unsigned long, + unsigned long, unsigned long, + struct list_head *); static inline unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1815fbc40926..885f256f2fb7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -261,11 +261,13 @@ struct vm_region { #ifdef CONFIG_USERFAULTFD #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, }) +#define IS_NULL_VM_UFFD_CTX(uctx) ((uctx)->ctx == NULL) struct vm_userfaultfd_ctx { struct userfaultfd_ctx *ctx; }; #else /* CONFIG_USERFAULTFD */ #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {}) +#define IS_NULL_VM_UFFD_CTX(uctx) (true) struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index abd238d0f7a4..44cb6cf77e93 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -28,6 +28,11 @@ /* 0x0100 - 0x80000 flags are defined in asm-generic/mman.h */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ +/* + * Flags for process_vm_mmap + */ +#define PVMMAP_FIXED 0x01 + /* * Flags for mlock */ diff --git a/mm/mmap.c b/mm/mmap.c index b2a1f77643cd..3dbf280e9f8e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3274,6 +3274,114 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; } +static int do_mmap_process_vm(struct vm_area_struct *src_vma, + unsigned long src_addr, + struct mm_struct *dst_mm, + unsigned long dst_addr, + unsigned long len, + struct list_head *uf) +{ + struct vm_area_struct *dst_vma; + unsigned long pgoff, ret; + bool unused; + + if (do_munmap(dst_mm, dst_addr, len, uf)) + return -ENOMEM; + + if (src_vma->vm_flags & VM_ACCOUNT) { + if (security_vm_enough_memory_mm(dst_mm, len >> PAGE_SHIFT)) + return -ENOMEM; + } + + pgoff = src_vma->vm_pgoff + + ((src_addr - src_vma->vm_start) >> PAGE_SHIFT); + dst_vma = copy_vma(&src_vma, dst_mm, dst_addr, + len, pgoff, &unused, false); + if (!dst_vma) { + ret = -ENOMEM; + goto unacct; + } + + ret = copy_page_range(dst_mm, src_vma->vm_mm, src_vma, + dst_addr, src_addr, src_addr + len); + if (ret) { + do_munmap(dst_mm, dst_addr, len, uf); + return -ENOMEM; + } + + if (dst_vma->vm_file) + uprobe_mmap(dst_vma); + perf_event_mmap(dst_vma); + + dst_vma->vm_flags |= VM_SOFTDIRTY; + vma_set_page_prot(dst_vma); + + vm_stat_account(dst_mm, dst_vma->vm_flags, len >> PAGE_SHIFT); + return 0; + +unacct: + vm_unacct_memory(len >> PAGE_SHIFT); + return ret; +} + +unsigned long mmap_process_vm(struct mm_struct *src_mm, + unsigned long src_addr, + struct mm_struct *dst_mm, + unsigned long dst_addr, + unsigned long len, + unsigned long flags, + struct list_head *uf) +{ + struct vm_area_struct *src_vma = find_vma(src_mm, src_addr); + unsigned long gua_flags = 0; + unsigned long ret; + + if (!src_vma || src_vma->vm_start > src_addr) + return -EFAULT; + if (len > src_vma->vm_end - src_addr) + return -EFAULT; + if (src_vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) + return -EFAULT; + if (is_vm_hugetlb_page(src_vma) || (src_vma->vm_flags & VM_IO)) + return -EINVAL; + if (dst_mm->map_count + 2 > sysctl_max_map_count) + return -ENOMEM; + if (!IS_NULL_VM_UFFD_CTX(&src_vma->vm_userfaultfd_ctx)) + return -ENOTSUPP; + + if (src_vma->vm_flags & VM_SHARED) + gua_flags |= MAP_SHARED; + else + gua_flags |= MAP_PRIVATE; + if (vma_is_anonymous(src_vma) || vma_is_shmem(src_vma)) + gua_flags |= MAP_ANONYMOUS; + if (flags & PVMMAP_FIXED) + gua_flags |= MAP_FIXED; + ret = get_unmapped_area(src_vma->vm_file, dst_addr, len, + src_vma->vm_pgoff + + ((src_addr - src_vma->vm_start) >> PAGE_SHIFT), + gua_flags); + if (offset_in_page(ret)) + return ret; + dst_addr = ret; + + /* Check against address space limit. */ + if (!may_expand_vm(dst_mm, src_vma->vm_flags, len >> PAGE_SHIFT)) { + unsigned long nr_pages; + + nr_pages = count_vma_pages_range(dst_mm, dst_addr, dst_addr + len); + if (!may_expand_vm(dst_mm, src_vma->vm_flags, + (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; + } + + ret = do_mmap_process_vm(src_vma, src_addr, dst_mm, dst_addr, len, uf); + if (ret) + return ret; + + return dst_addr; +} + /* * Return true if the calling process may expand its vm space by the passed * number of pages diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index a447092d4635..7fca2c5c7edd 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -17,6 +17,8 @@ #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/syscalls.h> +#include <linux/mman.h> +#include <linux/userfaultfd_k.h> #ifdef CONFIG_COMPAT #include <linux/compat.h> @@ -295,6 +297,68 @@ static ssize_t process_vm_rw(pid_t pid, return rc; } +static unsigned long process_vm_mmap(pid_t pid, unsigned long src_addr, + unsigned long len, unsigned long dst_addr, + unsigned long flags) +{ + struct mm_struct *src_mm, *dst_mm; + struct task_struct *task; + unsigned long ret; + int depth = 0; + LIST_HEAD(uf); + + len = PAGE_ALIGN(len); + src_addr = round_down(src_addr, PAGE_SIZE); + if (flags & PVMMAP_FIXED) + dst_addr = round_down(dst_addr, PAGE_SIZE); + else + dst_addr = round_hint_to_min(dst_addr); + + if ((flags & ~PVMMAP_FIXED) || len == 0 || len > TASK_SIZE || + src_addr == 0 || dst_addr > TASK_SIZE - len) + return -EINVAL; + task = find_get_task_by_vpid(pid > 0 ? pid : -pid); + if (!task) + return -ESRCH; + if (unlikely(task->flags & PF_KTHREAD)) { + ret = -EINVAL; + goto out_put_task; + } + + src_mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); + if (!src_mm || IS_ERR(src_mm)) { + ret = IS_ERR(src_mm) ? PTR_ERR(src_mm) : -ESRCH; + goto out_put_task; + } + dst_mm = current->mm; + mmget(dst_mm); + + if (pid < 0) + swap(src_mm, dst_mm); + + /* Double lock mm in address order: smallest is the first */ + if (src_mm < dst_mm) { + down_write(&src_mm->mmap_sem); + depth = SINGLE_DEPTH_NESTING; + } + down_write_nested(&dst_mm->mmap_sem, depth); + if (src_mm > dst_mm) + down_write_nested(&src_mm->mmap_sem, SINGLE_DEPTH_NESTING); + + ret = mmap_process_vm(src_mm, src_addr, dst_mm, dst_addr, len, flags, &uf); + + up_write(&dst_mm->mmap_sem); + if (dst_mm != src_mm) + up_write(&src_mm->mmap_sem); + + userfaultfd_unmap_complete(dst_mm, &uf); + mmput(src_mm); + mmput(dst_mm); +out_put_task: + put_task_struct(task); + return ret; +} + SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, unsigned long, liovcnt, const struct iovec __user *, rvec, unsigned long, riovcnt, unsigned long, flags) @@ -310,6 +374,13 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); } +SYSCALL_DEFINE5(process_vm_mmap, pid_t, pid, + unsigned long, src_addr, unsigned long, len, + unsigned long, dst_addr, unsigned long, flags) +{ + return process_vm_mmap(pid, src_addr, len, dst_addr, flags); +} + #ifdef CONFIG_COMPAT static ssize_t