This ioctl can be used to watch the process's memory and perform atomic operations which aren't possible through procfs. Three operations have been implemented: - PAGEMAP_SD_GET gets the soft dirty pages in a address range. - PAGEMAP_SD_CLEAR clears the soft dirty bit from dirty pages in a address range. - PAGEMAP_SD_GET_AND_CLEAR gets and clears the soft dirty bit in a address range. struct pagemap_sd_args is used as the argument of the IOCTL. In this struct: - The range is specified through start and len. - The output buffer and size is specified as vec and vec_len. - The flags can be specified in the flags field. Currently only one PAGEMAP_SD_NO_REUSED_REGIONS is supported which can be specified to ignore the VMA dirty flags. This is based on a patch from Gabriel Krisman Bertazi. Signed-off-by: Muhammad Usama Anjum <usama.anjum@xxxxxxxxxxxxx> --- Changes in v3: - Tighten the user-kernel interface by using explicit types and add more error checking Changes in v2: - Convert the interface from syscall to ioctl - Remove pidfd support as it doesn't make sense in ioctl --- fs/proc/task_mmu.c | 260 ++++++++++++++++++++++++++++++++++ include/uapi/linux/fs.h | 23 +++ tools/include/uapi/linux/fs.h | 23 +++ 3 files changed, 306 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f66674033207..33d3d5c2ab40 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -19,6 +19,8 @@ #include <linux/shmem_fs.h> #include <linux/uaccess.h> #include <linux/pkeys.h> +#include <uapi/linux/fs.h> +#include <linux/vmalloc.h> #include <asm/elf.h> #include <asm/tlb.h> @@ -1775,11 +1777,269 @@ static int pagemap_release(struct inode *inode, struct file *file) return 0; } +#ifdef CONFIG_MEM_SOFT_DIRTY +#define IS_CLEAR_SD_OP(op) (op == PAGEMAP_SD_CLEAR || op == PAGEMAP_SD_GET_AND_CLEAR) +#define IS_GET_SD_OP(op) (op == PAGEMAP_SD_GET || op == PAGEMAP_SD_GET_AND_CLEAR) +#define PAGEMAP_SD_FLAGS_MASK (PAGEMAP_SD_NO_REUSED_REGIONS) + +struct pagemap_sd_private { + unsigned long start; + __u64 *vec; + unsigned long vec_len; + unsigned long index; + unsigned int op; + unsigned int flags; +}; + +static int pagemap_sd_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct pagemap_sd_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long start = addr; + spinlock_t *ptl; + pte_t *pte; + int dirty; + bool dirty_vma = (p->flags & PAGEMAP_SD_NO_REUSED_REGIONS) ? 0 : + (vma->vm_flags & VM_SOFTDIRTY); + + end = min(end, walk->vma->vm_end); + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) { + /* + * Break huge page into small pages if operation needs to be performed is + * on a portion of the huge page or the return buffer cannot store complete + * data. Then process this PMD as having normal pages. + */ + if ((IS_CLEAR_SD_OP(p->op) && (end - addr < HPAGE_SIZE)) || + (IS_GET_SD_OP(p->op) && (p->index + HPAGE_SIZE/PAGE_SIZE > p->vec_len))) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, addr); + goto process_smaller_pages; + } else { + dirty = check_soft_dirty_pmd(vma, addr, pmd, IS_CLEAR_SD_OP(p->op)); + if (IS_GET_SD_OP(p->op) && (dirty_vma || dirty)) { + for (; addr != end && p->index < p->vec_len; + addr += PAGE_SIZE) + p->vec[p->index++] = addr - p->start; + } + } + } + spin_unlock(ptl); + return 0; + } + +process_smaller_pages: + if (pmd_trans_unstable(pmd)) + return 0; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { + dirty = check_soft_dirty(vma, addr, pte, IS_CLEAR_SD_OP(p->op)); + + if (IS_GET_SD_OP(p->op) && (dirty_vma || dirty)) { + p->vec[p->index++] = addr - p->start; + WARN_ON(p->index > p->vec_len); + } + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + if (IS_CLEAR_SD_OP(p->op)) + flush_tlb_mm_range(vma->vm_mm, start, end, PAGE_SHIFT, false); + + return 0; +} + +static int pagemap_sd_pte_hole(unsigned long addr, unsigned long end, int depth, + struct mm_walk *walk) +{ + struct pagemap_sd_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + + if (p->flags & PAGEMAP_SD_NO_REUSED_REGIONS) + return 0; + + if (vma && (vma->vm_flags & VM_SOFTDIRTY) && IS_GET_SD_OP(p->op)) { + for (; addr != end && p->index < p->vec_len; addr += PAGE_SIZE) + p->vec[p->index++] = addr - p->start; + } + + return 0; +} + +static int pagemap_sd_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk) +{ + struct pagemap_sd_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + int ret; + unsigned long end_cut = end; + + if (p->flags & PAGEMAP_SD_NO_REUSED_REGIONS) + return 0; + + if (IS_CLEAR_SD_OP(p->op) && (vma->vm_flags & VM_SOFTDIRTY)) { + if (vma->vm_start < start) { + ret = split_vma(vma->vm_mm, vma, start, 1); + if (ret) + return ret; + } + + if (IS_GET_SD_OP(p->op)) + end_cut = min(start + p->vec_len * PAGE_SIZE, end); + + if (vma->vm_end > end_cut) { + ret = split_vma(vma->vm_mm, vma, end_cut, 0); + if (ret) + return ret; + } + } + + return 0; +} + +static void pagemap_sd_post_vma(struct mm_walk *walk) +{ + struct pagemap_sd_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + + if (p->flags & PAGEMAP_SD_NO_REUSED_REGIONS) + return; + + if (IS_CLEAR_SD_OP(p->op) && (vma->vm_flags & VM_SOFTDIRTY)) { + vma->vm_flags &= ~VM_SOFTDIRTY; + vma_set_page_prot(vma); + } +} + +static int pagemap_sd_pmd_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_sd_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + + if (IS_GET_SD_OP(p->op) && (p->index == p->vec_len)) + return -1; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + return 0; +} + +static const struct mm_walk_ops pagemap_sd_ops = { + .test_walk = pagemap_sd_pmd_test_walk, + .pre_vma = pagemap_sd_pre_vma, + .pmd_entry = pagemap_sd_pmd_entry, + .pte_hole = pagemap_sd_pte_hole, + .post_vma = pagemap_sd_post_vma, +}; + +static long do_pagemap_sd_cmd(struct mm_struct *mm, unsigned int cmd, struct pagemap_sd_args *arg) +{ + struct pagemap_sd_private sd_data; + struct mmu_notifier_range range; + unsigned long start, end; + int ret; + + start = (unsigned long)untagged_addr(arg->start); + if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len))) + return -EINVAL; + + if (IS_GET_SD_OP(cmd) && + ((arg->vec_len == 0) || (!arg->vec) || (!access_ok((loff_t *)arg->vec, arg->vec_len)))) + return -EINVAL; + + if ((arg->flags & ~PAGEMAP_SD_FLAGS_MASK) || (arg->__reserved)) + return -EINVAL; + + end = start + arg->len; + sd_data.start = start; + sd_data.op = cmd; + sd_data.flags = arg->flags; + sd_data.index = 0; + sd_data.vec_len = arg->vec_len; + + if (IS_GET_SD_OP(cmd)) { + sd_data.vec = vzalloc(arg->vec_len * sizeof(loff_t)); + if (!sd_data.vec) + return -ENOMEM; + } + + if (IS_CLEAR_SD_OP(cmd)) { + mmap_write_lock(mm); + + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, NULL, + mm, start, end); + mmu_notifier_invalidate_range_start(&range); + inc_tlb_flush_pending(mm); + } else { + mmap_read_lock(mm); + } + + ret = walk_page_range(mm, start, end, &pagemap_sd_ops, &sd_data); + + if (IS_CLEAR_SD_OP(cmd)) { + mmu_notifier_invalidate_range_end(&range); + dec_tlb_flush_pending(mm); + + mmap_write_unlock(mm); + } else { + mmap_read_unlock(mm); + } + + if (ret < 0) + goto free_sd_data; + + if (IS_GET_SD_OP(cmd)) { + ret = copy_to_user((loff_t *)arg->vec, sd_data.vec, sd_data.index * sizeof(loff_t)); + if (ret) { + ret = -EIO; + goto free_sd_data; + } + ret = sd_data.index; + } else { + ret = 0; + } + +free_sd_data: + if (IS_GET_SD_OP(cmd)) + vfree(sd_data.vec); + + return ret; +} + +static long pagemap_sd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct pagemap_sd_args __user *uarg = (struct pagemap_sd_args __user *)arg; + struct mm_struct *mm = file->private_data; + struct pagemap_sd_args arguments; + + switch (cmd) { + case PAGEMAP_SD_GET: + fallthrough; + case PAGEMAP_SD_CLEAR: + fallthrough; + case PAGEMAP_SD_GET_AND_CLEAR: + if (copy_from_user(&arguments, uarg, sizeof(struct pagemap_sd_args))) + return -EFAULT; + return do_pagemap_sd_cmd(mm, cmd, &arguments); + default: + return -EINVAL; + } +} +#endif /* CONFIG_MEM_SOFT_DIRTY */ + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, +#ifdef CONFIG_MEM_SOFT_DIRTY + .unlocked_ioctl = pagemap_sd_ioctl, + .compat_ioctl = pagemap_sd_ioctl, +#endif /* CONFIG_MEM_SOFT_DIRTY */ }; #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index b7b56871029c..4f6d1c0ae524 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -305,4 +305,27 @@ typedef int __bitwise __kernel_rwf_t; #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND) +/** + * struct pagemap_sd_args - Soft-dirty IOCTL argument + * @start: Starting address + * @len: Length of the region + * @vec: Output buffer address + * @vec_len: Length of the output buffer + * @flags: Special flags for the IOCTL + * @__reserved: Reserved member to preserve data alignment. Should be 0. + */ +struct pagemap_sd_args { + __u64 __user start; + __u64 len; + __u64 __user vec; + __u64 vec_len; + __u32 flags; + __u32 __reserved; +}; + +#define PAGEMAP_SD_GET _IOWR('f', 16, struct pagemap_sd_args) +#define PAGEMAP_SD_CLEAR _IOWR('f', 17, struct pagemap_sd_args) +#define PAGEMAP_SD_GET_AND_CLEAR _IOWR('f', 18, struct pagemap_sd_args) +#define PAGEMAP_SD_NO_REUSED_REGIONS 0x1 + #endif /* _UAPI_LINUX_FS_H */ diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index b7b56871029c..4f6d1c0ae524 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -305,4 +305,27 @@ typedef int __bitwise __kernel_rwf_t; #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND) +/** + * struct pagemap_sd_args - Soft-dirty IOCTL argument + * @start: Starting address + * @len: Length of the region + * @vec: Output buffer address + * @vec_len: Length of the output buffer + * @flags: Special flags for the IOCTL + * @__reserved: Reserved member to preserve data alignment. Should be 0. + */ +struct pagemap_sd_args { + __u64 __user start; + __u64 len; + __u64 __user vec; + __u64 vec_len; + __u32 flags; + __u32 __reserved; +}; + +#define PAGEMAP_SD_GET _IOWR('f', 16, struct pagemap_sd_args) +#define PAGEMAP_SD_CLEAR _IOWR('f', 17, struct pagemap_sd_args) +#define PAGEMAP_SD_GET_AND_CLEAR _IOWR('f', 18, struct pagemap_sd_args) +#define PAGEMAP_SD_NO_REUSED_REGIONS 0x1 + #endif /* _UAPI_LINUX_FS_H */ -- 2.30.2