This IOCTL, PAGEMAP_SCAN can be used to get and/or clear the info about page table entries. The following operations are supported in this ioctl: - Get the information if the pages are soft-dirty, file mapped, present or swapped. - Clear the soft-dirty PTE bit of the pages. - Get and clear the soft-dirty PTE bit of the pages. Only the soft-dirty bit can be read and cleared atomically. struct pagemap_sd_args is used as the argument of the IOCTL. In this struct: - The range is specified through start and len. - The output buffer and size is specified as vec and vec_len. - The optional maximum requested pages are specified in the max_pages. - The flags can be specified in the flags field. The PAGEMAP_SD_CLEAR and PAGEMAP_SD_NO_REUSED_REGIONS are supported. - The masks are specified in rmask, amask, emask and return_mask. This IOCTL can be extended to get information about more PTE bits. This is based on a patch from Gabriel Krisman Bertazi. Signed-off-by: Muhammad Usama Anjum <usama.anjum@xxxxxxxxxxxxx> --- Changes in v5: - Remove tlb flushing even for clear operation Changes in v4: - Update the interface and implementation Changes in v3: - Tighten the user-kernel interface by using explicit types and add more error checking Changes in v2: - Convert the interface from syscall to ioctl - Remove pidfd support as it doesn't make sense in ioctl --- fs/proc/task_mmu.c | 314 ++++++++++++++++++++++++++++++++++ include/uapi/linux/fs.h | 53 ++++++ tools/include/uapi/linux/fs.h | 53 ++++++ 3 files changed, 420 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8235c536ac70..9690a44eb1fc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -19,6 +19,9 @@ #include <linux/shmem_fs.h> #include <linux/uaccess.h> #include <linux/pkeys.h> +#include <uapi/linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/minmax.h> #include <asm/elf.h> #include <asm/tlb.h> @@ -1775,11 +1778,322 @@ static int pagemap_release(struct inode *inode, struct file *file) return 0; } +#ifdef CONFIG_MEM_SOFT_DIRTY + +#define PAGEMAP_OP_MASK (PAGE_IS_SD | PAGE_IS_FILE | \ + PAGE_IS_PRESENT | PAGE_IS_SWAPED) +#define PAGEMAP_NON_SD_MASK (PAGE_IS_FILE | PAGE_IS_PRESENT | PAGE_IS_SWAPED) +#define PAGEMAP_SD_FLAGS_MASK (PAGEMAP_SD_CLEAR | PAGEMAP_NO_REUSED_REGIONS) +#define IS_CLEAR_OP(flags) (flags & PAGEMAP_SD_CLEAR) +#define IS_GET_OP(vec) (vec) + +struct pagemap_scan_private { + struct page_region *vec; + unsigned long vec_len; + unsigned long index; + unsigned int max_pages; + unsigned int found_pages; + unsigned int flags; + unsigned int rmask; + unsigned int amask; + unsigned int emask; + unsigned int return_mask; +}; + +static int add_to_out(bool sd, bool file, bool pres, bool swap, struct pagemap_scan_private *p, + unsigned long addr, unsigned int len) +{ + unsigned int bitmap, cpy = true, cur = sd | file << 1 | pres << 2 | swap << 3; + + if (p->rmask) + cpy = ((p->rmask & cur) == p->rmask) ? true : false; + if (cpy && p->amask) + cpy = (p->amask & cur) ? true : false; + if (cpy && p->emask) + cpy = (p->emask & cur) ? false : true; + + bitmap = cur & p->return_mask; + + if (cpy && bitmap) { + if (p->index && p->vec[p->index - 1].bitmap == bitmap && + p->vec[p->index - 1].start + p->vec[p->index - 1].len * PAGE_SIZE == addr) { + p->vec[p->index - 1].len += len; + p->found_pages += len; + } else if (p->index < p->vec_len) { + p->vec[p->index].start = addr; + p->vec[p->index].len = len; + p->vec[p->index].bitmap = bitmap; + p->index++; + p->found_pages += len; + } else { + return -ENOMEM; + } + } + + return 0; +} + +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + int dirty, ret = 0; + spinlock_t *ptl; + pte_t *pte; + bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? + (false) : (vma->vm_flags & VM_SOFTDIRTY); + + if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages)) + return 0; + + end = min(end, walk->vma->vm_end); + + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) { + /* + * Break huge page into small pages if operation needs to be performed is + * on a portion of the huge page or the return buffer cannot store complete + * data. + */ + if ((IS_CLEAR_OP(p->flags) && (end - addr < HPAGE_SIZE)) || + (IS_GET_OP(p->vec) && p->max_pages && + (p->found_pages + HPAGE_SIZE/PAGE_SIZE > p->max_pages))) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, addr); + goto process_smaller_pages; + } else { + dirty = check_soft_dirty_pmd(vma, addr, pmd, IS_CLEAR_OP(p->flags)); + if (IS_GET_OP(p->vec)) + add_to_out(dirty_vma || dirty, vma->vm_file, + pmd_present(*pmd), is_swap_pmd(*pmd), p, + addr, (end - addr)/PAGE_SIZE); + } + } + spin_unlock(ptl); + return 0; + } + +process_smaller_pages: + if (pmd_trans_unstable(pmd)) + return 0; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr < end && !ret; pte++, addr += PAGE_SIZE) { + dirty = check_soft_dirty(vma, addr, pte, IS_CLEAR_OP(p->flags)); + if (IS_GET_OP(p->vec)) { + ret = add_to_out(dirty_vma || dirty, vma->vm_file, pte_present(*pte), + is_swap_pte(*pte), p, addr, 1); + if (p->max_pages && (p->found_pages == p->max_pages)) + break; + } + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + return 0; +} + +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned int len; + bool sd; + + if (vma) { + /* Individual pages haven't been allocated and written */ + sd = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? (false) : + (vma->vm_flags & VM_SOFTDIRTY); + + len = (end - addr)/PAGE_SIZE; + if (p->max_pages && p->max_pages - p->found_pages < len) + len = p->max_pages - p->found_pages; + + add_to_out(sd, vma->vm_file, false, false, p, addr, len); + } + + return 0; +} + +static int pagemap_scan_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long end_cut = end; + int ret; + + if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p->flags) && + (vma->vm_flags & VM_SOFTDIRTY)) { + if (vma->vm_start < start) { + ret = split_vma(vma->vm_mm, vma, start, 1); + if (ret) + return ret; + } + /* Calculate end_cut because of max_pages */ + if (IS_GET_OP(p->vec) && p->max_pages) + end_cut = min(start + (p->max_pages - p->found_pages) * PAGE_SIZE, end); + + if (vma->vm_end > end_cut) { + ret = split_vma(vma->vm_mm, vma, end_cut, 0); + if (ret) + return ret; + } + } + + return 0; +} + +static void pagemap_scan_post_vma(struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + + if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p->flags) && + (vma->vm_flags & VM_SOFTDIRTY)) { + vma->vm_flags &= ~VM_SOFTDIRTY; + vma_set_page_prot(vma); + } +} + +static int pagemap_scan_pmd_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + + if (IS_GET_OP(p->vec) && p->max_pages && (p->found_pages == p->max_pages)) + return -1; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + return 0; +} + +static const struct mm_walk_ops pagemap_scan_ops = { + .test_walk = pagemap_scan_pmd_test_walk, + .pmd_entry = pagemap_scan_pmd_entry, + .pte_hole = pagemap_scan_pte_hole, + + /* Only for clearing SD bit over VMAs */ + .pre_vma = pagemap_scan_pre_vma, + .post_vma = pagemap_scan_post_vma, +}; + +static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg) +{ + struct mmu_notifier_range range; + unsigned long __user start, end; + struct pagemap_scan_private p; + int ret; + + start = (unsigned long)untagged_addr(arg->start); + if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len))) + return -EINVAL; + + if (IS_GET_OP(arg->vec) && + ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len)))) + return -ENOMEM; + + if ((arg->flags & ~PAGEMAP_SD_FLAGS_MASK) || (arg->rmask & ~PAGEMAP_OP_MASK) || + (arg->amask & ~PAGEMAP_OP_MASK) || (arg->emask & ~PAGEMAP_OP_MASK) || + (arg->return_mask & ~PAGEMAP_OP_MASK)) + return -EINVAL; + + if ((!arg->rmask && !arg->amask && !arg->emask) || !arg->return_mask) + return -EINVAL; + + if ((arg->flags & PAGEMAP_SD_FLAGS_MASK) && ((arg->rmask & PAGEMAP_NON_SD_MASK) || + (arg->amask & PAGEMAP_NON_SD_MASK))) + return -EINVAL; + + end = start + arg->len; + p.max_pages = arg->max_pages; + p.found_pages = 0; + p.flags = arg->flags; + p.rmask = arg->rmask; + p.amask = arg->amask; + p.emask = arg->emask; + p.return_mask = arg->return_mask; + p.index = 0; + p.vec_len = arg->vec_len; + + if (IS_GET_OP(arg->vec)) { + p.vec = vzalloc(arg->vec_len * sizeof(struct page_region)); + if (!p.vec) + return -ENOMEM; + } else { + p.vec = NULL; + } + + if (IS_CLEAR_OP(arg->flags)) { + mmap_write_lock(mm); + + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, NULL, mm, start, end); + mmu_notifier_invalidate_range_start(&range); + inc_tlb_flush_pending(mm); + } else { + mmap_read_lock(mm); + } + + ret = walk_page_range(mm, start, end, &pagemap_scan_ops, &p); + + if (IS_CLEAR_OP(arg->flags)) { + mmu_notifier_invalidate_range_end(&range); + dec_tlb_flush_pending(mm); + + mmap_write_unlock(mm); + } else { + mmap_read_unlock(mm); + } + + if (ret < 0) + goto free_data; + + if (IS_GET_OP(arg->vec) && p.index) { + if (copy_to_user((struct page_region *)arg->vec, p.vec, + p.index * sizeof(struct page_region))) { + ret = -EFAULT; + goto free_data; + } + ret = p.index; + } else { + ret = 0; + } + +free_data: + if (IS_GET_OP(arg->vec)) + vfree(p.vec); + + return ret; +} + +static long pagemap_sd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct pagemap_scan_arg __user *uarg = (struct pagemap_scan_arg __user *)arg; + struct mm_struct *mm = file->private_data; + struct pagemap_scan_arg argument; + + if (cmd == PAGEMAP_SCAN) { + if (copy_from_user(&argument, uarg, sizeof(struct pagemap_scan_arg))) + return -EFAULT; + return do_pagemap_sd_cmd(mm, &argument); + } + return -EINVAL; +} +#endif /* CONFIG_MEM_SOFT_DIRTY */ + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, +#ifdef CONFIG_MEM_SOFT_DIRTY + .unlocked_ioctl = pagemap_sd_ioctl, + .compat_ioctl = pagemap_sd_ioctl, +#endif /* CONFIG_MEM_SOFT_DIRTY */ }; #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index b7b56871029c..5d6c0d85dac4 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -305,4 +305,57 @@ typedef int __bitwise __kernel_rwf_t; #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND) +/* PAGEMAP IOCTL */ +#define PAGEMAP_SCAN _IOWR('f', 16, struct pagemap_scan_arg) + +/* Bits are set in the bitmap of the page_region and masks in pagemap_sd_args */ +#define PAGE_IS_SD (1 << 0) +#define PAGE_IS_FILE (1 << 1) +#define PAGE_IS_PRESENT (1 << 2) +#define PAGE_IS_SWAPED (1 << 3) + +/* + * struct page_region - Page region with bitmap flags + * @start: Start of the region + * @len: Length of the region + * bitmap: Bits sets for the region + */ +struct page_region { + __u64 start; + __u64 len; + __u32 bitmap; + __u32 __reserved; +}; + +/* + * struct pagemap_scan_arg - Soft-dirty IOCTL argument + * @start: Starting address of the region + * @len: Length of the region (All the pages in this length are included) + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional max return pages (It must be less than vec_len if specified) + * @flags: Special flags for the IOCTL + * @rmask: Required mask - All of these bits have to be set in the PTE + * @amask: Any mask - Any of these bits are set in the PTE + * @emask: Exclude mask - None of these bits are set in the PTE + * @return_mask: Bits that have to be reported to the user in page_region + */ +struct pagemap_scan_arg { + __u64 start; + __u64 len; + __u64 vec; + __u64 vec_len; + __u32 max_pages; + __u32 flags; + __u32 rmask; + __u32 amask; + __u32 emask; + __u32 return_mask; +}; + +/* Special flags */ +#define PAGEMAP_SD_CLEAR (1 << 0) +/* Check the individual pages if they are soft-dirty to find dirty pages faster. */ +#define PAGEMAP_NO_REUSED_REGIONS (1 << 1) + #endif /* _UAPI_LINUX_FS_H */ diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index b7b56871029c..5d6c0d85dac4 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -305,4 +305,57 @@ typedef int __bitwise __kernel_rwf_t; #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND) +/* PAGEMAP IOCTL */ +#define PAGEMAP_SCAN _IOWR('f', 16, struct pagemap_scan_arg) + +/* Bits are set in the bitmap of the page_region and masks in pagemap_sd_args */ +#define PAGE_IS_SD (1 << 0) +#define PAGE_IS_FILE (1 << 1) +#define PAGE_IS_PRESENT (1 << 2) +#define PAGE_IS_SWAPED (1 << 3) + +/* + * struct page_region - Page region with bitmap flags + * @start: Start of the region + * @len: Length of the region + * bitmap: Bits sets for the region + */ +struct page_region { + __u64 start; + __u64 len; + __u32 bitmap; + __u32 __reserved; +}; + +/* + * struct pagemap_scan_arg - Soft-dirty IOCTL argument + * @start: Starting address of the region + * @len: Length of the region (All the pages in this length are included) + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional max return pages (It must be less than vec_len if specified) + * @flags: Special flags for the IOCTL + * @rmask: Required mask - All of these bits have to be set in the PTE + * @amask: Any mask - Any of these bits are set in the PTE + * @emask: Exclude mask - None of these bits are set in the PTE + * @return_mask: Bits that have to be reported to the user in page_region + */ +struct pagemap_scan_arg { + __u64 start; + __u64 len; + __u64 vec; + __u64 vec_len; + __u32 max_pages; + __u32 flags; + __u32 rmask; + __u32 amask; + __u32 emask; + __u32 return_mask; +}; + +/* Special flags */ +#define PAGEMAP_SD_CLEAR (1 << 0) +/* Check the individual pages if they are soft-dirty to find dirty pages faster. */ +#define PAGEMAP_NO_REUSED_REGIONS (1 << 1) + #endif /* _UAPI_LINUX_FS_H */ -- 2.30.2