Hi Muhammad, Here are a few inline comments. On Wed, Nov 09, 2022 at 03:23:02PM +0500, Muhammad Usama Anjum wrote: > This IOCTL, PAGEMAP_SCAN can be used to get and/or clear the info about > page table entries. The following operations are supported in this ioctl: > - Get the information if the pages are soft-dirty, file mapped, present > or swapped. > - Clear the soft-dirty PTE bit of the pages. > - Get and clear the soft-dirty PTE bit of the pages. > > Only the soft-dirty bit can be read and cleared atomically. struct > pagemap_sd_args is used as the argument of the IOCTL. In this struct: > - The range is specified through start and len. > - The output buffer and size is specified as vec and vec_len. > - The optional maximum requested pages are specified in the max_pages. > - The flags can be specified in the flags field. The PAGEMAP_SD_CLEAR > and PAGEMAP_SD_NO_REUSED_REGIONS are supported. > - The masks are specified in rmask, amask, emask and return_mask. > > This IOCTL can be extended to get information about more PTE bits. > > This is based on a patch from Gabriel Krisman Bertazi. > > Signed-off-by: Muhammad Usama Anjum <usama.anjum@xxxxxxxxxxxxx> > --- > Changes in v6: > - Rename variables and update comments > - Make IOCTL independent of soft_dirty config > - Change masks and bitmap type to _u64 > - Improve code quality > > Changes in v5: > - Remove tlb flushing even for clear operation > > Changes in v4: > - Update the interface and implementation > > Changes in v3: > - Tighten the user-kernel interface by using explicit types and add more > error checking > > Changes in v2: > - Convert the interface from syscall to ioctl > - Remove pidfd support as it doesn't make sense in ioctl > --- > fs/proc/task_mmu.c | 328 ++++++++++++++++++++++++++++++++++ > include/uapi/linux/fs.h | 56 ++++++ > tools/include/uapi/linux/fs.h | 56 ++++++ > 3 files changed, 440 insertions(+) > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > index 8235c536ac70..8d6a84ec5ef7 100644 > --- a/fs/proc/task_mmu.c > +++ b/fs/proc/task_mmu.c > @@ -19,6 +19,9 @@ > #include <linux/shmem_fs.h> > #include <linux/uaccess.h> > #include <linux/pkeys.h> > +#include <uapi/linux/fs.h> > +#include <linux/vmalloc.h> > +#include <linux/minmax.h> > > #include <asm/elf.h> > #include <asm/tlb.h> > @@ -1775,11 +1778,336 @@ static int pagemap_release(struct inode *inode, struct file *file) > return 0; > } > > +#define PAGEMAP_OP_MASK (PAGE_IS_SOFTDIRTY | PAGE_IS_FILE | \ > + PAGE_IS_PRESENT | PAGE_IS_SWAPPED) > +#define PAGEMAP_NONSD_OP_MASK (PAGE_IS_FILE | PAGE_IS_PRESENT | PAGE_IS_SWAPPED) > +#define PAGEMAP_SD_FLAGS (PAGEMAP_SOFTDIRTY_CLEAR | PAGEMAP_NO_REUSED_REGIONS) > +#define IS_CLEAR_OP(a) (a->flags & PAGEMAP_SOFTDIRTY_CLEAR) > +#define IS_GET_OP(a) (a->vec) > +#define IS_SD_OP(a) (a->flags & PAGEMAP_SD_FLAGS) > + > +struct pagemap_scan_private { > + struct page_region *vec; > + unsigned long vec_len; > + unsigned long vec_index; > + unsigned int max_pages; > + unsigned int found_pages; > + unsigned int flags; > + unsigned long required_mask; > + unsigned long anyof_mask; > + unsigned long excluded_mask; > + unsigned long return_mask; > +}; > + > +static int pagemap_scan_pmd_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) > +{ > + struct pagemap_scan_private *p = walk->private; > + struct vm_area_struct *vma = walk->vma; > + > + if (IS_GET_OP(p) && p->max_pages && (p->found_pages == p->max_pages)) > + return -1; > + > + if (vma->vm_flags & VM_PFNMAP) > + return 1; > + > + return 0; > +} > + > +static int add_to_out(bool sd, bool file, bool pres, bool swap, struct pagemap_scan_private *p, > + unsigned long addr, unsigned int len) > +{ > + unsigned long bitmap, cur = sd | file << 1 | pres << 2 | swap << 3; Should we define contants for each of these bits? > + bool cpy = true; > + > + if (p->required_mask) > + cpy = ((p->required_mask & cur) == p->required_mask); > + if (cpy && p->anyof_mask) > + cpy = (p->anyof_mask & cur); > + if (cpy && p->excluded_mask) > + cpy = !(p->excluded_mask & cur); > + > + bitmap = cur & p->return_mask; > + > + if (cpy && bitmap) { > + if ((p->vec_index) && (p->vec[p->vec_index - 1].bitmap == bitmap) && > + (p->vec[p->vec_index - 1].start + p->vec[p->vec_index - 1].len * PAGE_SIZE == > + addr)) { I think it is better to define a variable for p->vec_index - 1. nit: len can be in bytes rather than pages. > + p->vec[p->vec_index - 1].len += len; > + p->found_pages += len; > + } else if (p->vec_index < p->vec_len) { > + p->vec[p->vec_index].start = addr; > + p->vec[p->vec_index].len = len; > + p->found_pages += len; > + p->vec[p->vec_index].bitmap = bitmap; > + p->vec_index++; > + } else { > + return -ENOMEM; > + } > + } > + > + return 0; > +} > + > +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long addr, > + unsigned long end, struct mm_walk *walk) > +{ > + struct pagemap_scan_private *p = walk->private; > + struct vm_area_struct *vma = walk->vma; > + unsigned int len; > + spinlock_t *ptl; > + int ret = 0; > + pte_t *pte; > + bool dirty_vma = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? > + (false) : (vma->vm_flags & VM_SOFTDIRTY); > + > + if ((walk->vma->vm_end < addr) || (p->max_pages && p->found_pages == p->max_pages)) > + return 0; > + > + end = min(end, walk->vma->vm_end); > + > + ptl = pmd_trans_huge_lock(pmd, vma); > + if (ptl) { > + if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) { > + /* > + * Break huge page into small pages if operation needs to be performed is > + * on a portion of the huge page or the return buffer cannot store complete > + * data. > + */ > + if ((IS_CLEAR_OP(p) && (end - addr < HPAGE_SIZE))) { > + spin_unlock(ptl); > + split_huge_pmd(vma, pmd, addr); > + goto process_smaller_pages; > + } > + > + if (IS_GET_OP(p)) { > + len = (end - addr)/PAGE_SIZE; > + if (p->max_pages && p->found_pages + len > p->max_pages) > + len = p->max_pages - p->found_pages; > + > + ret = add_to_out(dirty_vma || > + check_soft_dirty_pmd(vma, addr, pmd, false), can we reuse a return code of the previous call of check_soft_dirty_pmd? > + vma->vm_file, pmd_present(*pmd), is_swap_pmd(*pmd), > + p, addr, len); > + } > + if (!ret && IS_CLEAR_OP(p)) > + check_soft_dirty_pmd(vma, addr, pmd, true); should we return a error in this case? We need to be sure that: * we stop waking page tables after this point. * return this error to the user-space if we are not able to add anything in the vector. > + } > + spin_unlock(ptl); > + return 0; > + } > + > +process_smaller_pages: > + if (pmd_trans_unstable(pmd)) > + return 0; > + > + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > + for (; addr < end && !ret && (!p->max_pages || (p->found_pages < p->max_pages)) > + ; pte++, addr += PAGE_SIZE) { > + if (IS_GET_OP(p)) > + ret = add_to_out(dirty_vma || check_soft_dirty(vma, addr, pte, false), > + vma->vm_file, pte_present(*pte), > + is_swap_pte(*pte), p, addr, 1); > + if (!ret && IS_CLEAR_OP(p)) > + check_soft_dirty(vma, addr, pte, true); > + } > + pte_unmap_unlock(pte - 1, ptl); > + cond_resched(); > + > + return 0; > +} > + > +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth, > + struct mm_walk *walk) > +{ > + struct pagemap_scan_private *p = walk->private; > + struct vm_area_struct *vma = walk->vma; > + unsigned int len; > + bool sd; > + > + if (vma) { > + /* Individual pages haven't been allocated and written */ > + sd = (p->flags & PAGEMAP_NO_REUSED_REGIONS) ? (false) : > + (vma->vm_flags & VM_SOFTDIRTY); > + > + len = (end - addr)/PAGE_SIZE; > + if (p->max_pages && p->found_pages + len > p->max_pages) > + len = p->max_pages - p->found_pages; > + > + add_to_out(sd, vma->vm_file, false, false, p, addr, len); > + } > + > + return 0; > +} > + > +#ifdef CONFIG_MEM_SOFT_DIRTY > +static int pagemap_scan_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk) > +{ > + struct pagemap_scan_private *p = walk->private; > + struct vm_area_struct *vma = walk->vma; > + unsigned long end_cut = end; > + int ret; > + > + if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) && > + (vma->vm_flags & VM_SOFTDIRTY)) { > + if (vma->vm_start < start) { > + ret = split_vma(vma->vm_mm, vma, start, 1); > + if (ret) > + return ret; > + } > + /* Calculate end_cut because of max_pages */ > + if (IS_GET_OP(p) && p->max_pages) > + end_cut = min(start + (p->max_pages - p->found_pages) * PAGE_SIZE, end); > + > + if (vma->vm_end > end_cut) { > + ret = split_vma(vma->vm_mm, vma, end_cut, 0); > + if (ret) > + return ret; > + } > + } > + > + return 0; > +} > + > +static void pagemap_scan_post_vma(struct mm_walk *walk) > +{ > + struct pagemap_scan_private *p = walk->private; > + struct vm_area_struct *vma = walk->vma; > + > + if (!(p->flags & PAGEMAP_NO_REUSED_REGIONS) && IS_CLEAR_OP(p) && > + (vma->vm_flags & VM_SOFTDIRTY)) { > + vma->vm_flags &= ~VM_SOFTDIRTY; > + vma_set_page_prot(vma); > + } > +} > +#endif /* CONFIG_MEM_SOFT_DIRTY */ > + > +static const struct mm_walk_ops pagemap_scan_ops = { > + .test_walk = pagemap_scan_pmd_test_walk, > + .pmd_entry = pagemap_scan_pmd_entry, > + .pte_hole = pagemap_scan_pte_hole, > + > +#ifdef CONFIG_MEM_SOFT_DIRTY > + /* Only for clearing SD bit over VMAs */ > + .pre_vma = pagemap_scan_pre_vma, > + .post_vma = pagemap_scan_post_vma, > +#endif /* CONFIG_MEM_SOFT_DIRTY */ > +}; > + > +static long do_pagemap_sd_cmd(struct mm_struct *mm, struct pagemap_scan_arg *arg) > +{ > + struct mmu_notifier_range range; > + unsigned long __user start, end; > + struct pagemap_scan_private p; > + int ret; > + > + start = (unsigned long)untagged_addr(arg->start); > + if ((!IS_ALIGNED(start, PAGE_SIZE)) || (!access_ok((void __user *)start, arg->len))) > + return -EINVAL; > + > + if (IS_GET_OP(arg) && > + ((arg->vec_len == 0) || (!access_ok((struct page_region *)arg->vec, arg->vec_len)))) > + return -ENOMEM; > + > +#ifndef CONFIG_MEM_SOFT_DIRTY > + if (IS_SD_OP(arg) || (arg->required_mask & PAGE_IS_SOFTDIRTY) || > + (arg->anyof_mask & PAGE_IS_SOFTDIRTY)) > + return -EINVAL; > +#endif > + > + if ((arg->flags & ~PAGEMAP_SD_FLAGS) || (arg->required_mask & ~PAGEMAP_OP_MASK) || > + (arg->anyof_mask & ~PAGEMAP_OP_MASK) || (arg->excluded_mask & ~PAGEMAP_OP_MASK) || > + (arg->return_mask & ~PAGEMAP_OP_MASK)) > + return -EINVAL; > + > + if ((!arg->required_mask && !arg->anyof_mask && !arg->excluded_mask) || !arg->return_mask) > + return -EINVAL; > + > + if (IS_SD_OP(arg) && ((arg->required_mask & PAGEMAP_NONSD_OP_MASK) || > + (arg->anyof_mask & PAGEMAP_NONSD_OP_MASK))) > + return -EINVAL; > + > + end = start + arg->len; > + p.max_pages = arg->max_pages; > + p.found_pages = 0; > + p.flags = arg->flags; > + p.required_mask = arg->required_mask; > + p.anyof_mask = arg->anyof_mask; > + p.excluded_mask = arg->excluded_mask; > + p.return_mask = arg->return_mask; > + p.vec_index = 0; > + p.vec_len = arg->vec_len; > + > + if (IS_GET_OP(arg)) { > + p.vec = vzalloc(arg->vec_len * sizeof(struct page_region)); I think we need to set a reasonable limit for vec_len to avoid large allocations on the kernel. We can consider to use kmalloc or kvmalloc here. Thanks, Andrei