On Tue, Sep 03, 2024 at 04:22:39PM -0700, Anthony Yznaga wrote: > From: Khalid Aziz <khalid@xxxxxxxxxx> > > Add support for handling page faults in an mshare region by > redirecting the faults to operate on the mshare mm_struct and > vmas contained in it and to link the page tables of the faulting > process with the shared page tables in the mshare mm. > Modify the unmap path to ensure that page tables in mshare regions > are kept intact when a process exits. Note that they are also > kept intact and not unlinked from a process when an mshare region > is explicitly unmapped which is bug to be addressed. > > Signed-off-by: Khalid Aziz <khalid@xxxxxxxxxx> > Signed-off-by: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx> > Signed-off-by: Anthony Yznaga <anthony.yznaga@xxxxxxxxxx> > --- > mm/internal.h | 1 + > mm/memory.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++--- > mm/mshare.c | 38 +++++++++++++++++++++++++++++++ > 3 files changed, 98 insertions(+), 3 deletions(-) > > diff --git a/mm/internal.h b/mm/internal.h > index 8005d5956b6e..8ac224d96806 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -1578,6 +1578,7 @@ void unlink_file_vma_batch_init(struct unlink_vma_file_batch *); > void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *); > void unlink_file_vma_batch_final(struct unlink_vma_file_batch *); > > +extern vm_fault_t find_shared_vma(struct vm_area_struct **vma, unsigned long *addrp); > static inline bool vma_is_shared(const struct vm_area_struct *vma) > { > return VM_SHARED_PT && (vma->vm_flags & VM_SHARED_PT); > diff --git a/mm/memory.c b/mm/memory.c > index 3c01d68065be..f526aef71a61 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -387,11 +387,15 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, > vma_start_write(vma); > unlink_anon_vmas(vma); > > + /* > + * There is no page table to be freed for vmas that > + * are mapped in mshare regions > + */ > if (is_vm_hugetlb_page(vma)) { > unlink_file_vma(vma); > hugetlb_free_pgd_range(tlb, addr, vma->vm_end, > floor, next ? next->vm_start : ceiling); > - } else { > + } else if (!vma_is_shared(vma)) { > unlink_file_vma_batch_init(&vb); > unlink_file_vma_batch_add(&vb, vma); > > @@ -399,7 +403,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, > * Optimization: gather nearby vmas into one call down > */ > while (next && next->vm_start <= vma->vm_end + PMD_SIZE > - && !is_vm_hugetlb_page(next)) { > + && !is_vm_hugetlb_page(next) > + && !vma_is_shared(next)) { > vma = next; > next = mas_find(mas, ceiling - 1); > if (unlikely(xa_is_zero(next))) > @@ -412,7 +417,9 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, > unlink_file_vma_batch_final(&vb); > free_pgd_range(tlb, addr, vma->vm_end, > floor, next ? next->vm_start : ceiling); > - } > + } else > + unlink_file_vma(vma); > + > vma = next; I would rather have vma->vm_ops->free_pgtables() hook that would be defined to non-NULL for mshared and hugetlb VMAs > } while (vma); > } > @@ -1797,6 +1804,13 @@ void unmap_page_range(struct mmu_gather *tlb, > pgd_t *pgd; > unsigned long next; > > + /* > + * No need to unmap vmas that share page table through > + * mshare region > + */ > + if (vma_is_shared(vma)) > + return; > + Ditto. vma->vm_ops->unmap_page_range(). > BUG_ON(addr >= end); > tlb_start_vma(tlb, vma); > pgd = pgd_offset(vma->vm_mm, addr); > @@ -5801,6 +5815,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > struct mm_struct *mm = vma->vm_mm; > vm_fault_t ret; > bool is_droppable; > + bool shared = false; > > __set_current_state(TASK_RUNNING); > > @@ -5808,6 +5823,21 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > if (ret) > goto out; > > + if (unlikely(vma_is_shared(vma))) { > + /* XXX mshare does not support per-VMA locks yet so fallback to mm lock */ > + if (flags & FAULT_FLAG_VMA_LOCK) { > + vma_end_read(vma); > + return VM_FAULT_RETRY; > + } > + > + ret = find_shared_vma(&vma, &address); > + if (ret) > + return ret; > + if (!vma) > + return VM_FAULT_SIGSEGV; > + shared = true; Do we need to update 'mm' variable here? It is going to be used to account the fault below. Not sure which mm has to account such faults. > + } > + > if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, > flags & FAULT_FLAG_INSTRUCTION, > flags & FAULT_FLAG_REMOTE)) { > @@ -5843,6 +5873,32 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, > if (is_droppable) > ret &= ~VM_FAULT_OOM; > > + /* > + * Release the read lock on the shared mm of a shared VMA unless > + * unless the lock has already been released. > + * The mmap lock will already have been released if __handle_mm_fault > + * returns VM_FAULT_COMPLETED or if it returns VM_FAULT_RETRY and > + * the flags FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT are > + * _not_ both set. > + * If the lock was released earlier, release the lock on the task's > + * mm now to keep lock state consistent. > + */ > + if (shared) { > + int release_mmlock = 1; > + > + if ((ret & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)) == 0) { > + mmap_read_unlock(vma->vm_mm); > + release_mmlock = 0; > + } else if ((flags & FAULT_FLAG_ALLOW_RETRY) && > + (flags & FAULT_FLAG_RETRY_NOWAIT)) { > + mmap_read_unlock(vma->vm_mm); > + release_mmlock = 0; > + } > + > + if (release_mmlock) > + mmap_read_unlock(mm); > + } > + > if (flags & FAULT_FLAG_USER) { > mem_cgroup_exit_user_fault(); > /* > diff --git a/mm/mshare.c b/mm/mshare.c > index f3f6ed9c3761..8f47c8d6e6a4 100644 > --- a/mm/mshare.c > +++ b/mm/mshare.c > @@ -19,6 +19,7 @@ > #include <linux/spinlock_types.h> > #include <uapi/linux/magic.h> > #include <uapi/linux/msharefs.h> > +#include "internal.h" > > struct mshare_data { > struct mm_struct *mm; > @@ -33,6 +34,43 @@ struct msharefs_info { > static const struct inode_operations msharefs_dir_inode_ops; > static const struct inode_operations msharefs_file_inode_ops; > > +/* Returns holding the host mm's lock for read. Caller must release. */ > +vm_fault_t > +find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp) > +{ > + struct vm_area_struct *vma, *guest = *vmap; > + struct mshare_data *m_data = guest->vm_private_data; > + struct mm_struct *host_mm = m_data->mm; > + unsigned long host_addr; > + pgd_t *pgd, *guest_pgd; > + > + mmap_read_lock(host_mm); Hm. So we have current->mm locked here, right? So this is nested mmap lock. Have you tested it under lockdep? I expected it to complain. > + host_addr = *addrp - guest->vm_start + host_mm->mmap_base; > + pgd = pgd_offset(host_mm, host_addr); > + guest_pgd = pgd_offset(guest->vm_mm, *addrp); > + if (!pgd_same(*guest_pgd, *pgd)) { > + set_pgd(guest_pgd, *pgd); > + mmap_read_unlock(host_mm); > + return VM_FAULT_NOPAGE; > + } > + > + *addrp = host_addr; > + vma = find_vma(host_mm, host_addr); > + > + /* XXX: expand stack? */ > + if (vma && vma->vm_start > host_addr) > + vma = NULL; > + > + *vmap = vma; > + > + /* > + * release host mm lock unless a matching vma is found > + */ > + if (!vma) > + mmap_read_unlock(host_mm); > + return 0; > +} > + > /* > * Disallow partial unmaps of an mshare region for now. Unmapping at > * boundaries aligned to the level page tables are shared at could > -- > 2.43.5 > -- Kiryl Shutsemau / Kirill A. Shutemov