Currently, there does not appear to be any mechanism for letting drivers or other kernel entities know about updates made in a mapping particularly when a new page is faulted in. Providing notifications for such situations is really useful when using memfds backed by ram-based filesystems such as shmem or hugetlbfs that also allow FALLOC_FL_PUNCH_HOLE. More specifically, when a hole is punched in a memfd (that is backed by shmem or hugetlbfs), a driver can register for notifications associated with range invalidations. However, it would also be useful to have notifications when new pages are faulted in as a result of writes made to the mapping region that overlaps with a previously punched hole. Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Cc: Hugh Dickins <hughd@xxxxxxxxxx> Cc: Peter Xu <peterx@xxxxxxxxxx> Cc: Jason Gunthorpe <jgg@xxxxxxxxxx> Cc: Gerd Hoffmann <kraxel@xxxxxxxxxx> Cc: Dongwon Kim <dongwon.kim@xxxxxxxxx> Cc: Junxiao Chang <junxiao.chang@xxxxxxxxx> Signed-off-by: Vivek Kasireddy <vivek.kasireddy@xxxxxxxxx> --- include/linux/mmu_notifier.h | 27 +++++++++++++++++++++++++++ mm/hugetlb.c | 9 ++++++++- mm/mmu_notifier.c | 17 +++++++++++++++++ mm/shmem.c | 7 ++++++- 4 files changed, 58 insertions(+), 2 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 64a3e051c3c4..218ddc3b4bc7 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -131,6 +131,16 @@ struct mmu_notifier_ops { unsigned long address, pte_t pte); + /* + * update_mapping is called when a page is replaced (at a given offset) + * in a mapping backed by shmem or hugetlbfs. The new page's pfn will + * be contained in the pfn field. + */ + void (*update_mapping)(struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long address, + unsigned long pfn); + /* * invalidate_range_start() and invalidate_range_end() must be * paired and are called only when the mmap_lock and/or the @@ -394,6 +404,9 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); +extern void __mmu_notifier_update_mapping(struct mm_struct *mm, + unsigned long address, + unsigned long pfn); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, bool only_end); @@ -447,6 +460,14 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm, __mmu_notifier_change_pte(mm, address, pte); } +static inline void mmu_notifier_update_mapping(struct mm_struct *mm, + unsigned long address, + unsigned long pfn) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_update_mapping(mm, address, pfn); +} + static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { @@ -695,6 +716,12 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm, { } +static inline void mmu_notifier_update_mapping(struct mm_struct *mm, + unsigned long address, + unsigned long pfn) +{ +} + static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 64a3239b6407..1f2f0209101a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6096,8 +6096,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ - return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, + ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, entry, flags); + if (!ret) + mmu_notifier_update_mapping(vma->vm_mm, address, + pte_pfn(*ptep)); + return ret; ret = 0; @@ -6223,6 +6227,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ if (need_wait_lock) folio_wait_locked(folio); + if (!ret) + mmu_notifier_update_mapping(vma->vm_mm, address, + pte_pfn(*ptep)); return ret; } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 50c0dde1354f..6421405334b9 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -441,6 +441,23 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } +void __mmu_notifier_update_mapping(struct mm_struct *mm, unsigned long address, + unsigned long pfn) +{ + struct mmu_notifier *subscription; + int id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { + if (subscription->ops->update_mapping) + subscription->ops->update_mapping(subscription, mm, + address, pfn); + } + srcu_read_unlock(&srcu, id); +} + static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, const struct mmu_notifier_range *range) { diff --git a/mm/shmem.c b/mm/shmem.c index 2f2e0e618072..e59eb5fafadb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -77,6 +77,7 @@ static struct vfsmount *shm_mnt; #include <linux/fcntl.h> #include <uapi/linux/memfd.h> #include <linux/rmap.h> +#include <linux/mmu_notifier.h> #include <linux/uuid.h> #include <linux/uaccess.h> @@ -2164,8 +2165,12 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) gfp, vma, vmf, &ret); if (err) return vmf_error(err); - if (folio) + if (folio) { vmf->page = folio_file_page(folio, vmf->pgoff); + if (ret == VM_FAULT_LOCKED) + mmu_notifier_update_mapping(vma->vm_mm, vmf->address, + page_to_pfn(vmf->page)); + } return ret; } -- 2.39.2