Re: [PATCH 10/10] fsnotify: generate pre-content permission event on page fault

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Jul 25, 2024 at 9:20 PM Josef Bacik <josef@xxxxxxxxxxxxxx> wrote:
>
> FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
> on the faulting method.
>
> This pre-content event is meant to be used by hierarchical storage
> managers that want to fill in the file content on first read access.
>
> Signed-off-by: Josef Bacik <josef@xxxxxxxxxxxxxx>
> ---
>  fs/notify/fsnotify.c             | 13 +++++++++
>  include/linux/fsnotify_backend.h | 14 +++++++++
>  mm/filemap.c                     | 50 ++++++++++++++++++++++++++++----
>  3 files changed, 71 insertions(+), 6 deletions(-)
>
> diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> index 1ca4a8da7f29..435232d46b4f 100644
> --- a/fs/notify/fsnotify.c
> +++ b/fs/notify/fsnotify.c
> @@ -28,6 +28,19 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
>         fsnotify_clear_marks_by_mount(mnt);
>  }
>
> +bool fsnotify_file_has_content_watches(struct file *file)

nit: has_pre_content_watches...

> +{
> +       struct inode *inode = file_inode(file);
> +       struct super_block *sb = inode->i_sb;
> +       struct mount *mnt = real_mount(file->f_path.mnt);
> +       u32 mask = inode->i_fsnotify_mask;
> +
> +       mask |= mnt->mnt_fsnotify_mask;
> +       mask |= sb->s_fsnotify_mask;
> +
> +       return !!(mask & FSNOTIFY_PRE_CONTENT_EVENTS);

This can use the fsnotify_object_watched() helper, and it will need
the READ_ONCE() that are just being added to avoid data races.

> +}
> +
>  /**
>   * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
>   * @sb: superblock being unmounted.
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index 36c3d18cc40a..6983fbf096b8 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -900,6 +900,15 @@ static inline void fsnotify_init_event(struct fsnotify_event *event)
>         INIT_LIST_HEAD(&event->list);
>  }
>
> +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
> +bool fsnotify_file_has_content_watches(struct file *file);
> +#else
> +static inline bool fsnotify_file_has_content_watches(struct file *file)
> +{
> +       return false;
> +}
> +#endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
> +
>  #else
>
>  static inline int fsnotify(__u32 mask, const void *data, int data_type,
> @@ -938,6 +947,11 @@ static inline u32 fsnotify_get_cookie(void)
>  static inline void fsnotify_unmount_inodes(struct super_block *sb)
>  {}
>
> +static inline bool fsnotify_file_has_content_watches(struct file *file)
> +{
> +       return false;
> +}
> +
>  #endif /* CONFIG_FSNOTIFY */
>
>  #endif /* __KERNEL __ */
> diff --git a/mm/filemap.c b/mm/filemap.c
> index ca8c8d889eef..cc9d7885bbe3 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -46,6 +46,7 @@
>  #include <linux/pipe_fs_i.h>
>  #include <linux/splice.h>
>  #include <linux/rcupdate_wait.h>
> +#include <linux/fsnotify.h>
>  #include <asm/pgalloc.h>
>  #include <asm/tlbflush.h>
>  #include "internal.h"
> @@ -3112,13 +3113,13 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
>   * that.  If we didn't pin a file then we return NULL.  The file that is
>   * returned needs to be fput()'ed when we're done with it.
>   */
> -static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf,
> +                                          struct file *fpin)
>  {
>         struct file *file = vmf->vma->vm_file;
>         struct file_ra_state *ra = &file->f_ra;
>         struct address_space *mapping = file->f_mapping;
>         DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
> -       struct file *fpin = NULL;
>         unsigned long vm_flags = vmf->vma->vm_flags;
>         unsigned int mmap_miss;
>
> @@ -3182,12 +3183,12 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
>   * was pinned if we have to drop the mmap_lock in order to do IO.
>   */
>  static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
> -                                           struct folio *folio)
> +                                           struct folio *folio,
> +                                           struct file *fpin)
>  {

If I am reading correctly, iomap (i.e. xfs) write shared memory fault
does not reach this code?

Do we care about writable shared memory faults use case for HSM?
It does not sound very relevant to HSM, but we cannot just ignore it..

>         struct file *file = vmf->vma->vm_file;
>         struct file_ra_state *ra = &file->f_ra;
>         DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
> -       struct file *fpin = NULL;
>         unsigned int mmap_miss;
>
>         /* If we don't want any read-ahead, don't bother */
> @@ -3287,6 +3288,35 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
>         if (unlikely(index >= max_idx))
>                 return VM_FAULT_SIGBUS;
>
> +       /*
> +        * If we have pre-content watchers then we need to generate events on
> +        * page fault so that we can populate any data before the fault.
> +        *
> +        * We only do this on the first pass through, otherwise the populating
> +        * application could potentially deadlock on the mmap lock if it tries
> +        * to populate it with mmap.
> +        */
> +       if (fault_flag_allow_retry_first(vmf->flags) &&
> +           fsnotify_file_has_content_watches(file)) {
> +               int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_READ;
> +               loff_t pos = vmf->pgoff << PAGE_SHIFT;
> +
> +               fpin = maybe_unlock_mmap_for_io(vmf, fpin);
> +
> +               /*
> +                * We can only emit the event if we did actually release the
> +                * mmap lock.
> +                */
> +               if (fpin) {
> +                       error = fsnotify_file_area_perm(fpin, mask, &pos,
> +                                                       PAGE_SIZE);

This is going to also emit a FAN_ACCESS_PERM event.
Heritage of the fact that read() has to emit FAN_ACCESS_PERM
for backward compat and a design decision to emit both
FAN_ACCESS_PERM and FAN_PRE_ACCESS to avoid carrying the
API baggage of the legacy FAN_ACCESS_PERM to pre-content events.

Suggestion as workaround - use MAY_ACCESS instead of MAY_READ
here and emit FAN_PRE_ACCESS with MAY_READ | MAY_ACCESS.

Thank you for pushing my patches through!
Amir.





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux