Re: [PATCH RFC 06/15] fanotify: Support submission through ring buffer

Amir Goldstein <amir73il@xxxxxxxxx> · Tue, 27 Apr 2021 09:02:52 +0300

On Mon, Apr 26, 2021 at 9:42 PM Gabriel Krisman Bertazi
<krisman@xxxxxxxxxxxxx> wrote:
>
> This adds support for the ring buffer mode in fanotify.  It is enabled
> by a new flag FAN_PREALLOC_QUEUE passed to fanotify_init.  If this flag
> is enabled, the group only allows marks that support the ring buffer

I don't like this limitation.
I think FAN_PREALLOC_QUEUE can work with other events, why not?

In any case if we keep ring buffer, please use a different set of
fanotify_ring_buffer_ops struct instead of spraying if/else all over the
event queue implementation.

> submission.  In a following patch, FAN_ERROR will make use of this
> mechanism.
>
> Signed-off-by: Gabriel Krisman Bertazi <krisman@xxxxxxxxxxxxx>
> ---
>  fs/notify/fanotify/fanotify.c      | 77 +++++++++++++++++++---------
>  fs/notify/fanotify/fanotify_user.c | 81 ++++++++++++++++++------------
>  include/linux/fanotify.h           |  5 +-
>  include/uapi/linux/fanotify.h      |  1 +
>  4 files changed, 105 insertions(+), 59 deletions(-)
>
> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index e3669d8a4a64..98591a8155a7 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -612,6 +612,26 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
>         return event;
>  }
>
> +static struct fanotify_event *fanotify_ring_get_slot(struct fsnotify_group *group,
> +                                                    u32 mask, const void *data,
> +                                                    int data_type)
> +{
> +       size_t size = 0;
> +
> +       pr_debug("%s: group=%p mask=%x size=%lu\n", __func__, group, mask, size);
> +
> +       return FANOTIFY_E(fsnotify_ring_alloc_event_slot(group, size));
> +}
> +
> +static void fanotify_ring_write_event(struct fsnotify_group *group,
> +                                     struct fanotify_event *event, u32 mask,
> +                                     const void *data, __kernel_fsid_t *fsid)
> +{
> +       fanotify_init_event(group, event, 0, mask);
> +
> +       event->pid = get_pid(task_tgid(current));
> +}
> +
>  /*
>   * Get cached fsid of the filesystem containing the object from any connector.
>   * All connectors are supposed to have the same fsid, but we do not verify that
> @@ -701,31 +721,38 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
>                         return 0;
>         }
>
> -       event = fanotify_alloc_event(group, mask, data, data_type, dir,
> -                                    file_name, &fsid);
> -       ret = -ENOMEM;
> -       if (unlikely(!event)) {
> -               /*
> -                * We don't queue overflow events for permission events as
> -                * there the access is denied and so no event is in fact lost.
> -                */
> -               if (!fanotify_is_perm_event(mask))
> -                       fsnotify_queue_overflow(group);
> -               goto finish;
> -       }
> -
> -       fsn_event = &event->fse;
> -       ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
> -       if (ret) {
> -               /* Permission events shouldn't be merged */
> -               BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
> -               /* Our event wasn't used in the end. Free it. */
> -               fsnotify_destroy_event(group, fsn_event);
> -
> -               ret = 0;
> -       } else if (fanotify_is_perm_event(mask)) {
> -               ret = fanotify_get_response(group, FANOTIFY_PERM(event),
> -                                           iter_info);
> +       if (group->flags & FSN_SUBMISSION_RING_BUFFER) {
> +               event = fanotify_ring_get_slot(group, mask, data, data_type);
> +               if (IS_ERR(event))
> +                       return PTR_ERR(event);

So no FAN_OVERFLOW with the ring buffer implementation?
This will be unexpected for fanotify users and frankly, less useful IMO.
I also don't see the technical reason to omit the overflow event.

> +               fanotify_ring_write_event(group, event, mask, data, &fsid);
> +               fsnotify_ring_commit_slot(group, &event->fse);
> +       } else {
> +               event = fanotify_alloc_event(group, mask, data, data_type, dir,
> +                                            file_name, &fsid);
> +               ret = -ENOMEM;
> +               if (unlikely(!event)) {
> +                       /*
> +                        * We don't queue overflow events for permission events as
> +                        * there the access is denied and so no event is in fact lost.
> +                        */
> +                       if (!fanotify_is_perm_event(mask))
> +                               fsnotify_queue_overflow(group);
> +                       goto finish;
> +               }
> +               fsn_event = &event->fse;
> +               ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
> +               if (ret) {
> +                       /* Permission events shouldn't be merged */
> +                       BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
> +                       /* Our event wasn't used in the end. Free it. */
> +                       fsnotify_destroy_event(group, fsn_event);
> +
> +                       ret = 0;
> +               } else if (fanotify_is_perm_event(mask)) {
> +                       ret = fanotify_get_response(group, FANOTIFY_PERM(event),
> +                                                   iter_info);
> +               }
>         }
>  finish:
>         if (fanotify_is_perm_event(mask))
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index fe605359af88..5031198bf7db 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -521,7 +521,9 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
>                  * Permission events get queued to wait for response.  Other
>                  * events can be destroyed now.
>                  */
> -               if (!fanotify_is_perm_event(event->mask)) {
> +               if (group->fanotify_data.flags & FAN_PREALLOC_QUEUE) {
> +                       fsnotify_ring_buffer_consume_event(group, &event->fse);
> +               } else if (!fanotify_is_perm_event(event->mask)) {
>                         fsnotify_destroy_event(group, &event->fse);
>                 } else {
>                         if (ret <= 0) {
> @@ -587,40 +589,39 @@ static int fanotify_release(struct inode *ignored, struct file *file)
>          */
>         fsnotify_group_stop_queueing(group);
>
> -       /*
> -        * Process all permission events on access_list and notification queue
> -        * and simulate reply from userspace.
> -        */
> -       spin_lock(&group->notification_lock);
> -       while (!list_empty(&group->fanotify_data.access_list)) {
> -               struct fanotify_perm_event *event;
> -
> -               event = list_first_entry(&group->fanotify_data.access_list,
> -                               struct fanotify_perm_event, fae.fse.list);
> -               list_del_init(&event->fae.fse.list);
> -               finish_permission_event(group, event, FAN_ALLOW);
> +       if (!(group->flags & FSN_SUBMISSION_RING_BUFFER)) {
> +               /*
> +                * Process all permission events on access_list and notification queue
> +                * and simulate reply from userspace.
> +                */
>                 spin_lock(&group->notification_lock);
> -       }
> -
> -       /*
> -        * Destroy all non-permission events. For permission events just
> -        * dequeue them and set the response. They will be freed once the
> -        * response is consumed and fanotify_get_response() returns.
> -        */
> -       while (!fsnotify_notify_queue_is_empty(group)) {
> -               struct fanotify_event *event;
> -
> -               event = FANOTIFY_E(fsnotify_remove_first_event(group));
> -               if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
> -                       spin_unlock(&group->notification_lock);
> -                       fsnotify_destroy_event(group, &event->fse);
> -               } else {
> -                       finish_permission_event(group, FANOTIFY_PERM(event),
> -                                               FAN_ALLOW);
> +               while (!list_empty(&group->fanotify_data.access_list)) {
> +                       struct fanotify_perm_event *event;
> +                       event = list_first_entry(&group->fanotify_data.access_list,
> +                                                struct fanotify_perm_event, fae.fse.list);
> +                       list_del_init(&event->fae.fse.list);
> +                       finish_permission_event(group, event, FAN_ALLOW);
> +                       spin_lock(&group->notification_lock);
>                 }
> -               spin_lock(&group->notification_lock);
> +               /*
> +                * Destroy all non-permission events. For permission events just
> +                * dequeue them and set the response. They will be freed once the
> +                * response is consumed and fanotify_get_response() returns.
> +                */
> +               while (!fsnotify_notify_queue_is_empty(group)) {
> +                       struct fanotify_event *event;
> +                       event = FANOTIFY_E(fsnotify_remove_first_event(group));
> +                       if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
> +                               spin_unlock(&group->notification_lock);
> +                               fsnotify_destroy_event(group, &event->fse);
> +                       } else {
> +                               finish_permission_event(group, FANOTIFY_PERM(event),
> +                                                       FAN_ALLOW);
> +                       }
> +                       spin_lock(&group->notification_lock);
> +               }
> +               spin_unlock(&group->notification_lock);
>         }
> -       spin_unlock(&group->notification_lock);
>
>         /* Response for all permission events it set, wakeup waiters */
>         wake_up(&group->fanotify_data.access_waitq);
> @@ -981,6 +982,16 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>         if (flags & FAN_NONBLOCK)
>                 f_flags |= O_NONBLOCK;
>
> +       if (flags & FAN_PREALLOC_QUEUE) {
> +               if (!capable(CAP_SYS_ADMIN))
> +                       return -EPERM;
> +
> +               if (flags & FAN_UNLIMITED_QUEUE)
> +                       return -EINVAL;
> +
> +               fsn_flags = FSN_SUBMISSION_RING_BUFFER;
> +       }
> +
>         /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
>         group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops, fsn_flags);
>         if (IS_ERR(group)) {
> @@ -1223,6 +1234,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
>                 goto fput_and_out;
>         }
>
> +       if ((group->flags & FSN_SUBMISSION_RING_BUFFER) &&
> +           (mask & ~FANOTIFY_SUBMISSION_BUFFER_EVENTS))
> +               goto fput_and_out;
> +
>         ret = fanotify_find_path(dfd, pathname, &path, flags,
>                         (mask & ALL_FSNOTIFY_EVENTS), obj_type);
>         if (ret)
> @@ -1327,7 +1342,7 @@ SYSCALL32_DEFINE6(fanotify_mark,
>   */
>  static int __init fanotify_user_setup(void)
>  {
> -       BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
> +       BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
>         BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
>
>         fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
> diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
> index 3e9c56ee651f..5a4cefb4b1c3 100644
> --- a/include/linux/fanotify.h
> +++ b/include/linux/fanotify.h
> @@ -23,7 +23,8 @@
>  #define FANOTIFY_INIT_FLAGS    (FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \
>                                  FAN_REPORT_TID | \
>                                  FAN_CLOEXEC | FAN_NONBLOCK | \
> -                                FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS)
> +                                FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS | \
> +                                FAN_PREALLOC_QUEUE)
>
>  #define FANOTIFY_MARK_TYPE_BITS        (FAN_MARK_INODE | FAN_MARK_MOUNT | \
>                                  FAN_MARK_FILESYSTEM)
> @@ -71,6 +72,8 @@
>                                          FANOTIFY_PERM_EVENTS | \
>                                          FAN_Q_OVERFLOW | FAN_ONDIR)
>
> +#define FANOTIFY_SUBMISSION_BUFFER_EVENTS 0

FANOTIFY_RING_BUFFER_EVENTS? FANOTIFY_PREALLOC_EVENTS?

Please leave a comment above to state what this group means.
I *think* there is no reason to limit the set of events, only the sort of
information that is possible with FAN_PREALLOC_QUEUE.

Perhaps FAN_REPORT_FID cannot be allowed and as a result
FANOTIFY_INODE_EVENTS will not be allowed, but I am not even
sure if that limitation is needed.

Thanks,
Amir.