Re: [PATCH 11/22] fsnotify: Provide framework for dropping SRCU lock in ->handle_event

Amir Goldstein <amir73il@xxxxxxxxx> · Sun, 8 Jan 2017 11:02:24 +0200

On Fri, Jan 6, 2017 at 12:43 PM, Jan Kara <jack@xxxxxxx> wrote:
> fanotify wants to drop fsnotify_mark_srcu lock when waiting for response
> from userspace so that the whole notification subsystem is not blocked
> during that time. This patch provides a framework for safely getting
> mark reference for a mark found in the object list which pins the mark
> in that list. We can then drop fsnotify_mark_srcu, wait for userspace
> response and then safely continue iteration of the object list once we
> reaquire fsnotify_mark_srcu.
>
> Reviewed-by: Amir Goldstein <amir73il@xxxxxxxxx>
> Signed-off-by: Jan Kara <jack@xxxxxxx>

Patch looks good, but I wonder out-loud:

Shouldn't this infrastructure be kept more generic and not only for
the use case of
grabbing refcount before user sleep, i.e.:

fsnotify_prepare_user_wait(inode_mark, vfsmount_mark, srcu_idx) ->
    fsnotify_get_marks_safe(inode_mark, vfsmount_mark, srcu_idx)

I am not sure it will ever make sense for performance, but is there
an inherent reason why marks references could not be taken for
any group on object list iteration (in fsnotify()) *before* handle_event()
and regardless of user wait?

If there is no "inherent" reason, then I think the API should remain
generic (i.e. fsnotify_get_marks_safe()) and in your implementation
it can be called only for user wait case.

If you think that using this infrastructure for anything other then user
sleep case, then disregard my comment.

> ---
>  fs/notify/group.c                |  1 +
>  fs/notify/mark.c                 | 86 ++++++++++++++++++++++++++++++++++++++++
>  include/linux/fsnotify_backend.h |  8 ++++
>  3 files changed, 95 insertions(+)
>
> diff --git a/fs/notify/group.c b/fs/notify/group.c
> index 0fb4aadcc19f..79439cdf16e0 100644
> --- a/fs/notify/group.c
> +++ b/fs/notify/group.c
> @@ -126,6 +126,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
>         /* set to 0 when there a no external references to this group */
>         atomic_set(&group->refcnt, 1);
>         atomic_set(&group->num_marks, 0);
> +       atomic_set(&group->user_waits, 0);
>
>         spin_lock_init(&group->notification_lock);
>         INIT_LIST_HEAD(&group->notification_list);
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index 47162c13fc0f..fb9b9f080da6 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -109,6 +109,16 @@ void fsnotify_get_mark(struct fsnotify_mark *mark)
>         atomic_inc(&mark->refcnt);
>  }
>
> +/*
> + * Get mark reference when we found the mark via lockless traversal of object
> + * list. Mark can be already removed from the list by now and on its way to be
> + * destroyed once SRCU period ends.
> + */
> +static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
> +{
> +       return atomic_inc_not_zero(&mark->refcnt);
> +}
> +
>  static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
>  {
>         u32 new_mask = 0;
> @@ -242,6 +252,76 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
>         }
>  }
>
> +bool fsnotify_prepare_user_wait(struct fsnotify_mark *inode_mark,
> +                               struct fsnotify_mark *vfsmount_mark,
> +                               int *srcu_idx)
> +{
> +       struct fsnotify_group *group;
> +
> +       if (WARN_ON_ONCE(!inode_mark && !vfsmount_mark))
> +               return false;
> +
> +       if (inode_mark)
> +               group = inode_mark->group;
> +       else
> +               group = vfsmount_mark->group;
> +
> +       /*
> +        * Since acquisition of mark reference is an atomic op as well, we can
> +        * be sure this inc is seen before any effect of refcount increment.
> +        */
> +       atomic_inc(&group->user_waits);
> +
> +       if (inode_mark) {
> +               /* This can fail if mark is being removed */
> +               if (!fsnotify_get_mark_safe(inode_mark))
> +                       goto out_wait;
> +       }
> +       if (vfsmount_mark) {
> +               if (!fsnotify_get_mark_safe(vfsmount_mark))
> +                       goto out_inode;
> +       }
> +
> +       /*
> +        * Now that both marks are pinned by refcount in the inode / vfsmount
> +        * lists, we can drop SRCU lock, and safely resume the list iteration
> +        * once userspace returns.
> +        */
> +       srcu_read_unlock(&fsnotify_mark_srcu, *srcu_idx);
> +
> +       return true;
> +out_inode:
> +       if (inode_mark)
> +               fsnotify_put_mark(inode_mark);
> +out_wait:
> +       if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
> +               wake_up(&group->notification_waitq);
> +       return false;
> +}
> +
> +void fsnotify_finish_user_wait(struct fsnotify_mark *inode_mark,
> +                              struct fsnotify_mark *vfsmount_mark,
> +                              int *srcu_idx)
> +{
> +       struct fsnotify_group *group = NULL;
> +
> +       *srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
> +       if (inode_mark) {
> +               group = inode_mark->group;
> +               fsnotify_put_mark(inode_mark);
> +       }
> +       if (vfsmount_mark) {
> +               group = vfsmount_mark->group;
> +               fsnotify_put_mark(vfsmount_mark);
> +       }
> +       /*
> +        * We abuse notification_waitq on group shutdown for waiting for all
> +        * marks pinned when waiting for userspace.
> +        */
> +       if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
> +               wake_up(&group->notification_waitq);
> +}
> +
>  /*
>   * Mark mark as dead, remove it from group list. Mark still stays in object
>   * list until its last reference is dropped. Note that we rely on mark being
> @@ -631,6 +711,12 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group)
>                 fsnotify_free_mark(mark);
>                 fsnotify_put_mark(mark);
>         }
> +       /*
> +        * Some marks can still be pinned when waiting for response from
> +        * userspace. Wait for those now. fsnotify_prepare_user_wait() will
> +        * not succeed now so this wait is race-free.
> +        */
> +       wait_event(group->notification_waitq, !atomic_read(&group->user_waits));
>  }
>
>  /* Destroy all marks attached to inode / vfsmount */
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index 64ce249288ad..786dc1cf715c 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -162,6 +162,8 @@ struct fsnotify_group {
>         struct fsnotify_event *overflow_event;  /* Event we queue when the
>                                                  * notification list is too
>                                                  * full */
> +       atomic_t user_waits;            /* Number of tasks waiting for user
> +                                        * response */
>
>         /* groups can define private fields here or use the void *private */
>         union {
> @@ -367,6 +369,12 @@ extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, un
>  extern void fsnotify_get_mark(struct fsnotify_mark *mark);
>  extern void fsnotify_put_mark(struct fsnotify_mark *mark);
>  extern void fsnotify_unmount_inodes(struct super_block *sb);
> +extern void fsnotify_finish_user_wait(struct fsnotify_mark *inode_mark,
> +                                     struct fsnotify_mark *vfsmount_mark,
> +                                     int *srcu_idx);
> +extern bool fsnotify_prepare_user_wait(struct fsnotify_mark *inode_mark,
> +                                      struct fsnotify_mark *vfsmount_mark,
> +                                      int *srcu_idx);
>
>  /* put here because inotify does some weird stuff when destroying watches */
>  extern void fsnotify_init_event(struct fsnotify_event *event,
> --
> 2.10.2
>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html