Re: [PATCH 11/22] fsnotify: Provide framework for dropping SRCU lock in ->handle_event

Jan Kara <jack@xxxxxxx> · Tue, 10 Jan 2017 14:00:23 +0100

On Sun 08-01-17 11:02:24, Amir Goldstein wrote:
> On Fri, Jan 6, 2017 at 12:43 PM, Jan Kara <jack@xxxxxxx> wrote:
> > fanotify wants to drop fsnotify_mark_srcu lock when waiting for response
> > from userspace so that the whole notification subsystem is not blocked
> > during that time. This patch provides a framework for safely getting
> > mark reference for a mark found in the object list which pins the mark
> > in that list. We can then drop fsnotify_mark_srcu, wait for userspace
> > response and then safely continue iteration of the object list once we
> > reaquire fsnotify_mark_srcu.
> >
> > Reviewed-by: Amir Goldstein <amir73il@xxxxxxxxx>
> > Signed-off-by: Jan Kara <jack@xxxxxxx>
> 
> Patch looks good, but I wonder out-loud:
> 
> Shouldn't this infrastructure be kept more generic and not only for
> the use case of
> grabbing refcount before user sleep, i.e.:
> 
> fsnotify_prepare_user_wait(inode_mark, vfsmount_mark, srcu_idx) ->
>     fsnotify_get_marks_safe(inode_mark, vfsmount_mark, srcu_idx)
> 
> I am not sure it will ever make sense for performance, but is there
> an inherent reason why marks references could not be taken for
> any group on object list iteration (in fsnotify()) *before* handle_event()
> and regardless of user wait?
> 
> If there is no "inherent" reason, then I think the API should remain
> generic (i.e. fsnotify_get_marks_safe()) and in your implementation
> it can be called only for user wait case.

So there's no inherent reason (except for performance as you noted).
However the SRCU handling and user_waits counter in the notification group
handling are quite special so if anybody wanted to use it somewhere else,
he'd better give it a good thought. So for now I'd leave the naming as is
and if we ever come with another use for the framework, we can easily
rename the functions...

								Honza

> > ---
> >  fs/notify/group.c                |  1 +
> >  fs/notify/mark.c                 | 86 ++++++++++++++++++++++++++++++++++++++++
> >  include/linux/fsnotify_backend.h |  8 ++++
> >  3 files changed, 95 insertions(+)
> >
> > diff --git a/fs/notify/group.c b/fs/notify/group.c
> > index 0fb4aadcc19f..79439cdf16e0 100644
> > --- a/fs/notify/group.c
> > +++ b/fs/notify/group.c
> > @@ -126,6 +126,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
> >         /* set to 0 when there a no external references to this group */
> >         atomic_set(&group->refcnt, 1);
> >         atomic_set(&group->num_marks, 0);
> > +       atomic_set(&group->user_waits, 0);
> >
> >         spin_lock_init(&group->notification_lock);
> >         INIT_LIST_HEAD(&group->notification_list);
> > diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> > index 47162c13fc0f..fb9b9f080da6 100644
> > --- a/fs/notify/mark.c
> > +++ b/fs/notify/mark.c
> > @@ -109,6 +109,16 @@ void fsnotify_get_mark(struct fsnotify_mark *mark)
> >         atomic_inc(&mark->refcnt);
> >  }
> >
> > +/*
> > + * Get mark reference when we found the mark via lockless traversal of object
> > + * list. Mark can be already removed from the list by now and on its way to be
> > + * destroyed once SRCU period ends.
> > + */
> > +static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
> > +{
> > +       return atomic_inc_not_zero(&mark->refcnt);
> > +}
> > +
> >  static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> >  {
> >         u32 new_mask = 0;
> > @@ -242,6 +252,76 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
> >         }
> >  }
> >
> > +bool fsnotify_prepare_user_wait(struct fsnotify_mark *inode_mark,
> > +                               struct fsnotify_mark *vfsmount_mark,
> > +                               int *srcu_idx)
> > +{
> > +       struct fsnotify_group *group;
> > +
> > +       if (WARN_ON_ONCE(!inode_mark && !vfsmount_mark))
> > +               return false;
> > +
> > +       if (inode_mark)
> > +               group = inode_mark->group;
> > +       else
> > +               group = vfsmount_mark->group;
> > +
> > +       /*
> > +        * Since acquisition of mark reference is an atomic op as well, we can
> > +        * be sure this inc is seen before any effect of refcount increment.
> > +        */
> > +       atomic_inc(&group->user_waits);
> > +
> > +       if (inode_mark) {
> > +               /* This can fail if mark is being removed */
> > +               if (!fsnotify_get_mark_safe(inode_mark))
> > +                       goto out_wait;
> > +       }
> > +       if (vfsmount_mark) {
> > +               if (!fsnotify_get_mark_safe(vfsmount_mark))
> > +                       goto out_inode;
> > +       }
> > +
> > +       /*
> > +        * Now that both marks are pinned by refcount in the inode / vfsmount
> > +        * lists, we can drop SRCU lock, and safely resume the list iteration
> > +        * once userspace returns.
> > +        */
> > +       srcu_read_unlock(&fsnotify_mark_srcu, *srcu_idx);
> > +
> > +       return true;
> > +out_inode:
> > +       if (inode_mark)
> > +               fsnotify_put_mark(inode_mark);
> > +out_wait:
> > +       if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
> > +               wake_up(&group->notification_waitq);
> > +       return false;
> > +}
> > +
> > +void fsnotify_finish_user_wait(struct fsnotify_mark *inode_mark,
> > +                              struct fsnotify_mark *vfsmount_mark,
> > +                              int *srcu_idx)
> > +{
> > +       struct fsnotify_group *group = NULL;
> > +
> > +       *srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
> > +       if (inode_mark) {
> > +               group = inode_mark->group;
> > +               fsnotify_put_mark(inode_mark);
> > +       }
> > +       if (vfsmount_mark) {
> > +               group = vfsmount_mark->group;
> > +               fsnotify_put_mark(vfsmount_mark);
> > +       }
> > +       /*
> > +        * We abuse notification_waitq on group shutdown for waiting for all
> > +        * marks pinned when waiting for userspace.
> > +        */
> > +       if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
> > +               wake_up(&group->notification_waitq);
> > +}
> > +
> >  /*
> >   * Mark mark as dead, remove it from group list. Mark still stays in object
> >   * list until its last reference is dropped. Note that we rely on mark being
> > @@ -631,6 +711,12 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group)
> >                 fsnotify_free_mark(mark);
> >                 fsnotify_put_mark(mark);
> >         }
> > +       /*
> > +        * Some marks can still be pinned when waiting for response from
> > +        * userspace. Wait for those now. fsnotify_prepare_user_wait() will
> > +        * not succeed now so this wait is race-free.
> > +        */
> > +       wait_event(group->notification_waitq, !atomic_read(&group->user_waits));
> >  }
> >
> >  /* Destroy all marks attached to inode / vfsmount */
> > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> > index 64ce249288ad..786dc1cf715c 100644
> > --- a/include/linux/fsnotify_backend.h
> > +++ b/include/linux/fsnotify_backend.h
> > @@ -162,6 +162,8 @@ struct fsnotify_group {
> >         struct fsnotify_event *overflow_event;  /* Event we queue when the
> >                                                  * notification list is too
> >                                                  * full */
> > +       atomic_t user_waits;            /* Number of tasks waiting for user
> > +                                        * response */
> >
> >         /* groups can define private fields here or use the void *private */
> >         union {
> > @@ -367,6 +369,12 @@ extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, un
> >  extern void fsnotify_get_mark(struct fsnotify_mark *mark);
> >  extern void fsnotify_put_mark(struct fsnotify_mark *mark);
> >  extern void fsnotify_unmount_inodes(struct super_block *sb);
> > +extern void fsnotify_finish_user_wait(struct fsnotify_mark *inode_mark,
> > +                                     struct fsnotify_mark *vfsmount_mark,
> > +                                     int *srcu_idx);
> > +extern bool fsnotify_prepare_user_wait(struct fsnotify_mark *inode_mark,
> > +                                      struct fsnotify_mark *vfsmount_mark,
> > +                                      int *srcu_idx);
> >
> >  /* put here because inotify does some weird stuff when destroying watches */
> >  extern void fsnotify_init_event(struct fsnotify_event *event,
> > --
> > 2.10.2
> >
-- 
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html