On Sun 08-01-17 11:02:24, Amir Goldstein wrote: > On Fri, Jan 6, 2017 at 12:43 PM, Jan Kara <jack@xxxxxxx> wrote: > > fanotify wants to drop fsnotify_mark_srcu lock when waiting for response > > from userspace so that the whole notification subsystem is not blocked > > during that time. This patch provides a framework for safely getting > > mark reference for a mark found in the object list which pins the mark > > in that list. We can then drop fsnotify_mark_srcu, wait for userspace > > response and then safely continue iteration of the object list once we > > reaquire fsnotify_mark_srcu. > > > > Reviewed-by: Amir Goldstein <amir73il@xxxxxxxxx> > > Signed-off-by: Jan Kara <jack@xxxxxxx> > > Patch looks good, but I wonder out-loud: > > Shouldn't this infrastructure be kept more generic and not only for > the use case of > grabbing refcount before user sleep, i.e.: > > fsnotify_prepare_user_wait(inode_mark, vfsmount_mark, srcu_idx) -> > fsnotify_get_marks_safe(inode_mark, vfsmount_mark, srcu_idx) > > I am not sure it will ever make sense for performance, but is there > an inherent reason why marks references could not be taken for > any group on object list iteration (in fsnotify()) *before* handle_event() > and regardless of user wait? > > If there is no "inherent" reason, then I think the API should remain > generic (i.e. fsnotify_get_marks_safe()) and in your implementation > it can be called only for user wait case. So there's no inherent reason (except for performance as you noted). However the SRCU handling and user_waits counter in the notification group handling are quite special so if anybody wanted to use it somewhere else, he'd better give it a good thought. So for now I'd leave the naming as is and if we ever come with another use for the framework, we can easily rename the functions... Honza > > --- > > fs/notify/group.c | 1 + > > fs/notify/mark.c | 86 ++++++++++++++++++++++++++++++++++++++++ > > include/linux/fsnotify_backend.h | 8 ++++ > > 3 files changed, 95 insertions(+) > > > > diff --git a/fs/notify/group.c b/fs/notify/group.c > > index 0fb4aadcc19f..79439cdf16e0 100644 > > --- a/fs/notify/group.c > > +++ b/fs/notify/group.c > > @@ -126,6 +126,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) > > /* set to 0 when there a no external references to this group */ > > atomic_set(&group->refcnt, 1); > > atomic_set(&group->num_marks, 0); > > + atomic_set(&group->user_waits, 0); > > > > spin_lock_init(&group->notification_lock); > > INIT_LIST_HEAD(&group->notification_list); > > diff --git a/fs/notify/mark.c b/fs/notify/mark.c > > index 47162c13fc0f..fb9b9f080da6 100644 > > --- a/fs/notify/mark.c > > +++ b/fs/notify/mark.c > > @@ -109,6 +109,16 @@ void fsnotify_get_mark(struct fsnotify_mark *mark) > > atomic_inc(&mark->refcnt); > > } > > > > +/* > > + * Get mark reference when we found the mark via lockless traversal of object > > + * list. Mark can be already removed from the list by now and on its way to be > > + * destroyed once SRCU period ends. > > + */ > > +static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) > > +{ > > + return atomic_inc_not_zero(&mark->refcnt); > > +} > > + > > static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) > > { > > u32 new_mask = 0; > > @@ -242,6 +252,76 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) > > } > > } > > > > +bool fsnotify_prepare_user_wait(struct fsnotify_mark *inode_mark, > > + struct fsnotify_mark *vfsmount_mark, > > + int *srcu_idx) > > +{ > > + struct fsnotify_group *group; > > + > > + if (WARN_ON_ONCE(!inode_mark && !vfsmount_mark)) > > + return false; > > + > > + if (inode_mark) > > + group = inode_mark->group; > > + else > > + group = vfsmount_mark->group; > > + > > + /* > > + * Since acquisition of mark reference is an atomic op as well, we can > > + * be sure this inc is seen before any effect of refcount increment. > > + */ > > + atomic_inc(&group->user_waits); > > + > > + if (inode_mark) { > > + /* This can fail if mark is being removed */ > > + if (!fsnotify_get_mark_safe(inode_mark)) > > + goto out_wait; > > + } > > + if (vfsmount_mark) { > > + if (!fsnotify_get_mark_safe(vfsmount_mark)) > > + goto out_inode; > > + } > > + > > + /* > > + * Now that both marks are pinned by refcount in the inode / vfsmount > > + * lists, we can drop SRCU lock, and safely resume the list iteration > > + * once userspace returns. > > + */ > > + srcu_read_unlock(&fsnotify_mark_srcu, *srcu_idx); > > + > > + return true; > > +out_inode: > > + if (inode_mark) > > + fsnotify_put_mark(inode_mark); > > +out_wait: > > + if (atomic_dec_and_test(&group->user_waits) && group->shutdown) > > + wake_up(&group->notification_waitq); > > + return false; > > +} > > + > > +void fsnotify_finish_user_wait(struct fsnotify_mark *inode_mark, > > + struct fsnotify_mark *vfsmount_mark, > > + int *srcu_idx) > > +{ > > + struct fsnotify_group *group = NULL; > > + > > + *srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); > > + if (inode_mark) { > > + group = inode_mark->group; > > + fsnotify_put_mark(inode_mark); > > + } > > + if (vfsmount_mark) { > > + group = vfsmount_mark->group; > > + fsnotify_put_mark(vfsmount_mark); > > + } > > + /* > > + * We abuse notification_waitq on group shutdown for waiting for all > > + * marks pinned when waiting for userspace. > > + */ > > + if (atomic_dec_and_test(&group->user_waits) && group->shutdown) > > + wake_up(&group->notification_waitq); > > +} > > + > > /* > > * Mark mark as dead, remove it from group list. Mark still stays in object > > * list until its last reference is dropped. Note that we rely on mark being > > @@ -631,6 +711,12 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group) > > fsnotify_free_mark(mark); > > fsnotify_put_mark(mark); > > } > > + /* > > + * Some marks can still be pinned when waiting for response from > > + * userspace. Wait for those now. fsnotify_prepare_user_wait() will > > + * not succeed now so this wait is race-free. > > + */ > > + wait_event(group->notification_waitq, !atomic_read(&group->user_waits)); > > } > > > > /* Destroy all marks attached to inode / vfsmount */ > > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h > > index 64ce249288ad..786dc1cf715c 100644 > > --- a/include/linux/fsnotify_backend.h > > +++ b/include/linux/fsnotify_backend.h > > @@ -162,6 +162,8 @@ struct fsnotify_group { > > struct fsnotify_event *overflow_event; /* Event we queue when the > > * notification list is too > > * full */ > > + atomic_t user_waits; /* Number of tasks waiting for user > > + * response */ > > > > /* groups can define private fields here or use the void *private */ > > union { > > @@ -367,6 +369,12 @@ extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, un > > extern void fsnotify_get_mark(struct fsnotify_mark *mark); > > extern void fsnotify_put_mark(struct fsnotify_mark *mark); > > extern void fsnotify_unmount_inodes(struct super_block *sb); > > +extern void fsnotify_finish_user_wait(struct fsnotify_mark *inode_mark, > > + struct fsnotify_mark *vfsmount_mark, > > + int *srcu_idx); > > +extern bool fsnotify_prepare_user_wait(struct fsnotify_mark *inode_mark, > > + struct fsnotify_mark *vfsmount_mark, > > + int *srcu_idx); > > > > /* put here because inotify does some weird stuff when destroying watches */ > > extern void fsnotify_init_event(struct fsnotify_event *event, > > -- > > 2.10.2 > > -- Jan Kara <jack@xxxxxxxx> SUSE Labs, CR -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html