On Wed, 2016-04-27 at 09:56 +0200, Jan Kara wrote: > Inotify instance is destroyed when all references to it are dropped. > That not only means that the corresponding file descriptor needs to be > closed but also that all corresponding instance marks are freed (as each > mark holds a reference to the inotify instance). However marks are freed > only after SRCU period ends which can take some time and thus if > user rapidly creates and frees inotify instances, number of existing > inotify instances can exceed max_user_instances limit although from user > point of view there is always at most one existing instance. Thus > inotify_init() returns EMFILE error which is hard to justify from user > point of view. This problem is exposed by LTP inotify06 testcase on some > machines. > > We fix the problem by making sure all group marks are properly freed > while destroying inotify instance. We wait for SRCU period to end in > that path anyway since we have to make sure there is no event being > added to the instance while we are tearing down the instance. So it > takes only some plumbing to allow for marks to be destroyed in that path > as well and not from a dedicated work item. > > Reported-and-tested-by: Xiaoguang Wang <wangxg.fnst@xxxxxxxxxxxxxx> > Signed-off-by: Jan Kara <jack@xxxxxxx> Acked-by: Jeff Layton <jlayton@xxxxxxxxxxxxxxx> > --- > fs/notify/fsnotify.h | 5 +++ > fs/notify/group.c | 17 ++++++--- > fs/notify/mark.c | 78 +++++++++++++++++++++++++++++++--------- > include/linux/fsnotify_backend.h | 2 -- > 4 files changed, 79 insertions(+), 23 deletions(-) > > Andrew, can you please merge this patch? Thanks! > > diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h > index b44c68a857e7..72fb4f33982f 100644 > --- a/fs/notify/fsnotify.h > +++ b/fs/notify/fsnotify.h > @@ -56,6 +56,11 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) > fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, > &mnt->mnt_root->d_lock); > } > +/* prepare for freeing all marks associated with given group */ > +extern void fsnotify_detach_group_marks(struct fsnotify_group *group); > +/* wait for fsnotify_mark_srcu period to end and free all marks in destroy_list */ > +extern void fsnotify_mark_destroy_list(void); > + > /* > * update the dentry->d_flags of all of inode's children to indicate if inode cares > * about events that happen to its children. > diff --git a/fs/notify/group.c b/fs/notify/group.c > index d16b62cb2854..3e2dd85be5dd 100644 > --- a/fs/notify/group.c > +++ b/fs/notify/group.c > @@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) > */ > void fsnotify_destroy_group(struct fsnotify_group *group) > { > - /* clear all inode marks for this group */ > - fsnotify_clear_marks_by_group(group); > + /* clear all inode marks for this group, attach them to destroy_list */ > + fsnotify_detach_group_marks(group); > > - synchronize_srcu(&fsnotify_mark_srcu); > + /* > + * Wait for fsnotify_mark_srcu period to end and free all marks in > + * destroy_list > + */ > + fsnotify_mark_destroy_list(); > > - /* clear the notification queue of all events */ > + /* > + * Since we have waited for fsnotify_mark_srcu in > + * fsnotify_mark_destroy_list() there can be no outstanding event > + * notification against this group. So clearing the notification queue > + * of all events is reliable now. > + */ > fsnotify_flush_notify(group); > > /* > diff --git a/fs/notify/mark.c b/fs/notify/mark.c > index 7115c5d7d373..d3fea0bd89e2 100644 > --- a/fs/notify/mark.c > +++ b/fs/notify/mark.c > @@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu; > static DEFINE_SPINLOCK(destroy_lock); > static LIST_HEAD(destroy_list); > > -static void fsnotify_mark_destroy(struct work_struct *work); > -static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy); > +static void fsnotify_mark_destroy_workfn(struct work_struct *work); > +static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); > > void fsnotify_get_mark(struct fsnotify_mark *mark) > { > @@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) > } > > /* > - * Free fsnotify mark. The freeing is actually happening from a kthread which > - * first waits for srcu period end. Caller must have a reference to the mark > - * or be protected by fsnotify_mark_srcu. > + * Prepare mark for freeing and add it to the list of marks prepared for > + * freeing. The actual freeing must happen after SRCU period ends and the > + * caller is responsible for this. > + * > + * The function returns true if the mark was added to the list of marks for > + * freeing. The function returns false if someone else has already called > + * __fsnotify_free_mark() for the mark. > */ > -void fsnotify_free_mark(struct fsnotify_mark *mark) > +static bool __fsnotify_free_mark(struct fsnotify_mark *mark) > { > struct fsnotify_group *group = mark->group; > > @@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) > /* something else already called this function on this mark */ > if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { > spin_unlock(&mark->lock); > - return; > + return false; > } > mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; > spin_unlock(&mark->lock); > > - spin_lock(&destroy_lock); > - list_add(&mark->g_list, &destroy_list); > - spin_unlock(&destroy_lock); > - queue_delayed_work(system_unbound_wq, &reaper_work, > - FSNOTIFY_REAPER_DELAY); > - > /* > * Some groups like to know that marks are being freed. This is a > * callback to the group function to let it know that this mark > @@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) > */ > if (group->ops->freeing_mark) > group->ops->freeing_mark(mark, group); > + > + spin_lock(&destroy_lock); > + list_add(&mark->g_list, &destroy_list); > + spin_unlock(&destroy_lock); > + > + return true; > +} > + > +/* > + * Free fsnotify mark. The freeing is actually happening from a workqueue which > + * first waits for srcu period end. Caller must have a reference to the mark > + * or be protected by fsnotify_mark_srcu. > + */ > +void fsnotify_free_mark(struct fsnotify_mark *mark) > +{ > + if (__fsnotify_free_mark(mark)) { > + queue_delayed_work(system_unbound_wq, &reaper_work, > + FSNOTIFY_REAPER_DELAY); > + } > } > > void fsnotify_destroy_mark(struct fsnotify_mark *mark, > @@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, > } > > /* > - * Given a group, destroy all of the marks associated with that group. > + * Given a group, prepare for freeing all the marks associated with that group. > + * The marks are attached to the list of marks prepared for destruction, the > + * caller is responsible for freeing marks in that list after SRCU period has > + * ended. > */ > -void fsnotify_clear_marks_by_group(struct fsnotify_group *group) > +void fsnotify_detach_group_marks(struct fsnotify_group *group) > { > - fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1); > + struct fsnotify_mark *mark; > + > + while (1) { > + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); > + if (list_empty(&group->marks_list)) { > + mutex_unlock(&group->mark_mutex); > + break; > + } > + mark = list_first_entry(&group->marks_list, > + struct fsnotify_mark, g_list); > + fsnotify_get_mark(mark); > + fsnotify_detach_mark(mark); > + mutex_unlock(&group->mark_mutex); > + __fsnotify_free_mark(mark); > + fsnotify_put_mark(mark); > + } > } > > void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) > @@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, > mark->free_mark = free_mark; > } > > -static void fsnotify_mark_destroy(struct work_struct *work) > +/* > + * Destroy all marks in destroy_list, waits for SRCU period to finish before > + * actually freeing marks. > + */ > +void fsnotify_mark_destroy_list(void) > { > struct fsnotify_mark *mark, *next; > struct list_head private_destroy_list; > @@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work) > fsnotify_put_mark(mark); > } > } > + > +static void fsnotify_mark_destroy_workfn(struct work_struct *work) > +{ > + fsnotify_mark_destroy_list(); > +} > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h > index 1259e53d9296..29f917517299 100644 > --- a/include/linux/fsnotify_backend.h > +++ b/include/linux/fsnotify_backend.h > @@ -359,8 +359,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) > extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group); > /* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/ > extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags); > -/* run all the marks in a group, and flag them to be freed */ > -extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group); > extern void fsnotify_get_mark(struct fsnotify_mark *mark); > extern void fsnotify_put_mark(struct fsnotify_mark *mark); > extern void fsnotify_unmount_inodes(struct super_block *sb); -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html