Re: [PATCH] fsnotify: Avoid spurious EMFILE errors from inotify_init()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, 2016-04-27 at 09:56 +0200, Jan Kara wrote:
> Inotify instance is destroyed when all references to it are dropped.
> That not only means that the corresponding file descriptor needs to be
> closed but also that all corresponding instance marks are freed (as each
> mark holds a reference to the inotify instance). However marks are freed
> only after SRCU period ends which can take some time and thus if
> user rapidly creates and frees inotify instances, number of existing
> inotify instances can exceed max_user_instances limit although from user
> point of view there is always at most one existing instance. Thus
> inotify_init() returns EMFILE error which is hard to justify from user
> point of view. This problem is exposed by LTP inotify06 testcase on some
> machines.
> 
> We fix the problem by making sure all group marks are properly freed
> while destroying inotify instance. We wait for SRCU period to end in
> that path anyway since we have to make sure there is no event being
> added to the instance while we are tearing down the instance. So it
> takes only some plumbing to allow for marks to be destroyed in that path
> as well and not from a dedicated work item.
> 
> Reported-and-tested-by: Xiaoguang Wang <wangxg.fnst@xxxxxxxxxxxxxx>
> Signed-off-by: Jan Kara <jack@xxxxxxx>

Acked-by: Jeff Layton <jlayton@xxxxxxxxxxxxxxx>

> ---
>  fs/notify/fsnotify.h             |  5 +++
>  fs/notify/group.c                | 17 ++++++---
>  fs/notify/mark.c                 | 78 +++++++++++++++++++++++++++++++---------
>  include/linux/fsnotify_backend.h |  2 --
>  4 files changed, 79 insertions(+), 23 deletions(-)
> 
> Andrew, can you please merge this patch? Thanks!
> 
> diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
> index b44c68a857e7..72fb4f33982f 100644
> --- a/fs/notify/fsnotify.h
> +++ b/fs/notify/fsnotify.h
> @@ -56,6 +56,11 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
>  	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
>  			       &mnt->mnt_root->d_lock);
>  }
> +/* prepare for freeing all marks associated with given group */
> +extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
> +/* wait for fsnotify_mark_srcu period to end and free all marks in destroy_list */
> +extern void fsnotify_mark_destroy_list(void);
> +
>  /*
>   * update the dentry->d_flags of all of inode's children to indicate if inode cares
>   * about events that happen to its children.
> diff --git a/fs/notify/group.c b/fs/notify/group.c
> index d16b62cb2854..3e2dd85be5dd 100644
> --- a/fs/notify/group.c
> +++ b/fs/notify/group.c
> @@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
>   */
>  void fsnotify_destroy_group(struct fsnotify_group *group)
>  {
> -	/* clear all inode marks for this group */
> -	fsnotify_clear_marks_by_group(group);
> +	/* clear all inode marks for this group, attach them to destroy_list */
> +	fsnotify_detach_group_marks(group);
>  
> -	synchronize_srcu(&fsnotify_mark_srcu);
> +	/*
> +	 * Wait for fsnotify_mark_srcu period to end and free all marks in
> +	 * destroy_list
> +	 */
> +	fsnotify_mark_destroy_list();
>  
> -	/* clear the notification queue of all events */
> +	/*
> +	 * Since we have waited for fsnotify_mark_srcu in
> +	 * fsnotify_mark_destroy_list() there can be no outstanding event
> +	 * notification against this group. So clearing the notification queue
> +	 * of all events is reliable now.
> +	 */
>  	fsnotify_flush_notify(group);
>  
>  	/*
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index 7115c5d7d373..d3fea0bd89e2 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu;
>  static DEFINE_SPINLOCK(destroy_lock);
>  static LIST_HEAD(destroy_list);
>  
> -static void fsnotify_mark_destroy(struct work_struct *work);
> -static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
> +static void fsnotify_mark_destroy_workfn(struct work_struct *work);
> +static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
>  
>  void fsnotify_get_mark(struct fsnotify_mark *mark)
>  {
> @@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
>  }
>  
>  /*
> - * Free fsnotify mark. The freeing is actually happening from a kthread which
> - * first waits for srcu period end. Caller must have a reference to the mark
> - * or be protected by fsnotify_mark_srcu.
> + * Prepare mark for freeing and add it to the list of marks prepared for
> + * freeing. The actual freeing must happen after SRCU period ends and the
> + * caller is responsible for this.
> + *
> + * The function returns true if the mark was added to the list of marks for
> + * freeing. The function returns false if someone else has already called
> + * __fsnotify_free_mark() for the mark.
>   */
> -void fsnotify_free_mark(struct fsnotify_mark *mark)
> +static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
>  {
>  	struct fsnotify_group *group = mark->group;
>  
> @@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
>  	/* something else already called this function on this mark */
>  	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
>  		spin_unlock(&mark->lock);
> -		return;
> +		return false;
>  	}
>  	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
>  	spin_unlock(&mark->lock);
>  
> -	spin_lock(&destroy_lock);
> -	list_add(&mark->g_list, &destroy_list);
> -	spin_unlock(&destroy_lock);
> -	queue_delayed_work(system_unbound_wq, &reaper_work,
> -				FSNOTIFY_REAPER_DELAY);
> -
>  	/*
>  	 * Some groups like to know that marks are being freed.  This is a
>  	 * callback to the group function to let it know that this mark
> @@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
>  	 */
>  	if (group->ops->freeing_mark)
>  		group->ops->freeing_mark(mark, group);
> +
> +	spin_lock(&destroy_lock);
> +	list_add(&mark->g_list, &destroy_list);
> +	spin_unlock(&destroy_lock);
> +
> +	return true;
> +}
> +
> +/*
> + * Free fsnotify mark. The freeing is actually happening from a workqueue which
> + * first waits for srcu period end. Caller must have a reference to the mark
> + * or be protected by fsnotify_mark_srcu.
> + */
> +void fsnotify_free_mark(struct fsnotify_mark *mark)
> +{
> +	if (__fsnotify_free_mark(mark)) {
> +		queue_delayed_work(system_unbound_wq, &reaper_work,
> +				   FSNOTIFY_REAPER_DELAY);
> +	}
>  }
>  
>  void fsnotify_destroy_mark(struct fsnotify_mark *mark,
> @@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
>  }
>  
>  /*
> - * Given a group, destroy all of the marks associated with that group.
> + * Given a group, prepare for freeing all the marks associated with that group.
> + * The marks are attached to the list of marks prepared for destruction, the
> + * caller is responsible for freeing marks in that list after SRCU period has
> + * ended.
>   */
> -void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
> +void fsnotify_detach_group_marks(struct fsnotify_group *group)
>  {
> -	fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
> +	struct fsnotify_mark *mark;
> +
> +	while (1) {
> +		mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
> +		if (list_empty(&group->marks_list)) {
> +			mutex_unlock(&group->mark_mutex);
> +			break;
> +		}
> +		mark = list_first_entry(&group->marks_list,
> +					struct fsnotify_mark, g_list);
> +		fsnotify_get_mark(mark);
> +		fsnotify_detach_mark(mark);
> +		mutex_unlock(&group->mark_mutex);
> +		__fsnotify_free_mark(mark);
> +		fsnotify_put_mark(mark);
> +	}
>  }
>  
>  void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
> @@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
>  	mark->free_mark = free_mark;
>  }
>  
> -static void fsnotify_mark_destroy(struct work_struct *work)
> +/*
> + * Destroy all marks in destroy_list, waits for SRCU period to finish before
> + * actually freeing marks.
> + */
> +void fsnotify_mark_destroy_list(void)
>  {
>  	struct fsnotify_mark *mark, *next;
>  	struct list_head private_destroy_list;
> @@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work)
>  		fsnotify_put_mark(mark);
>  	}
>  }
> +
> +static void fsnotify_mark_destroy_workfn(struct work_struct *work)
> +{
> +	fsnotify_mark_destroy_list();
> +}
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index 1259e53d9296..29f917517299 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -359,8 +359,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
>  extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
>  /* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
>  extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
> -/* run all the marks in a group, and flag them to be freed */
> -extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
>  extern void fsnotify_get_mark(struct fsnotify_mark *mark);
>  extern void fsnotify_put_mark(struct fsnotify_mark *mark);
>  extern void fsnotify_unmount_inodes(struct super_block *sb);
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux