Re: [PATCH 06/22] fsnotify: Attach marks to object via dedicated head structure

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Jan 6, 2017 at 12:43 PM, Jan Kara <jack@xxxxxxx> wrote:
> Currently notification marks are attached to object (inode or vfsmnt) by
> a hlist_head in the object. The list is also protected by a spinlock in
> the object. So while there is any mark attached to the list of marks,
> the object must be pinned in memory (and thus e.g. last iput() deleting
> inode cannot happen). Also for list iteration in fsnotify() to work, we
> must hold fsnotify_mark_srcu lock so that mark itself and
> mark->obj_list.next cannot get freed. Thus we are required to wait for
> response to fanotify events from userspace process with
> fsnotify_mark_srcu lock held. That causes issues when userspace process
> is buggy and does not reply to some event - basically the whole
> notification subsystem gets eventually stuck.
>
> So to be able to drop fsnotify_mark_srcu lock while waiting for
> response, we have to pin the mark in memory and make sure it stays in
> the object list (as removing the mark waiting for response could lead to
> lost notification events for groups later in the list). However we don't
> want inode reclaim to block on such mark as that would lead to system
> just locking up elsewhere.
>
> This commit tries to pave a way towards solving these conflicting
> lifetime needs. Instead of anchoring the list of marks directly in the
> object, we anchor it in a dedicated structure (fsnotify_mark_connector) and
> just point to that structure from the object. Also the list is protected
> by a spinlock contained in that structure. With this, we can detach
> notification marks from object without having to modify the list itself.
>
> Signed-off-by: Jan Kara <jack@xxxxxxx>
> ---

Looks good, except for some confusing semantics of grab/put
and question about rcu_assign_pointer()

...
> diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> index b41515d3f081..d512ef9f75fc 100644
> --- a/fs/notify/fsnotify.c
> +++ b/fs/notify/fsnotify.c
> @@ -193,6 +193,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
>         struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
>         struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
>         struct fsnotify_group *inode_group, *vfsmount_group;
> +       struct fsnotify_mark_connector *inode_conn, *vfsmount_conn;
>         struct mount *mnt;
>         int idx, ret = 0;
>         /* global tests shouldn't care about events on child only the specific event */
> @@ -210,8 +211,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
>          * SRCU because we have no references to any objects and do not
>          * need SRCU to keep them "alive".
>          */
> -       if (hlist_empty(&to_tell->i_fsnotify_marks) &&
> -           (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
> +       if (!to_tell->i_fsnotify_marks &&
> +           (!mnt || !mnt->mnt_fsnotify_marks))
>                 return 0;
>         /*
>          * if this is a modify event we may need to clear the ignored masks
> @@ -226,16 +227,27 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
>         idx = srcu_read_lock(&fsnotify_mark_srcu);
>
>         if ((mask & FS_MODIFY) ||
> -           (test_mask & to_tell->i_fsnotify_mask))
> -               inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
> +           (test_mask & to_tell->i_fsnotify_mask)) {
> +               inode_conn = srcu_dereference(to_tell->i_fsnotify_marks,
>                                               &fsnotify_mark_srcu);
> +               if (inode_conn)
> +                       inode_node = srcu_dereference(inode_conn->list.first,
> +                                                     &fsnotify_mark_srcu);
> +       }
>
>         if (mnt && ((mask & FS_MODIFY) ||
>                     (test_mask & mnt->mnt_fsnotify_mask))) {
> -               vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
> -                                                &fsnotify_mark_srcu);
> -               inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
> +               inode_conn = srcu_dereference(to_tell->i_fsnotify_marks,
>                                               &fsnotify_mark_srcu);
> +               if (inode_conn)
> +                       inode_node = srcu_dereference(inode_conn->list.first,
> +                                                     &fsnotify_mark_srcu);
> +               vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks,
> +                                                &fsnotify_mark_srcu);
> +               if (vfsmount_conn)
> +                       vfsmount_node = srcu_dereference(
> +                                               vfsmount_conn->list.first,
> +                                               &fsnotify_mark_srcu);
>         }
>
...
> +
> +static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark)
> +{
> +       struct fsnotify_mark_connector *conn;
> +       struct inode *inode = NULL;
> +       bool free_conn = false;
> +
> +       conn = mark->connector;
> +       spin_lock(&conn->lock);
> +       hlist_del_init_rcu(&mark->obj_list);
> +       if (hlist_empty(&conn->list)) {
> +               if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) {
> +                       inode = conn->inode;
> +                       inode->i_fsnotify_marks = NULL;

Shouldn't that assignment be done using rcu_assign_pointer() to
match srcu_dereference(to_tell->i_fsnotify_marks, &fsnotify_mark_srcu); ?


> +                       inode->i_fsnotify_mask = 0;
> +                       conn->inode = NULL;
> +                       conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE;
> +               } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
> +                       real_mount(conn->mnt)->mnt_fsnotify_marks = NULL;

Same here

> +                       real_mount(conn->mnt)->mnt_fsnotify_mask = 0;
> +                       conn->mnt = NULL;
> +                       conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT;
> +               }
> +               free_conn = true;
> +       } else
> +               __fsnotify_recalc_mask(conn);
> +       mark->connector = NULL;
> +       spin_unlock(&conn->lock);
> +
> +       if (free_conn) {
> +               spin_lock(&destroy_lock);
> +               conn->destroy_next = connector_destroy_list;
> +               connector_destroy_list = conn;
> +               spin_unlock(&destroy_lock);
> +               queue_work(system_unbound_wq, &connector_reaper_work);
> +       }
> +
> +       return inode;
>  }
>
>  /*
> @@ -137,13 +227,8 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
>
>         mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
>
> -       if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
> -               inode = mark->inode;
> -               fsnotify_destroy_inode_mark(mark);
> -       } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
> -               fsnotify_destroy_vfsmount_mark(mark);
> -       else
> -               BUG();
> +       inode = fsnotify_detach_from_object(mark);
> +
>         /*
>          * Note that we didn't update flags telling whether inode cares about
>          * what's happening with children. We update these flags from
> @@ -155,7 +240,7 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
>
>         spin_unlock(&mark->lock);
>
> -       if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
> +       if (inode)
>                 iput(inode);
>
>         atomic_dec(&group->num_marks);
> @@ -220,45 +305,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
>         fsnotify_free_mark(mark);
>  }
>
> -void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
> -{
> -       struct fsnotify_mark *mark;
> -
> -       while (1) {
> -               /*
> -                * We have to be careful since we can race with e.g.
> -                * fsnotify_clear_marks_by_group() and once we drop 'lock',
> -                * mark can get removed from the obj_list and destroyed. But
> -                * we are holding mark reference so mark cannot be freed and
> -                * calling fsnotify_destroy_mark() more than once is fine.
> -                */
> -               spin_lock(lock);
> -               if (hlist_empty(head)) {
> -                       spin_unlock(lock);
> -                       break;
> -               }
> -               mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
> -               /*
> -                * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
> -                * since inode / mount is going away anyway. So just remove
> -                * mark from the list.
> -                */
> -               hlist_del_init_rcu(&mark->obj_list);
> -               fsnotify_get_mark(mark);
> -               spin_unlock(lock);
> -               fsnotify_destroy_mark(mark, mark->group);
> -               fsnotify_put_mark(mark);
> -       }
> -}
> -
>  void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
>  {
>         assert_spin_locked(&mark->lock);
>
>         mark->mask = mask;
> -
> -       if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
> -               fsnotify_set_inode_mark_mask_locked(mark, mask);
>  }
>
>  void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
> @@ -304,37 +355,118 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
>         return -1;
>  }
>
> -/* Add mark into proper place in given list of marks */
> -int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark,
> -                          int allow_dups)
> +/*
> + * Get mark connector, make sure it is alive and return with its lock held.
> + * This is for users that get connector pointer from inode or mount. Users that
> + * hold reference to a mark on the list may directly lock connector->lock as
> + * they are sure list cannot go away under them.
> + */
> +static struct fsnotify_mark_connector *fsnotify_grab_connector(
> +                               struct fsnotify_mark_connector * __rcu *connp)
> +{
> +       struct fsnotify_mark_connector *conn;
> +       int idx;
> +
> +       idx = srcu_read_lock(&fsnotify_mark_srcu);
> +       conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
> +       if (!conn)
> +               goto out;
> +       spin_lock(&conn->lock);
> +       if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE |
> +                            FSNOTIFY_OBJ_TYPE_VFSMOUNT))) {
> +               spin_unlock(&conn->lock);
> +               srcu_read_unlock(&fsnotify_mark_srcu, idx);
> +               return NULL;
> +       }
> +out:
> +       srcu_read_unlock(&fsnotify_mark_srcu, idx);
> +       return conn;
> +}
> +
> +static void fsnotify_put_connector(struct fsnotify_mark_connector *conn)
> +{
> +       spin_unlock(&conn->lock);
> +}
> +

Is there a precedent in the kernel for using grab()/put() semantics for
what is essentially aquire()/release() for exclusive object access?

The counterpart for grab_cache_page(), for example, is
unlock_page(page); put_page(page); (and not just put_page()).

igrab() does not return with i_lock held, so igrab()/iput() sematics
do not apply to this case.

I find a function that is called put_connector() and actually does unlock
to be confusing and if that function is a one liner helper, I prefer that
it did not exist at all and call site should use spin_unlock(&conn->lock);
explicitly instead of hiding it.


> +/*
> + * Add mark into proper place in given list of marks. These marks may be used
> + * for the fsnotify backend to determine which event types should be delivered
> + * to which group and for which inodes. These marks are ordered according to
> + * priority, highest number first, and then by the group's location in memory.
> + */
> +static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
> +                                 struct inode *inode, struct vfsmount *mnt,
> +                                 int allow_dups)
>  {
>         struct fsnotify_mark *lmark, *last = NULL;
> +       struct fsnotify_mark_connector *conn;
> +       struct fsnotify_mark_connector **connp;
>         int cmp;
> +       int err = 0;
> +
> +       BUG_ON(!inode && !mnt);
> +       if (inode)
> +               connp = &inode->i_fsnotify_marks;
> +       else
> +               connp = &real_mount(mnt)->mnt_fsnotify_marks;
> +restart:
> +       spin_lock(&mark->lock);
> +       conn = fsnotify_grab_connector(connp);
> +       if (!conn) {
> +               spin_unlock(&mark->lock);
> +               conn = kmem_cache_alloc(fsnotify_mark_connector_cachep,
> +                                       GFP_KERNEL);
> +               if (!conn)
> +                       return -ENOMEM;
> +               spin_lock_init(&conn->lock);
> +               INIT_HLIST_HEAD(&conn->list);
> +               if (inode) {
> +                       conn->flags = FSNOTIFY_OBJ_TYPE_INODE;
> +                       conn->inode = igrab(inode);
> +               } else {
> +                       conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
> +                       conn->mnt = mnt;
> +               }
> +               if (cmpxchg(connp, NULL, conn)) {

Same question as in fsnotify_detach_from_object()
Is there a need for any smb_wmb() here to match
srcu_dereference() in fsnotify(), or is there no need because
cmpxchg from NULL to a live object is always safe for the reader?


> +                       /* Someone else created list structure for us */
> +                       if (inode)
> +                               iput(inode);
> +                       kmem_cache_free(fsnotify_mark_connector_cachep, conn);
> +               }
> +               goto restart;
> +       }
>
>         /* is mark the first mark? */
> -       if (hlist_empty(head)) {
> -               hlist_add_head_rcu(&mark->obj_list, head);
> -               return 0;
> +       if (hlist_empty(&conn->list)) {
> +               hlist_add_head_rcu(&mark->obj_list, &conn->list);
> +               goto added;
>         }
>
>         /* should mark be in the middle of the current list? */
> -       hlist_for_each_entry(lmark, head, obj_list) {
> +       hlist_for_each_entry(lmark, &conn->list, obj_list) {
>                 last = lmark;
>
> -               if ((lmark->group == mark->group) && !allow_dups)
> -                       return -EEXIST;
> +               if ((lmark->group == mark->group) && !allow_dups) {
> +                       err = -EEXIST;
> +                       goto out_err;
> +               }
>
>                 cmp = fsnotify_compare_groups(lmark->group, mark->group);
>                 if (cmp >= 0) {
>                         hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
> -                       return 0;
> +                       goto added;
>                 }
>         }
>
>         BUG_ON(last == NULL);
>         /* mark should be the last entry.  last is the current last entry */
>         hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
> -       return 0;
> +added:
> +       mark->connector = conn;
> +out_err:
> +       fsnotify_put_connector(conn);
> +       spin_unlock(&mark->lock);
> +       return err;
>  }
>
>  /*
> @@ -356,7 +488,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
>          * LOCKING ORDER!!!!
>          * group->mark_mutex
>          * mark->lock
> -        * inode->i_lock
> +        * mark->connector->lock
>          */
>         spin_lock(&mark->lock);
>         mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
> @@ -366,25 +498,14 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
>         list_add(&mark->g_list, &group->marks_list);
>         atomic_inc(&group->num_marks);
>         fsnotify_get_mark(mark); /* for i_list and g_list */
> -
> -       if (inode) {
> -               ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
> -               if (ret)
> -                       goto err;
> -       } else if (mnt) {
> -               ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
> -               if (ret)
> -                       goto err;
> -       } else {
> -               BUG();
> -       }
> -
> -       /* this will pin the object if appropriate */
> -       fsnotify_set_mark_mask_locked(mark, mark->mask);
>         spin_unlock(&mark->lock);
>
> -       if (inode)
> -               __fsnotify_update_child_dentry_flags(inode);
> +       ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups);
> +       if (ret)
> +               goto err;
> +
> +       if (mark->mask)
> +               fsnotify_recalc_mask(mark->connector);
>
>         return ret;
>  err:
> @@ -419,17 +540,23 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
>   * Given a list of marks, find the mark associated with given group. If found
>   * take a reference to that mark and return it, else return NULL.
>   */
> -struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
> +struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_mark_connector **connp,
>                                          struct fsnotify_group *group)
>  {
>         struct fsnotify_mark *mark;
> +       struct fsnotify_mark_connector *conn;
>
> -       hlist_for_each_entry(mark, head, obj_list) {
> +       conn = fsnotify_grab_connector(connp);
> +       if (!conn)
> +               return NULL;
> +       hlist_for_each_entry(mark, &conn->list, obj_list) {
>                 if (mark->group == group) {
>                         fsnotify_get_mark(mark);
> +                       fsnotify_put_connector(conn);
>                         return mark;
>                 }
>         }
> +       fsnotify_put_connector(conn);
>         return NULL;
>  }
>
> @@ -453,7 +580,7 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
>          */
>         mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
>         list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
> -               if (mark->flags & flags)
> +               if (mark->connector->flags & flags)
>                         list_move(&mark->g_list, &to_free);
>         }
>         mutex_unlock(&group->mark_mutex);
> @@ -499,6 +626,29 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group)
>         }
>  }
>
> +/* Destroy all marks attached to inode / vfsmount */
> +void fsnotify_destroy_marks(struct fsnotify_mark_connector **connp)
> +{
> +       struct fsnotify_mark_connector *conn;
> +       struct fsnotify_mark *mark;
> +
> +       while ((conn = fsnotify_grab_connector(connp))) {
> +               /*
> +                * We have to be careful since we can race with e.g.
> +                * fsnotify_clear_marks_by_group() and once we drop the list
> +                * lock, mark can get removed from the obj_list and destroyed.
> +                * But we are holding mark reference so mark cannot be freed
> +                * and calling fsnotify_destroy_mark() more than once is fine.
> +                */
> +               mark = hlist_entry(conn->list.first, struct fsnotify_mark,
> +                                  obj_list);
> +               fsnotify_get_mark(mark);
> +               fsnotify_put_connector(conn);
> +               fsnotify_destroy_mark(mark, mark->group);
> +               fsnotify_put_mark(mark);
> +       }
> +}
> +
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux