Re: [PATCH 06/25] fsnotify: Attach marks to object via dedicated head structure

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Feb 1, 2017 at 11:44 AM, Jan Kara <jack@xxxxxxx> wrote:
> Currently notification marks are attached to object (inode or vfsmnt) by
> a hlist_head in the object. The list is also protected by a spinlock in
> the object. So while there is any mark attached to the list of marks,
> the object must be pinned in memory (and thus e.g. last iput() deleting
> inode cannot happen). Also for list iteration in fsnotify() to work, we
> must hold fsnotify_mark_srcu lock so that mark itself and
> mark->obj_list.next cannot get freed. Thus we are required to wait for
> response to fanotify events from userspace process with
> fsnotify_mark_srcu lock held. That causes issues when userspace process
> is buggy and does not reply to some event - basically the whole
> notification subsystem gets eventually stuck.
>
> So to be able to drop fsnotify_mark_srcu lock while waiting for
> response, we have to pin the mark in memory and make sure it stays in
> the object list (as removing the mark waiting for response could lead to
> lost notification events for groups later in the list). However we don't
> want inode reclaim to block on such mark as that would lead to system
> just locking up elsewhere.
>
> This commit tries to pave a way towards solving these conflicting
> lifetime needs. Instead of anchoring the list of marks directly in the
> object, we anchor it in a dedicated structure (fsnotify_mark_connector)
> and just point to that structure from the object. In the following patch
> we will also protect the list by a spinlock contained in that structure.
> With this, we will be able to detach notification marks from object
> without having to modify the list itself.

This is still a bit big to review easily.  I'd suggest further splitup into:

- move hlist_head from inode/mnt into connector
- move inode/mnt ptr from mark into connector (no refcounting changes)
- inode refcounting cleanup

More comments inline.

>
> Signed-off-by: Jan Kara <jack@xxxxxxx>
> ---
>  fs/inode.c                       |   3 -
>  fs/mount.h                       |   2 +-
>  fs/namespace.c                   |   3 -
>  fs/notify/dnotify/dnotify.c      |   4 +-
>  fs/notify/fdinfo.c               |  12 +--
>  fs/notify/fsnotify.c             |  32 +++++---
>  fs/notify/fsnotify.h             |  17 +++--
>  fs/notify/inode_mark.c           |  74 ++++--------------
>  fs/notify/mark.c                 | 157 +++++++++++++++++++++++++++++----------
>  fs/notify/vfsmount_mark.c        |  41 +++-------
>  include/linux/fs.h               |   4 +-
>  include/linux/fsnotify_backend.h |  38 +++++++---
>  kernel/audit_tree.c              |  30 ++++++--
>  kernel/auditsc.c                 |   4 +-
>  14 files changed, 236 insertions(+), 185 deletions(-)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 88110fd0b282..131b2bcebc48 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -371,9 +371,6 @@ void inode_init_once(struct inode *inode)
>         INIT_LIST_HEAD(&inode->i_lru);
>         address_space_init_once(&inode->i_data);
>         i_size_ordered_init(inode);
> -#ifdef CONFIG_FSNOTIFY
> -       INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
> -#endif
>  }
>  EXPORT_SYMBOL(inode_init_once);
>
> diff --git a/fs/mount.h b/fs/mount.h
> index 2c856fc47ae3..c92c78d135dc 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -59,7 +59,7 @@ struct mount {
>         struct mountpoint *mnt_mp;      /* where is it mounted */
>         struct hlist_node mnt_mp_list;  /* list mounts with the same mountpoint */
>  #ifdef CONFIG_FSNOTIFY
> -       struct hlist_head mnt_fsnotify_marks;
> +       struct fsnotify_mark_connector *mnt_fsnotify_marks;
>         __u32 mnt_fsnotify_mask;
>  #endif
>         int mnt_id;                     /* mount identifier */
> diff --git a/fs/namespace.c b/fs/namespace.c
> index b5b1259e064f..22a91352e2d7 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -233,9 +233,6 @@ static struct mount *alloc_vfsmnt(const char *name)
>                 INIT_LIST_HEAD(&mnt->mnt_slave_list);
>                 INIT_LIST_HEAD(&mnt->mnt_slave);
>                 INIT_HLIST_NODE(&mnt->mnt_mp_list);
> -#ifdef CONFIG_FSNOTIFY
> -               INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
> -#endif
>                 init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
>         }
>         return mnt;
> diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
> index 5a4ec309e283..3dd2e0ece262 100644
> --- a/fs/notify/dnotify/dnotify.c
> +++ b/fs/notify/dnotify/dnotify.c
> @@ -69,8 +69,8 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
>         if (old_mask == new_mask)
>                 return;
>
> -       if (fsn_mark->inode)
> -               fsnotify_recalc_inode_mask(fsn_mark->inode);
> +       if (fsn_mark->connector->inode

This doesn't make sense in the context of this patch.  Now
mark->connector is set to NULL where previously mark->inode was set to
null.   So the right condition would be "if (fsn_mark->connector)
...".

)
> +               fsnotify_recalc_inode_mask(fsn_mark->connector->inode);
>  }
>
>  /*
> diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
> index 601a59c8d87e..dd63aa9a6f9a 100644
> --- a/fs/notify/fdinfo.c
> +++ b/fs/notify/fdinfo.c
> @@ -76,11 +76,11 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
>         struct inotify_inode_mark *inode_mark;
>         struct inode *inode;
>
> -       if (!(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
> +       if (!(mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE))
>                 return;
>
>         inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
> -       inode = igrab(mark->inode);
> +       inode = igrab(mark->connector->inode);
>         if (inode) {
>                 /*
>                  * IN_ALL_EVENTS represents all of the mask bits
> @@ -115,8 +115,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
>         if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
>                 mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
>
> -       if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
> -               inode = igrab(mark->inode);
> +       if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) {
> +               inode = igrab(mark->connector->inode);
>                 if (!inode)
>                         return;
>                 seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
> @@ -125,8 +125,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
>                 show_mark_fhandle(m, inode);
>                 seq_putc(m, '\n');
>                 iput(inode);
> -       } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
> -               struct mount *mnt = real_mount(mark->mnt);
> +       } else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
> +               struct mount *mnt = real_mount(mark->connector->mnt);
>
>                 seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
>                            mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
> diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> index b41515d3f081..421c7431a16e 100644
> --- a/fs/notify/fsnotify.c
> +++ b/fs/notify/fsnotify.c
> @@ -193,6 +193,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
>         struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
>         struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
>         struct fsnotify_group *inode_group, *vfsmount_group;
> +       struct fsnotify_mark_connector *inode_conn, *vfsmount_conn;
>         struct mount *mnt;
>         int idx, ret = 0;
>         /* global tests shouldn't care about events on child only the specific event */
> @@ -210,8 +211,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
>          * SRCU because we have no references to any objects and do not
>          * need SRCU to keep them "alive".
>          */
> -       if (hlist_empty(&to_tell->i_fsnotify_marks) &&
> -           (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
> +       if (!to_tell->i_fsnotify_marks &&
> +           (!mnt || !mnt->mnt_fsnotify_marks))

The checks against hlist_empty() could still be useful to optimize
away the memory barrier imposed by srcu_read_lock() in case there were
marks on the object but not anymore (i.e. the connector is non-NULL,
but the hlist is empty).

>                 return 0;
>         /*
>          * if this is a modify event we may need to clear the ignored masks
> @@ -226,16 +227,24 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
>         idx = srcu_read_lock(&fsnotify_mark_srcu);
>
>         if ((mask & FS_MODIFY) ||
> -           (test_mask & to_tell->i_fsnotify_mask))
> -               inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
> -                                             &fsnotify_mark_srcu);
> +           (test_mask & to_tell->i_fsnotify_mask)) {
> +               inode_conn = to_tell->i_fsnotify_marks;

Needs lockless_dereference().

> +               if (inode_conn)
> +                       inode_node = srcu_dereference(inode_conn->list.first,
> +                                                     &fsnotify_mark_srcu);
> +       }
>
>         if (mnt && ((mask & FS_MODIFY) ||
>                     (test_mask & mnt->mnt_fsnotify_mask))) {
> -               vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
> -                                                &fsnotify_mark_srcu);
> -               inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
> -                                             &fsnotify_mark_srcu);
> +               inode_conn = to_tell->i_fsnotify_marks;
> +               if (inode_conn)
> +                       inode_node = srcu_dereference(inode_conn->list.first,
> +                                                     &fsnotify_mark_srcu);
> +               vfsmount_conn = mnt->mnt_fsnotify_marks;

And this one too.

> +               if (vfsmount_conn)
> +                       vfsmount_node = srcu_dereference(
> +                                               vfsmount_conn->list.first,
> +                                               &fsnotify_mark_srcu);
>         }
>
>         /*
> @@ -293,6 +302,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
>  }
>  EXPORT_SYMBOL_GPL(fsnotify);
>
> +extern struct kmem_cache *fsnotify_mark_connector_cachep;
> +
>  static __init int fsnotify_init(void)
>  {
>         int ret;
> @@ -303,6 +314,9 @@ static __init int fsnotify_init(void)
>         if (ret)
>                 panic("initializing fsnotify_mark_srcu");
>
> +       fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
> +                                                   SLAB_PANIC);
> +
>         return 0;
>  }
>  core_initcall(fsnotify_init);
> diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
> index 0a3bc2cf192c..7a95436dc8d3 100644
> --- a/fs/notify/fsnotify.h
> +++ b/fs/notify/fsnotify.h
> @@ -8,6 +8,8 @@
>
>  #include "../mount.h"
>
> +struct fsnotify_mark_connector;
> +
>  /* destroy all events sitting in this groups notification queue */
>  extern void fsnotify_flush_notify(struct fsnotify_group *group);
>
> @@ -15,7 +17,7 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group);
>  extern struct srcu_struct fsnotify_mark_srcu;
>
>  /* Calculate mask of events for a list of marks */
> -extern u32 fsnotify_recalc_mask(struct hlist_head *head);
> +extern u32 fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
>
>  /* compare two groups for sorting of marks lists */
>  extern int fsnotify_compare_groups(struct fsnotify_group *a,
> @@ -23,10 +25,6 @@ extern int fsnotify_compare_groups(struct fsnotify_group *a,
>
>  extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
>                                                 __u32 mask);
> -/* Add mark to a proper place in mark list */
> -extern int fsnotify_add_mark_list(struct hlist_head *head,
> -                                 struct fsnotify_mark *mark,
> -                                 int allow_dups);
>  /* add a mark to an inode */
>  extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
>                                    struct fsnotify_group *group, struct inode *inode,
> @@ -39,20 +37,25 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
>  /* vfsmount specific destruction of a mark */
>  extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
>  /* inode specific destruction of a mark */
> -extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
> +extern struct inode *fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
>  /* Find mark belonging to given group in the list of marks */
>  extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
>                                                 struct fsnotify_group *group);
>  /* Destroy all marks in the given list protected by 'lock' */
> -extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
> +extern void fsnotify_destroy_marks(struct fsnotify_mark_connector **connp,
> +                                  spinlock_t *lock);
>  /* run the list of all marks associated with inode and destroy them */
>  static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
>  {
> +       if (!inode->i_fsnotify_marks)
> +               return;

Should move the above check inside fsnotify_destroy_marks().  Locking
rules also dictate that the check be done under inode lock, although
that may not matter in practice.

>         fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
>  }
>  /* run the list of all marks associated with vfsmount and destroy them */
>  static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
>  {
> +       if (!real_mount(mnt)->mnt_fsnotify_marks)
> +               return;

And the same here.

>         fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
>                                &mnt->mnt_root->d_lock);
>  }
> diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
> index a3645249f7ec..564641becb39 100644
> --- a/fs/notify/inode_mark.c
> +++ b/fs/notify/inode_mark.c
> @@ -37,15 +37,16 @@
>  void fsnotify_recalc_inode_mask(struct inode *inode)
>  {
>         spin_lock(&inode->i_lock);
> -       inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
> +       inode->i_fsnotify_mask = fsnotify_recalc_mask(inode->i_fsnotify_marks);
>         spin_unlock(&inode->i_lock);
>
>         __fsnotify_update_child_dentry_flags(inode);
>  }
>
> -void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
> +struct inode *fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
>  {
> -       struct inode *inode = mark->inode;
> +       struct inode *inode = mark->connector->inode;
> +       bool empty;
>
>         BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
>         assert_spin_locked(&mark->lock);
> @@ -53,15 +54,18 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
>         spin_lock(&inode->i_lock);
>
>         hlist_del_init_rcu(&mark->obj_list);
> -       mark->inode = NULL;
> +       empty = hlist_empty(&mark->connector->list);
> +       mark->connector = NULL;
>
>         /*
>          * this mark is now off the inode->i_fsnotify_marks list and we
>          * hold the inode->i_lock, so this is the perfect time to update the
>          * inode->i_fsnotify_mask
>          */
> -       inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
> +       inode->i_fsnotify_mask = fsnotify_recalc_mask(inode->i_fsnotify_marks);
>         spin_unlock(&inode->i_lock);
> +
> +       return empty ? inode : NULL;
>  }
>
>  /*
> @@ -69,7 +73,7 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
>   */
>  void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
>  {
> -       fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
> +       fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_INODE);
>  }
>
>  /*
> @@ -81,64 +85,14 @@ struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
>  {
>         struct fsnotify_mark *mark;
>
> -       spin_lock(&inode->i_lock);
> -       mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
> -       spin_unlock(&inode->i_lock);
> -
> -       return mark;
> -}
> -
> -/*
> - * If we are setting a mark mask on an inode mark we should pin the inode
> - * in memory.
> - */
> -void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
> -                                        __u32 mask)
> -{
> -       struct inode *inode;
> -
> -       assert_spin_locked(&mark->lock);
> -
> -       if (mask &&
> -           mark->inode &&
> -           !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
> -               mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
> -               inode = igrab(mark->inode);
> -               /*
> -                * we shouldn't be able to get here if the inode wasn't
> -                * already safely held in memory.  But bug in case it
> -                * ever is wrong.
> -                */
> -               BUG_ON(!inode);
> -       }
> -}

I think the above warrants a separate patch.  This patch subtly
changes the rules of grabbing the inode reference and it's not
explained why this is okay.   Removing whole functions also confuses
the patch itself.

> -
> -/*
> - * Attach an initialized mark to a given inode.
> - * These marks may be used for the fsnotify backend to determine which
> - * event types should be delivered to which group and for which inodes.  These
> - * marks are ordered according to priority, highest number first, and then by
> - * the group's location in memory.
> - */
> -int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
> -                           struct fsnotify_group *group, struct inode *inode,
> -                           int allow_dups)
> -{
> -       int ret;
> -
> -       mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
> -
> -       BUG_ON(!mutex_is_locked(&group->mark_mutex));
> -       assert_spin_locked(&mark->lock);
> +       if (!inode->i_fsnotify_marks)
> +               return NULL;

Check in fsnotify_find_mark()?

>
>         spin_lock(&inode->i_lock);
> -       mark->inode = inode;
> -       ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark,
> -                                    allow_dups);
> -       inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
> +       mark = fsnotify_find_mark(&inode->i_fsnotify_marks->list, group);
>         spin_unlock(&inode->i_lock);
>
> -       return ret;
> +       return mark;
>  }
>
>  /**
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index 44836e539169..ce2314ae96d0 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -83,6 +83,8 @@
>  #define FSNOTIFY_REAPER_DELAY  (1)     /* 1 jiffy */
>
>  struct srcu_struct fsnotify_mark_srcu;
> +struct kmem_cache *fsnotify_mark_connector_cachep;
> +
>  static DEFINE_SPINLOCK(destroy_lock);
>  static LIST_HEAD(destroy_list);
>
> @@ -104,12 +106,12 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
>  }
>
>  /* Calculate mask of events for a list of marks */
> -u32 fsnotify_recalc_mask(struct hlist_head *head)
> +u32 fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
>  {
>         u32 new_mask = 0;
>         struct fsnotify_mark *mark;
>
> -       hlist_for_each_entry(mark, head, obj_list)
> +       hlist_for_each_entry(mark, &conn->list, obj_list)
>                 new_mask |= mark->mask;
>         return new_mask;
>  }
> @@ -137,10 +139,9 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
>
>         mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
>
> -       if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
> -               inode = mark->inode;
> -               fsnotify_destroy_inode_mark(mark);
> -       } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
> +       if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE)
> +               inode = fsnotify_destroy_inode_mark(mark);
> +       else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
>                 fsnotify_destroy_vfsmount_mark(mark);
>         else
>                 BUG();
> @@ -155,7 +156,7 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
>
>         spin_unlock(&mark->lock);
>
> -       if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
> +       if (inode)

And this should also be part of the inode refcounting change.

>                 iput(inode);
>
>         atomic_dec(&group->num_marks);
> @@ -220,7 +221,8 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
>         fsnotify_free_mark(mark);
>  }
>
> -void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
> +void fsnotify_destroy_marks(struct fsnotify_mark_connector **connp,
> +                           spinlock_t *lock)
>  {
>         struct fsnotify_mark *mark;
>
> @@ -233,11 +235,12 @@ void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
>                  * calling fsnotify_destroy_mark() more than once is fine.
>                  */
>                 spin_lock(lock);
> -               if (hlist_empty(head)) {
> +               if (hlist_empty(&(*connp)->list)) {
>                         spin_unlock(lock);
>                         break;
>                 }
> -               mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
> +               mark = hlist_entry((*connp)->list.first, struct fsnotify_mark,
> +                                  obj_list);
>                 /*
>                  * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
>                  * since inode / mount is going away anyway. So just remove
> @@ -249,6 +252,16 @@ void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
>                 fsnotify_destroy_mark(mark, mark->group);
>                 fsnotify_put_mark(mark);
>         }
> +       /*
> +        * Skip freeing of connector when inode is just unlinked. We leave that
> +        * until destroy_inode() is called at which moment nobody will be
> +        * looking at the inode anymore.
> +        */
> +       if ((*connp)->flags & FSNOTIFY_OBJ_TYPE_INODE &&
> +           !((*connp)->inode->i_state & I_CLEAR))
> +               return;

This is ugly.   Why not rather add a fsnotify_connector_free() and
call that explicitly on inode/vfsmount destruction?

> +       kmem_cache_free(fsnotify_mark_connector_cachep, *connp);
> +       *connp = NULL;
>  }
>
>  void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
> @@ -256,9 +269,6 @@ void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
>         assert_spin_locked(&mark->lock);
>
>         mark->mask = mask;
> -
> -       if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
> -               fsnotify_set_inode_mark_mask_locked(mark, mask);

Also related to inode refcounting?

>  }
>
>  void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
> @@ -304,37 +314,111 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
>         return -1;
>  }
>
> -/* Add mark into proper place in given list of marks */
> -int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark,
> -                          int allow_dups)
> +/*
> + * Get mark connector, make sure it is alive and return with its lock held.
> + * This is for users that get connector pointer from inode or mount. Users that
> + * hold reference to a mark on the list may directly lock connector->lock as
> + * they are sure list cannot go away under them.
> + */
> +static struct fsnotify_mark_connector *fsnotify_grab_connector(
> +                               struct fsnotify_mark_connector *conn)
> +{
> +       if (!conn)
> +               return NULL;
> +       if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
> +               spin_lock(&conn->inode->i_lock);
> +       else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
> +               spin_lock(&conn->mnt->mnt_root->d_lock);
> +       else
> +               return NULL;

Can this happen?  If not, then add a WARNING().

> +       return conn;
> +}
> +
> +static void fsnotify_put_connector(struct fsnotify_mark_connector *conn)
> +{
> +       if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
> +               spin_unlock(&conn->inode->i_lock);
> +       else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
> +               spin_unlock(&conn->mnt->mnt_root->d_lock);
> +}
> +
> +/*
> + * Add mark into proper place in given list of marks. These marks may be used
> + * for the fsnotify backend to determine which event types should be delivered
> + * to which group and for which inodes. These marks are ordered according to
> + * priority, highest number first, and then by the group's location in memory.
> + */
> +static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
> +                                 struct inode *inode, struct vfsmount *mnt,
> +                                 int allow_dups)
>  {
>         struct fsnotify_mark *lmark, *last = NULL;
> +       struct fsnotify_mark_connector *conn;
> +       struct fsnotify_mark_connector **connp;
>         int cmp;
> +       int err = 0;
> +
> +       BUG_ON(!inode && !mnt);

Use WARN_ON() instead.

> +       if (inode)
> +               connp = &inode->i_fsnotify_marks;
> +       else
> +               connp = &real_mount(mnt)->mnt_fsnotify_marks;
> +restart:
> +       spin_lock(&mark->lock);
> +       conn = fsnotify_grab_connector(*connp);

This looks a bit too subtle.  We have the inode/vfsmount.  We have the
matching lock.  Could store the address of the lock in a local
variable and use that for locking.

> +       if (!conn) {
> +               spin_unlock(&mark->lock);
> +               conn = kmem_cache_alloc(fsnotify_mark_connector_cachep,
> +                                       GFP_KERNEL);
> +               if (!conn)
> +                       return -ENOMEM;
> +               INIT_HLIST_HEAD(&conn->list);
> +               if (inode) {
> +                       conn->flags = FSNOTIFY_OBJ_TYPE_INODE;
> +                       conn->inode = inode;
> +               } else {
> +                       conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
> +                       conn->mnt = mnt;
> +               }
> +               if (cmpxchg(connp, NULL, conn)) {
> +                       /* Someone else created list structure for us */
> +                       kmem_cache_free(fsnotify_mark_connector_cachep, conn);
> +               }

Setting *connp should be done with inode/vfsmount lock already
regained (if I understand the locking rules correctly) and then no
need to have cmpxchg magic in there.

> +               goto restart;
> +       }
>
>         /* is mark the first mark? */
> -       if (hlist_empty(head)) {
> -               hlist_add_head_rcu(&mark->obj_list, head);
> -               return 0;
> +       if (hlist_empty(&conn->list)) {
> +               hlist_add_head_rcu(&mark->obj_list, &conn->list);
> +               igrab(inode);

And this is again an inode grabbing change

> +               goto added;
>         }
>
>         /* should mark be in the middle of the current list? */
> -       hlist_for_each_entry(lmark, head, obj_list) {
> +       hlist_for_each_entry(lmark, &conn->list, obj_list) {
>                 last = lmark;
>
> -               if ((lmark->group == mark->group) && !allow_dups)
> -                       return -EEXIST;
> +               if ((lmark->group == mark->group) && !allow_dups) {
> +                       err = -EEXIST;
> +                       goto out_err;
> +               }
>
>                 cmp = fsnotify_compare_groups(lmark->group, mark->group);
>                 if (cmp >= 0) {
>                         hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
> -                       return 0;
> +                       goto added;
>                 }
>         }
>
>         BUG_ON(last == NULL);
>         /* mark should be the last entry.  last is the current last entry */
>         hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
> -       return 0;
> +added:
> +       mark->connector = conn;
> +out_err:
> +       fsnotify_put_connector(conn);
> +       spin_unlock(&mark->lock);
> +       return err;

And AFAICS, this is also due to the inode grabbing change.

>  }
>
>  /*
> @@ -366,25 +450,16 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
>         list_add(&mark->g_list, &group->marks_list);
>         atomic_inc(&group->num_marks);
>         fsnotify_get_mark(mark); /* for i_list and g_list */
> -
> -       if (inode) {
> -               ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
> -               if (ret)
> -                       goto err;
> -       } else if (mnt) {
> -               ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
> -               if (ret)
> -                       goto err;
> -       } else {
> -               BUG();
> -       }
> -
> -       /* this will pin the object if appropriate */
> -       fsnotify_set_mark_mask_locked(mark, mark->mask);
>         spin_unlock(&mark->lock);
>
> +       ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups);
> +       if (ret)
> +               goto err;
> +
>         if (inode)
> -               __fsnotify_update_child_dentry_flags(inode);
> +               fsnotify_recalc_inode_mask(inode);
> +       else
> +               fsnotify_recalc_vfsmount_mask(mnt);
>
>         return ret;
>  err:
> @@ -453,7 +528,7 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
>          */
>         mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
>         list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
> -               if (mark->flags & flags)
> +               if (mark->connector->flags & flags)
>                         list_move(&mark->g_list, &to_free);
>         }
>         mutex_unlock(&group->mark_mutex);
> diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
> index a8fcab68faef..9e676b4b0068 100644
> --- a/fs/notify/vfsmount_mark.c
> +++ b/fs/notify/vfsmount_mark.c
> @@ -31,7 +31,7 @@
>
>  void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
>  {
> -       fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
> +       fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
>  }
>
>  /*
> @@ -43,13 +43,13 @@ void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
>         struct mount *m = real_mount(mnt);
>
>         spin_lock(&mnt->mnt_root->d_lock);
> -       m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
> +       m->mnt_fsnotify_mask = fsnotify_recalc_mask(m->mnt_fsnotify_marks);
>         spin_unlock(&mnt->mnt_root->d_lock);
>  }
>
>  void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
>  {
> -       struct vfsmount *mnt = mark->mnt;
> +       struct vfsmount *mnt = mark->connector->mnt;
>         struct mount *m = real_mount(mnt);
>
>         BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
> @@ -58,9 +58,9 @@ void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
>         spin_lock(&mnt->mnt_root->d_lock);
>
>         hlist_del_init_rcu(&mark->obj_list);
> -       mark->mnt = NULL;
> +       mark->connector = NULL;
>
> -       m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
> +       m->mnt_fsnotify_mask = fsnotify_recalc_mask(m->mnt_fsnotify_marks);
>         spin_unlock(&mnt->mnt_root->d_lock);
>  }
>
> @@ -74,35 +74,12 @@ struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
>         struct mount *m = real_mount(mnt);
>         struct fsnotify_mark *mark;
>
> -       spin_lock(&mnt->mnt_root->d_lock);
> -       mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group);
> -       spin_unlock(&mnt->mnt_root->d_lock);
> -
> -       return mark;
> -}
> -
> -/*
> - * Attach an initialized mark to a given group and vfsmount.
> - * These marks may be used for the fsnotify backend to determine which
> - * event types should be delivered to which groups.
> - */
> -int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
> -                              struct fsnotify_group *group, struct vfsmount *mnt,
> -                              int allow_dups)
> -{
> -       struct mount *m = real_mount(mnt);
> -       int ret;
> -
> -       mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
> -
> -       BUG_ON(!mutex_is_locked(&group->mark_mutex));
> -       assert_spin_locked(&mark->lock);
> +       if (!m->mnt_fsnotify_marks)
> +               return NULL;

Move check to fsnotify_find_mark()?

>
>         spin_lock(&mnt->mnt_root->d_lock);
> -       mark->mnt = mnt;
> -       ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups);
> -       m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
> +       mark = fsnotify_find_mark(&m->mnt_fsnotify_marks->list, group);
>         spin_unlock(&mnt->mnt_root->d_lock);
>
> -       return ret;
> +       return mark;
>  }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 2ba074328894..2588a24ad11f 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -545,6 +545,8 @@ is_uncached_acl(struct posix_acl *acl)
>  #define IOP_XATTR      0x0008
>  #define IOP_DEFAULT_READLINK   0x0010
>
> +struct fsnotify_mark_connector;
> +
>  /*
>   * Keep mostly read-only and often accessed (especially for
>   * the RCU path lookup and 'stat' data) fields at the beginning
> @@ -644,7 +646,7 @@ struct inode {
>
>  #ifdef CONFIG_FSNOTIFY
>         __u32                   i_fsnotify_mask; /* all events this inode cares about */
> -       struct hlist_head       i_fsnotify_marks;
> +       struct fsnotify_mark_connector  *i_fsnotify_marks;
>  #endif
>
>  #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index 487246546ebe..01df00327683 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -194,6 +194,27 @@ struct fsnotify_group {
>  #define FSNOTIFY_EVENT_INODE   2
>
>  /*
> + * Inode / vfsmount point to this structure which tracks all marks attached to
> + * the inode / vfsmount. The reference to inode / vfsmount is held by this
> + * structure. We destroy this structure when there are no more marks attached
> + * to it. The structure is protected by fsnotify_mark_srcu.
> + */
> +struct fsnotify_mark_connector {
> +#define FSNOTIFY_OBJ_TYPE_INODE                0x01
> +#define FSNOTIFY_OBJ_TYPE_VFSMOUNT     0x02
> +       unsigned int flags;     /* Type of object [lock] */
> +       union { /* Object pointer [lock] */
> +               struct inode *inode;
> +               struct vfsmount *mnt;
> +       };
> +       union {
> +               struct hlist_head list;
> +               /* Used listing heads to free after srcu period expires */
> +               struct fsnotify_mark_connector *destroy_next;

This field is not used in this patch at all.

> +       };
> +};
> +
> +/*
>   * A mark is simply an object attached to an in core inode which allows an
>   * fsnotify listener to indicate they are either no longer interested in events
>   * of a type matching mask or only interested in those events.
> @@ -224,18 +245,13 @@ struct fsnotify_mark {
>         spinlock_t lock;
>         /* List of marks for inode / vfsmount [obj_lock] */
>         struct hlist_node obj_list;
> -       union { /* Object pointer [mark->lock, group->mark_mutex] */
> -               struct inode *inode;    /* inode this mark is associated with */
> -               struct vfsmount *mnt;   /* vfsmount this mark is associated with */
> -       };
> +       /* Head of list of marks for an object [mark->lock, group->mark_mutex] */
> +       struct fsnotify_mark_connector *connector;
>         /* Events types to ignore [mark->lock, group->mark_mutex] */
>         __u32 ignored_mask;
> -#define FSNOTIFY_MARK_FLAG_INODE               0x01
> -#define FSNOTIFY_MARK_FLAG_VFSMOUNT            0x02
> -#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED       0x04
> -#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08
> -#define FSNOTIFY_MARK_FLAG_ALIVE               0x10
> -#define FSNOTIFY_MARK_FLAG_ATTACHED            0x20
> +#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x01
> +#define FSNOTIFY_MARK_FLAG_ALIVE               0x02
> +#define FSNOTIFY_MARK_FLAG_ATTACHED            0x04
>         unsigned int flags;             /* flags [mark->lock] */
>         void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
>  };
> @@ -343,7 +359,7 @@ extern void fsnotify_free_mark(struct fsnotify_mark *mark);
>  extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group);
>  /* run all the marks in a group, and clear all of the inode marks */
>  extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
> -/* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
> +/* run all the marks in a group, and clear all of the marks attached to given object type */
>  extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
>  extern void fsnotify_get_mark(struct fsnotify_mark *mark);
>  extern void fsnotify_put_mark(struct fsnotify_mark *mark);
> diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
> index 11c7ac441624..a878a98f55b3 100644
> --- a/kernel/audit_tree.c
> +++ b/kernel/audit_tree.c
> @@ -172,10 +172,25 @@ static unsigned long inode_to_key(const struct inode *inode)
>  /*
>   * Function to return search key in our hash from chunk. Key 0 is special and
>   * should never be present in the hash.
> + *
> + * Must be called with chunk->mark.lock held to protect from connector
> + * becoming NULL.
>   */
> +static unsigned long __chunk_to_key(struct audit_chunk *chunk)
> +{
> +       if (!chunk->mark.connector)
> +               return 0;
> +       return (unsigned long)chunk->mark.connector->inode;
> +}
> +
>  static unsigned long chunk_to_key(struct audit_chunk *chunk)
>  {
> -       return (unsigned long)chunk->mark.inode;
> +       unsigned long key;
> +
> +       spin_lock(&chunk->mark.lock);
> +       key = __chunk_to_key(chunk);
> +       spin_unlock(&chunk->mark.lock);
> +       return key;
>  }
>
>  static inline struct list_head *chunk_hash(unsigned long key)
> @@ -187,7 +202,7 @@ static inline struct list_head *chunk_hash(unsigned long key)
>  /* hash_lock & entry->lock is held by caller */
>  static void insert_hash(struct audit_chunk *chunk)
>  {
> -       unsigned long key = chunk_to_key(chunk);
> +       unsigned long key = __chunk_to_key(chunk);
>         struct list_head *list;
>
>         if (!key)
> @@ -248,7 +263,8 @@ static void untag_chunk(struct node *p)
>
>         mutex_lock(&entry->group->mark_mutex);
>         spin_lock(&entry->lock);
> -       if (chunk->dead || !entry->inode) {
> +       if (chunk->dead || !entry->connector ||
> +           !entry->connector->inode) {

entry->connector->inode will always be positive, at least in this patch.

>                 spin_unlock(&entry->lock);
>                 mutex_unlock(&entry->group->mark_mutex);
>                 if (new)
> @@ -276,8 +292,8 @@ static void untag_chunk(struct node *p)
>         if (!new)
>                 goto Fallback;
>
> -       if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode,
> -                                    NULL, 1)) {
> +       if (fsnotify_add_mark_locked(&new->mark, entry->group,
> +                                    entry->connector->inode, NULL, 1)) {
>                 fsnotify_put_mark(&new->mark);
>                 goto Fallback;
>         }
> @@ -408,7 +424,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
>
>         mutex_lock(&old_entry->group->mark_mutex);
>         spin_lock(&old_entry->lock);
> -       if (!old_entry->inode) {
> +       if (!old_entry->connector || !old_entry->connector->inode) {

Same here, connector->inode will never be NULL.

>                 /* old_entry is being shot, lets just lie */
>                 spin_unlock(&old_entry->lock);
>                 mutex_unlock(&old_entry->group->mark_mutex);
> @@ -418,7 +434,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
>         }
>
>         if (fsnotify_add_mark_locked(chunk_entry, old_entry->group,
> -                                    old_entry->inode, NULL, 1)) {
> +                            old_entry->connector->inode, NULL, 1)) {
>                 spin_unlock(&old_entry->lock);
>                 mutex_unlock(&old_entry->group->mark_mutex);
>                 fsnotify_put_mark(chunk_entry);
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index cf1fa43512c1..e990de915341 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -1591,7 +1591,7 @@ static inline void handle_one(const struct inode *inode)
>         struct audit_tree_refs *p;
>         struct audit_chunk *chunk;
>         int count;
> -       if (likely(hlist_empty(&inode->i_fsnotify_marks)))
> +       if (likely(!inode->i_fsnotify_marks))

Need to check hlist_empty() as well?  Perhaps add an inline helper to
fsnotify_backend.h: fsnotify_inode_marks_empty().

>                 return;
>         context = current->audit_context;
>         p = context->trees;
> @@ -1634,7 +1634,7 @@ static void handle_path(const struct dentry *dentry)
>         seq = read_seqbegin(&rename_lock);
>         for(;;) {
>                 struct inode *inode = d_backing_inode(d);
> -               if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
> +               if (inode && unlikely(inode->i_fsnotify_marks)) {

Here as well.

>                         struct audit_chunk *chunk;
>                         chunk = audit_tree_lookup(inode);
>                         if (chunk) {
> --
> 2.10.2
>



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux