On Wed, Feb 1, 2017 at 11:44 AM, Jan Kara <jack@xxxxxxx> wrote: > Currently notification marks are attached to object (inode or vfsmnt) by > a hlist_head in the object. The list is also protected by a spinlock in > the object. So while there is any mark attached to the list of marks, > the object must be pinned in memory (and thus e.g. last iput() deleting > inode cannot happen). Also for list iteration in fsnotify() to work, we > must hold fsnotify_mark_srcu lock so that mark itself and > mark->obj_list.next cannot get freed. Thus we are required to wait for > response to fanotify events from userspace process with > fsnotify_mark_srcu lock held. That causes issues when userspace process > is buggy and does not reply to some event - basically the whole > notification subsystem gets eventually stuck. > > So to be able to drop fsnotify_mark_srcu lock while waiting for > response, we have to pin the mark in memory and make sure it stays in > the object list (as removing the mark waiting for response could lead to > lost notification events for groups later in the list). However we don't > want inode reclaim to block on such mark as that would lead to system > just locking up elsewhere. > > This commit tries to pave a way towards solving these conflicting > lifetime needs. Instead of anchoring the list of marks directly in the > object, we anchor it in a dedicated structure (fsnotify_mark_connector) > and just point to that structure from the object. In the following patch > we will also protect the list by a spinlock contained in that structure. > With this, we will be able to detach notification marks from object > without having to modify the list itself. This is still a bit big to review easily. I'd suggest further splitup into: - move hlist_head from inode/mnt into connector - move inode/mnt ptr from mark into connector (no refcounting changes) - inode refcounting cleanup More comments inline. > > Signed-off-by: Jan Kara <jack@xxxxxxx> > --- > fs/inode.c | 3 - > fs/mount.h | 2 +- > fs/namespace.c | 3 - > fs/notify/dnotify/dnotify.c | 4 +- > fs/notify/fdinfo.c | 12 +-- > fs/notify/fsnotify.c | 32 +++++--- > fs/notify/fsnotify.h | 17 +++-- > fs/notify/inode_mark.c | 74 ++++-------------- > fs/notify/mark.c | 157 +++++++++++++++++++++++++++++---------- > fs/notify/vfsmount_mark.c | 41 +++------- > include/linux/fs.h | 4 +- > include/linux/fsnotify_backend.h | 38 +++++++--- > kernel/audit_tree.c | 30 ++++++-- > kernel/auditsc.c | 4 +- > 14 files changed, 236 insertions(+), 185 deletions(-) > > diff --git a/fs/inode.c b/fs/inode.c > index 88110fd0b282..131b2bcebc48 100644 > --- a/fs/inode.c > +++ b/fs/inode.c > @@ -371,9 +371,6 @@ void inode_init_once(struct inode *inode) > INIT_LIST_HEAD(&inode->i_lru); > address_space_init_once(&inode->i_data); > i_size_ordered_init(inode); > -#ifdef CONFIG_FSNOTIFY > - INIT_HLIST_HEAD(&inode->i_fsnotify_marks); > -#endif > } > EXPORT_SYMBOL(inode_init_once); > > diff --git a/fs/mount.h b/fs/mount.h > index 2c856fc47ae3..c92c78d135dc 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -59,7 +59,7 @@ struct mount { > struct mountpoint *mnt_mp; /* where is it mounted */ > struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ > #ifdef CONFIG_FSNOTIFY > - struct hlist_head mnt_fsnotify_marks; > + struct fsnotify_mark_connector *mnt_fsnotify_marks; > __u32 mnt_fsnotify_mask; > #endif > int mnt_id; /* mount identifier */ > diff --git a/fs/namespace.c b/fs/namespace.c > index b5b1259e064f..22a91352e2d7 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -233,9 +233,6 @@ static struct mount *alloc_vfsmnt(const char *name) > INIT_LIST_HEAD(&mnt->mnt_slave_list); > INIT_LIST_HEAD(&mnt->mnt_slave); > INIT_HLIST_NODE(&mnt->mnt_mp_list); > -#ifdef CONFIG_FSNOTIFY > - INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); > -#endif > init_fs_pin(&mnt->mnt_umount, drop_mountpoint); > } > return mnt; > diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c > index 5a4ec309e283..3dd2e0ece262 100644 > --- a/fs/notify/dnotify/dnotify.c > +++ b/fs/notify/dnotify/dnotify.c > @@ -69,8 +69,8 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) > if (old_mask == new_mask) > return; > > - if (fsn_mark->inode) > - fsnotify_recalc_inode_mask(fsn_mark->inode); > + if (fsn_mark->connector->inode This doesn't make sense in the context of this patch. Now mark->connector is set to NULL where previously mark->inode was set to null. So the right condition would be "if (fsn_mark->connector) ...". ) > + fsnotify_recalc_inode_mask(fsn_mark->connector->inode); > } > > /* > diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c > index 601a59c8d87e..dd63aa9a6f9a 100644 > --- a/fs/notify/fdinfo.c > +++ b/fs/notify/fdinfo.c > @@ -76,11 +76,11 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) > struct inotify_inode_mark *inode_mark; > struct inode *inode; > > - if (!(mark->flags & FSNOTIFY_MARK_FLAG_INODE)) > + if (!(mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE)) > return; > > inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); > - inode = igrab(mark->inode); > + inode = igrab(mark->connector->inode); > if (inode) { > /* > * IN_ALL_EVENTS represents all of the mask bits > @@ -115,8 +115,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) > if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) > mflags |= FAN_MARK_IGNORED_SURV_MODIFY; > > - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { > - inode = igrab(mark->inode); > + if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) { > + inode = igrab(mark->connector->inode); > if (!inode) > return; > seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", > @@ -125,8 +125,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) > show_mark_fhandle(m, inode); > seq_putc(m, '\n'); > iput(inode); > - } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { > - struct mount *mnt = real_mount(mark->mnt); > + } else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { > + struct mount *mnt = real_mount(mark->connector->mnt); > > seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", > mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); > diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c > index b41515d3f081..421c7431a16e 100644 > --- a/fs/notify/fsnotify.c > +++ b/fs/notify/fsnotify.c > @@ -193,6 +193,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; > struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; > struct fsnotify_group *inode_group, *vfsmount_group; > + struct fsnotify_mark_connector *inode_conn, *vfsmount_conn; > struct mount *mnt; > int idx, ret = 0; > /* global tests shouldn't care about events on child only the specific event */ > @@ -210,8 +211,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > * SRCU because we have no references to any objects and do not > * need SRCU to keep them "alive". > */ > - if (hlist_empty(&to_tell->i_fsnotify_marks) && > - (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks))) > + if (!to_tell->i_fsnotify_marks && > + (!mnt || !mnt->mnt_fsnotify_marks)) The checks against hlist_empty() could still be useful to optimize away the memory barrier imposed by srcu_read_lock() in case there were marks on the object but not anymore (i.e. the connector is non-NULL, but the hlist is empty). > return 0; > /* > * if this is a modify event we may need to clear the ignored masks > @@ -226,16 +227,24 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > idx = srcu_read_lock(&fsnotify_mark_srcu); > > if ((mask & FS_MODIFY) || > - (test_mask & to_tell->i_fsnotify_mask)) > - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, > - &fsnotify_mark_srcu); > + (test_mask & to_tell->i_fsnotify_mask)) { > + inode_conn = to_tell->i_fsnotify_marks; Needs lockless_dereference(). > + if (inode_conn) > + inode_node = srcu_dereference(inode_conn->list.first, > + &fsnotify_mark_srcu); > + } > > if (mnt && ((mask & FS_MODIFY) || > (test_mask & mnt->mnt_fsnotify_mask))) { > - vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, > - &fsnotify_mark_srcu); > - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, > - &fsnotify_mark_srcu); > + inode_conn = to_tell->i_fsnotify_marks; > + if (inode_conn) > + inode_node = srcu_dereference(inode_conn->list.first, > + &fsnotify_mark_srcu); > + vfsmount_conn = mnt->mnt_fsnotify_marks; And this one too. > + if (vfsmount_conn) > + vfsmount_node = srcu_dereference( > + vfsmount_conn->list.first, > + &fsnotify_mark_srcu); > } > > /* > @@ -293,6 +302,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > } > EXPORT_SYMBOL_GPL(fsnotify); > > +extern struct kmem_cache *fsnotify_mark_connector_cachep; > + > static __init int fsnotify_init(void) > { > int ret; > @@ -303,6 +314,9 @@ static __init int fsnotify_init(void) > if (ret) > panic("initializing fsnotify_mark_srcu"); > > + fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, > + SLAB_PANIC); > + > return 0; > } > core_initcall(fsnotify_init); > diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h > index 0a3bc2cf192c..7a95436dc8d3 100644 > --- a/fs/notify/fsnotify.h > +++ b/fs/notify/fsnotify.h > @@ -8,6 +8,8 @@ > > #include "../mount.h" > > +struct fsnotify_mark_connector; > + > /* destroy all events sitting in this groups notification queue */ > extern void fsnotify_flush_notify(struct fsnotify_group *group); > > @@ -15,7 +17,7 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group); > extern struct srcu_struct fsnotify_mark_srcu; > > /* Calculate mask of events for a list of marks */ > -extern u32 fsnotify_recalc_mask(struct hlist_head *head); > +extern u32 fsnotify_recalc_mask(struct fsnotify_mark_connector *conn); > > /* compare two groups for sorting of marks lists */ > extern int fsnotify_compare_groups(struct fsnotify_group *a, > @@ -23,10 +25,6 @@ extern int fsnotify_compare_groups(struct fsnotify_group *a, > > extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, > __u32 mask); > -/* Add mark to a proper place in mark list */ > -extern int fsnotify_add_mark_list(struct hlist_head *head, > - struct fsnotify_mark *mark, > - int allow_dups); > /* add a mark to an inode */ > extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, > struct fsnotify_group *group, struct inode *inode, > @@ -39,20 +37,25 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, > /* vfsmount specific destruction of a mark */ > extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); > /* inode specific destruction of a mark */ > -extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); > +extern struct inode *fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); > /* Find mark belonging to given group in the list of marks */ > extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, > struct fsnotify_group *group); > /* Destroy all marks in the given list protected by 'lock' */ > -extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock); > +extern void fsnotify_destroy_marks(struct fsnotify_mark_connector **connp, > + spinlock_t *lock); > /* run the list of all marks associated with inode and destroy them */ > static inline void fsnotify_clear_marks_by_inode(struct inode *inode) > { > + if (!inode->i_fsnotify_marks) > + return; Should move the above check inside fsnotify_destroy_marks(). Locking rules also dictate that the check be done under inode lock, although that may not matter in practice. > fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock); > } > /* run the list of all marks associated with vfsmount and destroy them */ > static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) > { > + if (!real_mount(mnt)->mnt_fsnotify_marks) > + return; And the same here. > fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, > &mnt->mnt_root->d_lock); > } > diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c > index a3645249f7ec..564641becb39 100644 > --- a/fs/notify/inode_mark.c > +++ b/fs/notify/inode_mark.c > @@ -37,15 +37,16 @@ > void fsnotify_recalc_inode_mask(struct inode *inode) > { > spin_lock(&inode->i_lock); > - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); > + inode->i_fsnotify_mask = fsnotify_recalc_mask(inode->i_fsnotify_marks); > spin_unlock(&inode->i_lock); > > __fsnotify_update_child_dentry_flags(inode); > } > > -void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) > +struct inode *fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) > { > - struct inode *inode = mark->inode; > + struct inode *inode = mark->connector->inode; > + bool empty; > > BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); > assert_spin_locked(&mark->lock); > @@ -53,15 +54,18 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) > spin_lock(&inode->i_lock); > > hlist_del_init_rcu(&mark->obj_list); > - mark->inode = NULL; > + empty = hlist_empty(&mark->connector->list); > + mark->connector = NULL; > > /* > * this mark is now off the inode->i_fsnotify_marks list and we > * hold the inode->i_lock, so this is the perfect time to update the > * inode->i_fsnotify_mask > */ > - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); > + inode->i_fsnotify_mask = fsnotify_recalc_mask(inode->i_fsnotify_marks); > spin_unlock(&inode->i_lock); > + > + return empty ? inode : NULL; > } > > /* > @@ -69,7 +73,7 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) > */ > void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) > { > - fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE); > + fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_INODE); > } > > /* > @@ -81,64 +85,14 @@ struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, > { > struct fsnotify_mark *mark; > > - spin_lock(&inode->i_lock); > - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); > - spin_unlock(&inode->i_lock); > - > - return mark; > -} > - > -/* > - * If we are setting a mark mask on an inode mark we should pin the inode > - * in memory. > - */ > -void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, > - __u32 mask) > -{ > - struct inode *inode; > - > - assert_spin_locked(&mark->lock); > - > - if (mask && > - mark->inode && > - !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { > - mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; > - inode = igrab(mark->inode); > - /* > - * we shouldn't be able to get here if the inode wasn't > - * already safely held in memory. But bug in case it > - * ever is wrong. > - */ > - BUG_ON(!inode); > - } > -} I think the above warrants a separate patch. This patch subtly changes the rules of grabbing the inode reference and it's not explained why this is okay. Removing whole functions also confuses the patch itself. > - > -/* > - * Attach an initialized mark to a given inode. > - * These marks may be used for the fsnotify backend to determine which > - * event types should be delivered to which group and for which inodes. These > - * marks are ordered according to priority, highest number first, and then by > - * the group's location in memory. > - */ > -int fsnotify_add_inode_mark(struct fsnotify_mark *mark, > - struct fsnotify_group *group, struct inode *inode, > - int allow_dups) > -{ > - int ret; > - > - mark->flags |= FSNOTIFY_MARK_FLAG_INODE; > - > - BUG_ON(!mutex_is_locked(&group->mark_mutex)); > - assert_spin_locked(&mark->lock); > + if (!inode->i_fsnotify_marks) > + return NULL; Check in fsnotify_find_mark()? > > spin_lock(&inode->i_lock); > - mark->inode = inode; > - ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark, > - allow_dups); > - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); > + mark = fsnotify_find_mark(&inode->i_fsnotify_marks->list, group); > spin_unlock(&inode->i_lock); > > - return ret; > + return mark; > } > > /** > diff --git a/fs/notify/mark.c b/fs/notify/mark.c > index 44836e539169..ce2314ae96d0 100644 > --- a/fs/notify/mark.c > +++ b/fs/notify/mark.c > @@ -83,6 +83,8 @@ > #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ > > struct srcu_struct fsnotify_mark_srcu; > +struct kmem_cache *fsnotify_mark_connector_cachep; > + > static DEFINE_SPINLOCK(destroy_lock); > static LIST_HEAD(destroy_list); > > @@ -104,12 +106,12 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) > } > > /* Calculate mask of events for a list of marks */ > -u32 fsnotify_recalc_mask(struct hlist_head *head) > +u32 fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) > { > u32 new_mask = 0; > struct fsnotify_mark *mark; > > - hlist_for_each_entry(mark, head, obj_list) > + hlist_for_each_entry(mark, &conn->list, obj_list) > new_mask |= mark->mask; > return new_mask; > } > @@ -137,10 +139,9 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) > > mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; > > - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { > - inode = mark->inode; > - fsnotify_destroy_inode_mark(mark); > - } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) > + if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) > + inode = fsnotify_destroy_inode_mark(mark); > + else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) > fsnotify_destroy_vfsmount_mark(mark); > else > BUG(); > @@ -155,7 +156,7 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) > > spin_unlock(&mark->lock); > > - if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) > + if (inode) And this should also be part of the inode refcounting change. > iput(inode); > > atomic_dec(&group->num_marks); > @@ -220,7 +221,8 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, > fsnotify_free_mark(mark); > } > > -void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) > +void fsnotify_destroy_marks(struct fsnotify_mark_connector **connp, > + spinlock_t *lock) > { > struct fsnotify_mark *mark; > > @@ -233,11 +235,12 @@ void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) > * calling fsnotify_destroy_mark() more than once is fine. > */ > spin_lock(lock); > - if (hlist_empty(head)) { > + if (hlist_empty(&(*connp)->list)) { > spin_unlock(lock); > break; > } > - mark = hlist_entry(head->first, struct fsnotify_mark, obj_list); > + mark = hlist_entry((*connp)->list.first, struct fsnotify_mark, > + obj_list); > /* > * We don't update i_fsnotify_mask / mnt_fsnotify_mask here > * since inode / mount is going away anyway. So just remove > @@ -249,6 +252,16 @@ void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) > fsnotify_destroy_mark(mark, mark->group); > fsnotify_put_mark(mark); > } > + /* > + * Skip freeing of connector when inode is just unlinked. We leave that > + * until destroy_inode() is called at which moment nobody will be > + * looking at the inode anymore. > + */ > + if ((*connp)->flags & FSNOTIFY_OBJ_TYPE_INODE && > + !((*connp)->inode->i_state & I_CLEAR)) > + return; This is ugly. Why not rather add a fsnotify_connector_free() and call that explicitly on inode/vfsmount destruction? > + kmem_cache_free(fsnotify_mark_connector_cachep, *connp); > + *connp = NULL; > } > > void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) > @@ -256,9 +269,6 @@ void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) > assert_spin_locked(&mark->lock); > > mark->mask = mask; > - > - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) > - fsnotify_set_inode_mark_mask_locked(mark, mask); Also related to inode refcounting? > } > > void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask) > @@ -304,37 +314,111 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) > return -1; > } > > -/* Add mark into proper place in given list of marks */ > -int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, > - int allow_dups) > +/* > + * Get mark connector, make sure it is alive and return with its lock held. > + * This is for users that get connector pointer from inode or mount. Users that > + * hold reference to a mark on the list may directly lock connector->lock as > + * they are sure list cannot go away under them. > + */ > +static struct fsnotify_mark_connector *fsnotify_grab_connector( > + struct fsnotify_mark_connector *conn) > +{ > + if (!conn) > + return NULL; > + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) > + spin_lock(&conn->inode->i_lock); > + else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) > + spin_lock(&conn->mnt->mnt_root->d_lock); > + else > + return NULL; Can this happen? If not, then add a WARNING(). > + return conn; > +} > + > +static void fsnotify_put_connector(struct fsnotify_mark_connector *conn) > +{ > + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) > + spin_unlock(&conn->inode->i_lock); > + else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) > + spin_unlock(&conn->mnt->mnt_root->d_lock); > +} > + > +/* > + * Add mark into proper place in given list of marks. These marks may be used > + * for the fsnotify backend to determine which event types should be delivered > + * to which group and for which inodes. These marks are ordered according to > + * priority, highest number first, and then by the group's location in memory. > + */ > +static int fsnotify_add_mark_list(struct fsnotify_mark *mark, > + struct inode *inode, struct vfsmount *mnt, > + int allow_dups) > { > struct fsnotify_mark *lmark, *last = NULL; > + struct fsnotify_mark_connector *conn; > + struct fsnotify_mark_connector **connp; > int cmp; > + int err = 0; > + > + BUG_ON(!inode && !mnt); Use WARN_ON() instead. > + if (inode) > + connp = &inode->i_fsnotify_marks; > + else > + connp = &real_mount(mnt)->mnt_fsnotify_marks; > +restart: > + spin_lock(&mark->lock); > + conn = fsnotify_grab_connector(*connp); This looks a bit too subtle. We have the inode/vfsmount. We have the matching lock. Could store the address of the lock in a local variable and use that for locking. > + if (!conn) { > + spin_unlock(&mark->lock); > + conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, > + GFP_KERNEL); > + if (!conn) > + return -ENOMEM; > + INIT_HLIST_HEAD(&conn->list); > + if (inode) { > + conn->flags = FSNOTIFY_OBJ_TYPE_INODE; > + conn->inode = inode; > + } else { > + conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT; > + conn->mnt = mnt; > + } > + if (cmpxchg(connp, NULL, conn)) { > + /* Someone else created list structure for us */ > + kmem_cache_free(fsnotify_mark_connector_cachep, conn); > + } Setting *connp should be done with inode/vfsmount lock already regained (if I understand the locking rules correctly) and then no need to have cmpxchg magic in there. > + goto restart; > + } > > /* is mark the first mark? */ > - if (hlist_empty(head)) { > - hlist_add_head_rcu(&mark->obj_list, head); > - return 0; > + if (hlist_empty(&conn->list)) { > + hlist_add_head_rcu(&mark->obj_list, &conn->list); > + igrab(inode); And this is again an inode grabbing change > + goto added; > } > > /* should mark be in the middle of the current list? */ > - hlist_for_each_entry(lmark, head, obj_list) { > + hlist_for_each_entry(lmark, &conn->list, obj_list) { > last = lmark; > > - if ((lmark->group == mark->group) && !allow_dups) > - return -EEXIST; > + if ((lmark->group == mark->group) && !allow_dups) { > + err = -EEXIST; > + goto out_err; > + } > > cmp = fsnotify_compare_groups(lmark->group, mark->group); > if (cmp >= 0) { > hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); > - return 0; > + goto added; > } > } > > BUG_ON(last == NULL); > /* mark should be the last entry. last is the current last entry */ > hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); > - return 0; > +added: > + mark->connector = conn; > +out_err: > + fsnotify_put_connector(conn); > + spin_unlock(&mark->lock); > + return err; And AFAICS, this is also due to the inode grabbing change. > } > > /* > @@ -366,25 +450,16 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, > list_add(&mark->g_list, &group->marks_list); > atomic_inc(&group->num_marks); > fsnotify_get_mark(mark); /* for i_list and g_list */ > - > - if (inode) { > - ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups); > - if (ret) > - goto err; > - } else if (mnt) { > - ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups); > - if (ret) > - goto err; > - } else { > - BUG(); > - } > - > - /* this will pin the object if appropriate */ > - fsnotify_set_mark_mask_locked(mark, mark->mask); > spin_unlock(&mark->lock); > > + ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups); > + if (ret) > + goto err; > + > if (inode) > - __fsnotify_update_child_dentry_flags(inode); > + fsnotify_recalc_inode_mask(inode); > + else > + fsnotify_recalc_vfsmount_mask(mnt); > > return ret; > err: > @@ -453,7 +528,7 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, > */ > mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); > list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { > - if (mark->flags & flags) > + if (mark->connector->flags & flags) > list_move(&mark->g_list, &to_free); > } > mutex_unlock(&group->mark_mutex); > diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c > index a8fcab68faef..9e676b4b0068 100644 > --- a/fs/notify/vfsmount_mark.c > +++ b/fs/notify/vfsmount_mark.c > @@ -31,7 +31,7 @@ > > void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) > { > - fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT); > + fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT); > } > > /* > @@ -43,13 +43,13 @@ void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) > struct mount *m = real_mount(mnt); > > spin_lock(&mnt->mnt_root->d_lock); > - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); > + m->mnt_fsnotify_mask = fsnotify_recalc_mask(m->mnt_fsnotify_marks); > spin_unlock(&mnt->mnt_root->d_lock); > } > > void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) > { > - struct vfsmount *mnt = mark->mnt; > + struct vfsmount *mnt = mark->connector->mnt; > struct mount *m = real_mount(mnt); > > BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); > @@ -58,9 +58,9 @@ void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) > spin_lock(&mnt->mnt_root->d_lock); > > hlist_del_init_rcu(&mark->obj_list); > - mark->mnt = NULL; > + mark->connector = NULL; > > - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); > + m->mnt_fsnotify_mask = fsnotify_recalc_mask(m->mnt_fsnotify_marks); > spin_unlock(&mnt->mnt_root->d_lock); > } > > @@ -74,35 +74,12 @@ struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, > struct mount *m = real_mount(mnt); > struct fsnotify_mark *mark; > > - spin_lock(&mnt->mnt_root->d_lock); > - mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group); > - spin_unlock(&mnt->mnt_root->d_lock); > - > - return mark; > -} > - > -/* > - * Attach an initialized mark to a given group and vfsmount. > - * These marks may be used for the fsnotify backend to determine which > - * event types should be delivered to which groups. > - */ > -int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, > - struct fsnotify_group *group, struct vfsmount *mnt, > - int allow_dups) > -{ > - struct mount *m = real_mount(mnt); > - int ret; > - > - mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; > - > - BUG_ON(!mutex_is_locked(&group->mark_mutex)); > - assert_spin_locked(&mark->lock); > + if (!m->mnt_fsnotify_marks) > + return NULL; Move check to fsnotify_find_mark()? > > spin_lock(&mnt->mnt_root->d_lock); > - mark->mnt = mnt; > - ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups); > - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); > + mark = fsnotify_find_mark(&m->mnt_fsnotify_marks->list, group); > spin_unlock(&mnt->mnt_root->d_lock); > > - return ret; > + return mark; > } > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 2ba074328894..2588a24ad11f 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -545,6 +545,8 @@ is_uncached_acl(struct posix_acl *acl) > #define IOP_XATTR 0x0008 > #define IOP_DEFAULT_READLINK 0x0010 > > +struct fsnotify_mark_connector; > + > /* > * Keep mostly read-only and often accessed (especially for > * the RCU path lookup and 'stat' data) fields at the beginning > @@ -644,7 +646,7 @@ struct inode { > > #ifdef CONFIG_FSNOTIFY > __u32 i_fsnotify_mask; /* all events this inode cares about */ > - struct hlist_head i_fsnotify_marks; > + struct fsnotify_mark_connector *i_fsnotify_marks; > #endif > > #if IS_ENABLED(CONFIG_FS_ENCRYPTION) > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h > index 487246546ebe..01df00327683 100644 > --- a/include/linux/fsnotify_backend.h > +++ b/include/linux/fsnotify_backend.h > @@ -194,6 +194,27 @@ struct fsnotify_group { > #define FSNOTIFY_EVENT_INODE 2 > > /* > + * Inode / vfsmount point to this structure which tracks all marks attached to > + * the inode / vfsmount. The reference to inode / vfsmount is held by this > + * structure. We destroy this structure when there are no more marks attached > + * to it. The structure is protected by fsnotify_mark_srcu. > + */ > +struct fsnotify_mark_connector { > +#define FSNOTIFY_OBJ_TYPE_INODE 0x01 > +#define FSNOTIFY_OBJ_TYPE_VFSMOUNT 0x02 > + unsigned int flags; /* Type of object [lock] */ > + union { /* Object pointer [lock] */ > + struct inode *inode; > + struct vfsmount *mnt; > + }; > + union { > + struct hlist_head list; > + /* Used listing heads to free after srcu period expires */ > + struct fsnotify_mark_connector *destroy_next; This field is not used in this patch at all. > + }; > +}; > + > +/* > * A mark is simply an object attached to an in core inode which allows an > * fsnotify listener to indicate they are either no longer interested in events > * of a type matching mask or only interested in those events. > @@ -224,18 +245,13 @@ struct fsnotify_mark { > spinlock_t lock; > /* List of marks for inode / vfsmount [obj_lock] */ > struct hlist_node obj_list; > - union { /* Object pointer [mark->lock, group->mark_mutex] */ > - struct inode *inode; /* inode this mark is associated with */ > - struct vfsmount *mnt; /* vfsmount this mark is associated with */ > - }; > + /* Head of list of marks for an object [mark->lock, group->mark_mutex] */ > + struct fsnotify_mark_connector *connector; > /* Events types to ignore [mark->lock, group->mark_mutex] */ > __u32 ignored_mask; > -#define FSNOTIFY_MARK_FLAG_INODE 0x01 > -#define FSNOTIFY_MARK_FLAG_VFSMOUNT 0x02 > -#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED 0x04 > -#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08 > -#define FSNOTIFY_MARK_FLAG_ALIVE 0x10 > -#define FSNOTIFY_MARK_FLAG_ATTACHED 0x20 > +#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x01 > +#define FSNOTIFY_MARK_FLAG_ALIVE 0x02 > +#define FSNOTIFY_MARK_FLAG_ATTACHED 0x04 > unsigned int flags; /* flags [mark->lock] */ > void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */ > }; > @@ -343,7 +359,7 @@ extern void fsnotify_free_mark(struct fsnotify_mark *mark); > extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group); > /* run all the marks in a group, and clear all of the inode marks */ > extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group); > -/* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/ > +/* run all the marks in a group, and clear all of the marks attached to given object type */ > extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags); > extern void fsnotify_get_mark(struct fsnotify_mark *mark); > extern void fsnotify_put_mark(struct fsnotify_mark *mark); > diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c > index 11c7ac441624..a878a98f55b3 100644 > --- a/kernel/audit_tree.c > +++ b/kernel/audit_tree.c > @@ -172,10 +172,25 @@ static unsigned long inode_to_key(const struct inode *inode) > /* > * Function to return search key in our hash from chunk. Key 0 is special and > * should never be present in the hash. > + * > + * Must be called with chunk->mark.lock held to protect from connector > + * becoming NULL. > */ > +static unsigned long __chunk_to_key(struct audit_chunk *chunk) > +{ > + if (!chunk->mark.connector) > + return 0; > + return (unsigned long)chunk->mark.connector->inode; > +} > + > static unsigned long chunk_to_key(struct audit_chunk *chunk) > { > - return (unsigned long)chunk->mark.inode; > + unsigned long key; > + > + spin_lock(&chunk->mark.lock); > + key = __chunk_to_key(chunk); > + spin_unlock(&chunk->mark.lock); > + return key; > } > > static inline struct list_head *chunk_hash(unsigned long key) > @@ -187,7 +202,7 @@ static inline struct list_head *chunk_hash(unsigned long key) > /* hash_lock & entry->lock is held by caller */ > static void insert_hash(struct audit_chunk *chunk) > { > - unsigned long key = chunk_to_key(chunk); > + unsigned long key = __chunk_to_key(chunk); > struct list_head *list; > > if (!key) > @@ -248,7 +263,8 @@ static void untag_chunk(struct node *p) > > mutex_lock(&entry->group->mark_mutex); > spin_lock(&entry->lock); > - if (chunk->dead || !entry->inode) { > + if (chunk->dead || !entry->connector || > + !entry->connector->inode) { entry->connector->inode will always be positive, at least in this patch. > spin_unlock(&entry->lock); > mutex_unlock(&entry->group->mark_mutex); > if (new) > @@ -276,8 +292,8 @@ static void untag_chunk(struct node *p) > if (!new) > goto Fallback; > > - if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode, > - NULL, 1)) { > + if (fsnotify_add_mark_locked(&new->mark, entry->group, > + entry->connector->inode, NULL, 1)) { > fsnotify_put_mark(&new->mark); > goto Fallback; > } > @@ -408,7 +424,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) > > mutex_lock(&old_entry->group->mark_mutex); > spin_lock(&old_entry->lock); > - if (!old_entry->inode) { > + if (!old_entry->connector || !old_entry->connector->inode) { Same here, connector->inode will never be NULL. > /* old_entry is being shot, lets just lie */ > spin_unlock(&old_entry->lock); > mutex_unlock(&old_entry->group->mark_mutex); > @@ -418,7 +434,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) > } > > if (fsnotify_add_mark_locked(chunk_entry, old_entry->group, > - old_entry->inode, NULL, 1)) { > + old_entry->connector->inode, NULL, 1)) { > spin_unlock(&old_entry->lock); > mutex_unlock(&old_entry->group->mark_mutex); > fsnotify_put_mark(chunk_entry); > diff --git a/kernel/auditsc.c b/kernel/auditsc.c > index cf1fa43512c1..e990de915341 100644 > --- a/kernel/auditsc.c > +++ b/kernel/auditsc.c > @@ -1591,7 +1591,7 @@ static inline void handle_one(const struct inode *inode) > struct audit_tree_refs *p; > struct audit_chunk *chunk; > int count; > - if (likely(hlist_empty(&inode->i_fsnotify_marks))) > + if (likely(!inode->i_fsnotify_marks)) Need to check hlist_empty() as well? Perhaps add an inline helper to fsnotify_backend.h: fsnotify_inode_marks_empty(). > return; > context = current->audit_context; > p = context->trees; > @@ -1634,7 +1634,7 @@ static void handle_path(const struct dentry *dentry) > seq = read_seqbegin(&rename_lock); > for(;;) { > struct inode *inode = d_backing_inode(d); > - if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { > + if (inode && unlikely(inode->i_fsnotify_marks)) { Here as well. > struct audit_chunk *chunk; > chunk = audit_tree_lookup(inode); > if (chunk) { > -- > 2.10.2 >