On Fri, Jan 6, 2017 at 12:43 PM, Jan Kara <jack@xxxxxxx> wrote: > Currently notification marks are attached to object (inode or vfsmnt) by > a hlist_head in the object. The list is also protected by a spinlock in > the object. So while there is any mark attached to the list of marks, > the object must be pinned in memory (and thus e.g. last iput() deleting > inode cannot happen). Also for list iteration in fsnotify() to work, we > must hold fsnotify_mark_srcu lock so that mark itself and > mark->obj_list.next cannot get freed. Thus we are required to wait for > response to fanotify events from userspace process with > fsnotify_mark_srcu lock held. That causes issues when userspace process > is buggy and does not reply to some event - basically the whole > notification subsystem gets eventually stuck. > > So to be able to drop fsnotify_mark_srcu lock while waiting for > response, we have to pin the mark in memory and make sure it stays in > the object list (as removing the mark waiting for response could lead to > lost notification events for groups later in the list). However we don't > want inode reclaim to block on such mark as that would lead to system > just locking up elsewhere. > > This commit tries to pave a way towards solving these conflicting > lifetime needs. Instead of anchoring the list of marks directly in the > object, we anchor it in a dedicated structure (fsnotify_mark_connector) and > just point to that structure from the object. Also the list is protected > by a spinlock contained in that structure. With this, we can detach > notification marks from object without having to modify the list itself. > > Signed-off-by: Jan Kara <jack@xxxxxxx> This looks much better to me then v1. I reviewed most of the comments I gave for v1 on other patches. There was nothing more serious then inaccurate comment anyway. I'll give the whole series another pass next week. Amir. > --- > fs/inode.c | 3 - > fs/mount.h | 2 +- > fs/namespace.c | 3 - > fs/notify/dnotify/dnotify.c | 3 +- > fs/notify/fdinfo.c | 12 +- > fs/notify/fsnotify.c | 31 +++- > fs/notify/fsnotify.h | 38 +---- > fs/notify/inode_mark.c | 94 +----------- > fs/notify/mark.c | 316 +++++++++++++++++++++++++++++---------- > fs/notify/vfsmount_mark.c | 64 +------- > include/linux/fs.h | 4 +- > include/linux/fsnotify_backend.h | 43 ++++-- > kernel/audit_tree.c | 30 +++- > kernel/auditsc.c | 4 +- > 14 files changed, 339 insertions(+), 308 deletions(-) > > diff --git a/fs/inode.c b/fs/inode.c > index 88110fd0b282..131b2bcebc48 100644 > --- a/fs/inode.c > +++ b/fs/inode.c > @@ -371,9 +371,6 @@ void inode_init_once(struct inode *inode) > INIT_LIST_HEAD(&inode->i_lru); > address_space_init_once(&inode->i_data); > i_size_ordered_init(inode); > -#ifdef CONFIG_FSNOTIFY > - INIT_HLIST_HEAD(&inode->i_fsnotify_marks); > -#endif > } > EXPORT_SYMBOL(inode_init_once); > > diff --git a/fs/mount.h b/fs/mount.h > index 2c856fc47ae3..e1dc79713abe 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -59,7 +59,7 @@ struct mount { > struct mountpoint *mnt_mp; /* where is it mounted */ > struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ > #ifdef CONFIG_FSNOTIFY > - struct hlist_head mnt_fsnotify_marks; > + struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; > __u32 mnt_fsnotify_mask; > #endif > int mnt_id; /* mount identifier */ > diff --git a/fs/namespace.c b/fs/namespace.c > index b5b1259e064f..22a91352e2d7 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -233,9 +233,6 @@ static struct mount *alloc_vfsmnt(const char *name) > INIT_LIST_HEAD(&mnt->mnt_slave_list); > INIT_LIST_HEAD(&mnt->mnt_slave); > INIT_HLIST_NODE(&mnt->mnt_mp_list); > -#ifdef CONFIG_FSNOTIFY > - INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); > -#endif > init_fs_pin(&mnt->mnt_umount, drop_mountpoint); > } > return mnt; > diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c > index 5a4ec309e283..41b2a070761c 100644 > --- a/fs/notify/dnotify/dnotify.c > +++ b/fs/notify/dnotify/dnotify.c > @@ -69,8 +69,7 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) > if (old_mask == new_mask) > return; > > - if (fsn_mark->inode) > - fsnotify_recalc_inode_mask(fsn_mark->inode); > + fsnotify_recalc_mask(fsn_mark->connector); > } > > /* > diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c > index 601a59c8d87e..dd63aa9a6f9a 100644 > --- a/fs/notify/fdinfo.c > +++ b/fs/notify/fdinfo.c > @@ -76,11 +76,11 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) > struct inotify_inode_mark *inode_mark; > struct inode *inode; > > - if (!(mark->flags & FSNOTIFY_MARK_FLAG_INODE)) > + if (!(mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE)) > return; > > inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); > - inode = igrab(mark->inode); > + inode = igrab(mark->connector->inode); > if (inode) { > /* > * IN_ALL_EVENTS represents all of the mask bits > @@ -115,8 +115,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) > if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) > mflags |= FAN_MARK_IGNORED_SURV_MODIFY; > > - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { > - inode = igrab(mark->inode); > + if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) { > + inode = igrab(mark->connector->inode); > if (!inode) > return; > seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", > @@ -125,8 +125,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) > show_mark_fhandle(m, inode); > seq_putc(m, '\n'); > iput(inode); > - } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { > - struct mount *mnt = real_mount(mark->mnt); > + } else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { > + struct mount *mnt = real_mount(mark->connector->mnt); > > seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", > mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); > diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c > index b41515d3f081..d512ef9f75fc 100644 > --- a/fs/notify/fsnotify.c > +++ b/fs/notify/fsnotify.c > @@ -193,6 +193,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; > struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; > struct fsnotify_group *inode_group, *vfsmount_group; > + struct fsnotify_mark_connector *inode_conn, *vfsmount_conn; > struct mount *mnt; > int idx, ret = 0; > /* global tests shouldn't care about events on child only the specific event */ > @@ -210,8 +211,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > * SRCU because we have no references to any objects and do not > * need SRCU to keep them "alive". > */ > - if (hlist_empty(&to_tell->i_fsnotify_marks) && > - (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks))) > + if (!to_tell->i_fsnotify_marks && > + (!mnt || !mnt->mnt_fsnotify_marks)) > return 0; > /* > * if this is a modify event we may need to clear the ignored masks > @@ -226,16 +227,27 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > idx = srcu_read_lock(&fsnotify_mark_srcu); > > if ((mask & FS_MODIFY) || > - (test_mask & to_tell->i_fsnotify_mask)) > - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, > + (test_mask & to_tell->i_fsnotify_mask)) { > + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, > &fsnotify_mark_srcu); > + if (inode_conn) > + inode_node = srcu_dereference(inode_conn->list.first, > + &fsnotify_mark_srcu); > + } > > if (mnt && ((mask & FS_MODIFY) || > (test_mask & mnt->mnt_fsnotify_mask))) { > - vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, > - &fsnotify_mark_srcu); > - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, > + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, > &fsnotify_mark_srcu); > + if (inode_conn) > + inode_node = srcu_dereference(inode_conn->list.first, > + &fsnotify_mark_srcu); > + vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks, > + &fsnotify_mark_srcu); > + if (vfsmount_conn) > + vfsmount_node = srcu_dereference( > + vfsmount_conn->list.first, > + &fsnotify_mark_srcu); > } > > /* > @@ -293,6 +305,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > } > EXPORT_SYMBOL_GPL(fsnotify); > > +extern struct kmem_cache *fsnotify_mark_connector_cachep; > + > static __init int fsnotify_init(void) > { > int ret; > @@ -303,6 +317,9 @@ static __init int fsnotify_init(void) > if (ret) > panic("initializing fsnotify_mark_srcu"); > > + fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, > + SLAB_PANIC); > + > return 0; > } > core_initcall(fsnotify_init); > diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h > index 0a3bc2cf192c..24bc549e9709 100644 > --- a/fs/notify/fsnotify.h > +++ b/fs/notify/fsnotify.h > @@ -14,47 +14,25 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group); > /* protects reads of inode and vfsmount marks list */ > extern struct srcu_struct fsnotify_mark_srcu; > > -/* Calculate mask of events for a list of marks */ > -extern u32 fsnotify_recalc_mask(struct hlist_head *head); > - > /* compare two groups for sorting of marks lists */ > extern int fsnotify_compare_groups(struct fsnotify_group *a, > struct fsnotify_group *b); > > -extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, > - __u32 mask); > -/* Add mark to a proper place in mark list */ > -extern int fsnotify_add_mark_list(struct hlist_head *head, > - struct fsnotify_mark *mark, > - int allow_dups); > -/* add a mark to an inode */ > -extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, > - struct fsnotify_group *group, struct inode *inode, > - int allow_dups); > -/* add a mark to a vfsmount */ > -extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, > - struct fsnotify_group *group, struct vfsmount *mnt, > - int allow_dups); > - > -/* vfsmount specific destruction of a mark */ > -extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); > -/* inode specific destruction of a mark */ > -extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); > -/* Find mark belonging to given group in the list of marks */ > -extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, > - struct fsnotify_group *group); > -/* Destroy all marks in the given list protected by 'lock' */ > -extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock); > +/* Find mark belonging to given group attached to given connector */ > +extern struct fsnotify_mark *fsnotify_find_mark( > + struct fsnotify_mark_connector **connp, > + struct fsnotify_group *group); > +/* Destroy all marks connected via given connector */ > +extern void fsnotify_destroy_marks(struct fsnotify_mark_connector **connp); > /* run the list of all marks associated with inode and destroy them */ > static inline void fsnotify_clear_marks_by_inode(struct inode *inode) > { > - fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock); > + fsnotify_destroy_marks(&inode->i_fsnotify_marks); > } > /* run the list of all marks associated with vfsmount and destroy them */ > static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) > { > - fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, > - &mnt->mnt_root->d_lock); > + fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); > } > /* prepare for freeing all marks associated with given group */ > extern void fsnotify_detach_group_marks(struct fsnotify_group *group); > diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c > index a3645249f7ec..b9370316727e 100644 > --- a/fs/notify/inode_mark.c > +++ b/fs/notify/inode_mark.c > @@ -30,38 +30,9 @@ > > #include "../internal.h" > > -/* > - * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types > - * any notifier is interested in hearing for this inode. > - */ > void fsnotify_recalc_inode_mask(struct inode *inode) > { > - spin_lock(&inode->i_lock); > - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); > - spin_unlock(&inode->i_lock); > - > - __fsnotify_update_child_dentry_flags(inode); > -} > - > -void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) > -{ > - struct inode *inode = mark->inode; > - > - BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); > - assert_spin_locked(&mark->lock); > - > - spin_lock(&inode->i_lock); > - > - hlist_del_init_rcu(&mark->obj_list); > - mark->inode = NULL; > - > - /* > - * this mark is now off the inode->i_fsnotify_marks list and we > - * hold the inode->i_lock, so this is the perfect time to update the > - * inode->i_fsnotify_mask > - */ > - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); > - spin_unlock(&inode->i_lock); > + fsnotify_recalc_mask(inode->i_fsnotify_marks); > } > > /* > @@ -69,7 +40,7 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) > */ > void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) > { > - fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE); > + fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_INODE); > } > > /* > @@ -79,66 +50,7 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) > struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, > struct inode *inode) > { > - struct fsnotify_mark *mark; > - > - spin_lock(&inode->i_lock); > - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); > - spin_unlock(&inode->i_lock); > - > - return mark; > -} > - > -/* > - * If we are setting a mark mask on an inode mark we should pin the inode > - * in memory. > - */ > -void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, > - __u32 mask) > -{ > - struct inode *inode; > - > - assert_spin_locked(&mark->lock); > - > - if (mask && > - mark->inode && > - !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { > - mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; > - inode = igrab(mark->inode); > - /* > - * we shouldn't be able to get here if the inode wasn't > - * already safely held in memory. But bug in case it > - * ever is wrong. > - */ > - BUG_ON(!inode); > - } > -} > - > -/* > - * Attach an initialized mark to a given inode. > - * These marks may be used for the fsnotify backend to determine which > - * event types should be delivered to which group and for which inodes. These > - * marks are ordered according to priority, highest number first, and then by > - * the group's location in memory. > - */ > -int fsnotify_add_inode_mark(struct fsnotify_mark *mark, > - struct fsnotify_group *group, struct inode *inode, > - int allow_dups) > -{ > - int ret; > - > - mark->flags |= FSNOTIFY_MARK_FLAG_INODE; > - > - BUG_ON(!mutex_is_locked(&group->mark_mutex)); > - assert_spin_locked(&mark->lock); > - > - spin_lock(&inode->i_lock); > - mark->inode = inode; > - ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark, > - allow_dups); > - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); > - spin_unlock(&inode->i_lock); > - > - return ret; > + return fsnotify_find_mark(&inode->i_fsnotify_marks, group); > } > > /** > diff --git a/fs/notify/mark.c b/fs/notify/mark.c > index 44836e539169..97c68589742e 100644 > --- a/fs/notify/mark.c > +++ b/fs/notify/mark.c > @@ -33,7 +33,7 @@ > * > * group->mark_mutex > * mark->lock > - * inode->i_lock > + * mark->connector->lock > * > * group->mark_mutex protects the marks_list anchored inside a given group and > * each mark is hooked via the g_list. It also protects the groups private > @@ -44,10 +44,12 @@ > * is assigned to as well as the access to a reference of the inode/vfsmount > * that is being watched by the mark. > * > - * inode->i_lock protects the i_fsnotify_marks list anchored inside a > - * given inode and each mark is hooked via the i_list. (and sorta the > - * free_i_list) > + * mark->connector->lock protects the list of marks anchored inside an > + * inode / vfsmount and each mark is hooked via the i_list. > * > + * A list of notification marks relating to inode / mnt is contained in > + * fsnotify_mark_connector. That structure is alive as long as there are any marks > + * in the list and is also protected by fsnotify_mark_srcu. > * > * LIFETIME: > * Inode marks survive between when they are added to an inode and when their > @@ -83,12 +85,18 @@ > #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ > > struct srcu_struct fsnotify_mark_srcu; > +struct kmem_cache *fsnotify_mark_connector_cachep; > + > static DEFINE_SPINLOCK(destroy_lock); > static LIST_HEAD(destroy_list); > +static struct fsnotify_mark_connector *connector_destroy_list; > > static void fsnotify_mark_destroy_workfn(struct work_struct *work); > static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); > > +static void fsnotify_connector_destroy_workfn(struct work_struct *work); > +static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); > + > void fsnotify_get_mark(struct fsnotify_mark *mark) > { > atomic_inc(&mark->refcnt); > @@ -103,15 +111,97 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) > } > } > > -/* Calculate mask of events for a list of marks */ > -u32 fsnotify_recalc_mask(struct hlist_head *head) > +static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) > { > u32 new_mask = 0; > struct fsnotify_mark *mark; > > - hlist_for_each_entry(mark, head, obj_list) > - new_mask |= mark->mask; > - return new_mask; > + assert_spin_locked(&conn->lock); > + hlist_for_each_entry(mark, &conn->list, obj_list) { > + if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) > + new_mask |= mark->mask; > + } > + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) > + conn->inode->i_fsnotify_mask = new_mask; > + else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) > + real_mount(conn->mnt)->mnt_fsnotify_mask = new_mask; > +} > + > +/* > + * Calculate mask of events for a list of marks. The caller must make sure list > + * cannot disappear under us (usually by holding a mark->lock or > + * mark->group->mark_mutex for a mark on this list). > + */ > +void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) > +{ > + struct inode *inode = NULL; > + > + spin_lock(&conn->lock); > + __fsnotify_recalc_mask(conn); > + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) > + inode = igrab(conn->inode); > + spin_unlock(&conn->lock); > + if (inode) { > + __fsnotify_update_child_dentry_flags(inode); > + iput(inode); > + } > +} > + > +/* Free all connectors queued for freeing once SRCU period ends */ > +static void fsnotify_connector_destroy_workfn(struct work_struct *work) > +{ > + struct fsnotify_mark_connector *conn, *free; > + > + spin_lock(&destroy_lock); > + conn = connector_destroy_list; > + connector_destroy_list = NULL; > + spin_unlock(&destroy_lock); > + > + synchronize_srcu(&fsnotify_mark_srcu); > + while (conn) { > + free = conn; > + conn = conn->destroy_next; > + kmem_cache_free(fsnotify_mark_connector_cachep, free); > + } > +} > + > +static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark) > +{ > + struct fsnotify_mark_connector *conn; > + struct inode *inode = NULL; > + bool free_conn = false; > + > + conn = mark->connector; > + spin_lock(&conn->lock); > + hlist_del_init_rcu(&mark->obj_list); > + if (hlist_empty(&conn->list)) { > + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) { > + inode = conn->inode; > + inode->i_fsnotify_marks = NULL; > + inode->i_fsnotify_mask = 0; > + conn->inode = NULL; > + conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE; > + } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { > + real_mount(conn->mnt)->mnt_fsnotify_marks = NULL; > + real_mount(conn->mnt)->mnt_fsnotify_mask = 0; > + conn->mnt = NULL; > + conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT; > + } > + free_conn = true; > + } else > + __fsnotify_recalc_mask(conn); > + mark->connector = NULL; > + spin_unlock(&conn->lock); > + > + if (free_conn) { > + spin_lock(&destroy_lock); > + conn->destroy_next = connector_destroy_list; > + connector_destroy_list = conn; > + spin_unlock(&destroy_lock); > + queue_work(system_unbound_wq, &connector_reaper_work); > + } > + > + return inode; > } > > /* > @@ -137,13 +227,8 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) > > mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; > > - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { > - inode = mark->inode; > - fsnotify_destroy_inode_mark(mark); > - } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) > - fsnotify_destroy_vfsmount_mark(mark); > - else > - BUG(); > + inode = fsnotify_detach_from_object(mark); > + > /* > * Note that we didn't update flags telling whether inode cares about > * what's happening with children. We update these flags from > @@ -155,7 +240,7 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) > > spin_unlock(&mark->lock); > > - if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) > + if (inode) > iput(inode); > > atomic_dec(&group->num_marks); > @@ -220,45 +305,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, > fsnotify_free_mark(mark); > } > > -void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) > -{ > - struct fsnotify_mark *mark; > - > - while (1) { > - /* > - * We have to be careful since we can race with e.g. > - * fsnotify_clear_marks_by_group() and once we drop 'lock', > - * mark can get removed from the obj_list and destroyed. But > - * we are holding mark reference so mark cannot be freed and > - * calling fsnotify_destroy_mark() more than once is fine. > - */ > - spin_lock(lock); > - if (hlist_empty(head)) { > - spin_unlock(lock); > - break; > - } > - mark = hlist_entry(head->first, struct fsnotify_mark, obj_list); > - /* > - * We don't update i_fsnotify_mask / mnt_fsnotify_mask here > - * since inode / mount is going away anyway. So just remove > - * mark from the list. > - */ > - hlist_del_init_rcu(&mark->obj_list); > - fsnotify_get_mark(mark); > - spin_unlock(lock); > - fsnotify_destroy_mark(mark, mark->group); > - fsnotify_put_mark(mark); > - } > -} > - > void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) > { > assert_spin_locked(&mark->lock); > > mark->mask = mask; > - > - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) > - fsnotify_set_inode_mark_mask_locked(mark, mask); > } > > void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask) > @@ -304,37 +355,118 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) > return -1; > } > > -/* Add mark into proper place in given list of marks */ > -int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, > - int allow_dups) > +/* > + * Get mark connector, make sure it is alive and return with its lock held. > + * This is for users that get connector pointer from inode or mount. Users that > + * hold reference to a mark on the list may directly lock connector->lock as > + * they are sure list cannot go away under them. > + */ > +static struct fsnotify_mark_connector *fsnotify_grab_connector( > + struct fsnotify_mark_connector * __rcu *connp) > +{ > + struct fsnotify_mark_connector *conn; > + int idx; > + > + idx = srcu_read_lock(&fsnotify_mark_srcu); > + conn = srcu_dereference(*connp, &fsnotify_mark_srcu); > + if (!conn) > + goto out; > + spin_lock(&conn->lock); > + if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE | > + FSNOTIFY_OBJ_TYPE_VFSMOUNT))) { > + spin_unlock(&conn->lock); > + srcu_read_unlock(&fsnotify_mark_srcu, idx); > + return NULL; > + } > +out: > + srcu_read_unlock(&fsnotify_mark_srcu, idx); > + return conn; > +} > + > +static void fsnotify_put_connector(struct fsnotify_mark_connector *conn) > +{ > + spin_unlock(&conn->lock); > +} > + > +/* > + * Add mark into proper place in given list of marks. These marks may be used > + * for the fsnotify backend to determine which event types should be delivered > + * to which group and for which inodes. These marks are ordered according to > + * priority, highest number first, and then by the group's location in memory. > + */ > +static int fsnotify_add_mark_list(struct fsnotify_mark *mark, > + struct inode *inode, struct vfsmount *mnt, > + int allow_dups) > { > struct fsnotify_mark *lmark, *last = NULL; > + struct fsnotify_mark_connector *conn; > + struct fsnotify_mark_connector **connp; > int cmp; > + int err = 0; > + > + BUG_ON(!inode && !mnt); > + if (inode) > + connp = &inode->i_fsnotify_marks; > + else > + connp = &real_mount(mnt)->mnt_fsnotify_marks; > +restart: > + spin_lock(&mark->lock); > + conn = fsnotify_grab_connector(connp); > + if (!conn) { > + spin_unlock(&mark->lock); > + conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, > + GFP_KERNEL); > + if (!conn) > + return -ENOMEM; > + spin_lock_init(&conn->lock); > + INIT_HLIST_HEAD(&conn->list); > + if (inode) { > + conn->flags = FSNOTIFY_OBJ_TYPE_INODE; > + conn->inode = igrab(inode); > + } else { > + conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT; > + conn->mnt = mnt; > + } > + if (cmpxchg(connp, NULL, conn)) { > + /* Someone else created list structure for us */ > + if (inode) > + iput(inode); > + kmem_cache_free(fsnotify_mark_connector_cachep, conn); > + } > + goto restart; > + } > > /* is mark the first mark? */ > - if (hlist_empty(head)) { > - hlist_add_head_rcu(&mark->obj_list, head); > - return 0; > + if (hlist_empty(&conn->list)) { > + hlist_add_head_rcu(&mark->obj_list, &conn->list); > + goto added; > } > > /* should mark be in the middle of the current list? */ > - hlist_for_each_entry(lmark, head, obj_list) { > + hlist_for_each_entry(lmark, &conn->list, obj_list) { > last = lmark; > > - if ((lmark->group == mark->group) && !allow_dups) > - return -EEXIST; > + if ((lmark->group == mark->group) && !allow_dups) { > + err = -EEXIST; > + goto out_err; > + } > > cmp = fsnotify_compare_groups(lmark->group, mark->group); > if (cmp >= 0) { > hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); > - return 0; > + goto added; > } > } > > BUG_ON(last == NULL); > /* mark should be the last entry. last is the current last entry */ > hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); > - return 0; > +added: > + mark->connector = conn; > +out_err: > + fsnotify_put_connector(conn); > + spin_unlock(&mark->lock); > + return err; > } > > /* > @@ -356,7 +488,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, > * LOCKING ORDER!!!! > * group->mark_mutex > * mark->lock > - * inode->i_lock > + * mark->connector->lock > */ > spin_lock(&mark->lock); > mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; > @@ -366,25 +498,14 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, > list_add(&mark->g_list, &group->marks_list); > atomic_inc(&group->num_marks); > fsnotify_get_mark(mark); /* for i_list and g_list */ > - > - if (inode) { > - ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups); > - if (ret) > - goto err; > - } else if (mnt) { > - ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups); > - if (ret) > - goto err; > - } else { > - BUG(); > - } > - > - /* this will pin the object if appropriate */ > - fsnotify_set_mark_mask_locked(mark, mark->mask); > spin_unlock(&mark->lock); > > - if (inode) > - __fsnotify_update_child_dentry_flags(inode); > + ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups); > + if (ret) > + goto err; > + > + if (mark->mask) > + fsnotify_recalc_mask(mark->connector); > > return ret; > err: > @@ -419,17 +540,23 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, > * Given a list of marks, find the mark associated with given group. If found > * take a reference to that mark and return it, else return NULL. > */ > -struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, > +struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_mark_connector **connp, > struct fsnotify_group *group) > { > struct fsnotify_mark *mark; > + struct fsnotify_mark_connector *conn; > > - hlist_for_each_entry(mark, head, obj_list) { > + conn = fsnotify_grab_connector(connp); > + if (!conn) > + return NULL; > + hlist_for_each_entry(mark, &conn->list, obj_list) { > if (mark->group == group) { > fsnotify_get_mark(mark); > + fsnotify_put_connector(conn); > return mark; > } > } > + fsnotify_put_connector(conn); > return NULL; > } > > @@ -453,7 +580,7 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, > */ > mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); > list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { > - if (mark->flags & flags) > + if (mark->connector->flags & flags) > list_move(&mark->g_list, &to_free); > } > mutex_unlock(&group->mark_mutex); > @@ -499,6 +626,29 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group) > } > } > > +/* Destroy all marks attached to inode / vfsmount */ > +void fsnotify_destroy_marks(struct fsnotify_mark_connector **connp) > +{ > + struct fsnotify_mark_connector *conn; > + struct fsnotify_mark *mark; > + > + while ((conn = fsnotify_grab_connector(connp))) { > + /* > + * We have to be careful since we can race with e.g. > + * fsnotify_clear_marks_by_group() and once we drop the list > + * lock, mark can get removed from the obj_list and destroyed. > + * But we are holding mark reference so mark cannot be freed > + * and calling fsnotify_destroy_mark() more than once is fine. > + */ > + mark = hlist_entry(conn->list.first, struct fsnotify_mark, > + obj_list); > + fsnotify_get_mark(mark); > + fsnotify_put_connector(conn); > + fsnotify_destroy_mark(mark, mark->group); > + fsnotify_put_mark(mark); > + } > +} > + > /* > * Nothing fancy, just initialize lists and locks and counters. > */ > diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c > index a8fcab68faef..8fc1aca21bcf 100644 > --- a/fs/notify/vfsmount_mark.c > +++ b/fs/notify/vfsmount_mark.c > @@ -29,39 +29,14 @@ > #include <linux/fsnotify_backend.h> > #include "fsnotify.h" > > -void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) > -{ > - fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT); > -} > - > -/* > - * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types > - * any notifier is interested in hearing for this mount point > - */ > void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) > { > - struct mount *m = real_mount(mnt); > - > - spin_lock(&mnt->mnt_root->d_lock); > - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); > - spin_unlock(&mnt->mnt_root->d_lock); > + fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks); > } > > -void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) > +void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) > { > - struct vfsmount *mnt = mark->mnt; > - struct mount *m = real_mount(mnt); > - > - BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); > - assert_spin_locked(&mark->lock); > - > - spin_lock(&mnt->mnt_root->d_lock); > - > - hlist_del_init_rcu(&mark->obj_list); > - mark->mnt = NULL; > - > - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); > - spin_unlock(&mnt->mnt_root->d_lock); > + fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT); > } > > /* > @@ -72,37 +47,6 @@ struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, > struct vfsmount *mnt) > { > struct mount *m = real_mount(mnt); > - struct fsnotify_mark *mark; > - > - spin_lock(&mnt->mnt_root->d_lock); > - mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group); > - spin_unlock(&mnt->mnt_root->d_lock); > - > - return mark; > -} > - > -/* > - * Attach an initialized mark to a given group and vfsmount. > - * These marks may be used for the fsnotify backend to determine which > - * event types should be delivered to which groups. > - */ > -int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, > - struct fsnotify_group *group, struct vfsmount *mnt, > - int allow_dups) > -{ > - struct mount *m = real_mount(mnt); > - int ret; > - > - mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; > - > - BUG_ON(!mutex_is_locked(&group->mark_mutex)); > - assert_spin_locked(&mark->lock); > - > - spin_lock(&mnt->mnt_root->d_lock); > - mark->mnt = mnt; > - ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups); > - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); > - spin_unlock(&mnt->mnt_root->d_lock); > > - return ret; > + return fsnotify_find_mark(&m->mnt_fsnotify_marks, group); > } > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 2ba074328894..8fc09316f996 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -545,6 +545,8 @@ is_uncached_acl(struct posix_acl *acl) > #define IOP_XATTR 0x0008 > #define IOP_DEFAULT_READLINK 0x0010 > > +struct fsnotify_mark_connector; > + > /* > * Keep mostly read-only and often accessed (especially for > * the RCU path lookup and 'stat' data) fields at the beginning > @@ -644,7 +646,7 @@ struct inode { > > #ifdef CONFIG_FSNOTIFY > __u32 i_fsnotify_mask; /* all events this inode cares about */ > - struct hlist_head i_fsnotify_marks; > + struct fsnotify_mark_connector __rcu *i_fsnotify_marks; > #endif > > #if IS_ENABLED(CONFIG_FS_ENCRYPTION) > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h > index 487246546ebe..f3df68a64ee6 100644 > --- a/include/linux/fsnotify_backend.h > +++ b/include/linux/fsnotify_backend.h > @@ -194,6 +194,28 @@ struct fsnotify_group { > #define FSNOTIFY_EVENT_INODE 2 > > /* > + * Inode / vfsmount point to this structure which tracks all marks attached to > + * the inode / vfsmount. The reference to inode / vfsmount is held by this > + * structure. We destroy this structure when there are no more marks attached > + * to it. The structure is protected by fsnotify_mark_srcu. > + */ > +struct fsnotify_mark_connector { > + spinlock_t lock; > +#define FSNOTIFY_OBJ_TYPE_INODE 0x01 > +#define FSNOTIFY_OBJ_TYPE_VFSMOUNT 0x02 > + unsigned int flags; /* Type of object [lock] */ > + union { /* Object pointer [lock] */ > + struct inode *inode; > + struct vfsmount *mnt; > + }; > + union { > + struct hlist_head list; > + /* Used listing heads to free after srcu period expires */ > + struct fsnotify_mark_connector *destroy_next; > + }; > +}; > + > +/* > * A mark is simply an object attached to an in core inode which allows an > * fsnotify listener to indicate they are either no longer interested in events > * of a type matching mask or only interested in those events. > @@ -222,20 +244,15 @@ struct fsnotify_mark { > struct list_head g_list; > /* Protects inode / mnt pointers, flags, masks */ > spinlock_t lock; > - /* List of marks for inode / vfsmount [obj_lock] */ > + /* List of marks for inode / vfsmount [connector->lock] */ > struct hlist_node obj_list; > - union { /* Object pointer [mark->lock, group->mark_mutex] */ > - struct inode *inode; /* inode this mark is associated with */ > - struct vfsmount *mnt; /* vfsmount this mark is associated with */ > - }; > + /* Head of list of marks for an object [mark->lock, group->mark_mutex] */ > + struct fsnotify_mark_connector *connector; > /* Events types to ignore [mark->lock, group->mark_mutex] */ > __u32 ignored_mask; > -#define FSNOTIFY_MARK_FLAG_INODE 0x01 > -#define FSNOTIFY_MARK_FLAG_VFSMOUNT 0x02 > -#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED 0x04 > -#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08 > -#define FSNOTIFY_MARK_FLAG_ALIVE 0x10 > -#define FSNOTIFY_MARK_FLAG_ATTACHED 0x20 > +#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x01 > +#define FSNOTIFY_MARK_FLAG_ALIVE 0x02 > +#define FSNOTIFY_MARK_FLAG_ATTACHED 0x04 > unsigned int flags; /* flags [mark->lock] */ > void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */ > }; > @@ -314,6 +331,8 @@ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group > > /* functions used to manipulate the marks attached to inodes */ > > +/* Calculate mask of events for a list of marks */ > +extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn); > /* run all marks associated with a vfsmount and update mnt->mnt_fsnotify_mask */ > extern void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt); > /* run all marks associated with an inode and update inode->i_fsnotify_mask */ > @@ -343,7 +362,7 @@ extern void fsnotify_free_mark(struct fsnotify_mark *mark); > extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group); > /* run all the marks in a group, and clear all of the inode marks */ > extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group); > -/* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/ > +/* run all the marks in a group, and clear all of the marks attached to given object type */ > extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags); > extern void fsnotify_get_mark(struct fsnotify_mark *mark); > extern void fsnotify_put_mark(struct fsnotify_mark *mark); > diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c > index 11c7ac441624..a878a98f55b3 100644 > --- a/kernel/audit_tree.c > +++ b/kernel/audit_tree.c > @@ -172,10 +172,25 @@ static unsigned long inode_to_key(const struct inode *inode) > /* > * Function to return search key in our hash from chunk. Key 0 is special and > * should never be present in the hash. > + * > + * Must be called with chunk->mark.lock held to protect from connector > + * becoming NULL. > */ > +static unsigned long __chunk_to_key(struct audit_chunk *chunk) > +{ > + if (!chunk->mark.connector) > + return 0; > + return (unsigned long)chunk->mark.connector->inode; > +} > + > static unsigned long chunk_to_key(struct audit_chunk *chunk) > { > - return (unsigned long)chunk->mark.inode; > + unsigned long key; > + > + spin_lock(&chunk->mark.lock); > + key = __chunk_to_key(chunk); > + spin_unlock(&chunk->mark.lock); > + return key; > } > > static inline struct list_head *chunk_hash(unsigned long key) > @@ -187,7 +202,7 @@ static inline struct list_head *chunk_hash(unsigned long key) > /* hash_lock & entry->lock is held by caller */ > static void insert_hash(struct audit_chunk *chunk) > { > - unsigned long key = chunk_to_key(chunk); > + unsigned long key = __chunk_to_key(chunk); > struct list_head *list; > > if (!key) > @@ -248,7 +263,8 @@ static void untag_chunk(struct node *p) > > mutex_lock(&entry->group->mark_mutex); > spin_lock(&entry->lock); > - if (chunk->dead || !entry->inode) { > + if (chunk->dead || !entry->connector || > + !entry->connector->inode) { > spin_unlock(&entry->lock); > mutex_unlock(&entry->group->mark_mutex); > if (new) > @@ -276,8 +292,8 @@ static void untag_chunk(struct node *p) > if (!new) > goto Fallback; > > - if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode, > - NULL, 1)) { > + if (fsnotify_add_mark_locked(&new->mark, entry->group, > + entry->connector->inode, NULL, 1)) { > fsnotify_put_mark(&new->mark); > goto Fallback; > } > @@ -408,7 +424,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) > > mutex_lock(&old_entry->group->mark_mutex); > spin_lock(&old_entry->lock); > - if (!old_entry->inode) { > + if (!old_entry->connector || !old_entry->connector->inode) { > /* old_entry is being shot, lets just lie */ > spin_unlock(&old_entry->lock); > mutex_unlock(&old_entry->group->mark_mutex); > @@ -418,7 +434,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) > } > > if (fsnotify_add_mark_locked(chunk_entry, old_entry->group, > - old_entry->inode, NULL, 1)) { > + old_entry->connector->inode, NULL, 1)) { > spin_unlock(&old_entry->lock); > mutex_unlock(&old_entry->group->mark_mutex); > fsnotify_put_mark(chunk_entry); > diff --git a/kernel/auditsc.c b/kernel/auditsc.c > index cf1fa43512c1..e990de915341 100644 > --- a/kernel/auditsc.c > +++ b/kernel/auditsc.c > @@ -1591,7 +1591,7 @@ static inline void handle_one(const struct inode *inode) > struct audit_tree_refs *p; > struct audit_chunk *chunk; > int count; > - if (likely(hlist_empty(&inode->i_fsnotify_marks))) > + if (likely(!inode->i_fsnotify_marks)) > return; > context = current->audit_context; > p = context->trees; > @@ -1634,7 +1634,7 @@ static void handle_path(const struct dentry *dentry) > seq = read_seqbegin(&rename_lock); > for(;;) { > struct inode *inode = d_backing_inode(d); > - if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { > + if (inode && unlikely(inode->i_fsnotify_marks)) { > struct audit_chunk *chunk; > chunk = audit_tree_lookup(inode); > if (chunk) { > -- > 2.10.2 > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html