On Tue, Mar 28, 2017 at 06:13:16PM +0200, Jan Kara wrote: > Currently we free fsnotify_mark_connector structure only when inode / > vfsmount is getting freed. This can however impose noticeable memory > overhead when marks get attached to inodes only temporarily. So free the > connector structure once the last mark is detached from the object. > Since notification infrastructure can be working with the connector > under the protection of fsnotify_mark_srcu, we have to be careful and > free the fsnotify_mark_connector only after SRCU period passes. Hmm, I'm thinking about the corner case when connector gets created and removed repeatedly. Is this a potential DoS problem? Should perhaps creating new connectors be throttled when the destroy_list becomes too large? Probably not a big issue, but perhaps needs to be addressed in some way at the end of the series... > > Reviewed-by: Amir Goldstein <amir73il@xxxxxxxxx> > Signed-off-by: Jan Kara <jack@xxxxxxx> > --- > fs/inode.c | 3 - > fs/mount.h | 2 +- > fs/namespace.c | 3 - > fs/notify/fsnotify.c | 9 ++- > fs/notify/fsnotify.h | 10 +-- > fs/notify/inode_mark.c | 2 +- > fs/notify/mark.c | 137 ++++++++++++++++++++++++++++----------- > fs/notify/vfsmount_mark.c | 2 +- > include/linux/fs.h | 2 +- > include/linux/fsnotify_backend.h | 11 ++-- > kernel/auditsc.c | 6 +- > 11 files changed, 123 insertions(+), 64 deletions(-) > > diff --git a/fs/inode.c b/fs/inode.c > index 750e952d2918..131b2bcebc48 100644 > --- a/fs/inode.c > +++ b/fs/inode.c > @@ -234,9 +234,6 @@ void __destroy_inode(struct inode *inode) > inode_detach_wb(inode); > security_inode_free(inode); > fsnotify_inode_delete(inode); > -#ifdef CONFIG_FSNOTIFY > - fsnotify_connector_free(&inode->i_fsnotify_marks); > -#endif > locks_free_lock_context(inode); > if (!inode->i_nlink) { > WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); > diff --git a/fs/mount.h b/fs/mount.h > index bc409360a03b..bf1fda6eed8f 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -59,7 +59,7 @@ struct mount { > struct mountpoint *mnt_mp; /* where is it mounted */ > struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ > #ifdef CONFIG_FSNOTIFY > - struct fsnotify_mark_connector *mnt_fsnotify_marks; > + struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; > __u32 mnt_fsnotify_mask; > #endif > int mnt_id; /* mount identifier */ > diff --git a/fs/namespace.c b/fs/namespace.c > index 2625e1d97a3a..b3b115bd4e1e 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -1108,9 +1108,6 @@ static void cleanup_mnt(struct mount *mnt) > if (unlikely(mnt->mnt_pins.first)) > mnt_pin_kill(mnt); > fsnotify_vfsmount_delete(&mnt->mnt); > -#ifdef CONFIG_FSNOTIFY > - fsnotify_connector_free(&mnt->mnt_fsnotify_marks); > -#endif > dput(mnt->mnt.mnt_root); > deactivate_super(mnt->mnt.mnt_sb); > mnt_free_id(mnt); > diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c > index eae621a18ac9..d512ef9f75fc 100644 > --- a/fs/notify/fsnotify.c > +++ b/fs/notify/fsnotify.c > @@ -228,7 +228,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > > if ((mask & FS_MODIFY) || > (test_mask & to_tell->i_fsnotify_mask)) { > - inode_conn = lockless_dereference(to_tell->i_fsnotify_marks); > + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, > + &fsnotify_mark_srcu); > if (inode_conn) > inode_node = srcu_dereference(inode_conn->list.first, > &fsnotify_mark_srcu); > @@ -236,11 +237,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > > if (mnt && ((mask & FS_MODIFY) || > (test_mask & mnt->mnt_fsnotify_mask))) { > - inode_conn = lockless_dereference(to_tell->i_fsnotify_marks); > + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, > + &fsnotify_mark_srcu); > if (inode_conn) > inode_node = srcu_dereference(inode_conn->list.first, > &fsnotify_mark_srcu); > - vfsmount_conn = lockless_dereference(mnt->mnt_fsnotify_marks); > + vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks, > + &fsnotify_mark_srcu); > if (vfsmount_conn) > vfsmount_node = srcu_dereference( > vfsmount_conn->list.first, > diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h > index 510f027bdf0f..72050b75ca8c 100644 > --- a/fs/notify/fsnotify.h > +++ b/fs/notify/fsnotify.h > @@ -20,19 +20,19 @@ extern int fsnotify_compare_groups(struct fsnotify_group *a, > > /* Find mark belonging to given group in the list of marks */ > extern struct fsnotify_mark *fsnotify_find_mark( > - struct fsnotify_mark_connector *conn, > - struct fsnotify_group *group); > + struct fsnotify_mark_connector __rcu **connp, > + struct fsnotify_group *group); > /* Destroy all marks connected via given connector */ > -extern void fsnotify_destroy_marks(struct fsnotify_mark_connector *conn); > +extern void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp); > /* run the list of all marks associated with inode and destroy them */ > static inline void fsnotify_clear_marks_by_inode(struct inode *inode) > { > - fsnotify_destroy_marks(inode->i_fsnotify_marks); > + fsnotify_destroy_marks(&inode->i_fsnotify_marks); > } > /* run the list of all marks associated with vfsmount and destroy them */ > static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) > { > - fsnotify_destroy_marks(real_mount(mnt)->mnt_fsnotify_marks); > + fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); > } > /* prepare for freeing all marks associated with given group */ > extern void fsnotify_detach_group_marks(struct fsnotify_group *group); > diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c > index 080b6d8b9973..b9370316727e 100644 > --- a/fs/notify/inode_mark.c > +++ b/fs/notify/inode_mark.c > @@ -50,7 +50,7 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) > struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, > struct inode *inode) > { > - return fsnotify_find_mark(inode->i_fsnotify_marks, group); > + return fsnotify_find_mark(&inode->i_fsnotify_marks, group); > } > > /** > diff --git a/fs/notify/mark.c b/fs/notify/mark.c > index 72afa0aed313..3bb012d68eb2 100644 > --- a/fs/notify/mark.c > +++ b/fs/notify/mark.c > @@ -89,10 +89,14 @@ struct kmem_cache *fsnotify_mark_connector_cachep; > > static DEFINE_SPINLOCK(destroy_lock); > static LIST_HEAD(destroy_list); > +static struct fsnotify_mark_connector *connector_destroy_list; > > static void fsnotify_mark_destroy_workfn(struct work_struct *work); > static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); > > +static void fsnotify_connector_destroy_workfn(struct work_struct *work); > +static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); > + > void fsnotify_get_mark(struct fsnotify_mark *mark) > { > atomic_inc(&mark->refcnt); > @@ -139,23 +143,63 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) > __fsnotify_update_child_dentry_flags(conn->inode); > } > > +/* Free all connectors queued for freeing once SRCU period ends */ > +static void fsnotify_connector_destroy_workfn(struct work_struct *work) > +{ > + struct fsnotify_mark_connector *conn, *free; > + > + spin_lock(&destroy_lock); > + conn = connector_destroy_list; > + connector_destroy_list = NULL; > + spin_unlock(&destroy_lock); > + > + synchronize_srcu(&fsnotify_mark_srcu); > + while (conn) { > + free = conn; > + conn = conn->destroy_next; > + kmem_cache_free(fsnotify_mark_connector_cachep, free); > + } > +} > + > static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark) > { > struct fsnotify_mark_connector *conn; > struct inode *inode = NULL; > + bool free_conn = false; > > conn = mark->connector; > spin_lock(&conn->lock); > hlist_del_init_rcu(&mark->obj_list); > if (hlist_empty(&conn->list)) { > - if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) > + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) { > inode = conn->inode; > + rcu_assign_pointer(inode->i_fsnotify_marks, NULL); > + inode->i_fsnotify_mask = 0; > + conn->inode = NULL; > + conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE; > + } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { > + rcu_assign_pointer( > + real_mount(conn->mnt)->mnt_fsnotify_marks, > + NULL); > + real_mount(conn->mnt)->mnt_fsnotify_mask = 0; > + conn->mnt = NULL; > + conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT; > + } > + free_conn = true; > } else { > __fsnotify_recalc_mask(conn); > } > mark->connector = NULL; > spin_unlock(&conn->lock); > > + if (free_conn) { > + spin_lock(&destroy_lock); > + conn->destroy_next = connector_destroy_list; > + connector_destroy_list = conn; > + spin_unlock(&destroy_lock); > + queue_work(system_unbound_wq, &connector_reaper_work); > + } > + > return inode; > } > > @@ -260,14 +304,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, > fsnotify_free_mark(mark); > } > > -void fsnotify_connector_free(struct fsnotify_mark_connector **connp) > -{ > - if (*connp) { > - kmem_cache_free(fsnotify_mark_connector_cachep, *connp); > - *connp = NULL; > - } > -} > - > void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) > { > assert_spin_locked(&mark->lock); > @@ -319,9 +355,9 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) > } > > static int fsnotify_attach_connector_to_object( > - struct fsnotify_mark_connector **connp, > - struct inode *inode, > - struct vfsmount *mnt) > + struct fsnotify_mark_connector __rcu **connp, > + struct inode *inode, > + struct vfsmount *mnt) > { > struct fsnotify_mark_connector *conn; > > @@ -332,7 +368,7 @@ static int fsnotify_attach_connector_to_object( > INIT_HLIST_HEAD(&conn->list); > if (inode) { > conn->flags = FSNOTIFY_OBJ_TYPE_INODE; > - conn->inode = inode; > + conn->inode = igrab(inode); > } else { > conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT; > conn->mnt = mnt; > @@ -343,6 +379,8 @@ static int fsnotify_attach_connector_to_object( > */ > if (cmpxchg(connp, NULL, conn)) { > /* Someone else created list structure for us */ > + if (inode) > + iput(inode); > kmem_cache_free(fsnotify_mark_connector_cachep, conn); > } > > @@ -350,6 +388,34 @@ static int fsnotify_attach_connector_to_object( > } > > /* > + * Get mark connector, make sure it is alive and return with its lock held. > + * This is for users that get connector pointer from inode or mount. Users that > + * hold reference to a mark on the list may directly lock connector->lock as > + * they are sure list cannot go away under them. > + */ > +static struct fsnotify_mark_connector *fsnotify_grab_connector( > + struct fsnotify_mark_connector __rcu **connp) > +{ > + struct fsnotify_mark_connector *conn; > + int idx; > + > + idx = srcu_read_lock(&fsnotify_mark_srcu); > + conn = srcu_dereference(*connp, &fsnotify_mark_srcu); > + if (!conn) > + goto out; > + spin_lock(&conn->lock); > + if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE | > + FSNOTIFY_OBJ_TYPE_VFSMOUNT))) { > + spin_unlock(&conn->lock); > + srcu_read_unlock(&fsnotify_mark_srcu, idx); > + return NULL; > + } > +out: > + srcu_read_unlock(&fsnotify_mark_srcu, idx); > + return conn; > +} > + > +/* > * Add mark into proper place in given list of marks. These marks may be used > * for the fsnotify backend to determine which event types should be delivered > * to which group and for which inodes. These marks are ordered according to > @@ -361,7 +427,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, > { > struct fsnotify_mark *lmark, *last = NULL; > struct fsnotify_mark_connector *conn; > - struct fsnotify_mark_connector **connp; > + struct fsnotify_mark_connector __rcu **connp; > int cmp; > int err = 0; > > @@ -371,21 +437,20 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, > connp = &inode->i_fsnotify_marks; > else > connp = &real_mount(mnt)->mnt_fsnotify_marks; > - > - if (!*connp) { > +restart: > + spin_lock(&mark->lock); > + conn = fsnotify_grab_connector(connp); > + if (!conn) { > + spin_unlock(&mark->lock); > err = fsnotify_attach_connector_to_object(connp, inode, mnt); > if (err) > return err; > + goto restart; > } > - spin_lock(&mark->lock); > - conn = *connp; > - spin_lock(&conn->lock); > > /* is mark the first mark? */ > if (hlist_empty(&conn->list)) { > hlist_add_head_rcu(&mark->obj_list, &conn->list); > - if (inode) > - igrab(inode); > goto added; > } > > @@ -487,15 +552,17 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, > * Given a list of marks, find the mark associated with given group. If found > * take a reference to that mark and return it, else return NULL. > */ > -struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_mark_connector *conn, > - struct fsnotify_group *group) > +struct fsnotify_mark *fsnotify_find_mark( > + struct fsnotify_mark_connector __rcu **connp, > + struct fsnotify_group *group) > { > + struct fsnotify_mark_connector *conn; > struct fsnotify_mark *mark; > > + conn = fsnotify_grab_connector(connp); > if (!conn) > return NULL; > > - spin_lock(&conn->lock); > hlist_for_each_entry(mark, &conn->list, obj_list) { > if (mark->group == group) { > fsnotify_get_mark(mark); > @@ -573,26 +640,20 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group) > } > } > > -void fsnotify_destroy_marks(struct fsnotify_mark_connector *conn) > +/* Destroy all marks attached to inode / vfsmount */ > +void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp) > { > + struct fsnotify_mark_connector *conn; > struct fsnotify_mark *mark; > > - if (!conn) > - return; > - > - while (1) { > + while ((conn = fsnotify_grab_connector(connp))) { > /* > * We have to be careful since we can race with e.g. > - * fsnotify_clear_marks_by_group() and once we drop 'lock', > - * mark can get removed from the obj_list and destroyed. But > - * we are holding mark reference so mark cannot be freed and > - * calling fsnotify_destroy_mark() more than once is fine. > + * fsnotify_clear_marks_by_group() and once we drop the list > + * lock, mark can get removed from the obj_list and destroyed. > + * But we are holding mark reference so mark cannot be freed > + * and calling fsnotify_destroy_mark() more than once is fine. > */ > - spin_lock(&conn->lock); > - if (hlist_empty(&conn->list)) { > - spin_unlock(&conn->lock); > - break; > - } > mark = hlist_entry(conn->list.first, struct fsnotify_mark, > obj_list); > fsnotify_get_mark(mark); > diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c > index 26da5c209944..dd5f3fcbccfb 100644 > --- a/fs/notify/vfsmount_mark.c > +++ b/fs/notify/vfsmount_mark.c > @@ -48,5 +48,5 @@ struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, > { > struct mount *m = real_mount(mnt); > > - return fsnotify_find_mark(m->mnt_fsnotify_marks, group); > + return fsnotify_find_mark(&m->mnt_fsnotify_marks, group); > } > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 66e52342be2d..c0b6150c5fcc 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -647,7 +647,7 @@ struct inode { > > #ifdef CONFIG_FSNOTIFY > __u32 i_fsnotify_mask; /* all events this inode cares about */ > - struct fsnotify_mark_connector *i_fsnotify_marks; > + struct fsnotify_mark_connector __rcu *i_fsnotify_marks; > #endif > > #if IS_ENABLED(CONFIG_FS_ENCRYPTION) > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h > index 02c6fac652a4..84d71b6f75f6 100644 > --- a/include/linux/fsnotify_backend.h > +++ b/include/linux/fsnotify_backend.h > @@ -197,8 +197,8 @@ struct fsnotify_group { > /* > * Inode / vfsmount point to this structure which tracks all marks attached to > * the inode / vfsmount. The reference to inode / vfsmount is held by this > - * structure whenever the list is non-empty. The structure is freed only when > - * inode / vfsmount gets freed. > + * structure. We destroy this structure when there are no more marks attached > + * to it. The structure is protected by fsnotify_mark_srcu. > */ > struct fsnotify_mark_connector { > spinlock_t lock; > @@ -209,7 +209,11 @@ struct fsnotify_mark_connector { > struct inode *inode; > struct vfsmount *mnt; > }; > - struct hlist_head list; > + union { > + struct hlist_head list; > + /* Used listing heads to free after srcu period expires */ > + struct fsnotify_mark_connector *destroy_next; > + }; > }; > > /* > @@ -361,7 +365,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) > extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group); > /* run all the marks in a group, and clear all of the marks attached to given object type */ > extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags); > -extern void fsnotify_connector_free(struct fsnotify_mark_connector **connp); > extern void fsnotify_get_mark(struct fsnotify_mark *mark); > extern void fsnotify_put_mark(struct fsnotify_mark *mark); > extern void fsnotify_unmount_inodes(struct super_block *sb); > diff --git a/kernel/auditsc.c b/kernel/auditsc.c > index bf7b7ca295d0..d383c33540af 100644 > --- a/kernel/auditsc.c > +++ b/kernel/auditsc.c > @@ -1597,8 +1597,7 @@ static inline void handle_one(const struct inode *inode) > struct audit_tree_refs *p; > struct audit_chunk *chunk; > int count; > - if (likely(!inode->i_fsnotify_marks || > - hlist_empty(&inode->i_fsnotify_marks->list))) > + if (likely(!inode->i_fsnotify_marks)) > return; > context = current->audit_context; > p = context->trees; > @@ -1641,8 +1640,7 @@ static void handle_path(const struct dentry *dentry) > seq = read_seqbegin(&rename_lock); > for(;;) { > struct inode *inode = d_backing_inode(d); > - if (inode && unlikely(inode->i_fsnotify_marks && > - !hlist_empty(&inode->i_fsnotify_marks->list))) { > + if (inode && unlikely(inode->i_fsnotify_marks)) { > struct audit_chunk *chunk; > chunk = audit_tree_lookup(inode); > if (chunk) { > -- > 2.10.2 >