On Wed, Feb 1, 2017 at 11:44 AM, Jan Kara <jack@xxxxxxx> wrote: > Currently we free fsnotify_mark_connector structure only when inode / > vfsmount is getting freed. This can however impose noticeable memory > overhead when marks get attached to inodes only temporarily. So free the > connector structure once the last mark is detached from the object. > Since notification infrastructure can be working with the connector > under the protection of fsnotify_mark_srcu, we have to be careful and > free the fsnotify_mark_connector only after SRCU period passes. This adds a bunch of complexity. How about instead: - protect setting *_fsnotify_marks (forward pointer) with lock in containing object - keep count of marks not freed for connector - when object (inode or vfsmount) is being destroyed and *_fsnotify_marks is non-NULL: o lock object; re-check forward pointer; lock connector; set back-pointer to NULL; unlock connector; unlock object - when mark counter goes to zero: o lock connector; check back pointer, if nonzero: + trylock object, if failed: unlock connector, relax & retry + clear forward pointer; unlock object o unlock connector This ensures freeing only after srcu period (since the mark is already srcu protected). It also ensures that multiple connectors for the same object won't exist at the same time (the connector will be reused if a new mark is added before the srcu period has passed). While the locking looks somewhat challenging, I think it's still going to be simpler than the current one. Thanks, Miklos > > Signed-off-by: Jan Kara <jack@xxxxxxx> > --- > fs/mount.h | 2 +- > fs/notify/fsnotify.c | 9 +++-- > fs/notify/fsnotify.h | 4 --- > fs/notify/mark.c | 97 ++++++++++++++++++++++++++++++++++++++-------------- > include/linux/fs.h | 2 +- > 5 files changed, 79 insertions(+), 35 deletions(-) > > diff --git a/fs/mount.h b/fs/mount.h > index c92c78d135dc..e1dc79713abe 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -59,7 +59,7 @@ struct mount { > struct mountpoint *mnt_mp; /* where is it mounted */ > struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ > #ifdef CONFIG_FSNOTIFY > - struct fsnotify_mark_connector *mnt_fsnotify_marks; > + struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; > __u32 mnt_fsnotify_mask; > #endif > int mnt_id; /* mount identifier */ > diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c > index 421c7431a16e..d512ef9f75fc 100644 > --- a/fs/notify/fsnotify.c > +++ b/fs/notify/fsnotify.c > @@ -228,7 +228,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > > if ((mask & FS_MODIFY) || > (test_mask & to_tell->i_fsnotify_mask)) { > - inode_conn = to_tell->i_fsnotify_marks; > + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, > + &fsnotify_mark_srcu); > if (inode_conn) > inode_node = srcu_dereference(inode_conn->list.first, > &fsnotify_mark_srcu); > @@ -236,11 +237,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, > > if (mnt && ((mask & FS_MODIFY) || > (test_mask & mnt->mnt_fsnotify_mask))) { > - inode_conn = to_tell->i_fsnotify_marks; > + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, > + &fsnotify_mark_srcu); > if (inode_conn) > inode_node = srcu_dereference(inode_conn->list.first, > &fsnotify_mark_srcu); > - vfsmount_conn = mnt->mnt_fsnotify_marks; > + vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks, > + &fsnotify_mark_srcu); > if (vfsmount_conn) > vfsmount_node = srcu_dereference( > vfsmount_conn->list.first, > diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h > index 317ff2327cc8..24bc549e9709 100644 > --- a/fs/notify/fsnotify.h > +++ b/fs/notify/fsnotify.h > @@ -27,15 +27,11 @@ extern void fsnotify_destroy_marks(struct fsnotify_mark_connector **connp); > /* run the list of all marks associated with inode and destroy them */ > static inline void fsnotify_clear_marks_by_inode(struct inode *inode) > { > - if (!inode->i_fsnotify_marks) > - return; > fsnotify_destroy_marks(&inode->i_fsnotify_marks); > } > /* run the list of all marks associated with vfsmount and destroy them */ > static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) > { > - if (!real_mount(mnt)->mnt_fsnotify_marks) > - return; > fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); > } > /* prepare for freeing all marks associated with given group */ > diff --git a/fs/notify/mark.c b/fs/notify/mark.c > index 299d5f0f42d6..719e71d313e8 100644 > --- a/fs/notify/mark.c > +++ b/fs/notify/mark.c > @@ -89,10 +89,14 @@ struct kmem_cache *fsnotify_mark_connector_cachep; > > static DEFINE_SPINLOCK(destroy_lock); > static LIST_HEAD(destroy_list); > +static struct fsnotify_mark_connector *connector_destroy_list; > > static void fsnotify_mark_destroy_workfn(struct work_struct *work); > static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); > > +static void fsnotify_connector_destroy_workfn(struct work_struct *work); > +static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); > + > void fsnotify_get_mark(struct fsnotify_mark *mark) > { > atomic_inc(&mark->refcnt); > @@ -143,22 +147,62 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) > } > } > > +/* Free all connectors queued for freeing once SRCU period ends */ > +static void fsnotify_connector_destroy_workfn(struct work_struct *work) > +{ > + struct fsnotify_mark_connector *conn, *free; > + > + spin_lock(&destroy_lock); > + conn = connector_destroy_list; > + connector_destroy_list = NULL; > + spin_unlock(&destroy_lock); > + > + synchronize_srcu(&fsnotify_mark_srcu); > + while (conn) { > + free = conn; > + conn = conn->destroy_next; > + kmem_cache_free(fsnotify_mark_connector_cachep, free); > + } > +} > + > static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark) > { > struct fsnotify_mark_connector *conn; > struct inode *inode = NULL; > + bool free_conn = false; > > conn = mark->connector; > spin_lock(&conn->lock); > hlist_del_init_rcu(&mark->obj_list); > if (hlist_empty(&conn->list)) { > - if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) > + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) { > inode = conn->inode; > + rcu_assign_pointer(inode->i_fsnotify_marks, NULL); > + inode->i_fsnotify_mask = 0; > + conn->inode = NULL; > + conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE; > + } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { > + rcu_assign_pointer( > + real_mount(conn->mnt)->mnt_fsnotify_marks, > + NULL); > + real_mount(conn->mnt)->mnt_fsnotify_mask = 0; > + conn->mnt = NULL; > + conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT; > + } > + free_conn = true; > } else > __fsnotify_recalc_mask(conn); > mark->connector = NULL; > spin_unlock(&conn->lock); > > + if (free_conn) { > + spin_lock(&destroy_lock); > + conn->destroy_next = connector_destroy_list; > + connector_destroy_list = conn; > + spin_unlock(&destroy_lock); > + queue_work(system_unbound_wq, &connector_reaper_work); > + } > + > return inode; > } > > @@ -320,11 +364,24 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) > * they are sure list cannot go away under them. > */ > static struct fsnotify_mark_connector *fsnotify_grab_connector( > - struct fsnotify_mark_connector *conn) > + struct fsnotify_mark_connector * __rcu *connp) > { > + struct fsnotify_mark_connector *conn; > + int idx; > + > + idx = srcu_read_lock(&fsnotify_mark_srcu); > + conn = srcu_dereference(*connp, &fsnotify_mark_srcu); > if (!conn) > - return NULL; > + goto out; > spin_lock(&conn->lock); > + if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE | > + FSNOTIFY_OBJ_TYPE_VFSMOUNT))) { > + spin_unlock(&conn->lock); > + srcu_read_unlock(&fsnotify_mark_srcu, idx); > + return NULL; > + } > +out: > + srcu_read_unlock(&fsnotify_mark_srcu, idx); > return conn; > } > > @@ -351,7 +408,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, > connp = &real_mount(mnt)->mnt_fsnotify_marks; > restart: > spin_lock(&mark->lock); > - conn = fsnotify_grab_connector(*connp); > + conn = fsnotify_grab_connector(connp); > if (!conn) { > spin_unlock(&mark->lock); > conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, > @@ -362,13 +419,15 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, > INIT_HLIST_HEAD(&conn->list); > if (inode) { > conn->flags = FSNOTIFY_OBJ_TYPE_INODE; > - conn->inode = inode; > + conn->inode = igrab(inode); > } else { > conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT; > conn->mnt = mnt; > } > if (cmpxchg(connp, NULL, conn)) { > /* Someone else created list structure for us */ > + if (inode) > + iput(inode); > kmem_cache_free(fsnotify_mark_connector_cachep, conn); > } > goto restart; > @@ -377,7 +436,6 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, > /* is mark the first mark? */ > if (hlist_empty(&conn->list)) { > hlist_add_head_rcu(&mark->obj_list, &conn->list); > - igrab(inode); > goto added; > } > > @@ -485,7 +543,7 @@ struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_mark_connector **connp, > struct fsnotify_mark *mark; > struct fsnotify_mark_connector *conn; > > - conn = fsnotify_grab_connector(*connp); > + conn = fsnotify_grab_connector(connp); > if (!conn) > return NULL; > hlist_for_each_entry(mark, &conn->list, obj_list) { > @@ -565,23 +623,20 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group) > } > } > > +/* Destroy all marks attached to inode / vfsmount */ > void fsnotify_destroy_marks(struct fsnotify_mark_connector **connp) > { > struct fsnotify_mark_connector *conn; > struct fsnotify_mark *mark; > > - while ((conn = fsnotify_grab_connector(*connp))) { > + while ((conn = fsnotify_grab_connector(connp))) { > /* > * We have to be careful since we can race with e.g. > - * fsnotify_clear_marks_by_group() and once we drop 'lock', > - * mark can get removed from the obj_list and destroyed. But > - * we are holding mark reference so mark cannot be freed and > - * calling fsnotify_destroy_mark() more than once is fine. > + * fsnotify_clear_marks_by_group() and once we drop the list > + * lock, mark can get removed from the obj_list and destroyed. > + * But we are holding mark reference so mark cannot be freed > + * and calling fsnotify_destroy_mark() more than once is fine. > */ > - if (hlist_empty(&conn->list)) { > - spin_unlock(&conn->lock); > - break; > - } > mark = hlist_entry(conn->list.first, struct fsnotify_mark, > obj_list); > fsnotify_get_mark(mark); > @@ -589,16 +644,6 @@ void fsnotify_destroy_marks(struct fsnotify_mark_connector **connp) > fsnotify_destroy_mark(mark, mark->group); > fsnotify_put_mark(mark); > } > - /* > - * Skip freeing of connector when inode is just unlinked. We leave that > - * until destroy_inode() is called at which moment nobody will be > - * looking at the inode anymore. > - */ > - if ((*connp)->flags & FSNOTIFY_OBJ_TYPE_INODE && > - !((*connp)->inode->i_state & I_CLEAR)) > - return; > - kmem_cache_free(fsnotify_mark_connector_cachep, *connp); > - *connp = NULL; > } > > /* > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 2588a24ad11f..8fc09316f996 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -646,7 +646,7 @@ struct inode { > > #ifdef CONFIG_FSNOTIFY > __u32 i_fsnotify_mask; /* all events this inode cares about */ > - struct fsnotify_mark_connector *i_fsnotify_marks; > + struct fsnotify_mark_connector __rcu *i_fsnotify_marks; > #endif > > #if IS_ENABLED(CONFIG_FS_ENCRYPTION) > -- > 2.10.2 >