When an inode is interested in events on its children, it must set DCACHE_FSNOTIFY_PARENT_WATCHED flag on all its children. Currently, when the fsnotify connector is removed and i_fsnotify_mask becomes zero, we lazily allow __fsnotify_parent() to do this the next time we see an event on a child. However, if the list of children is very long (e.g., in the millions), and lots of activity is occurring on the directory, then it's possible for many CPUs to end up blocked on the inode spinlock in __fsnotify_update_child_flags(). Each CPU will then redundantly iterate over the very long list of children. This situation can cause soft lockups. To avoid this, stop lazily updating child flags in __fsnotify_parent(). Protect the child flag update with i_rwsem held exclusive, to ensure that we only iterate over the child list when it's absolutely necessary, and even then, only once. Signed-off-by: Stephen Brennan <stephen.s.brennan@xxxxxxxxxx> --- Notes: v3 changes: * Moved fsnotify_update_children_dentry_flags() into fsnotify.c, declared it in the header. Made __fsnotify_update_children_dentry_flags() static since it has no external callers except fsnotify_update...(). * Use bitwise xor operator in children_need_update() * Eliminated FSNOTIFY_OBJ_FLAG_* constants, reused CONN_FLAG_* * Updated documentation of fsnotify_update_inode_conn_flags() to reflect its behavior * Renamed "flags" to "update_flags" in all its uses, so that it's a clear pattern and matches renamed fsnotify_update_object(). fs/notify/fsnotify.c | 45 ++++++++++- fs/notify/fsnotify.h | 13 +++- fs/notify/mark.c | 124 ++++++++++++++++++++----------- include/linux/fsnotify_backend.h | 1 + 4 files changed, 137 insertions(+), 46 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 7939aa911931..ccb8a3a6c522 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -103,13 +103,15 @@ void fsnotify_sb_delete(struct super_block *sb) * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ -void __fsnotify_update_child_dentry_flags(struct inode *inode) +static bool __fsnotify_update_children_dentry_flags(struct inode *inode) { struct dentry *alias, *child; int watched; if (!S_ISDIR(inode->i_mode)) - return; + return false; + + lockdep_assert_held_write(&inode->i_rwsem); /* determine if the children should tell inode about their events */ watched = fsnotify_inode_watches_children(inode); @@ -136,6 +138,43 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) } spin_unlock(&alias->d_lock); dput(alias); + return watched; +} + +void fsnotify_update_children_dentry_flags(struct fsnotify_mark_connector *conn, + struct inode *inode) +{ + bool need_update; + + inode_lock(inode); + spin_lock(&conn->lock); + need_update = fsnotify_children_need_update(conn, inode); + spin_unlock(&conn->lock); + if (need_update) { + bool watched = __fsnotify_update_children_dentry_flags(inode); + + spin_lock(&conn->lock); + if (watched) + conn->flags |= FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN; + else + conn->flags &= ~FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN; + spin_unlock(&conn->lock); + } + inode_unlock(inode); +} + + +static void fsnotify_update_child_dentry_flags(struct inode *inode, struct dentry *dentry) +{ + /* + * Flag would be cleared soon by + * __fsnotify_update_child_dentry_flags(), but as an + * optimization, clear it now. + */ + spin_lock(&dentry->d_lock); + if (!fsnotify_inode_watches_children(inode)) + dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; + spin_unlock(&dentry->d_lock); } /* Are inode/sb/mount interested in parent and name info with this event? */ @@ -206,7 +245,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, p_inode = parent->d_inode; p_mask = fsnotify_inode_watches_children(p_inode); if (unlikely(parent_watched && !p_mask)) - __fsnotify_update_child_dentry_flags(p_inode); + fsnotify_update_child_dentry_flags(p_inode, dentry); /* * Include parent/name in notification either if some notification diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index fde74eb333cc..621e78a6f0fb 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -70,11 +70,22 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) fsnotify_destroy_marks(&sb->s_fsnotify_marks); } +static inline bool fsnotify_children_need_update(struct fsnotify_mark_connector *conn, + struct inode *inode) +{ + bool watched, flags_set; + + watched = fsnotify_inode_watches_children(inode); + flags_set = conn->flags & FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN; + return watched ^ flags_set; +} + /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. */ -extern void __fsnotify_update_child_dentry_flags(struct inode *inode); +extern void fsnotify_update_children_dentry_flags(struct fsnotify_mark_connector *conn, + struct inode *inode); extern struct kmem_cache *fsnotify_mark_connector_cachep; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index c74ef947447d..8969128dacc1 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -123,37 +123,52 @@ static void fsnotify_get_inode_ref(struct inode *inode) } /* - * Grab or drop inode reference for the connector if needed. + * Determine the connector flags that it is necessary to update * - * When it's time to drop the reference, we only clear the HAS_IREF flag and - * return the inode object. fsnotify_drop_object() will be resonsible for doing - * iput() outside of spinlocks. This happens when last mark that wanted iref is - * detached. + * If any action needs to be taken on the connector's inode outside of a spinlock, + * we return the inode and set *update_flags accordingly. + * + * If FSNOTIFY_CONN_FLAG_HAS_IREF is set in *update_flags, then the caller needs + * to drop the last inode reference using fsnotify_put_inode_ref(). + * + * If FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN is set in *update_flags, then the + * caller needs to update the children dentry flags so that their + * DCACHE_FSNOTIFY_PARENT_WATCHED flag matches the i_fsnotify_mask value, using + * fsnotify_update_children_dentry_flags(). */ -static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn, - bool want_iref) +static struct inode *fsnotify_update_inode_conn_flags(struct fsnotify_mark_connector *conn, + bool want_iref, int *update_flags) { bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF; - struct inode *inode = NULL; + struct inode *inode = NULL, *ret = NULL; - if (conn->type != FSNOTIFY_OBJ_TYPE_INODE || - want_iref == has_iref) + if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) return NULL; - if (want_iref) { - /* Pin inode if any mark wants inode refcount held */ - fsnotify_get_inode_ref(fsnotify_conn_inode(conn)); - conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF; - } else { - /* Unpin inode after detach of last mark that wanted iref */ - inode = fsnotify_conn_inode(conn); - conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF; + inode = fsnotify_conn_inode(conn); + + if (want_iref != has_iref) { + if (want_iref) { + /* Pin inode if any mark wants inode refcount held */ + fsnotify_get_inode_ref(inode); + conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF; + } else { + /* Unpin inode after detach of last mark that wanted iref */ + conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF; + ret = inode; + *update_flags |= FSNOTIFY_CONN_FLAG_HAS_IREF; + } + } + if (fsnotify_children_need_update(conn, inode)) { + ret = inode; + *update_flags |= FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN; } - return inode; + return ret; } -static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) +static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn, + int *update_flags) { u32 new_mask = 0; bool want_iref = false; @@ -173,7 +188,7 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) } *fsnotify_conn_mask_p(conn) = new_mask; - return fsnotify_update_iref(conn, want_iref); + return fsnotify_update_inode_conn_flags(conn, want_iref, update_flags); } /* @@ -184,15 +199,19 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) */ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { + struct inode *inode = NULL; + int flags = 0; + if (!conn) return; spin_lock(&conn->lock); - __fsnotify_recalc_mask(conn); + inode = __fsnotify_recalc_mask(conn, &flags); spin_unlock(&conn->lock); - if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) - __fsnotify_update_child_dentry_flags( - fsnotify_conn_inode(conn)); + + if (flags & FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN) + fsnotify_update_children_dentry_flags(conn, inode); + WARN_ON_ONCE(flags & FSNOTIFY_CONN_FLAG_HAS_IREF); } /* Free all connectors queued for freeing once SRCU period ends */ @@ -240,7 +259,8 @@ static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn) static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, - unsigned int *type) + unsigned int *type, + unsigned int *update_flags) { struct inode *inode = NULL; @@ -252,8 +272,10 @@ static void *fsnotify_detach_connector_from_object( inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; - /* Unpin inode when detaching from connector */ - if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF)) + *update_flags = conn->flags & + (FSNOTIFY_CONN_FLAG_HAS_IREF | + FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN); + if (!*update_flags) inode = NULL; } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; @@ -279,15 +301,37 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) fsnotify_put_group(group); } -/* Drop object reference originally held by a connector */ -static void fsnotify_drop_object(unsigned int type, void *objp) +/* Apply the update_flags for a connector after recalculating mask */ +static void fsnotify_update_object(struct fsnotify_mark_connector *conn, + unsigned int type, void *objp, + int update_flags) { if (!objp) return; /* Currently only inode references are passed to be dropped */ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) return; - fsnotify_put_inode_ref(objp); + + if (update_flags & FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN) + /*. + * At this point, we've already detached the connector from the + * inode. It's entirely possible that another connector has been + * attached, and that connector would assume that the children's + * flags are all clear. There are two possibilities: + * (a) The connector has not yet attached a mark that watches its + * children. In this case, we will properly clear out the flags, + * and the connector's flags will be consistent with the + * children. + * (b) The connector attaches a mark that watches its children. + * It may have even already altered i_fsnotify_mask and/or + * altered the child dentry flags. In this case, our call here + * will read the correct value of i_fsnotify_mask and apply it + * to the children, which duplicates some work, but isn't + * harmful. + */ + fsnotify_update_children_dentry_flags(conn, objp); + if (update_flags & FSNOTIFY_CONN_FLAG_HAS_IREF) + fsnotify_put_inode_ref(objp); } void fsnotify_put_mark(struct fsnotify_mark *mark) @@ -296,6 +340,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) void *objp = NULL; unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; + int flags = 0; /* Catch marks that were actually never attached to object */ if (!conn) { @@ -313,16 +358,16 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { - objp = fsnotify_detach_connector_from_object(conn, &type); + objp = fsnotify_detach_connector_from_object(conn, &type, &flags); free_conn = true; } else { - objp = __fsnotify_recalc_mask(conn); + objp = __fsnotify_recalc_mask(conn, &flags); type = conn->type; } WRITE_ONCE(mark->connector, NULL); spin_unlock(&conn->lock); - fsnotify_drop_object(type, objp); + fsnotify_update_object(conn, type, objp, flags); if (free_conn) { spin_lock(&destroy_lock); @@ -331,12 +376,6 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) spin_unlock(&destroy_lock); queue_work(system_unbound_wq, &connector_reaper_work); } - /* - * Note that we didn't update flags telling whether inode cares about - * what's happening with children. We update these flags from - * __fsnotify_parent() lazily when next event happens on one of our - * children. - */ spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); @@ -834,6 +873,7 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp) struct fsnotify_mark *mark, *old_mark = NULL; void *objp; unsigned int type; + int update_flags = 0; conn = fsnotify_grab_connector(connp); if (!conn) @@ -859,11 +899,11 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp) * mark references get dropped. It would lead to strange results such * as delaying inode deletion or blocking unmount. */ - objp = fsnotify_detach_connector_from_object(conn, &type); + objp = fsnotify_detach_connector_from_object(conn, &type, &update_flags); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); - fsnotify_drop_object(type, objp); + fsnotify_update_object(conn, type, objp, update_flags); } /* diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index d7d96c806bff..7b8d1cdac0ce 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -474,6 +474,7 @@ struct fsnotify_mark_connector { unsigned short type; /* Type of object [lock] */ #define FSNOTIFY_CONN_FLAG_HAS_FSID 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 +#define FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN 0x04 unsigned short flags; /* flags [lock] */ __kernel_fsid_t fsid; /* fsid of filesystem containing object */ union { -- 2.34.1