mntput as part of path_put is called from all over the vfs sometimes as in the case of symlink chasing from some rather deep call chains. During lazy filesystem unmount with the right set of races those innocuous little mntput calls (that take very little stack space) can call deactivate_super and wind up taking 3k of stack space or more (David Chinner reports 5k for xfs). Avoid deactivate_super being called from a deep stack by moving the cleanup of the mount into a work queue. To avoid semantic changes mntput waits for deactivate_super to complete before returning. With this change all filesystem unmounting happens with about 7400 bytes free on the stack at the point where deactivate_super is called. Giving filesystems plenty of room to do I/O and not overflow the kernel stack during unmounting. The extra fields mnt_cleanup_work, and mnt_undone are added in a union to avoid growing struct mount unnecessarily. Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> --- fs/mount.h | 14 +++++++++++--- fs/namespace.c | 32 +++++++++++++++++++++++++++----- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index a4111f19fd2e..7272fc416580 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -48,8 +48,17 @@ struct mount { struct list_head mnt_slave; /* slave list entry */ struct mount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ - struct mountpoint *mnt_mp; /* where is it mounted */ - struct list_head mnt_mp_list; /* list mounts with the same mountpoint */ + union { + struct { + struct path mnt_ex_mountpoint; + struct list_head mnt_mp_list; /* list mounts with the same mountpoint */ + struct mountpoint *mnt_mp; /* where is it mounted */ + }; + struct { + struct work_struct mnt_cleanup_work; + struct completion *mnt_undone; + }; + }; #ifdef CONFIG_FSNOTIFY struct hlist_head mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; @@ -58,7 +67,6 @@ struct mount { int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; - struct path mnt_ex_mountpoint; }; #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ diff --git a/fs/namespace.c b/fs/namespace.c index 2abd5a2416d0..ac589ad9f22d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -983,8 +983,25 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return ERR_PTR(err); } +static void cleanup_mnt(struct mount *mnt) +{ + fsnotify_vfsmount_delete(&mnt->mnt); + dput(mnt->mnt.mnt_root); + deactivate_super(mnt->mnt.mnt_sb); + mnt_free_id(mnt); + complete(mnt->mnt_undone); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); +} + +static void cleanup_mnt_work(struct work_struct *work) +{ + cleanup_mnt(container_of(work, struct mount, mnt_cleanup_work)); +} + static void mntput_no_expire(struct mount *mnt) { + struct completion undone; + rcu_read_lock(); mnt_add_count(mnt, -1); if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ @@ -1027,11 +1044,16 @@ static void mntput_no_expire(struct mount *mnt) * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); - fsnotify_vfsmount_delete(&mnt->mnt); - dput(mnt->mnt.mnt_root); - deactivate_super(mnt->mnt.mnt_sb); - mnt_free_id(mnt); - call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); + /* The stack may be deep here, cleanup the mount on a work + * queue where the stack is guaranteed to be shallow. + */ + init_completion(&undone); + mnt->mnt_undone = &undone; + + INIT_WORK(&mnt->mnt_cleanup_work, cleanup_mnt_work); + schedule_work(&mnt->mnt_cleanup_work); + + wait_for_completion(&undone); } void mntput(struct vfsmount *mnt) -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html