The check_unsafe_exec() counting of n_fs would not add up under a heavily threaded process trying to perform a suid exec, causing the suid portion to fail. This counting error appears to be unneeded, but to catch any possible conditions, explicitly unshare fs_struct on exec, if it ends up needing to happen. This means fs->in_exec must be removed (since it would never get cleared in the old copy), and is specifically no longer needed. See also commit 498052bba55e ("New locking/refcounting for fs_struct"). This additionally allows the "in_exec" member to be dropped, showing an 8 bytes savings per fs_struct on 64-bit. Before: struct fs_struct { int users; /* 0 4 */ spinlock_t lock; /* 4 4 */ seqcount_spinlock_t seq; /* 8 4 */ int umask; /* 12 4 */ int in_exec; /* 16 4 */ /* XXX 4 bytes hole, try to pack */ struct path root; /* 24 16 */ struct path pwd; /* 40 16 */ /* size: 56, cachelines: 1, members: 7 */ /* sum members: 52, holes: 1, sum holes: 4 */ /* last cacheline: 56 bytes */ }; After: struct fs_struct { int users; /* 0 4 */ spinlock_t lock; /* 4 4 */ seqcount_spinlock_t seq; /* 8 4 */ int umask; /* 12 4 */ struct path root; /* 16 16 */ struct path pwd; /* 32 16 */ /* size: 48, cachelines: 1, members: 6 */ /* last cacheline: 48 bytes */ }; Reported-by: Jorge Merlino <jorge.merlino@xxxxxxxxxxxxx> Link: https://lore.kernel.org/lkml/20220910211215.140270-1-jorge.merlino@xxxxxxxxxxxxx/ Cc: Eric Biederman <ebiederm@xxxxxxxxxxxx> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx> Cc: "Christian Brauner (Microsoft)" <brauner@xxxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Andy Lutomirski <luto@xxxxxxxxxx> Cc: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: linux-mm@xxxxxxxxx Cc: linux-fsdevel@xxxxxxxxxxxxxxx Signed-off-by: Kees Cook <keescook@xxxxxxxxxxxx> --- fs/exec.c | 9 +++--- fs/fs_struct.c | 1 - include/linux/fdtable.h | 1 + include/linux/fs_struct.h | 1 - kernel/fork.c | 62 ++++++++++++++++++++++++++------------- 5 files changed, 48 insertions(+), 26 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 902bce45b116..7d5f63f03c58 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1272,6 +1272,11 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out; + /* Ensure the fs_struct is not shared. */ + retval = unshare_fs(); + if (retval) + goto out; + /* * Must be called _before_ exec_mmap() as bprm->mm is * not visible until then. This also enables the update @@ -1583,8 +1588,6 @@ static void check_unsafe_exec(struct linux_binprm *bprm) if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; - else - p->fs->in_exec = 1; spin_unlock(&p->fs->lock); } @@ -1843,7 +1846,6 @@ static int bprm_execve(struct linux_binprm *bprm, goto out; /* execve succeeded */ - current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); acct_update_integrals(current); @@ -1861,7 +1863,6 @@ static int bprm_execve(struct linux_binprm *bprm, force_fatal_sig(SIGSEGV); out_unmark: - current->fs->in_exec = 0; current->in_execve = 0; return retval; diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 04b3f5b9c629..c27c67023d01 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -115,7 +115,6 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) /* We don't need to lock fs - think why ;-) */ if (fs) { fs->users = 1; - fs->in_exec = 0; spin_lock_init(&fs->lock); seqcount_spinlock_init(&fs->seq, &fs->lock); fs->umask = old->umask; diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index e066816f3519..fbfb89ee3bda 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -117,6 +117,7 @@ struct task_struct; void put_files_struct(struct files_struct *fs); int unshare_files(void); +int unshare_fs(void); struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 783b48dedb72..08b82a90ceae 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -11,7 +11,6 @@ struct fs_struct { spinlock_t lock; seqcount_spinlock_t seq; int umask; - int in_exec; struct path root, pwd; } __randomize_layout; diff --git a/kernel/fork.c b/kernel/fork.c index b4a799d9c50f..53b7248f7a4b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1589,10 +1589,6 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ spin_lock(&fs->lock); - if (fs->in_exec) { - spin_unlock(&fs->lock); - return -EAGAIN; - } fs->users++; spin_unlock(&fs->lock); return 0; @@ -3070,10 +3066,9 @@ static int check_unshare_flags(unsigned long unshare_flags) return 0; } -/* - * Unshare the filesystem structure if it is being shared - */ -static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) +/* Allocate an fs_struct if it is currently shared and CLONE_FS requested. */ +static int unshare_fs_alloc(unsigned long unshare_flags, + struct fs_struct **new_fsp) { struct fs_struct *fs = current->fs; @@ -3091,6 +3086,41 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) return 0; } +/* Swap out fs_struct, returning old fs if it needs freeing. */ +static void unshare_fs_finalize(struct fs_struct **new_fsp) +{ + struct task_struct *task = current; + struct fs_struct *fs = task->fs; + + spin_lock(&fs->lock); + task->fs = *new_fsp; + if (--fs->users) + *new_fsp = NULL; + else + *new_fsp = fs; + spin_unlock(&fs->lock); +} + +/* + * Unshare the filesystem structure if it is being shared + */ +int unshare_fs(void) +{ + struct fs_struct *new_fs = NULL; + int error; + + error = unshare_fs_alloc(CLONE_FS, &new_fs); + if (error || !new_fs) + return error; + + unshare_fs_finalize(&new_fs); + + if (new_fs) + free_fs_struct(new_fs); + + return 0; +} + /* * Unshare file descriptor table if it is being shared */ @@ -3120,7 +3150,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, */ int ksys_unshare(unsigned long unshare_flags) { - struct fs_struct *fs, *new_fs = NULL; + struct fs_struct *new_fs = NULL; struct files_struct *new_fd = NULL; struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; @@ -3159,7 +3189,7 @@ int ksys_unshare(unsigned long unshare_flags) */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - err = unshare_fs(unshare_flags, &new_fs); + err = unshare_fs_alloc(unshare_flags, &new_fs); if (err) goto bad_unshare_out; err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); @@ -3197,16 +3227,8 @@ int ksys_unshare(unsigned long unshare_flags) task_lock(current); - if (new_fs) { - fs = current->fs; - spin_lock(&fs->lock); - current->fs = new_fs; - if (--fs->users) - new_fs = NULL; - else - new_fs = fs; - spin_unlock(&fs->lock); - } + if (new_fs) + unshare_fs_finalize(&new_fs); if (new_fd) swap(current->files, new_fd); -- 2.34.1