On Fri, Aug 16, 2024 at 04:03:41AM +0100, Al Viro wrote: > Thread 1: > if (dup2(0, 1023) >= 0) > dup2(0, 10); > > Thread 2: > if (close_range(64, 127, CLOSE_RANGE_UNSHARE) == 0 && > fcntl(10, F_GETFD) >= 0 && > fcntl(1023, F_GETFD) == -1) > printf("broken"); > > > Note that close_range() call in the second thread does not > affect any of the descriptors we work with in the first thread > and at no point does thread 1 have descriptor 10 opened without > descriptor 1023 also being opened. > > It *can* actually happen - all it takes is close_range(2) decision > to trim the copied descriptor table made before the first dup2() > and actual copying done after both dup2() are done. > > I would not expect that printf to trigger - not without having > looked through the close_range(2) implementation. Note that > manpage doesn't even hint at anything of that sort. > > IMO it's a QoI issue at the very least, and arguably an outright > bug. > > Note that there is a case where everything works fine, and I suspect > that most of the callers where we want trimming are of that sort - > if the second argument of close_range() is above the INT_MAX. > > If that's the only case where we want trimming to happen, the > fix is trivial; if not... also doable. We just need to pass the > range to be punched out all way down to sane_fdtable_size() > (e.g. as a pointer, NULL meaning "no holes to punch"). I wouldn't > bother with unshare_fd() - it's not hard for __close_range() to > use dup_fd() instead. > > Something like this (completely untested), perhaps? or, with a braino fixed, this: diff --git a/fs/file.c b/fs/file.c index 655338effe9c..312574fda320 100644 --- a/fs/file.c +++ b/fs/file.c @@ -272,20 +272,6 @@ static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) return test_bit(fd, fdt->open_fds); } -static unsigned int count_open_files(struct fdtable *fdt) -{ - unsigned int size = fdt->max_fds; - unsigned int i; - - /* Find the last open fd */ - for (i = size / BITS_PER_LONG; i > 0; ) { - if (fdt->open_fds[--i]) - break; - } - i = (i + 1) * BITS_PER_LONG; - return i; -} - /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. @@ -299,14 +285,18 @@ static unsigned int count_open_files(struct fdtable *fdt) * just make that BITS_PER_LONG alignment be part of a sane * fdtable size. Becuase that's really what it is. */ -static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) +static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int *punch_hole) { - unsigned int count; - - count = count_open_files(fdt); - if (max_fds < NR_OPEN_DEFAULT) - max_fds = NR_OPEN_DEFAULT; - return ALIGN(min(count, max_fds), BITS_PER_LONG); + unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); + + if (last == fdt->max_fds) // empty + return NR_OPEN_DEFAULT; + if (punch_hole && punch_hole[1] >= last && punch_hole[0] <= last) { + last = find_last_bit(fdt->open_fds, punch_hole[0]); + if (last == punch_hole[0]) + return NR_OPEN_DEFAULT; + } + return ALIGN(last + 1, BITS_PER_LONG); } /* @@ -314,7 +304,7 @@ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) * passed in files structure. * errorp will be valid only when the returned files_struct is NULL. */ -struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp) +struct files_struct *dup_fd(struct files_struct *oldf, unsigned int *punch_hole, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; @@ -341,7 +331,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = sane_fdtable_size(old_fdt, max_fds); + open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. @@ -372,7 +362,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); - open_files = sane_fdtable_size(old_fdt, max_fds); + open_files = sane_fdtable_size(old_fdt, punch_hole); } copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); @@ -748,37 +738,26 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) if (fd > max_fd) return -EINVAL; - if (flags & CLOSE_RANGE_UNSHARE) { + if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { int ret; - unsigned int max_unshare_fds = NR_OPEN_MAX; + unsigned int range[2] = {fd, max_fd}, *punch_hole = range; /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ - if (!(flags & CLOSE_RANGE_CLOEXEC)) { - /* - * If the requested range is greater than the current - * maximum, we're closing everything so only copy all - * file descriptors beneath the lowest file descriptor. - */ - rcu_read_lock(); - if (max_fd >= last_fd(files_fdtable(cur_fds))) - max_unshare_fds = fd; - rcu_read_unlock(); - } + if (flags & CLOSE_RANGE_CLOEXEC) + punch_hole = NULL; - ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); - if (ret) + fds = dup_fd(cur_fds, punch_hole, &ret); + if (!fds) return ret; - /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ - if (fds) - swap(cur_fds, fds); + swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 2944d4aa413b..6bc4353f919e 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -22,7 +22,6 @@ * as this is the granularity returned by copy_fdset(). */ #define NR_OPEN_DEFAULT BITS_PER_LONG -#define NR_OPEN_MAX ~0U struct fdtable { unsigned int max_fds; @@ -106,7 +105,7 @@ struct task_struct; void put_files_struct(struct files_struct *fs); int unshare_files(void); -struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy; +struct files_struct *dup_fd(struct files_struct *, unsigned *, int *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), @@ -115,8 +114,7 @@ int iterate_fd(struct files_struct *, unsigned, extern int close_fd(unsigned int fd); extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); extern struct file *file_close_fd(unsigned int fd); -extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, - struct files_struct **new_fdp); +extern int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp); extern struct kmem_cache *files_cachep; diff --git a/kernel/fork.c b/kernel/fork.c index cc760491f201..a7c905f06048 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1773,7 +1773,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, goto out; } - newf = dup_fd(oldf, NR_OPEN_MAX, &error); + newf = dup_fd(oldf, NULL, &error); if (!newf) goto out; @@ -3232,15 +3232,14 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, - struct files_struct **new_fdp) +int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) { struct files_struct *fd = current->files; int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, max_fds, &error); + *new_fdp = dup_fd(fd, NULL, &error); if (!*new_fdp) return error; } @@ -3300,7 +3299,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); + err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3392,7 +3391,7 @@ int unshare_files(void) struct files_struct *old, *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); + error = unshare_fd(CLONE_FILES, ©); if (error || !copy) return error;