Completing the conversion of the file descriptor allocation code to use the IDR. This patch includes below changes: - Move max_fds from struct fdtable to files_struct. - Added fill_max_fds() routine to calculate the new value of max_fds to matches the old behaviour of alloc_fdtable() code which is user-visible through /proc. - Remove struct fdtable - Removed resize_in_progress, resize_wait from files_struct - Delete open_fds() & count_open_files() - Use READ_ONCE() instead of rcu_read_lock/unlock(). The rcu_read_lock()/ unlock() was used to dereference the files->fdt. Now we don't use RCU to protect files->max_fds. So using READ_ONCE() macro to read the value of files->max_fds. Signed-off-by: Sandhya Bankar <bankarsandhya512@xxxxxxxxx> Signed-off-by: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx> --- fs/compat.c | 6 +- fs/file.c | 354 +++++++----------------------------------------- fs/proc/array.c | 2 +- fs/proc/fd.c | 2 +- fs/select.c | 6 +- include/linux/fdtable.h | 31 +---- 6 files changed, 54 insertions(+), 347 deletions(-) diff --git a/fs/compat.c b/fs/compat.c index c61b506..7483c9c 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1153,17 +1153,13 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp, fd_set_bits fds; void *bits; int size, max_fds, ret = -EINVAL; - struct fdtable *fdt; long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ - rcu_read_lock(); - fdt = files_fdtable(current->files); - max_fds = fdt->max_fds; - rcu_read_unlock(); + max_fds = READ_ONCE(current->files->max_fds); if (n > max_fds) n = max_fds; diff --git a/fs/file.c b/fs/file.c index 23f198b..3e6cf10 100644 --- a/fs/file.c +++ b/fs/file.c @@ -31,191 +31,36 @@ unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; -static void *alloc_fdmem(size_t size) +static int fill_max_fds(struct files_struct *files, unsigned int nr) { - /* - * Very large allocations can stress page reclaim, so fall back to - * vmalloc() if the allocation size will be considered "large" by the VM. - */ - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL_ACCOUNT | - __GFP_NOWARN | __GFP_NORETRY); - if (data != NULL) - return data; - } - return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL); -} + unsigned int nr_open; -static void __free_fdtable(struct fdtable *fdt) -{ - kvfree(fdt->open_fds); - kfree(fdt); -} - -static void free_fdtable_rcu(struct rcu_head *rcu) -{ - __free_fdtable(container_of(rcu, struct fdtable, rcu)); -} - -/* - * Copy 'count' fd bits from the old table to the new table and clear the extra - * space if any. This does not copy the file pointers. Called with the files - * spinlock held for write. - */ -static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, - unsigned int count) -{ - unsigned int cpy, set; - - cpy = count / BITS_PER_BYTE; - set = (nfdt->max_fds - count) / BITS_PER_BYTE; - memcpy(nfdt->open_fds, ofdt->open_fds, cpy); - memset((char *)nfdt->open_fds + cpy, 0, set); -} + if (likely(nr < files->max_fds)) + return 0; -/* - * Copy all file descriptors from the old table to the new, expanded table and - * clear the extra space. Called with the files spinlock held for write. - */ -static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) -{ - BUG_ON(nfdt->max_fds < ofdt->max_fds); - copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); -} + nr_open = READ_ONCE(sysctl_nr_open); -static struct fdtable * alloc_fdtable(unsigned int nr) -{ - struct fdtable *fdt; - void *data; + if (nr >= nr_open) + return -EMFILE; /* - * Figure out how many fds we actually want to support in this fdtable. - * Allocation steps are keyed to the size of the fdarray, since it - * grows far faster than any of the other dynamic data. We try to fit - * the fdarray into comfortable page-tuned chunks: starting at 1024B - * and growing in powers of two from there on. + * This calculation of the new value of max_fds matches the old + * behaviour of this code which is user-visible through /proc. + * nr may exceed the sysctl_nr_open value by a small amount. + * This used to be necessary to keep the bitmaps aligned, and + * we keep it the same even though the IDR handles the bitmaps now. */ + nr /= (1024 / sizeof(struct file *)); nr = roundup_pow_of_two(nr + 1); nr *= (1024 / sizeof(struct file *)); - /* - * Note that this can drive nr *below* what we had passed if sysctl_nr_open - * had been set lower between the check in expand_files() and here. Deal - * with that in caller, it's cheaper that way. - * - * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise - * bitmaps handling below becomes unpleasant, to put it mildly... - */ - if (unlikely(nr > sysctl_nr_open)) - nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; - fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); - if (!fdt) - goto out; - fdt->max_fds = nr; + if (unlikely(nr > nr_open)) + nr = ((nr_open - 1) | (BITS_PER_LONG - 1)) + 1; - data = alloc_fdmem(max_t(size_t, nr / BITS_PER_BYTE, L1_CACHE_BYTES)); - if (!data) - goto out_fdt; - fdt->open_fds = data; + files->max_fds = nr; - return fdt; - -out_fdt: - kfree(fdt); -out: - return NULL; -} - -/* - * Expand the file descriptor table. - * This function will allocate a new fdtable and both fd array and fdset, of - * the given size. - * Return <0 error code on error; 1 on successful completion. - * The files->file_lock should be held on entry, and will be held on exit. - */ -static int expand_fdtable(struct files_struct *files, unsigned int nr) - __releases(files->file_lock) - __acquires(files->file_lock) -{ - struct fdtable *new_fdt, *cur_fdt; - - spin_unlock(&files->file_lock); - idr_preload_end(); - new_fdt = alloc_fdtable(nr); - - /* make sure all __fd_install() have seen resize_in_progress - * or have finished their rcu_read_lock_sched() section. - */ - if (atomic_read(&files->count) > 1) - synchronize_sched(); - - idr_preload(GFP_KERNEL); - spin_lock(&files->file_lock); - if (!new_fdt) - return -ENOMEM; - /* - * extremely unlikely race - sysctl_nr_open decreased between the check in - * caller and alloc_fdtable(). Cheaper to catch it here... - */ - if (unlikely(new_fdt->max_fds <= nr)) { - __free_fdtable(new_fdt); - return -EMFILE; - } - cur_fdt = files_fdtable(files); - BUG_ON(nr < cur_fdt->max_fds); - copy_fdtable(new_fdt, cur_fdt); - rcu_assign_pointer(files->fdt, new_fdt); - if (cur_fdt != &files->fdtab) - call_rcu(&cur_fdt->rcu, free_fdtable_rcu); - /* coupled with smp_rmb() in __fd_install() */ - smp_wmb(); - return 1; -} - -/* - * Expand files. - * This function will expand the file structures, if the requested size exceeds - * the current capacity and there is room for expansion. - * Return <0 error code on error; 0 when nothing done; 1 when files were - * expanded and execution may have blocked. - * The files->file_lock should be held on entry, and will be held on exit. - */ -static int expand_files(struct files_struct *files, unsigned int nr) - __releases(files->file_lock) - __acquires(files->file_lock) -{ - struct fdtable *fdt; - int expanded = 0; - -repeat: - fdt = files_fdtable(files); - - /* Do we need to expand? */ - if (nr < fdt->max_fds) - return expanded; - - /* Can we expand? */ - if (nr >= sysctl_nr_open) - return -EMFILE; - - if (unlikely(files->resize_in_progress)) { - spin_unlock(&files->file_lock); - idr_preload_end(); - expanded = 1; - wait_event(files->resize_wait, !files->resize_in_progress); - idr_preload(GFP_KERNEL); - spin_lock(&files->file_lock); - goto repeat; - } - - /* All good, so we try */ - files->resize_in_progress = true; - expanded = expand_fdtable(files, nr); - files->resize_in_progress = false; - - wake_up_all(&files->resize_wait); - return expanded; + return 0; } static inline bool fd_is_open(unsigned int fd, struct files_struct *files) @@ -235,28 +80,21 @@ static inline void __clear_close_on_exec(unsigned int fd, idr_tag_clear(&files->fd_idr, fd, FD_TAG_CLOEXEC); } -static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) -{ - __set_bit(fd, fdt->open_fds); -} - -static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) -{ - __clear_bit(fd, fdt->open_fds); -} - -static unsigned int count_open_files(struct fdtable *fdt) +static void close_files(struct files_struct * files) { - unsigned int size = fdt->max_fds; - unsigned int i; + /* + * No need for RCU or ->file_lock protection because + * this is the last reference to the files structure. + */ + struct file *file; + int fd; - /* Find the last open fd */ - for (i = size / BITS_PER_LONG; i > 0; ) { - if (fdt->open_fds[--i]) - break; + idr_for_each_entry(&files->fd_idr, file, fd) { + filp_close(file, files); + cond_resched_rcu_qs(); } - i = (i + 1) * BITS_PER_LONG; - return i; + + idr_destroy(&files->fd_idr); } /* @@ -267,9 +105,8 @@ static unsigned int count_open_files(struct fdtable *fdt) struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; - unsigned int open_files, i; + unsigned int i; struct file *f; - struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); @@ -280,51 +117,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) spin_lock_init(&newf->file_lock); idr_init(&newf->fd_idr); - newf->resize_in_progress = false; - init_waitqueue_head(&newf->resize_wait); - new_fdt = &newf->fdtab; - new_fdt->max_fds = NR_OPEN_DEFAULT; - new_fdt->open_fds = newf->open_fds_init; + newf->max_fds = NR_OPEN_DEFAULT; restart: idr_copy_preload(&oldf->fd_idr, GFP_KERNEL); spin_lock(&oldf->file_lock); - old_fdt = files_fdtable(oldf); - open_files = count_open_files(old_fdt); - - /* - * Check whether we need to allocate a larger fd array and fd set. - */ - while (unlikely(open_files > new_fdt->max_fds)) { - spin_unlock(&oldf->file_lock); - idr_preload_end(); - - if (new_fdt != &newf->fdtab) - __free_fdtable(new_fdt); - - new_fdt = alloc_fdtable(open_files - 1); - if (!new_fdt) { - *errorp = -ENOMEM; - goto out_release; - } - - /* beyond sysctl_nr_open; nothing to do */ - if (unlikely(new_fdt->max_fds < open_files)) { - __free_fdtable(new_fdt); - *errorp = -EMFILE; - goto out_release; - } - - /* - * Reacquire the oldf lock and a pointer to its fd table - * who knows it may have a new bigger fd table. We need - * the latest pointer. - */ - idr_copy_preload(&oldf->fd_idr, GFP_KERNEL); - spin_lock(&oldf->file_lock); - old_fdt = files_fdtable(oldf); - open_files = count_open_files(old_fdt); - } if (!idr_check_preload(&oldf->fd_idr)) { spin_unlock(&oldf->file_lock); @@ -332,8 +129,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) goto restart; } - copy_fd_bitmaps(new_fdt, old_fdt, open_files); - idr_for_each_entry(&oldf->fd_idr, f, i) { int err; @@ -341,6 +136,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) err = idr_alloc(&newf->fd_idr, f, i, i + 1, GFP_NOWAIT); if (WARN(err != i, "Could not allocate %d: %d", i, err)) { spin_unlock(&oldf->file_lock); + idr_preload_end(); goto out; } @@ -351,42 +147,18 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) spin_unlock(&oldf->file_lock); idr_preload_end(); - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). - */ - for_each_set_bit(i, new_fdt->open_fds, new_fdt->max_fds) { - if (!idr_find(&newf->fd_idr, i)) - __clear_bit(i, new_fdt->open_fds); + if (unlikely(fill_max_fds(newf, i) < 0)) { + *errorp = -EMFILE; + goto out; } - rcu_assign_pointer(newf->fdt, new_fdt); - return newf; -out_release: - idr_destroy(&newf->fd_idr); - kmem_cache_free(files_cachep, newf); out: - return NULL; -} - -static void close_files(struct files_struct * files) -{ - /* - * No need for RCU or ->file_lock protection because - * this is the last reference to the files structure. - */ - struct file *file; - int fd; - - idr_for_each_entry(&files->fd_idr, file, fd) { - filp_close(file, files); - cond_resched_rcu_qs(); - } + close_files(newf); + kmem_cache_free(files_cachep, newf); - idr_destroy(&files->fd_idr); + return NULL; } struct files_struct *get_files_struct(struct task_struct *task) @@ -405,12 +177,7 @@ struct files_struct *get_files_struct(struct task_struct *task) void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { - struct fdtable *fdt = rcu_dereference_raw(files->fdt); close_files(files); - - /* free the arrays if they are not embedded */ - if (fdt != &files->fdtab) - __free_fdtable(fdt); kmem_cache_free(files_cachep, files); } } @@ -441,11 +208,7 @@ void exit_files(struct task_struct *tsk) struct files_struct init_files = { .count = ATOMIC_INIT(1), - .fdt = &init_files.fdtab, - .fdtab = { - .max_fds = NR_OPEN_DEFAULT, - .open_fds = init_files.open_fds_init, - }, + .max_fds = NR_OPEN_DEFAULT, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .fd_idr = IDR_INIT, }; @@ -458,7 +221,6 @@ int __alloc_fd(struct files_struct *files, { unsigned int fd; int error; - struct fdtable *fdt; idr_preload(GFP_KERNEL); spin_lock(&files->file_lock); @@ -471,14 +233,12 @@ int __alloc_fd(struct files_struct *files, BUG_ON(error < 0); fd = error; - error = expand_files(files, fd); + error = fill_max_fds(files, fd); if (error < 0) { idr_remove(&files->fd_idr, fd); goto out; } - fdt = files_fdtable(files); - __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, files); else @@ -502,18 +262,11 @@ int get_unused_fd_flags(unsigned flags) } EXPORT_SYMBOL(get_unused_fd_flags); -static void __put_unused_fd(struct files_struct *files, unsigned int fd) -{ - struct fdtable *fdt = files_fdtable(files); - __clear_open_fd(fd, fdt); -} - void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); BUG_ON(idr_remove(&files->fd_idr, fd)); - __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } @@ -560,17 +313,12 @@ void fd_install(unsigned int fd, struct file *file) int __close_fd(struct files_struct *files, unsigned fd) { struct file *file; - struct fdtable *fdt; spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (fd >= fdt->max_fds) - goto out_unlock; file = idr_remove(&files->fd_idr, fd); if (!file) goto out_unlock; __clear_close_on_exec(fd, files); - __put_unused_fd(files, fd); spin_unlock(&files->file_lock); return filp_close(file, files); @@ -589,7 +337,6 @@ void do_close_on_exec(struct files_struct *files) idr_for_each_entry_tagged(&files->fd_idr, file, fd, FD_TAG_CLOEXEC) { idr_remove(&files->fd_idr, fd); - __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); @@ -718,7 +465,6 @@ static int do_dup2(struct files_struct *files, __releases(&files->file_lock) { struct file *tofree; - struct fdtable *fdt; /* * We need to detect attempts to do dup2() over allocated but still @@ -734,7 +480,6 @@ static int do_dup2(struct files_struct *files, * scope of POSIX or SUS, since neither considers shared descriptor * tables and this condition does not arise without those. */ - fdt = files_fdtable(files); tofree = idr_find(&files->fd_idr, fd); if (!tofree && fd_is_open(fd, files)) goto Ebusy; @@ -749,7 +494,6 @@ static int do_dup2(struct files_struct *files, goto Ebusy; } } - __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, files); else @@ -781,9 +525,10 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) idr_preload(GFP_KERNEL); spin_lock(&files->file_lock); - err = expand_files(files, fd); + err = fill_max_fds(files, fd); if (unlikely(err < 0)) goto out_unlock; + return do_dup2(files, file, fd, flags); out_unlock: @@ -809,20 +554,18 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) idr_preload(GFP_KERNEL); spin_lock(&files->file_lock); - err = expand_files(files, newfd); + err = fill_max_fds(files, newfd); + if (unlikely(err < 0)) + goto Ebadf; + file = fcheck(oldfd); if (unlikely(!file)) goto Ebadf; - if (unlikely(err < 0)) { - if (err == -EMFILE) - goto Ebadf; - goto out_unlock; - } + return do_dup2(files, file, newfd, flags); Ebadf: err = -EBADF; -out_unlock: spin_unlock(&files->file_lock); idr_preload_end(); return err; @@ -875,12 +618,11 @@ int iterate_fd(struct files_struct *files, unsigned n, int (*f)(const void *, struct file *, unsigned), const void *p) { - struct fdtable *fdt; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); - for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { + for (; n < files->max_fds; n++) { struct file *file; file = __fcheck_files(files, n); if (!file) diff --git a/fs/proc/array.c b/fs/proc/array.c index 88c3555..ec6fdaf 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -186,7 +186,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_lock(p); if (p->files) - max_fds = files_fdtable(p->files)->max_fds; + max_fds = READ_ONCE(p->files->max_fds); task_unlock(p); rcu_read_unlock(); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 2735ccc..7e5aeca 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -231,7 +231,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, rcu_read_lock(); for (fd = ctx->pos - 2; - fd < files_fdtable(files)->max_fds; + fd < files->max_fds; fd++, ctx->pos++) { char name[PROC_NUMBUF]; int len; diff --git a/fs/select.c b/fs/select.c index 5d20a14..14da393 100644 --- a/fs/select.c +++ b/fs/select.c @@ -557,7 +557,6 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, void *bits; int ret, max_fds; size_t size, alloc_size; - struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; @@ -566,10 +565,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ - rcu_read_lock(); - fdt = files_fdtable(current->files); - max_fds = fdt->max_fds; - rcu_read_unlock(); + max_fds = READ_ONCE(current->files->max_fds); if (n > max_fds) n = max_fds; diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 7f1ab82..ff94541 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -24,43 +24,16 @@ #define FD_TAG_CLOEXEC 1 -struct fdtable { - unsigned int max_fds; - unsigned long *open_fds; - struct rcu_head rcu; -}; - /* * Open file table structure */ struct files_struct { - /* - * read mostly part - */ + spinlock_t file_lock; atomic_t count; - bool resize_in_progress; - wait_queue_head_t resize_wait; - struct idr fd_idr; - struct fdtable __rcu *fdt; - struct fdtable fdtab; - /* - * written part on a separate cache line in SMP - */ - spinlock_t file_lock ____cacheline_aligned_in_smp; - unsigned long open_fds_init[1]; + unsigned int max_fds; }; -struct file_operations; -struct vfsmount; -struct dentry; - -#define rcu_dereference_check_fdtable(files, fdtfd) \ - rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock)) - -#define files_fdtable(files) \ - rcu_dereference_check_fdtable((files), (files)->fdt) - /* * The caller must ensure that fd table isn't shared or hold rcu or file lock */ -- 1.8.3.1