Instead of storing all the file pointers in a single array, use an IDR. It is RCU-safe, and does not need to be reallocated when the fd array grows. It also handles allocation of new file descriptors. Signed-off-by: Sandhya Bankar <bankarsandhya512@xxxxxxxxx> [mawilcox@xxxxxxxxxxxxx: fixes] Signed-off-by: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx> --- fs/file.c | 180 ++++++++++++++++++++---------------------------- include/linux/fdtable.h | 10 +-- 2 files changed, 79 insertions(+), 111 deletions(-) diff --git a/fs/file.c b/fs/file.c index ad6f094..1c000d8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -47,7 +47,6 @@ static void *alloc_fdmem(size_t size) static void __free_fdtable(struct fdtable *fdt) { - kvfree(fdt->fd); kvfree(fdt->open_fds); kfree(fdt); } @@ -89,15 +88,7 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { - unsigned int cpy, set; - BUG_ON(nfdt->max_fds < ofdt->max_fds); - - cpy = ofdt->max_fds * sizeof(struct file *); - set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); - memcpy(nfdt->fd, ofdt->fd, cpy); - memset((char *)nfdt->fd + cpy, 0, set); - copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); } @@ -131,15 +122,11 @@ static struct fdtable * alloc_fdtable(unsigned int nr) if (!fdt) goto out; fdt->max_fds = nr; - data = alloc_fdmem(nr * sizeof(struct file *)); - if (!data) - goto out_fdt; - fdt->fd = data; data = alloc_fdmem(max_t(size_t, 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES)); if (!data) - goto out_arr; + goto out_fdt; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; @@ -148,8 +135,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr) return fdt; -out_arr: - kvfree(fdt->fd); out_fdt: kfree(fdt); out: @@ -170,6 +155,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); + idr_preload_end(); new_fdt = alloc_fdtable(nr); /* make sure all __fd_install() have seen resize_in_progress @@ -178,6 +164,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) if (atomic_read(&files->count) > 1) synchronize_sched(); + idr_preload(GFP_KERNEL); spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; @@ -228,8 +215,10 @@ static int expand_files(struct files_struct *files, unsigned int nr) if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); + idr_preload_end(); expanded = 1; wait_event(files->resize_wait, !files->resize_in_progress); + idr_preload(GFP_KERNEL); spin_lock(&files->file_lock); goto repeat; } @@ -290,8 +279,8 @@ static unsigned int count_open_files(struct fdtable *fdt) struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; - struct file **old_fds, **new_fds; unsigned int open_files, i; + struct file *f; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; @@ -302,6 +291,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); + idr_init(&newf->fd_idr); newf->resize_in_progress = false; init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; @@ -310,8 +300,9 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; - new_fdt->fd = &newf->fd_array[0]; +restart: + idr_copy_preload(&oldf->fd_idr, GFP_KERNEL); spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = count_open_files(old_fdt); @@ -321,6 +312,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) */ while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); + idr_preload_end(); if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); @@ -343,41 +335,50 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) * who knows it may have a new bigger fd table. We need * the latest pointer. */ + idr_copy_preload(&oldf->fd_idr, GFP_KERNEL); spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = count_open_files(old_fdt); } + if (!idr_check_preload(&oldf->fd_idr)) { + spin_unlock(&oldf->file_lock); + idr_preload_end(); + goto restart; + } + copy_fd_bitmaps(new_fdt, old_fdt, open_files); - old_fds = old_fdt->fd; - new_fds = new_fdt->fd; - - for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; - if (f) { - get_file(f); - } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ - __clear_open_fd(open_files - i, new_fdt); + idr_for_each_entry(&oldf->fd_idr, f, i) { + int err; + + get_file(f); + err = idr_alloc(&newf->fd_idr, f, i, i + 1, GFP_NOWAIT); + if (WARN(err != i, "Could not allocate %d: %d", i, err)) { + spin_unlock(&oldf->file_lock); + goto out; } - rcu_assign_pointer(*new_fds++, f); } + spin_unlock(&oldf->file_lock); + idr_preload_end(); - /* clear the remainder */ - memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); + /* + * The fd may be claimed in the fd bitmap but not yet + * instantiated in the files array if a sibling thread + * is partway through open(). + */ + for_each_set_bit(i, new_fdt->open_fds, new_fdt->max_fds) { + if (!idr_find(&newf->fd_idr, i)) + __clear_bit(i, new_fdt->open_fds); + } rcu_assign_pointer(newf->fdt, new_fdt); return newf; out_release: + idr_destroy(&newf->fd_idr); kmem_cache_free(files_cachep, newf); out: return NULL; @@ -401,7 +402,8 @@ static struct fdtable *close_files(struct files_struct * files) set = fdt->open_fds[j++]; while (set) { if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); + struct file *file; + file = idr_remove(&files->fd_idr, i); if (file) { filp_close(file, files); cond_resched_rcu_qs(); @@ -469,28 +471,14 @@ struct files_struct init_files = { .fdt = &init_files.fdtab, .fdtab = { .max_fds = NR_OPEN_DEFAULT, - .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), + .fd_idr = IDR_INIT, }; -static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) -{ - unsigned int maxfd = fdt->max_fds; - unsigned int maxbit = maxfd / BITS_PER_LONG; - unsigned int bitbit = start / BITS_PER_LONG; - - bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; - if (bitbit > maxfd) - return maxfd; - if (bitbit > start) - start = bitbit; - return find_next_zero_bit(fdt->open_fds, maxfd, start); -} - /* * allocate a file descriptor, mark it busy. */ @@ -501,54 +489,37 @@ int __alloc_fd(struct files_struct *files, int error; struct fdtable *fdt; + idr_preload(GFP_KERNEL); spin_lock(&files->file_lock); -repeat: - fdt = files_fdtable(files); - fd = start; - if (fd < files->next_fd) - fd = files->next_fd; - if (fd < fdt->max_fds) - fd = find_next_fd(fdt, fd); - - /* - * N.B. For clone tasks sharing a files structure, this test - * will limit the total number of files that can be opened. - */ - error = -EMFILE; - if (fd >= end) + error = idr_alloc(&files->fd_idr, NULL, start, end, GFP_NOWAIT); + if (error == -ENOSPC) { + error = -EMFILE; goto out; + } + BUG_ON(error < 0); + fd = error; error = expand_files(files, fd); - if (error < 0) + if (error < 0) { + idr_remove(&files->fd_idr, fd); goto out; - - /* - * If we needed to expand the fs array we - * might have blocked - try again. - */ - if (error) - goto repeat; + } if (start <= files->next_fd) files->next_fd = fd + 1; + fdt = files_fdtable(files); __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); error = fd; -#if 1 - /* Sanity check */ - if (rcu_access_pointer(fdt->fd[fd]) != NULL) { - printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); - rcu_assign_pointer(fdt->fd[fd], NULL); - } -#endif out: spin_unlock(&files->file_lock); + idr_preload_end(); return error; } @@ -575,6 +546,7 @@ void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); + BUG_ON(idr_remove(&files->fd_idr, fd)); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } @@ -604,22 +576,9 @@ void put_unused_fd(unsigned int fd) void __fd_install(struct files_struct *files, unsigned int fd, struct file *file) { - struct fdtable *fdt; - - might_sleep(); - rcu_read_lock_sched(); - - while (unlikely(files->resize_in_progress)) { - rcu_read_unlock_sched(); - wait_event(files->resize_wait, !files->resize_in_progress); - rcu_read_lock_sched(); - } - /* coupled with smp_wmb() in expand_fdtable() */ - smp_rmb(); - fdt = rcu_dereference_sched(files->fdt); - BUG_ON(fdt->fd[fd] != NULL); - rcu_assign_pointer(fdt->fd[fd], file); - rcu_read_unlock_sched(); + rcu_read_lock(); + BUG_ON(idr_replace(&files->fd_idr, file, fd)); + rcu_read_unlock(); } void fd_install(unsigned int fd, struct file *file) @@ -641,10 +600,9 @@ int __close_fd(struct files_struct *files, unsigned fd) fdt = files_fdtable(files); if (fd >= fdt->max_fds) goto out_unlock; - file = fdt->fd[fd]; + file = idr_remove(&files->fd_idr, fd); if (!file) goto out_unlock; - rcu_assign_pointer(fdt->fd[fd], NULL); __clear_close_on_exec(fd, fdt); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); @@ -676,10 +634,9 @@ void do_close_on_exec(struct files_struct *files) struct file *file; if (!(set & 1)) continue; - file = fdt->fd[fd]; + file = idr_remove(&files->fd_idr, fd); if (!file) continue; - rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); @@ -842,17 +799,27 @@ static int do_dup2(struct files_struct *files, * tables and this condition does not arise without those. */ fdt = files_fdtable(files); - tofree = fdt->fd[fd]; + tofree = idr_find(&files->fd_idr, fd); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); - rcu_assign_pointer(fdt->fd[fd], file); + if (tofree) { + idr_replace(&files->fd_idr, file, fd); + } else { + int err = idr_alloc(&files->fd_idr, file, fd, fd + 1, + GFP_NOWAIT); + if (err != fd) { + WARN(1, "do_dup2 got %d for fd %d\n", err, fd); + goto Ebusy; + } + } __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); + idr_preload_end(); if (tofree) filp_close(tofree, files); @@ -861,6 +828,7 @@ static int do_dup2(struct files_struct *files, Ebusy: spin_unlock(&files->file_lock); + idr_preload_end(); return -EBUSY; } @@ -875,6 +843,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; + idr_preload(GFP_KERNEL); spin_lock(&files->file_lock); err = expand_files(files, fd); if (unlikely(err < 0)) @@ -883,6 +852,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) out_unlock: spin_unlock(&files->file_lock); + idr_preload_end(); return err; } @@ -901,6 +871,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) if (newfd >= rlimit(RLIMIT_NOFILE)) return -EBADF; + idr_preload(GFP_KERNEL); spin_lock(&files->file_lock); err = expand_files(files, newfd); file = fcheck(oldfd); @@ -917,6 +888,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) err = -EBADF; out_unlock: spin_unlock(&files->file_lock); + idr_preload_end(); return err; } @@ -974,7 +946,7 @@ int iterate_fd(struct files_struct *files, unsigned n, spin_lock(&files->file_lock); for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { struct file *file; - file = rcu_dereference_check_fdtable(files, fdt->fd[n]); + file = __fcheck_files(files, n); if (!file) continue; res = f(p, file, n); diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 6e84b2cae..4072f24 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/idr.h> #include <linux/atomic.h> @@ -23,7 +24,6 @@ struct fdtable { unsigned int max_fds; - struct file __rcu **fd; /* current fd array */ unsigned long *close_on_exec; unsigned long *open_fds; unsigned long *full_fds_bits; @@ -51,6 +51,7 @@ struct files_struct { bool resize_in_progress; wait_queue_head_t resize_wait; + struct idr fd_idr; struct fdtable __rcu *fdt; struct fdtable fdtab; /* @@ -61,7 +62,6 @@ struct files_struct { unsigned long close_on_exec_init[1]; unsigned long open_fds_init[1]; unsigned long full_fds_bits_init[1]; - struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; struct file_operations; @@ -79,11 +79,7 @@ struct files_struct { */ static inline struct file *__fcheck_files(struct files_struct *files, unsigned int fd) { - struct fdtable *fdt = rcu_dereference_raw(files->fdt); - - if (fd < fdt->max_fds) - return rcu_dereference_raw(fdt->fd[fd]); - return NULL; + return idr_find(&files->fd_idr, fd); } static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd) -- 1.8.3.1