Make it possible to avoid ENFILE checking for kernel specific open files, such as are used by the CacheFiles module. After, for example, tarring up a kernel source tree over the network, the CacheFiles module may easily have 20000+ files open in the backing filesystem, thus causing all non-root processes to be given error ENFILE when they try to open a file, socket, pipe, etc.. Signed-Off-By: David Howells <dhowells@xxxxxxxxxx> --- arch/ia64/kernel/perfmon.c | 2 +- drivers/infiniband/core/uverbs_main.c | 2 +- fs/eventpoll.c | 2 +- fs/file_table.c | 36 ++++++++++++++++++++++++--------- fs/hugetlbfs/inode.c | 2 +- fs/inotify.c | 2 +- fs/namei.c | 2 +- fs/open.c | 22 +++++++++++++++++++- fs/pipe.c | 4 ++-- include/linux/file.h | 1 - include/linux/fs.h | 8 ++++++- kernel/futex.c | 2 +- kernel/sysctl.c | 2 +- mm/shmem.c | 2 +- mm/tiny-shmem.c | 2 +- net/socket.c | 2 +- 16 files changed, 67 insertions(+), 26 deletions(-) diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 077f212..f23ab3a 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2162,7 +2162,7 @@ pfm_alloc_fd(struct file **cfile) ret = -ENFILE; - file = get_empty_filp(); + file = get_empty_filp(0); if (!file) goto out; /* diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index ff092a0..4f7137c 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -525,7 +525,7 @@ struct file *ib_uverbs_alloc_event_file( goto err; } - filp = get_empty_filp(); + filp = get_empty_filp(0); if (!filp) { ret = -ENFILE; goto err_fd; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1b4491c..f774038 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -714,7 +714,7 @@ static int ep_getfd(int *efd, struct ino /* Get an ready to use file */ error = -ENFILE; - file = get_empty_filp(); + file = get_empty_filp(0); if (!file) goto eexit_1; diff --git a/fs/file_table.c b/fs/file_table.c index bcea199..300e7c2 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -34,6 +34,7 @@ struct files_stat_struct files_stat = { __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); static struct percpu_counter nr_files __cacheline_aligned_in_smp; +static atomic_t nr_kernel_files; static inline void file_free_rcu(struct rcu_head *head) { @@ -43,7 +44,10 @@ static inline void file_free_rcu(struct static inline void file_free(struct file *f) { - percpu_counter_dec(&nr_files); + if (!(f->f_kernel_flags & FKFLAGS_KERNEL)) + percpu_counter_dec(&nr_files); + else + atomic_dec(&nr_kernel_files); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -72,6 +76,7 @@ int proc_nr_files(ctl_table *table, int void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); + files_stat.nr_kernel_files = atomic_read(&nr_kernel_files); return proc_dointvec(table, write, filp, buffer, lenp, ppos); } #else @@ -86,7 +91,7 @@ int proc_nr_files(ctl_table *table, int * Returns NULL, if there are no more free file structures or * we run out of memory. */ -struct file *get_empty_filp(void) +struct file *get_empty_filp(int kernel) { struct task_struct *tsk; static int old_max; @@ -95,20 +100,29 @@ struct file *get_empty_filp(void) /* * Privileged users can go above max_files */ - if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { - /* - * percpu_counters are inaccurate. Do an expensive check before - * we go and fail. - */ - if (percpu_counter_sum(&nr_files) >= files_stat.max_files) - goto over; + if (!kernel) { + if (get_nr_files() >= files_stat.max_files && + !capable(CAP_SYS_ADMIN) + ) { + /* + * percpu_counters are inaccurate. Do an expensive + * check before we go and fail. + */ + if (percpu_counter_sum(&nr_files) >= + files_stat.max_files) + goto over; + } } f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (f == NULL) goto fail; - percpu_counter_inc(&nr_files); + if (!kernel) + percpu_counter_inc(&nr_files); + else + atomic_inc(&nr_kernel_files); + memset(f, 0, sizeof(*f)); if (security_file_alloc(f)) goto fail_sec; @@ -117,6 +131,7 @@ struct file *get_empty_filp(void) INIT_LIST_HEAD(&f->f_u.fu_list); atomic_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); + f->f_kernel_flags = kernel ? FKFLAGS_KERNEL : 0; f->f_uid = tsk->fsuid; f->f_gid = tsk->fsgid; eventpoll_init_file(f); @@ -235,6 +250,7 @@ struct file fastcall *fget_light(unsigne return file; } +EXPORT_SYMBOL(fget_light); void put_filp(struct file *file) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3a5b4e9..cc27ee8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -770,7 +770,7 @@ struct file *hugetlb_zero_setup(size_t s goto out_shm_unlock; error = -ENFILE; - file = get_empty_filp(); + file = get_empty_filp(0); if (!file) goto out_dentry; diff --git a/fs/inotify.c b/fs/inotify.c index 1f50302..2e66e05 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -939,7 +939,7 @@ asmlinkage long sys_inotify_init(void) if (fd < 0) return fd; - filp = get_empty_filp(); + filp = get_empty_filp(0); if (!filp) { ret = -ENFILE; goto out_put_fd; diff --git a/fs/namei.c b/fs/namei.c index 96723ae..6713213 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1146,7 +1146,7 @@ static int __path_lookup_intent_open(int unsigned int lookup_flags, struct nameidata *nd, int open_flags, int create_mode) { - struct file *filp = get_empty_filp(); + struct file *filp = get_empty_filp(0); int err; if (filp == NULL) diff --git a/fs/open.c b/fs/open.c index 53ec28c..cea1538 100644 --- a/fs/open.c +++ b/fs/open.c @@ -962,7 +962,7 @@ struct file *dentry_open(struct dentry * struct file *f; error = -ENFILE; - f = get_empty_filp(); + f = get_empty_filp(0); if (f == NULL) { dput(dentry); mntput(mnt); @@ -974,6 +974,26 @@ struct file *dentry_open(struct dentry * EXPORT_SYMBOL(dentry_open); /* + * open a specifically in-kernel file + */ +struct file *dentry_open_kernel(struct dentry *dentry, struct vfsmount *mnt, int flags) +{ + int error; + struct file *f; + + error = -ENFILE; + f = get_empty_filp(1); + if (f == NULL) { + dput(dentry); + mntput(mnt); + return ERR_PTR(error); + } + + return __dentry_open(dentry, mnt, flags, f, NULL); +} +EXPORT_SYMBOL(dentry_open_kernel); + +/* * Find an empty file descriptor entry, and mark it busy. */ int get_unused_fd(void) diff --git a/fs/pipe.c b/fs/pipe.c index 7fefb10..6081367 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -795,11 +795,11 @@ int do_pipe(int *fd) int i, j; error = -ENFILE; - f1 = get_empty_filp(); + f1 = get_empty_filp(0); if (!f1) goto no_files; - f2 = get_empty_filp(); + f2 = get_empty_filp(0); if (!f2) goto close_f1; diff --git a/include/linux/file.h b/include/linux/file.h index 9f7c251..da7be8f 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -79,7 +79,6 @@ extern void FASTCALL(set_close_on_exec(u extern void put_filp(struct file *); extern int get_unused_fd(void); extern void FASTCALL(put_unused_fd(unsigned int fd)); -struct kmem_cache; extern struct file ** alloc_fd_array(int); extern void free_fd_array(struct file **, int); diff --git a/include/linux/fs.h b/include/linux/fs.h index 3de2bfb..979b1d3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -33,6 +33,7 @@ struct files_stat_struct { int nr_files; /* read only */ int nr_free_files; /* read only */ int max_files; /* tunable */ + int nr_kernel_files; /* read only */ }; extern struct files_stat_struct files_stat; extern int get_max_files(void); @@ -70,6 +71,8 @@ extern int dir_notify_enable; behavior for cross-node execution/opening_for_writing of files */ #define FMODE_EXEC 16 +#define FKFLAGS_KERNEL 1 /* kernel internal file (not accounted) */ + #define RW_MASK 1 #define RWA_MASK 2 #define READ 0 @@ -640,6 +643,7 @@ struct file { atomic_t f_count; unsigned int f_flags; mode_t f_mode; + unsigned short f_kernel_flags; loff_t f_pos; struct fown_struct f_owner; unsigned int f_uid, f_gid; @@ -1377,6 +1381,7 @@ extern long do_sys_open(int fdf, const c int mode); extern struct file *filp_open(const char *, int, int); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); +extern struct file * dentry_open_kernel(struct dentry *, struct vfsmount *, int); extern int filp_close(struct file *, fl_owner_t id); extern char * getname(const char __user *); @@ -1577,7 +1582,7 @@ static inline void insert_inode_hash(str __insert_inode_hash(inode, inode->i_ino); } -extern struct file * get_empty_filp(void); +extern struct file * get_empty_filp(int kernel); extern void file_move(struct file *f, struct list_head *list); extern void file_kill(struct file *f); struct bio; @@ -1603,6 +1608,7 @@ extern ssize_t generic_file_direct_write unsigned long *, loff_t, loff_t *, size_t, size_t); extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, unsigned long, loff_t, loff_t *, size_t, ssize_t); +extern int generic_file_buffered_write_one_kernel_page(struct file *, pgoff_t, struct page *); extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov, diff --git a/kernel/futex.c b/kernel/futex.c index 5699c51..7c334f3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -779,7 +779,7 @@ static int futex_fd(unsigned long uaddr, ret = get_unused_fd(); if (ret < 0) goto out; - filp = get_empty_filp(); + filp = get_empty_filp(0); if (!filp) { put_unused_fd(ret); ret = -ENFILE; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e82726f..e8f9b5f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -943,7 +943,7 @@ static ctl_table fs_table[] = { .ctl_name = FS_NRFILE, .procname = "file-nr", .data = &files_stat, - .maxlen = 3*sizeof(int), + .maxlen = 4*sizeof(int), .mode = 0444, .proc_handler = &proc_nr_files, }, diff --git a/mm/shmem.c b/mm/shmem.c index 37eaf42..83bbbe8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2311,7 +2311,7 @@ struct file *shmem_file_setup(char *name goto put_memory; error = -ENFILE; - file = get_empty_filp(); + file = get_empty_filp(0); if (!file) goto put_dentry; diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index f9d6a9c..b014dd5 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -71,7 +71,7 @@ struct file *shmem_file_setup(char *name goto put_memory; error = -ENFILE; - file = get_empty_filp(); + file = get_empty_filp(0); if (!file) goto put_dentry; diff --git a/net/socket.c b/net/socket.c index 23898f4..9743df2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -377,7 +377,7 @@ static int sock_alloc_fd(struct file **f fd = get_unused_fd(); if (likely(fd >= 0)) { - struct file *file = get_empty_filp(); + struct file *file = get_empty_filp(0); *filep = file; if (unlikely(!file)) { - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html