Oleg Nesterov <oleg@xxxxxxxxxx> writes: > On 09/14, Jeff Layton wrote: >> >> Currently, we have a single refcount variable inside the files_struct. >> When we go to unshare the files_struct, we check this counter and if >> it's elevated, then we allocate a new files_struct instead of just >> repurposing the old one, under the assumption that that indicates that >> it's shared between tasks. >> >> This is not necessarily the case, however. Each task associated with the >> files_struct does get a long-held reference, but the refcount can be >> elevated for other reasons as well, by callers of get_files_struct. >> >> This means that we can end up allocating a new files_struct if we just >> happen to race with a call to get_files_struct. Fix this by adding a new >> counter to track the number associated threads, and use that to >> determine whether to allocate a new files_struct when unsharing. > > But who actually needs get_files_struct() ? All users except binder.c need > struct file, not struct files_struct. I think the difficult case is flock64_to_posix_lock. Where it sets fl_owner to current->files. There is also mandatory locking. Ah. I see you are talking about the cases that increment the count on files_struct. So that a count on files_struct is unambiguously the number of threads sharing it. > See the (completely untested) patch below. It adds the new fget_task() helper > and converts all other users to use it. > > As for binder.c, in this case we probably actually want to unshare ->files > on exec so we can ignore it? Looking at the binder case it only captures ->files on mmap. Exec ditches the mmap. So if the order of operations are correct than the dropping of the old mm will also drop the count on files_struct held by binder. So semantically binder should not effect locks on exec, nor should binder effect the files_count. So I think as long as our order of operations are correct we are good. I have concerns about binder. It is not pid namespace safe. It assumes but does not enforce all threads share a single fs struct. Binder at least limits mmapers to the thread group of the opener of the binder file. This limits the worst of the shenanighans with file descriptor passing that I can imagine. In short as long as we get the oder of operations correct we should be able to safely ignore binder, and not have binder affect the results of this code. Eric > Oleg. > --- > > fs/file.c | 29 ++++++++++++- > fs/proc/fd.c | 117 +++++++++++++++++++-------------------------------- > include/linux/file.h | 3 ++ > kernel/bpf/syscall.c | 23 +++------- > kernel/kcmp.c | 20 ++------- > 5 files changed, 83 insertions(+), 109 deletions(-) > > diff --git a/fs/file.c b/fs/file.c > index 7ffd6e9..a685cc0 100644 > --- a/fs/file.c > +++ b/fs/file.c > @@ -676,9 +676,9 @@ void do_close_on_exec(struct files_struct *files) > spin_unlock(&files->file_lock); > } > > -static struct file *__fget(unsigned int fd, fmode_t mask) > +static struct file *__fget_files(struct files_struct *files, > + unsigned int fd, fmode_t mask) > { > - struct files_struct *files = current->files; > struct file *file; > > rcu_read_lock(); > @@ -699,12 +699,37 @@ static struct file *__fget(unsigned int fd, fmode_t mask) > return file; > } > > +static inline struct file *__fget(unsigned int fd, fmode_t mask) > +{ > + return __fget_files(current->files, fd, mask); > +} > + > struct file *fget(unsigned int fd) > { > return __fget(fd, FMODE_PATH); > } > EXPORT_SYMBOL(fget); > > +struct file *fget_task(struct task_struct *task, unsigned int fd) > +{ > + struct files_struct *files; > + struct file *file = ERR_PTR(-EBADF); > + > + task_lock(task); > + /* > + * __fget_files() checks max_fds itself but we want -EBADF > + * if fd is too big; and files_fdtable() needs rcu lock. > + */ > + rcu_read_lock(); > + files = task->files; > + if (files && fd < files_fdtable(files)->max_fds) > + file = __fget_files(files, fd, FMODE_PATH); > + rcu_read_unlock(); > + task_unlock(task); > + > + return file; > +} > + > struct file *fget_raw(unsigned int fd) > { > return __fget(fd, 0); > diff --git a/fs/proc/fd.c b/fs/proc/fd.c > index 81882a1..bb61890 100644 > --- a/fs/proc/fd.c > +++ b/fs/proc/fd.c > @@ -19,54 +19,45 @@ > > static int seq_show(struct seq_file *m, void *v) > { > - struct files_struct *files = NULL; > + struct task_struct *task = get_proc_task(m->private); > + unsigned int fd = proc_fd(m->private); > int f_flags = 0, ret = -ENOENT; > struct file *file = NULL; > - struct task_struct *task; > > - task = get_proc_task(m->private); > if (!task) > - return -ENOENT; > - > - files = get_files_struct(task); > - put_task_struct(task); > - > - if (files) { > - unsigned int fd = proc_fd(m->private); > - > - spin_lock(&files->file_lock); > - file = fcheck_files(files, fd); > - if (file) { > - struct fdtable *fdt = files_fdtable(files); > + return ret; > > - f_flags = file->f_flags; > - if (close_on_exec(fd, fdt)) > - f_flags |= O_CLOEXEC; > + file = fget_task(task, fd); > + if (IS_ERR_OR_NULL(file)) > + goto out; > > - get_file(file); > - ret = 0; > - } > - spin_unlock(&files->file_lock); > - put_files_struct(files); > + /* TODO: add another helper ? */ > + task_lock(task); > + rcu_read_lock(); > + if (task->files) { > + struct fdtable *fdt = files_fdtable(task->files); > + if (fd < fdt->max_fds && close_on_exec(fd, fdt)) > + f_flags |= O_CLOEXEC; > } > - > - if (ret) > - return ret; > + rcu_read_unlock(); > + task_unlock(task); > > seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", > (long long)file->f_pos, f_flags, > real_mount(file->f_path.mnt)->mnt_id); > > - show_fd_locks(m, file, files); > + /* show_fd_locks() never dereferences file, NULL is fine too */ > + show_fd_locks(m, file, task->files); > if (seq_has_overflowed(m)) > goto out; > > if (file->f_op->show_fdinfo) > file->f_op->show_fdinfo(m, file); > > -out: > fput(file); > - return 0; > +out: > + put_task_struct(task); > + return ret; > } > > static int seq_fdinfo_open(struct inode *inode, struct file *file) > @@ -83,19 +74,14 @@ static const struct file_operations proc_fdinfo_file_operations = { > > static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) > { > - struct files_struct *files = get_files_struct(task); > - struct file *file; > + struct file *file = fget_task(task, fd); > > - if (!files) > + if (IS_ERR_OR_NULL(file)) > return false; > > - rcu_read_lock(); > - file = fcheck_files(files, fd); > - if (file) > - *mode = file->f_mode; > - rcu_read_unlock(); > - put_files_struct(files); > - return !!file; > + *mode = file->f_mode; > + fput(file); > + return true; > } > > static void tid_fd_update_inode(struct task_struct *task, struct inode *inode, > @@ -146,31 +132,24 @@ static const struct dentry_operations tid_fd_dentry_operations = { > > static int proc_fd_link(struct dentry *dentry, struct path *path) > { > - struct files_struct *files = NULL; > - struct task_struct *task; > + struct task_struct *task = get_proc_task(d_inode(dentry)); > + unsigned int fd = proc_fd(d_inode(dentry)); > + struct file *fd_file; > int ret = -ENOENT; > > task = get_proc_task(d_inode(dentry)); > - if (task) { > - files = get_files_struct(task); > - put_task_struct(task); > - } > - > - if (files) { > - unsigned int fd = proc_fd(d_inode(dentry)); > - struct file *fd_file; > + if (!task) > + return ret; > > - spin_lock(&files->file_lock); > - fd_file = fcheck_files(files, fd); > - if (fd_file) { > - *path = fd_file->f_path; > - path_get(&fd_file->f_path); > - ret = 0; > - } > - spin_unlock(&files->file_lock); > - put_files_struct(files); > + fd_file = fget_task(task, fd); > + if (!IS_ERR_OR_NULL(fd_file)) { > + *path = fd_file->f_path; > + path_get(&fd_file->f_path); > + fput(fd_file); > + ret = 0; > } > > + put_task_struct(task); > return ret; > } > > @@ -229,7 +208,6 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, > instantiate_t instantiate) > { > struct task_struct *p = get_proc_task(file_inode(file)); > - struct files_struct *files; > unsigned int fd; > > if (!p) > @@ -237,37 +215,30 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, > > if (!dir_emit_dots(file, ctx)) > goto out; > - files = get_files_struct(p); > - if (!files) > - goto out; > > - rcu_read_lock(); > - for (fd = ctx->pos - 2; > - fd < files_fdtable(files)->max_fds; > - fd++, ctx->pos++) { > + for (fd = ctx->pos - 2;; fd++, ctx->pos++) { > struct file *f; > struct fd_data data; > char name[10 + 1]; > unsigned int len; > > - f = fcheck_files(files, fd); > + f = fget_task(p, fd); > if (!f) > continue; > + if (IS_ERR(f)) > + break; > + > data.mode = f->f_mode; > - rcu_read_unlock(); > data.fd = fd; > + fput(f); > > len = snprintf(name, sizeof(name), "%u", fd); > if (!proc_fill_cache(file, ctx, > name, len, instantiate, p, > &data)) > - goto out_fd_loop; > + goto out; > cond_resched(); > - rcu_read_lock(); > } > - rcu_read_unlock(); > -out_fd_loop: > - put_files_struct(files); > out: > put_task_struct(p); > return 0; > diff --git a/include/linux/file.h b/include/linux/file.h > index 6b2fb03..8b7abdb 100644 > --- a/include/linux/file.h > +++ b/include/linux/file.h > @@ -43,6 +43,9 @@ static inline void fdput(struct fd fd) > fput(fd.file); > } > > +struct task_struct; > +extern struct file *fget_task(struct task_struct *task, unsigned int fd); > + > extern struct file *fget(unsigned int fd); > extern struct file *fget_raw(unsigned int fd); > extern unsigned long __fdget(unsigned int fd); > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 8339d81..2cbf26d 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -2260,7 +2260,6 @@ static int bpf_task_fd_query(const union bpf_attr *attr, > pid_t pid = attr->task_fd_query.pid; > u32 fd = attr->task_fd_query.fd; > const struct perf_event *event; > - struct files_struct *files; > struct task_struct *task; > struct file *file; > int err; > @@ -2278,24 +2277,13 @@ static int bpf_task_fd_query(const union bpf_attr *attr, > if (!task) > return -ENOENT; > > - files = get_files_struct(task); > - put_task_struct(task); > - if (!files) > - return -ENOENT; > - > - err = 0; > - spin_lock(&files->file_lock); > - file = fcheck_files(files, fd); > - if (!file) > - err = -EBADF; > - else > - get_file(file); > - spin_unlock(&files->file_lock); > - put_files_struct(files); > - > - if (err) > + err = -EBADF; > + file = fget_task(task, fd); > + if (IS_ERR_OR_NULL(file)) > goto out; > > + err = -ENOTSUPP; > + > if (file->f_op == &bpf_raw_tp_fops) { > struct bpf_raw_tracepoint *raw_tp = file->private_data; > struct bpf_raw_event_map *btp = raw_tp->btp; > @@ -2324,7 +2312,6 @@ static int bpf_task_fd_query(const union bpf_attr *attr, > goto put_file; > } > > - err = -ENOTSUPP; > put_file: > fput(file); > out: > diff --git a/kernel/kcmp.c b/kernel/kcmp.c > index a0e3d7a..067639e 100644 > --- a/kernel/kcmp.c > +++ b/kernel/kcmp.c > @@ -107,7 +107,6 @@ static int kcmp_epoll_target(struct task_struct *task1, > { > struct file *filp, *filp_epoll, *filp_tgt; > struct kcmp_epoll_slot slot; > - struct files_struct *files; > > if (copy_from_user(&slot, uslot, sizeof(slot))) > return -EFAULT; > @@ -116,23 +115,12 @@ static int kcmp_epoll_target(struct task_struct *task1, > if (!filp) > return -EBADF; > > - files = get_files_struct(task2); > - if (!files) > + filp_epoll = fget_task(task2, slot.efd); > + if (IS_ERR_OR_NULL(filp_epoll)) > return -EBADF; > > - spin_lock(&files->file_lock); > - filp_epoll = fcheck_files(files, slot.efd); > - if (filp_epoll) > - get_file(filp_epoll); > - else > - filp_tgt = ERR_PTR(-EBADF); > - spin_unlock(&files->file_lock); > - put_files_struct(files); > - > - if (filp_epoll) { > - filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); > - fput(filp_epoll); > - } > + filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); > + fput(filp_epoll); > > if (IS_ERR(filp_tgt)) > return PTR_ERR(filp_tgt);