On Sun, Feb 19, 2017 at 09:14:41AM +0000, Al Viro wrote: > On Fri, Feb 17, 2017 at 05:09:29PM +0100, Miklos Szeredi wrote: > > A file is opened for read-only, opened read-write (resulting in a copy up) > > and modified. The data read back from the the read-only fd will be stale > > in this case (the read-only file descriptor still refers to the lower, > > unmodified file). > > > > This patchset fixes issues related to this corner case. This is a > > requirement from various parties for accepting overlayfs as a "POSIX" > > filesystem. > > > > When an operation (read, mmap, fsync) is done on an overlay fd opened > > read-only that is referring to a lower file, check if it has been copied up > > in the mean time. If so, open the upper file and use that for the operation. > > > > To make the performance impact minimal for non-overlay case, use a flag in > > file->f_mode to indicate that this is an overlay file. > > This is one hell of a DoS vector - it's really easy to eat tons of struct > file that way. Preparatory parts of that series make sense on their own, > but your "let's allocate a struct file, call ->open() and schedule that > struct file for closing upon the exit to userland on each kernel_read()" > is not. How about this? Do you see a problem with calling __fput() synchronously here? Thanks, Miklos --- fs/Makefile | 2 - fs/file_table.c | 2 - fs/internal.h | 1 fs/overlay_util.c | 53 +++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 11 ++++++++ include/linux/overlay_util.h | 13 ++++++++++ 6 files changed, 80 insertions(+), 2 deletions(-) --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -31,6 +31,7 @@ #include <linux/workqueue.h> #include <linux/percpu-rwsem.h> #include <linux/delayed_call.h> +#include <linux/overlay_util.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> @@ -1721,9 +1722,19 @@ struct inode_operations { int (*set_acl)(struct inode *, struct posix_acl *, int); } ____cacheline_aligned; + +static inline bool overlay_file_inconsistent(struct file *file) +{ + return unlikely(file->f_path.dentry->d_flags & DCACHE_OP_REAL) && + unlikely(d_real_inode(file->f_path.dentry) != file_inode(file)); +} + static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio, struct iov_iter *iter) { + if (overlay_file_inconsistent(file)) + return overlay_read_iter(file, kio, iter); + return file->f_op->read_iter(kio, iter); } --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table. attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o \ - stack.o fs_struct.o statfs.o fs_pin.o nsfs.o + stack.o fs_struct.o statfs.o fs_pin.o nsfs.o overlay_util.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o --- /dev/null +++ b/fs/overlay_util.c @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2017 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/overlay_util.h> +#include <linux/fs.h> +#include <linux/file.h> +#include "internal.h" + +static struct file *overlay_clone_file(struct file *file) +{ + file = filp_clone_open(file); + if (!IS_ERR(file)) + file->f_mode |= FMODE_NONOTIFY; + + return file; +} + +/* + * Do the release synchronously. Otherwise we'd have a DoS problem when doing + * multiple reads (e.g. through kernel_read()) and only releasing the cloned + * files when returning to userspace. + * + * There's no risk of final dput or final mntput happening, since caller holds + * ref to both through the original file. + */ +static void overlay_put_cloned_file(struct file *file) +{ + if (WARN_ON(!atomic_long_dec_and_test(&file->f_count))) + return; + + __fput(file); +} + +ssize_t overlay_read_iter(struct file *file, struct kiocb *kio, + struct iov_iter *iter) +{ + ssize_t ret; + + file = overlay_clone_file(file); + if (IS_ERR(file)) + return PTR_ERR(file); + + ret = vfs_iter_read(file, iter, &kio->ki_pos); + overlay_put_cloned_file(file); + + return ret; +} +EXPORT_SYMBOL(overlay_read_iter); --- /dev/null +++ b/include/linux/overlay_util.h @@ -0,0 +1,13 @@ +#ifndef _LINUX_OVERLAY_FS_H +#define _LINUX_OVERLAY_FS_H + +#include <linux/types.h> + +struct file; +struct kiocb; +struct iov_iter; + +extern ssize_t overlay_read_iter(struct file *file, struct kiocb *kio, + struct iov_iter *iter); + +#endif /* _LINUX_OVERLAY_FS_H */ --- a/fs/file_table.c +++ b/fs/file_table.c @@ -184,7 +184,7 @@ EXPORT_SYMBOL(alloc_file); /* the real guts of fput() - releasing the last reference to file */ -static void __fput(struct file *file) +void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; --- a/fs/internal.h +++ b/fs/internal.h @@ -83,6 +83,7 @@ extern void chroot_fs_refs(const struct * file_table.c */ extern struct file *get_empty_filp(void); +extern void __fput(struct file *); /* * super.c -- To unsubscribe from this list: send the line "unsubscribe linux-unionfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html