Re: [RFC][PATCH v2 0/5] Experiments with overlayfs filemap

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



There's some progress: it makes it through xfstests without crashing
or deadlocking.  There are still some regressing test cases, but
things definitely look better.

What remains is ctime/mtime support.

Also fallocate/copyfile/reflink/etc will need some thought as we need
to not only flush out dirty pages, but in some cases prevent new
faults while the operation is in progress.

And after that it needs to be optimized (->readpages() and
->writepages()), but I think that's the easy part.

Thanks,
Miklos
commit 248c44a527ca1a88c76cac90b1127ae2d6b09b75
Author: Miklos Szeredi <mszeredi@xxxxxxxxxx>
Date:   Thu Jan 31 10:14:03 2019 +0100

    ovl-page-cache-fix.patch

diff --git a/fs/buffer.c b/fs/buffer.c
index d80b9f821dea..52d024bfdbc1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2117,7 +2117,6 @@ int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
 		mark_inode_dirty(inode);
 	return copied;
 }
-EXPORT_SYMBOL(__generic_write_end);
 
 int block_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index c922682ff1b2..19ff61362a14 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -17,6 +17,7 @@
 #include <linux/fadvise.h>
 #include <linux/writeback.h>
 #include <linux/ratelimit.h>
+#include <linux/workqueue.h>
 #include "overlayfs.h"
 
 static char ovl_whatisit(struct inode *inode, struct inode *realinode)
@@ -35,15 +36,7 @@ static struct file *ovl_open_realfile(const struct file *file,
 	struct inode *inode = file_inode(file);
 	struct file *realfile;
 	const struct cred *old_cred;
-	int flags = file->f_flags | O_NOATIME;
-
-	if (realinode == ovl_inode_upper(inode)) {
-		/* tmpfs has no readpage a_op, so need to read realfile */
-		if ((flags & O_WRONLY) &&
-		    (!realinode->i_mapping ||
-		     !realinode->i_mapping->a_ops->readpage))
-			flags = (flags & ~O_ACCMODE) | O_RDWR;
-	}
+	int flags = (file->f_flags & O_ACCMODE) | O_NOATIME | O_LARGEFILE;
 
 	old_cred = ovl_override_creds(inode->i_sb);
 	realfile = open_with_fake_path(&file->f_path, flags,
@@ -57,6 +50,44 @@ static struct file *ovl_open_realfile(const struct file *file,
 	return realfile;
 }
 
+static struct file *ovl_open_upper(const struct file *file,
+				   struct inode *realinode)
+{
+	struct inode *inode = file_inode(file);
+	struct ovl_inode *oi = OVL_I(inode);
+	struct file *realfile;
+	const struct cred *old_cred;
+	struct path realpath;
+	int flags = O_RDWR | O_NOATIME | O_LARGEFILE;
+
+	realfile = READ_ONCE(oi->upper_file);
+	if (realfile)
+		return realfile;
+
+	ovl_path_upper(file_dentry(file), &realpath);
+
+	old_cred = ovl_override_creds(inode->i_sb);
+	realfile = open_with_fake_path(&realpath, flags,
+				       realinode, current_cred());
+	revert_creds(old_cred);
+
+	pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
+		 file, file, ovl_whatisit(inode, realinode), file->f_flags,
+		 realfile, IS_ERR(realfile) ? 0 : realfile->f_flags);
+
+	if (!IS_ERR(realfile)) {
+		struct file *old = cmpxchg(&oi->upper_file, NULL, realfile);
+
+		if (old) {
+			fput(realfile);
+			realfile = old;
+		}
+	}
+
+	return realfile;
+}
+
+#if 0
 #define OVL_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT)
 
 static int ovl_change_flags(struct file *file, unsigned int flags)
@@ -94,13 +125,14 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
 
 	return 0;
 }
+#endif
 
 static bool ovl_filemap_support(const struct file *file)
 {
-	struct ovl_fs *ofs = file_inode(file)->i_sb->s_fs_info;
+	//struct ovl_fs *ofs = file_inode(file)->i_sb->s_fs_info;
 
 	/* TODO: implement aops to upper inode data */
-	return ofs->upper_mnt && ovl_aops.writepage;
+	return true;
 }
 
 static int ovl_file_maybe_copy_up(const struct file *file, bool allow_meta)
@@ -119,15 +151,17 @@ static int ovl_file_maybe_copy_up(const struct file *file, bool allow_meta)
 	return ovl_maybe_copy_up(file_dentry(file), copy_up_flags);
 }
 
-static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
-			       bool allow_meta)
+static int ovl_real_fdget(const struct file *file, struct fd *real)
 {
 	struct inode *inode = file_inode(file);
+	struct ovl_inode *oi = OVL_I(inode);
 	struct inode *realinode;
 	int err;
 
 	real->flags = 0;
-	real->file = file->private_data;
+	real->file = READ_ONCE(oi->upper_file);
+	if (!real->file)
+		real->file = file->private_data;
 
 	/*
 	 * Lazy copy up caches the meta copy upper file on open O_RDWR.
@@ -136,33 +170,41 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
 	 * we may try to open a lower file O_RDWR or perform data operations
 	 * (e.g. fallocate) on the metacopy inode.
 	 */
-	err = ovl_file_maybe_copy_up(file, allow_meta);
+	err = ovl_file_maybe_copy_up(file, false);
 	if (err)
 		return err;
 
-	if (allow_meta)
-		realinode = ovl_inode_real(inode);
-	else
-		realinode = ovl_inode_realdata(inode);
+	realinode = ovl_inode_realdata(inode);
 
 	/* Has it been copied up since we'd opened it? */
 	if (unlikely(file_inode(real->file) != realinode)) {
-		real->flags = FDPUT_FPUT;
-		real->file = ovl_open_realfile(file, realinode);
+		real->file = ovl_open_upper(file, realinode);
 
 		return PTR_ERR_OR_ZERO(real->file);
 	}
 
+#if 0
 	/* Did the flags change since open? */
 	if (unlikely((file->f_flags ^ real->file->f_flags) & ~O_NOATIME))
 		return ovl_change_flags(real->file, file->f_flags);
+#endif
 
 	return 0;
 }
 
-static int ovl_real_fdget(const struct file *file, struct fd *real)
+static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
+			       bool allow_meta)
 {
-	return ovl_real_fdget_meta(file, real, false);
+	struct inode *inode = file_inode(file);
+	struct inode *upperinode = ovl_inode_upper(inode);
+
+	if (!allow_meta || !upperinode || ovl_has_upperdata(inode))
+		return ovl_real_fdget(file, real);
+
+	real->flags = FDPUT_FPUT;
+	real->file = ovl_open_realfile(file, upperinode);
+
+	return PTR_ERR_OR_ZERO(real->file);
 }
 
 static bool ovl_should_use_filemap_meta(struct file *file, bool allow_meta)
@@ -220,6 +262,7 @@ static int ovl_flush_filemap(struct file *file, loff_t offset, loff_t len)
 static int ovl_open(struct inode *inode, struct file *file)
 {
 	struct file *realfile;
+	struct inode *realinode;
 	int err;
 	bool allow_meta = (file->f_mode & FMODE_WRITE) &&
 			ovl_filemap_support(file);
@@ -228,22 +271,28 @@ static int ovl_open(struct inode *inode, struct file *file)
 	if (err)
 		return err;
 
-	/* No longer need these flags, so don't pass them on to underlying fs */
-	file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+	realinode = allow_meta ?
+		ovl_inode_real(inode) : ovl_inode_realdata(inode);
 
-	realfile = ovl_open_realfile(file, allow_meta ? ovl_inode_real(inode) :
-				     ovl_inode_realdata(inode));
-	if (IS_ERR(realfile))
-		return PTR_ERR(realfile);
+	if (realinode == ovl_inode_upper(inode)) {
+		realfile = ovl_open_upper(file, realinode);
+		if (IS_ERR(realfile))
+			return PTR_ERR(realfile);
+	} else {
+		realfile = ovl_open_realfile(file, realinode);
+		if (IS_ERR(realfile))
+			return PTR_ERR(realfile);
 
-	file->private_data = realfile;
+		file->private_data = realfile;
+	}
 
 	return 0;
 }
 
 static int ovl_release(struct inode *inode, struct file *file)
 {
-	fput(file->private_data);
+	if (file->private_data)
+		fput(file->private_data);
 
 	return 0;
 }
@@ -254,7 +303,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
 
 	return generic_file_llseek_size(file, offset, whence,
 					realinode->i_sb->s_maxbytes,
-					i_size_read(realinode));
+					i_size_read(file_inode(file)));
 }
 
 static void ovl_file_accessed(struct file *file)
@@ -292,6 +341,10 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb)
 		flags |= RWF_DSYNC;
 	if (ifl & IOCB_SYNC)
 		flags |= RWF_SYNC;
+	if (ifl & IOCB_APPEND)
+		flags |= RWF_APPEND;
+	if (ifl & IOCB_DIRECT)
+		flags |= RWF_DIRECT;
 
 	return flags;
 }
@@ -362,7 +415,7 @@ static ssize_t ovl_real_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	revert_creds(old_cred);
 
 	/* Update size */
-	ovl_copyattr(ovl_inode_real(inode), inode);
+	ovl_copyattr_size(ovl_inode_real(inode), inode);
 
 	fdput(real);
 
@@ -375,9 +428,14 @@ static ssize_t ovl_real_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+
+	if (ovl_should_use_filemap(file)) {
+		/* Update mode for file_remove_privs() */
+		ovl_copyattr(ovl_inode_real(inode), inode);
 
-	if (ovl_should_use_filemap(file))
 		return generic_file_write_iter(iocb, iter);
+	}
 
 	return ovl_real_write_iter(iocb, iter);
 }
@@ -387,6 +445,7 @@ static int ovl_real_fsync(struct file *file, loff_t start, loff_t end,
 			  int datasync)
 {
 	struct fd real;
+	struct inode *inode = file_inode(file);
 	const struct cred *old_cred;
 	int ret;
 
@@ -395,12 +454,11 @@ static int ovl_real_fsync(struct file *file, loff_t start, loff_t end,
 		return ret;
 
 	/* Don't sync lower file for fear of receiving EROFS error */
-	if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) {
-		old_cred = ovl_override_creds(file_inode(file)->i_sb);
+	if (file_inode(real.file) == ovl_inode_upper(inode)) {
+		old_cred = ovl_override_creds(inode->i_sb);
 		ret = vfs_fsync_range(real.file, start, end, datasync);
 		revert_creds(old_cred);
 	}
-
 	fdput(real);
 
 	return ret;
@@ -408,8 +466,13 @@ static int ovl_real_fsync(struct file *file, loff_t start, loff_t end,
 
 static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	if (ovl_should_use_filemap_meta(file, true))
-		return __generic_file_fsync(file, start, end, datasync);
+	int err;
+
+	if (ovl_should_use_filemap_meta(file, true)) {
+		err = file_write_and_wait_range(file, start, end);
+		if (err)
+			return err;
+	}
 
 	return ovl_real_fsync(file, start, end, datasync);
 }
@@ -417,7 +480,6 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 static vm_fault_t ovl_fault(struct vm_fault *vmf)
 {
 	struct file *file = vmf->vma->vm_file;
-	struct file *realfile = file->private_data;
 	struct inode *inode = file_inode(file);
 	bool blocking = (vmf->flags & FAULT_FLAG_KILLABLE) ||
 			((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
@@ -443,13 +505,7 @@ static vm_fault_t ovl_fault(struct vm_fault *vmf)
 			goto out_err;
 
 		return ret;
-	} else {
-		err = vfs_fadvise(realfile, vmf->pgoff << PAGE_SHIFT,
-				  file->f_ra.ra_pages, POSIX_FADV_WILLNEED);
-		if (err)
-			goto out_err;
 	}
-
 	return filemap_fault(vmf);
 
 out_err:
@@ -526,23 +582,30 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset,
 	const struct cred *old_cred;
 	int ret;
 
+	inode_lock(inode);
+
 	/* XXX: Different modes need to flush different ranges... */
 	ret = ovl_flush_filemap(file, 0, LLONG_MAX);
 	if (ret)
-		return ret;
+		goto unlock;
 
 	ret = ovl_real_fdget(file, &real);
 	if (ret)
-		return ret;
+		goto unlock;
 
 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
 	ret = vfs_fallocate(real.file, mode, offset, len);
 	revert_creds(old_cred);
 
 	/* Update size */
-	ovl_copyattr(ovl_inode_real(inode), inode);
+	ovl_copyattr_size(ovl_inode_real(inode), inode);
+
+	/* and invalidate mappings and page cache */
+	truncate_pagecache_range(inode, 0, LLONG_MAX);
 
 	fdput(real);
+unlock:
+	inode_unlock(inode);
 
 	return ret;
 }
@@ -670,24 +733,23 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
 	const struct cred *old_cred;
 	loff_t ret;
 
-	/* XXX: For some ops zero length means EOF... */
-	ret = ovl_flush_filemap(file_out, pos_out, len ?: LLONG_MAX);
+	inode_lock(inode_out);
+
+	ret = ovl_flush_filemap(file_out, pos_out, LLONG_MAX);
 	if (ret)
-		return ret;
+		goto unlock;
 
-	ret = ovl_flush_filemap(file_in, pos_in, len ?: LLONG_MAX);
+	ret = ovl_flush_filemap(file_in, pos_in, LLONG_MAX);
 	if (ret)
-		return ret;
+		goto unlock;
 
 	ret = ovl_real_fdget(file_out, &real_out);
 	if (ret)
-		return ret;
+		goto unlock;
 
 	ret = ovl_real_fdget(file_in, &real_in);
-	if (ret) {
-		fdput(real_out);
-		return ret;
-	}
+	if (ret)
+		goto fdput_out;
 
 	old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
 	switch (op) {
@@ -710,10 +772,18 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
 	revert_creds(old_cred);
 
 	/* Update size */
-	ovl_copyattr(ovl_inode_real(inode_out), inode_out);
+	ovl_copyattr_size(ovl_inode_real(inode_out), inode_out);
+
+	if (op != OVL_DEDUPE)
+		truncate_pagecache_range(inode_out,
+					 round_down(pos_out, PAGE_SIZE),
+					 LLONG_MAX);
 
 	fdput(real_in);
+fdput_out:
 	fdput(real_out);
+unlock:
+	inode_unlock(inode_out);
 
 	return ret;
 }
@@ -770,39 +840,6 @@ const struct file_operations ovl_file_operations = {
 	.remap_file_range	= ovl_remap_file_range,
 };
 
-static struct page *ovl_real_get_page(struct file *realfile, pgoff_t index)
-{
-	struct page *page;
-
-	page = read_mapping_page(file_inode(realfile)->i_mapping, index, NULL);
-	if (IS_ERR(page))
-		return page;
-
-	if (!PageUptodate(page)) {
-		put_page(page);
-		return ERR_PTR(-EIO);
-	}
-
-	lock_page(page);
-
-	return page;
-}
-
-static int ovl_real_copy_page(struct file *realfile, struct page *page)
-{
-	struct page *realpage;
-
-	realpage = ovl_real_get_page(realfile, page->index);
-	if (IS_ERR(realpage))
-		return PTR_ERR(realpage);
-
-	copy_highpage(page, realpage);
-	unlock_page(realpage);
-	put_page(realpage);
-
-	return 0;
-}
-
 static int ovl_real_readpage(struct file *realfile, struct page *page)
 {
 	struct bio_vec bvec = {
@@ -817,22 +854,25 @@ static int ovl_real_readpage(struct file *realfile, struct page *page)
 	iov_iter_bvec(&iter, READ, &bvec, 1, PAGE_SIZE);
 
 	ret = vfs_iter_read(realfile, &iter, &pos, 0);
+	if (ret >= 0 && ret < PAGE_SIZE)
+		zero_user_segment(page, ret, PAGE_SIZE);
 
 	return ret < 0 ? ret : 0;
 }
 
 static int ovl_do_readpage(struct file *file, struct page *page)
 {
-	struct file *realfile = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct ovl_inode *oi = OVL_I(inode);
+	struct file *realfile = READ_ONCE(oi->upper_file);
 	const struct cred *old_cred;
 	int ret;
 
-	/* tmpfs has no readpage a_op, so need to read with f_op */
+	if (!realfile)
+		realfile = file->private_data;
+
 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
-	if (!realfile->f_mapping || !realfile->f_mapping->a_ops->readpage)
-		ret = ovl_real_readpage(realfile, page);
-	else
-		ret = ovl_real_copy_page(realfile, page);
+	ret = ovl_real_readpage(realfile, page);
 	revert_creds(old_cred);
 
 	if (!ret)
@@ -841,14 +881,37 @@ static int ovl_do_readpage(struct file *file, struct page *page)
 	return 0;
 }
 
+struct ovl_readpage_work {
+	struct file *file;
+	struct page *page;
+	struct work_struct work;
+};
+
+static void ovl_readpage_work_fn(struct work_struct *work)
+{
+	struct ovl_readpage_work *ow;
+
+	ow = container_of(work, struct ovl_readpage_work, work);
+
+	ovl_do_readpage(ow->file, ow->page);
+	unlock_page(ow->page);
+}
+
 static int ovl_readpage(struct file *file, struct page *page)
 {
-	int ret;
+	struct ovl_readpage_work *ow;
 
-	ret = ovl_do_readpage(file, page);
-	unlock_page(page);
+	ow = kmalloc(sizeof(*ow), GFP_KERNEL);
+	if (!ow) {
+		unlock_page(page);
+		return -ENOMEM;
+	}
+	ow->file = file;
+	ow->page = page;
+	INIT_WORK(&ow->work, ovl_readpage_work_fn);
+	queue_work(ovl_wq, &ow->work);
 
-	return ret;
+	return 0;
 }
 
 static int ovl_write_begin(struct file *file, struct address_space *mapping,
@@ -881,76 +944,74 @@ static int ovl_write_begin(struct file *file, struct address_space *mapping,
 	return 0;
 }
 
-static int ovl_real_write_end(struct file *file, loff_t pos,
-			      unsigned int copied, struct page *page)
-{
-	struct file *realfile = file->private_data;
-	unsigned int offset = (pos & (PAGE_SIZE - 1));
-	struct bio_vec bvec = {
-		.bv_page = page,
-		.bv_len = copied,
-		.bv_offset = offset,
-	};
-	struct iov_iter iter;
-	const struct cred *old_cred;
-	ssize_t ret;
-
-	iov_iter_bvec(&iter, WRITE, &bvec, 1, copied);
-
-	old_cred = ovl_override_creds(file_inode(file)->i_sb);
-	ret = vfs_iter_write(realfile, &iter, &pos, 0);
-	revert_creds(old_cred);
-
-	return ret < 0 ? ret : 0;
-}
-
-extern int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
-			       struct page *page);
-
 static int ovl_write_end(struct file *file, struct address_space *mapping,
 			 loff_t pos, unsigned len, unsigned copied,
 			 struct page *page, void *fsdata)
 {
-	int err;
+	int res;
 
-	err = ovl_real_write_end(file, pos, copied, page);
-	if (err) {
-		pr_warn("ovl_write_end: %i", err);
-		unlock_page(page);
-		put_page(page);
+	res = simple_write_end(file, mapping, pos, len, copied, page, fsdata);
 
-		return -EIO;
-	}
+	//pr_info("ovl_write_end: page->flags: %lx\n", page->flags);
 
-	return __generic_write_end(file_inode(file), pos, copied, page);
+	return res;
 }
 
 static int ovl_real_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct inode *realinode = ovl_inode_real(page->mapping->host);
-	struct page *realpage;
-	int ret;
+	struct ovl_inode *oi = OVL_I(page->mapping->host);
+	struct file *realfile = oi->upper_file;
+	loff_t pos = page->index << PAGE_SHIFT;
+	loff_t size = i_size_read(page->mapping->host);
+	size_t len = PAGE_SIZE;
+	struct bio_vec bvec = {
+		.bv_page = page,
+		.bv_len = PAGE_SIZE,
+		.bv_offset = 0,
+	};
+	struct iov_iter iter;
+	ssize_t ret;
+	rwf_t flags = 0;
 
-	if (!realinode->i_mapping || !realinode->i_mapping->a_ops->writepage)
-		return -EIO;
+	if (size <= pos) {
+		pr_info("ovl_writepage: size = %lli pos = %lli\n", size, pos);
+		return 0;
+	}
+	
+	if (size < pos + PAGE_SIZE) {
+		/* Can't do direct I/O for tail pages */
+		len = size - pos;
+	} else if (realfile->f_mapping->a_ops &&
+		   realfile->f_mapping->a_ops->direct_IO) {
+		flags |= RWF_DIRECT;
+	}
 
-	realpage = grab_cache_page(realinode->i_mapping, page->index);
-	copy_highpage(realpage, page);
-	set_page_dirty(realpage);
+	//pr_info("ovl_real_writepage(%lli)\n", pos);
+	iov_iter_bvec(&iter, WRITE, &bvec, 1, len);
 
-	/* Start writeback on and unlock real page */
-	ret = realinode->i_mapping->a_ops->writepage(realpage, wbc);
-	put_page(realpage);
+	ret = vfs_iter_write(realfile, &iter, &pos, flags);
+	if (ret != len)
+		pr_warn("ovl_read_writepage: write returned %zi (not %zi)\n",
+			ret, len);
 
-	return ret;
+	/* FADV_DONTNEED for tail pages*/
+
+	//pr_info("ovl_real_writepage(%lli) = %li\n", pos, ret);
+
+	return ret < 0 ? ret : 0;
 }
 
 static int ovl_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
+	const struct cred *old_cred;
 
 	set_page_writeback(page);
+
+	old_cred = ovl_override_creds(page->mapping->host->i_sb);
 	ret = ovl_real_writepage(page, wbc);
+	revert_creds(old_cred);
+
 	unlock_page(page);
 
 	/*
@@ -962,6 +1023,36 @@ static int ovl_writepage(struct page *page, struct writeback_control *wbc)
 	return ret;
 }
 
+static ssize_t ovl_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	rwf_t flags = ovl_iocb_to_rwf(iocb);
+	const struct cred *old_cred;
+	struct fd real;
+	ssize_t ret;
+
+	ret = ovl_real_fdget(file, &real);
+	if (ret)
+		goto out;
+
+	old_cred = ovl_override_creds(file_inode(file)->i_sb);
+	if (iov_iter_rw(iter) == READ) {
+		return vfs_iter_read(real.file,iter, &iocb->ki_pos, flags);
+	} else {
+		file_start_write(real.file);
+		ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, flags);
+		file_end_write(real.file);
+
+		/* Update size */
+		ovl_copyattr_size(ovl_inode_real(inode), inode);
+	}
+	revert_creds(old_cred);
+	fdput(real);
+out:
+	return ret;
+}
+
 const struct address_space_operations ovl_aops = {
 	.readpage	= ovl_readpage,
 	.write_begin	= ovl_write_begin,
@@ -969,5 +1060,5 @@ const struct address_space_operations ovl_aops = {
 	.set_page_dirty	= __set_page_dirty_nobuffers,
 	.writepage	= ovl_writepage,
 	/* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
-	.direct_IO	= noop_direct_IO,
+	.direct_IO	= ovl_direct_IO,
 };
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 766505598ed1..e139f92b64ba 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/cred.h>
 #include <linux/xattr.h>
@@ -18,10 +19,12 @@
 
 int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 {
+	struct inode *upperinode, *inode = d_inode(dentry);
 	int err;
 	bool full_copy_up = false;
 	struct dentry *upperdentry;
 	const struct cred *old_cred;
+	bool is_truncate = attr->ia_valid & ATTR_SIZE;
 
 	err = setattr_prepare(dentry, attr);
 	if (err)
@@ -31,47 +34,57 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err)
 		goto out;
 
-	if (attr->ia_valid & ATTR_SIZE) {
+	if (is_truncate) {
 		struct inode *realinode = d_inode(ovl_dentry_real(dentry));
 
 		err = -ETXTBSY;
 		if (atomic_read(&realinode->i_writecount) < 0)
 			goto out_drop_write;
 
+		err = filemap_write_and_wait_range(inode->i_mapping,
+						   0, LLONG_MAX);
+		if (err)
+			goto out_drop_write;
+
+
 		/* Truncate should trigger data copy up as well */
 		full_copy_up = true;
+
 	}
 
 	if (!full_copy_up)
 		err = ovl_copy_up(dentry);
 	else
 		err = ovl_copy_up_with_data(dentry);
-	if (!err) {
-		struct inode *winode = NULL;
 
-		upperdentry = ovl_dentry_upper(dentry);
+	if (err)
+		goto out_drop_write;
 
-		if (attr->ia_valid & ATTR_SIZE) {
-			winode = d_inode(upperdentry);
-			err = get_write_access(winode);
-			if (err)
-				goto out_drop_write;
-		}
+	upperdentry = ovl_dentry_upper(dentry);
+	upperinode = d_inode(upperdentry);
 
-		if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
-			attr->ia_valid &= ~ATTR_MODE;
+	if (is_truncate) {
+		err = get_write_access(upperinode);
+		if (err)
+			goto out_drop_write;
+	}
 
-		inode_lock(upperdentry->d_inode);
-		old_cred = ovl_override_creds(dentry->d_sb);
-		err = notify_change(upperdentry, attr, NULL);
-		revert_creds(old_cred);
-		if (!err)
-			ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
-		inode_unlock(upperdentry->d_inode);
+	if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
+		attr->ia_valid &= ~ATTR_MODE;
 
-		if (winode)
-			put_write_access(winode);
+	inode_lock(upperinode);
+	old_cred = ovl_override_creds(dentry->d_sb);
+	err = notify_change(upperdentry, attr, NULL);
+	revert_creds(old_cred);
+	if (!err) {
+		if (is_truncate)
+			truncate_setsize(inode, i_size_read(upperinode));
+		ovl_copyattr(upperinode, inode);
 	}
+	inode_unlock(upperinode);
+
+	if (is_truncate)
+		put_write_access(upperinode);
 out_drop_write:
 	ovl_drop_write(dentry);
 out:
@@ -148,7 +161,8 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
 	enum ovl_path_type type;
 	struct path realpath;
 	const struct cred *old_cred;
-	bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
+	struct inode *inode = d_inode(dentry);
+	bool is_dir = S_ISDIR(inode->i_mode);
 	bool samefs = ovl_same_sb(dentry->d_sb);
 	struct ovl_layer *lower_layer = NULL;
 	int err;
@@ -162,6 +176,13 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
 	if (err)
 		goto out;
 
+	/*
+	 * With overlay page cache, overlay inode i_size is more uptodate than
+	 * real inode i_size. Perhaps we should generic_fillattr() and only
+	 * update individual stats from real inode?
+	 */
+	stat->size = i_size_read(inode);
+
 	/*
 	 * For non-dir or same fs, we use st_ino of the copy up origin.
 	 * This guaranties constant st_dev/st_ino across copy up.
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index b5155ed6d516..576488162257 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -12,6 +12,9 @@
 #include <linux/fs.h>
 #include "ovl_entry.h"
 
+struct workqueue_struct;
+extern struct workqueue_struct *ovl_wq;
+
 enum ovl_path_type {
 	__OVL_PATH_UPPER	= (1 << 0),
 	__OVL_PATH_MERGE	= (1 << 1),
@@ -376,6 +379,11 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to)
 	to->i_atime = from->i_atime;
 	to->i_mtime = from->i_mtime;
 	to->i_ctime = from->i_ctime;
+}
+
+static inline void ovl_copyattr_size(struct inode *from, struct inode *to)
+{
+	ovl_copyattr(from, to);
 	i_size_write(to, i_size_read(from));
 }
 
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index ec237035333a..ab10c75c4b98 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -99,6 +99,7 @@ struct ovl_inode {
 	struct inode vfs_inode;
 	struct dentry *__upperdentry;
 	struct inode *lower;
+	struct file *upper_file;
 
 	/* synchronize copy up and more */
 	struct mutex lock;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 0116735cc321..cfbf1d8994c0 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -9,6 +9,7 @@
 
 #include <uapi/linux/magic.h>
 #include <linux/fs.h>
+#include <linux/file.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
 #include <linux/mount.h>
@@ -16,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/statfs.h>
 #include <linux/seq_file.h>
+#include <linux/workqueue.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/exportfs.h>
 #include "overlayfs.h"
@@ -27,6 +29,8 @@ MODULE_LICENSE("GPL");
 
 struct ovl_dir_cache;
 
+struct workqueue_struct *ovl_wq;
+
 #define OVL_MAX_STACK 500
 
 static bool ovl_redirect_dir_def = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_DIR);
@@ -185,6 +189,7 @@ static struct inode *ovl_alloc_inode(struct super_block *sb)
 	oi->__upperdentry = NULL;
 	oi->lower = NULL;
 	oi->lowerdata = NULL;
+	oi->upper_file = NULL;
 	mutex_init(&oi->lock);
 
 	return &oi->vfs_inode;
@@ -201,6 +206,8 @@ static void ovl_destroy_inode(struct inode *inode)
 {
 	struct ovl_inode *oi = OVL_I(inode);
 
+	if (oi->upper_file)
+		fput(oi->upper_file);
 	dput(oi->__upperdentry);
 	iput(oi->lower);
 	if (S_ISDIR(inode->i_mode))
@@ -378,7 +385,6 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data)
 static const struct super_operations ovl_super_operations = {
 	.alloc_inode	= ovl_alloc_inode,
 	.destroy_inode	= ovl_destroy_inode,
-	.drop_inode	= generic_delete_inode,
 	.put_super	= ovl_put_super,
 	.sync_fs	= ovl_sync_fs,
 	.statfs		= ovl_statfs,
@@ -1427,6 +1433,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	struct cred *cred;
 	int err;
 
+	err = super_setup_bdi(sb);
+	if (err)
+		goto out;
+
 	err = -ENOMEM;
 	ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
 	if (!ofs)
@@ -1585,7 +1595,11 @@ static void ovl_inode_init_once(void *foo)
 
 static int __init ovl_init(void)
 {
-	int err;
+	int err = -ENOMEM;
+
+	ovl_wq = alloc_workqueue("ovl_writeback", 0, 0);
+	if (!ovl_wq)
+		goto out;
 
 	ovl_inode_cachep = kmem_cache_create("ovl_inode",
 					     sizeof(struct ovl_inode), 0,
@@ -1593,12 +1607,19 @@ static int __init ovl_init(void)
 					      SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     ovl_inode_init_once);
 	if (ovl_inode_cachep == NULL)
-		return -ENOMEM;
+		goto out_destroy_wq;
 
 	err = register_filesystem(&ovl_fs_type);
 	if (err)
-		kmem_cache_destroy(ovl_inode_cachep);
+		goto out_cache_destroy;
+
+	return 0;
 
+out_cache_destroy:
+	kmem_cache_destroy(ovl_inode_cachep);
+out_destroy_wq:
+	destroy_workqueue(ovl_wq);
+out:
 	return err;
 }
 
@@ -1613,6 +1634,8 @@ static void __exit ovl_exit(void)
 	rcu_barrier();
 	kmem_cache_destroy(ovl_inode_cachep);
 
+	destroy_workqueue(ovl_wq);
+
 }
 
 module_init(ovl_init);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 54ed62a1d9d6..b1989c479947 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -413,7 +413,7 @@ void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
 	if (lowerdata)
 		OVL_I(inode)->lowerdata = igrab(d_inode(lowerdata));
 
-	ovl_copyattr(realinode, inode);
+	ovl_copyattr_size(realinode, inode);
 	ovl_copyflags(realinode, inode);
 	if (!inode->i_ino)
 		inode->i_ino = realinode->i_ino;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 811c77743dad..2901ffc43c11 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3339,6 +3339,8 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 		ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
 	if (flags & RWF_APPEND)
 		ki->ki_flags |= IOCB_APPEND;
+	if (flags & RWF_DIRECT)
+		ki->ki_flags |= IOCB_DIRECT;
 	return 0;
 }
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 121e82ce296b..b6e821ddb066 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -342,8 +342,11 @@ typedef int __bitwise __kernel_rwf_t;
 /* per-IO O_APPEND */
 #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
 
+/* per-IO O_DIRECT */
+#define RWF_DIRECT	((__force __kernel_rwf_t)0x00000020)
+
 /* mask of flags supported by the kernel */
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
-			 RWF_APPEND)
+			 RWF_APPEND | RWF_DIRECT)
 
 #endif /* _UAPI_LINUX_FS_H */

[Index of Archives]     [Linux Filesystems Devel]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux