On Thu, Jan 24, 2019 at 11:35 PM Amir Goldstein <amir73il@xxxxxxxxx> wrote: > > On Thu, Jan 24, 2019 at 7:18 PM Amir Goldstein <amir73il@xxxxxxxxx> wrote: > > > > > > > > Can you please point me in the direction of how to implement proper > > > aops? Or better yet, send me a demo patch? > > > I figure we need to call vfs_iter_read/vfs_iter_write of real file > > > with IOCB_DIRECT for page io? But I am very uncertain about locking > > > order requirements and whether there is already some infrastructure > > > that we can use (some splice variant)? > > > > > > Any other thoughts? > > > > > > > Miklos, > > > > I've made a small progress. > > > > Pushed two more commits to ovl-lazy-copyup-wip: > > ea9a0083d1ed - ovl: readahead upper inode page cache on overlay page fault > > d1ff6e6366d5 - ovl: implement overlay stacked readpage > > Pushed two more: > 838c119934a2 - ovl: implement write_begin/write_end aops > 44b9e89154ba - ovl: return overlay inode i_size for stat() > > > > > That implements readpage by reading and copying from upper page cache. > > > > With that change, the basic unionmount test on tmpfs passes. > > That is not surprising. overlay filemap ops are disabled for upper > tmpfs in current > implementation, because tmpfs has no readpage(). > > > With unionmount over xfs, tests fail because partial page write > > is not working correctly, although mmap partial write works well. > > > > But that is fixed now, so ./run --ov --samefs also passes over xfs. > Which says something bad about our test coverage... > > > I will try to follow similar pattern to implement writepage by copy to > > upper page cache. > > > > ...Luckily, I have implemented ./run --ov --recycle way back, otherwise we > would have had very poor test coverage for persistency of overlayfs writes. > > So we currently have 6 xfstests failing on write persistency: > overlay/018 overlay/032 overlay/044 overlay/047 overlay/050 overlay/051 > > and 2 unionmount tests (over /base xfs): > ./run --ov --samefs --recycle rename-new-pop-dir > ./run --ov --samefs --recycle rename-mass-dir Great progress. Thanks! I don't think you need to mess with dcache flushing, since this is all kernel (page cache) pages without any possible dcache aliasing issues that plague user pages. So a simple copy_page() should do. It would be even better to do it with O_DIRECT to remove cache duplication. Something like the attached... Thanks, Miklos
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 8e4056b262db..6f1e50202779 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -35,9 +35,19 @@ static struct file *ovl_open_realfile(const struct file *file, struct inode *inode = file_inode(file); struct file *realfile; const struct cred *old_cred; + int flags = file->f_flags | O_NOATIME; + + if (realinode == ovl_inode_upper(inode)) { + if (flags & O_WRONLY) + flags = (flags & ~O_ACCMODE) | O_RDWR; + + if (realinode->i_mapping->a_ops && + realinode->i_mapping->a_ops->direct_IO) + flags |= O_DIRECT; + } old_cred = ovl_override_creds(inode->i_sb); - realfile = open_with_fake_path(&file->f_path, file->f_flags | O_NOATIME, + realfile = open_with_fake_path(&file->f_path, flags, realinode, current_cred()); revert_creds(old_cred); @@ -122,6 +132,9 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real, return PTR_ERR_OR_ZERO(real->file); } + if (realinode == ovl_inode_upper(inode)) + return 0; + /* Did the flags change since open? */ if (unlikely((file->f_flags ^ real->file->f_flags) & ~O_NOATIME)) return ovl_change_flags(real->file, file->f_flags); @@ -145,7 +158,6 @@ static bool ovl_filemap_support(struct file *file) static bool ovl_should_use_filemap_meta(struct file *file, bool meta_only) { struct inode *inode = file_inode(file); - struct inode *upper; int err; if (!ovl_filemap_support(file)) @@ -168,12 +180,6 @@ static bool ovl_should_use_filemap_meta(struct file *file, bool meta_only) } } - /* Does upper inode support page cache operations? */ - upper = ovl_inode_upper(inode); - if (!upper || unlikely(!upper->i_mapping || - !upper->i_mapping->a_ops->readpage)) - return false; - /* * If file was opened O_RDWR and @meta_only is true, we use overlay * inode filemap operations, but defer data copy up further. @@ -186,7 +192,7 @@ static bool ovl_should_use_filemap_meta(struct file *file, bool meta_only) * including pure upper inodes, so ovl_sync_fs() can sync all dirty * overlay inodes without having to sync all upper fs dirty inodes. */ - return upper && ovl_has_upperdata(inode); + return ovl_inode_upper(inode) && ovl_has_upperdata(inode); } static bool ovl_should_use_filemap(struct file *file) @@ -404,52 +410,43 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ovl_real_fsync(file, start, end, datasync); } -static struct page *ovl_real_get_page(struct file *file, pgoff_t index) -{ - struct file *realfile = file->private_data; - struct page *page; - page = read_mapping_page(file_inode(realfile)->i_mapping, index, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - lock_page(page); - - return page; -} - -static void ovl_copy_page(struct page *dst_page, struct page *src_page) +static int ovl_real_readpage(struct file *realfile, struct page *page) { - void *src_addr = kmap_atomic(src_page); - void *dst_addr = kmap_atomic(dst_page); + struct bio_vec bvec = { + .bv_page = page, + .bv_len = PAGE_SIZE, + .bv_offset = 0, + }; + loff_t pos = page->index << PAGE_SHIFT; + struct iov_iter iter; + ssize_t ret; - flush_dcache_page(src_page); - memcpy(dst_addr, src_addr, PAGE_SIZE); + iov_iter_bvec(&iter, READ, &bvec, 1, PAGE_SIZE); - kunmap_atomic(dst_addr); - kunmap_atomic(src_addr); + ret = vfs_iter_read(realfile, &iter, &pos, 0); + + return ret < 0 ? ret : 0; } static int ovl_do_readpage(struct file *file, struct page *page) { - struct page *realpage; - int ret = 0; + int ret; + struct fd real; + const struct cred *old_cred; - realpage = ovl_real_get_page(file, page->index); - if (!IS_ERR(realpage)) { - ovl_copy_page(page, realpage); - unlock_page(realpage); - put_page(realpage); - } else { - ret = PTR_ERR(realpage); - clear_highpage(page); - } + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = ovl_real_readpage(real.file, page); + revert_creds(old_cred); + + fdput(real); - flush_dcache_page(page); - SetPageUptodate(page); + if (!ret) + SetPageUptodate(page); return ret; } @@ -470,6 +467,7 @@ static int ovl_write_begin(struct file *file, struct address_space *mapping, { struct page *page; pgoff_t index; + int err; index = pos >> PAGE_SHIFT; @@ -477,11 +475,20 @@ static int ovl_write_begin(struct file *file, struct address_space *mapping, if (!page) return -ENOMEM; - *pagep = page; - if (!PageUptodate(page) && (len != PAGE_SIZE)) - return ovl_do_readpage(file, page); + if (!PageUptodate(page) && len != PAGE_SIZE) { + err = ovl_do_readpage(file, page); + if (err) { + pr_warn("ovl_do_readpage: %i", err); + + unlock_page(page); + put_page(page); + return -EIO; + } + } + + *pagep = page; return 0; }