As per recommendations made at LSF'08, a stackable file system which does not change the data across layers, should try to use vm_operations instead of address_space_operations. This means we now have to implement out own ->read and ->write methods because we cannot rely on VFS helpers which require us to have a ->readpage method. Either way there are two caveats: (1) It's not possible currently to set inode->i_mapping->a_ops to NULL, because too many code paths expect i_mapping to be non-NULL. (2) a small/dummy ->readpage is still needed because generic_file_mmap, which we used in unionfs_mmap, still check for the existence of the ->readpage method. These code paths may have to be changed to remove the need for readpage(). Signed-off-by: Erez Zadok <ezk@xxxxxxxxxxxxx> --- fs/unionfs/file.c | 109 +++++++++++++++-- fs/unionfs/mmap.c | 338 +++++++--------------------------------------------- fs/unionfs/union.h | 4 +- 3 files changed, 147 insertions(+), 304 deletions(-) diff --git a/fs/unionfs/file.c b/fs/unionfs/file.c index 1fcaf8e..9397ff3 100644 --- a/fs/unionfs/file.c +++ b/fs/unionfs/file.c @@ -18,17 +18,83 @@ #include "union.h" +static ssize_t unionfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + int err; + struct file *lower_file; + struct dentry *dentry = file->f_path.dentry; + + unionfs_read_lock(dentry->d_sb, UNIONFS_SMUTEX_PARENT); + unionfs_lock_dentry(dentry, UNIONFS_DMUTEX_CHILD); + err = unionfs_file_revalidate_locked(file, false); + if (unlikely(err)) + goto out; + + lower_file = unionfs_lower_file(file); + err = vfs_read(lower_file, buf, count, ppos); + /* update our inode atime upon a successful lower read */ + if (err >= 0) { + fsstack_copy_attr_atime(file->f_path.dentry->d_inode, + lower_file->f_path.dentry->d_inode); + unionfs_check_file(file); + } + +out: + unionfs_unlock_dentry(dentry); + unionfs_read_unlock(file->f_path.dentry->d_sb); + return err; +} + +static ssize_t unionfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + int err = 0; + struct file *lower_file; + struct dentry *dentry = file->f_path.dentry; + + unionfs_read_lock(dentry->d_sb, UNIONFS_SMUTEX_PARENT); + unionfs_lock_dentry(dentry, UNIONFS_DMUTEX_CHILD); + err = unionfs_file_revalidate_locked(file, true); + if (unlikely(err)) + goto out; + + lower_file = unionfs_lower_file(file); + err = vfs_write(lower_file, buf, count, ppos); + /* update our inode times+sizes upon a successful lower write */ + if (err >= 0) { + fsstack_copy_inode_size(file->f_path.dentry->d_inode, + lower_file->f_path.dentry->d_inode); + fsstack_copy_attr_times(file->f_path.dentry->d_inode, + lower_file->f_path.dentry->d_inode); + unionfs_check_file(file); + } + +out: + unionfs_unlock_dentry(dentry); + unionfs_read_unlock(file->f_path.dentry->d_sb); + return err; +} + + static int unionfs_file_readdir(struct file *file, void *dirent, filldir_t filldir) { return -ENOTDIR; } +int unionfs_readpage_dummy(struct file *file, struct page *page) +{ + BUG(); + return -EINVAL; +} + static int unionfs_mmap(struct file *file, struct vm_area_struct *vma) { int err = 0; bool willwrite; struct file *lower_file; + struct vm_operations_struct *saved_vm_ops = NULL; unionfs_read_lock(file->f_path.dentry->d_sb, UNIONFS_SMUTEX_PARENT); @@ -54,12 +120,39 @@ static int unionfs_mmap(struct file *file, struct vm_area_struct *vma) err = -EINVAL; printk(KERN_ERR "unionfs: branch %d file system does not " "support writeable mmap\n", fbstart(file)); - } else { - err = generic_file_mmap(file, vma); - if (err) - printk(KERN_ERR - "unionfs: generic_file_mmap failed %d\n", err); + goto out; + } + + /* + * find and save lower vm_ops. + * + * XXX: the VFS should have a cleaner way of finding the lower vm_ops + */ + if (!UNIONFS_F(file)->lower_vm_ops) { + err = lower_file->f_op->mmap(lower_file, vma); + if (err) { + printk(KERN_ERR "unionfs: lower mmap failed %d\n", err); + goto out; + } + saved_vm_ops = vma->vm_ops; + err = do_munmap(current->mm, vma->vm_start, + vma->vm_end - vma->vm_start); + if (err) { + printk(KERN_ERR "unionfs: do_munmap failed %d\n", err); + goto out; + } + } + + file->f_mapping->a_ops = &unionfs_dummy_aops; + err = generic_file_mmap(file, vma); + file->f_mapping->a_ops = &unionfs_aops; + if (err) { + printk(KERN_ERR "unionfs: generic_file_mmap failed %d\n", err); + goto out; } + vma->vm_ops = &unionfs_vm_ops; + if (!UNIONFS_F(file)->lower_vm_ops) + UNIONFS_F(file)->lower_vm_ops = saved_vm_ops; out: if (!err) { @@ -222,10 +315,8 @@ out: struct file_operations unionfs_main_fops = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = unionfs_read, + .write = unionfs_write, .readdir = unionfs_file_readdir, .unlocked_ioctl = unionfs_ioctl, .mmap = unionfs_mmap, diff --git a/fs/unionfs/mmap.c b/fs/unionfs/mmap.c index d6ac61e..07db5b0 100644 --- a/fs/unionfs/mmap.c +++ b/fs/unionfs/mmap.c @@ -19,316 +19,66 @@ #include "union.h" -static int unionfs_writepage(struct page *page, struct writeback_control *wbc) -{ - int err = -EIO; - struct inode *inode; - struct inode *lower_inode; - struct page *lower_page; - struct address_space *lower_mapping; /* lower inode mapping */ - gfp_t mask; - - BUG_ON(!PageUptodate(page)); - inode = page->mapping->host; - /* if no lower inode, nothing to do */ - if (!inode || !UNIONFS_I(inode) || UNIONFS_I(inode)->lower_inodes) { - err = 0; - goto out; - } - lower_inode = unionfs_lower_inode(inode); - lower_mapping = lower_inode->i_mapping; - - /* - * find lower page (returns a locked page) - * - * We turn off __GFP_FS while we look for or create a new lower - * page. This prevents a recursion into the file system code, which - * under memory pressure conditions could lead to a deadlock. This - * is similar to how the loop driver behaves (see loop_set_fd in - * drivers/block/loop.c). If we can't find the lower page, we - * redirty our page and return "success" so that the VM will call us - * again in the (hopefully near) future. - */ - mask = mapping_gfp_mask(lower_mapping) & ~(__GFP_FS); - lower_page = find_or_create_page(lower_mapping, page->index, mask); - if (!lower_page) { - err = 0; - set_page_dirty(page); - goto out; - } - - /* copy page data from our upper page to the lower page */ - copy_highpage(lower_page, page); - flush_dcache_page(lower_page); - SetPageUptodate(lower_page); - set_page_dirty(lower_page); - - /* - * Call lower writepage (expects locked page). However, if we are - * called with wbc->for_reclaim, then the VFS/VM just wants to - * reclaim our page. Therefore, we don't need to call the lower - * ->writepage: just copy our data to the lower page (already done - * above), then mark the lower page dirty and unlock it, and return - * success. - */ - if (wbc->for_reclaim) { - unlock_page(lower_page); - goto out_release; - } - - BUG_ON(!lower_mapping->a_ops->writepage); - wait_on_page_writeback(lower_page); /* prevent multiple writers */ - clear_page_dirty_for_io(lower_page); /* emulate VFS behavior */ - err = lower_mapping->a_ops->writepage(lower_page, wbc); - if (err < 0) - goto out_release; - - /* - * Lower file systems such as ramfs and tmpfs, may return - * AOP_WRITEPAGE_ACTIVATE so that the VM won't try to (pointlessly) - * write the page again for a while. But those lower file systems - * also set the page dirty bit back again. Since we successfully - * copied our page data to the lower page, then the VM will come - * back to the lower page (directly) and try to flush it. So we can - * save the VM the hassle of coming back to our page and trying to - * flush too. Therefore, we don't re-dirty our own page, and we - * never return AOP_WRITEPAGE_ACTIVATE back to the VM (we consider - * this a success). - * - * We also unlock the lower page if the lower ->writepage returned - * AOP_WRITEPAGE_ACTIVATE. (This "anomalous" behaviour may be - * addressed in future shmem/VM code.) - */ - if (err == AOP_WRITEPAGE_ACTIVATE) { - err = 0; - unlock_page(lower_page); - } - - /* all is well */ - - /* lower mtimes have changed: update ours */ - unionfs_copy_attr_times(inode); - -out_release: - /* b/c find_or_create_page increased refcnt */ - page_cache_release(lower_page); -out: - /* - * We unlock our page unconditionally, because we never return - * AOP_WRITEPAGE_ACTIVATE. - */ - unlock_page(page); - return err; -} -static int unionfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +/* + * XXX: we need a dummy readpage handler because generic_file_mmap (which we + * use in unionfs_mmap) checks for the existence of + * mapping->a_ops->readpage, else it returns -ENOEXEC. The VFS will need to + * be fixed to allow a file system to define vm_ops->fault without any + * address_space_ops whatsoever. + * + * Otherwise, we don't want to use our readpage method at all. + */ +static int unionfs_readpage(struct file *file, struct page *page) { - int err = 0; - struct inode *lower_inode; - struct inode *inode; - - inode = mapping->host; - if (ibstart(inode) < 0 && ibend(inode) < 0) - goto out; - lower_inode = unionfs_lower_inode(inode); - if (!lower_inode) - goto out; - - err = generic_writepages(mapping, wbc); - if (!err) - unionfs_copy_attr_times(inode); -out: - return err; + BUG(); + return -EINVAL; } -/* Readpage expects a locked page, and must unlock it */ -static int unionfs_readpage(struct file *file, struct page *page) +static int unionfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int err; - struct file *lower_file; - struct inode *inode; - mm_segment_t old_fs; - char *page_data = NULL; - mode_t orig_mode; + struct file *file, *lower_file; + struct vm_operations_struct *lower_vm_ops; - unionfs_read_lock(file->f_path.dentry->d_sb, UNIONFS_SMUTEX_PARENT); - err = unionfs_file_revalidate(file, false); - if (unlikely(err)) - goto out; - unionfs_check_file(file); - - if (!UNIONFS_F(file)) { - err = -ENOENT; - goto out; - } + BUG_ON(!vma); + file = vma->vm_file; + lower_vm_ops = UNIONFS_F(file)->lower_vm_ops; + BUG_ON(!lower_vm_ops); lower_file = unionfs_lower_file(file); - /* FIXME: is this assertion right here? */ - BUG_ON(lower_file == NULL); - - inode = file->f_path.dentry->d_inode; - - page_data = (char *) kmap(page); - /* - * Use vfs_read because some lower file systems don't have a - * readpage method, and some file systems (esp. distributed ones) - * don't like their pages to be accessed directly. Using vfs_read - * may be a little slower, but a lot safer, as the VFS does a lot of - * the necessary magic for us. - */ - lower_file->f_pos = page_offset(page); - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* - * generic_file_splice_write may call us on a file not opened for - * reading, so temporarily allow reading. - */ - orig_mode = lower_file->f_mode; - lower_file->f_mode |= FMODE_READ; - err = vfs_read(lower_file, page_data, PAGE_CACHE_SIZE, - &lower_file->f_pos); - lower_file->f_mode = orig_mode; - set_fs(old_fs); - if (err >= 0 && err < PAGE_CACHE_SIZE) - memset(page_data + err, 0, PAGE_CACHE_SIZE - err); - kunmap(page); - - if (err < 0) - goto out; - err = 0; - - /* if vfs_read succeeded above, sync up our times */ - unionfs_copy_attr_times(inode); - - flush_dcache_page(page); - + BUG_ON(!lower_file); /* - * we have to unlock our page, b/c we _might_ have gotten a locked - * page. but we no longer have to wakeup on our page here, b/c - * UnlockPage does it + * XXX: we set the vm_file to the lower_file, before calling the + * lower ->fault op, then we restore the vm_file back to the upper + * file. Need to change the ->fault prototype to take an explicit + * struct file, and fix all users accordingly. */ -out: - if (err == 0) - SetPageUptodate(page); - else - ClearPageUptodate(page); - - unlock_page(page); - unionfs_check_file(file); - unionfs_read_unlock(file->f_path.dentry->d_sb); - - return err; -} - -static int unionfs_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) -{ - int err; - - unionfs_read_lock(file->f_path.dentry->d_sb, UNIONFS_SMUTEX_PARENT); - err = unionfs_file_revalidate(file, true); - if (!err) { - unionfs_copy_attr_times(file->f_path.dentry->d_inode); - unionfs_check_file(file); - } - unionfs_read_unlock(file->f_path.dentry->d_sb); - + vma->vm_file = lower_file; + err = lower_vm_ops->fault(vma, vmf); + vma->vm_file = file; return err; } -static int unionfs_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) -{ - int err = -ENOMEM; - struct inode *inode, *lower_inode; - struct file *lower_file = NULL; - unsigned bytes = to - from; - char *page_data = NULL; - mm_segment_t old_fs; - - BUG_ON(file == NULL); - - unionfs_read_lock(file->f_path.dentry->d_sb, UNIONFS_SMUTEX_PARENT); - err = unionfs_file_revalidate(file, true); - if (unlikely(err)) - goto out; - unionfs_check_file(file); - - inode = page->mapping->host; - - if (UNIONFS_F(file) != NULL) - lower_file = unionfs_lower_file(file); - - /* FIXME: is this assertion right here? */ - BUG_ON(lower_file == NULL); - - page_data = (char *)kmap(page); - lower_file->f_pos = page_offset(page) + from; - - /* - * We use vfs_write instead of copying page data and the - * prepare_write/commit_write combo because file system's like - * GFS/OCFS2 don't like things touching those directly, - * calling the underlying write op, while a little bit slower, will - * call all the FS specific code as well - */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = vfs_write(lower_file, page_data + from, bytes, - &lower_file->f_pos); - set_fs(old_fs); - - kunmap(page); - - if (err < 0) - goto out; - - /* if vfs_write succeeded above, sync up our times/sizes */ - lower_inode = lower_file->f_path.dentry->d_inode; - if (!lower_inode) - lower_inode = unionfs_lower_inode(inode); - BUG_ON(!lower_inode); - fsstack_copy_inode_size(inode, lower_inode); - unionfs_copy_attr_times(inode); - mark_inode_dirty_sync(inode); - -out: - if (err < 0) - ClearPageUptodate(page); - - unionfs_check_file(file); - unionfs_read_unlock(file->f_path.dentry->d_sb); - return err; /* assume all is ok */ -} - /* - * Although unionfs isn't a block-based file system, it may stack on one. - * ->bmap is needed, for example, to swapon(2) files. + * XXX: the default address_space_ops for unionfs is empty. We cannot set + * our inode->i_mapping->a_ops to NULL because too many code paths expect + * the a_ops vector to be non-NULL. */ -sector_t unionfs_bmap(struct address_space *mapping, sector_t block) -{ - int err = -EINVAL; - struct inode *inode, *lower_inode; - sector_t (*bmap)(struct address_space *, sector_t); - - inode = (struct inode *)mapping->host; - lower_inode = unionfs_lower_inode(inode); - if (!lower_inode) - goto out; - bmap = lower_inode->i_mapping->a_ops->bmap; - if (bmap) - err = bmap(lower_inode->i_mapping, block); -out: - return err; -} - - struct address_space_operations unionfs_aops = { - .writepage = unionfs_writepage, - .writepages = unionfs_writepages, + /* empty on purpose */ +}; + +/* + * XXX: we need a second, dummy address_space_ops vector, to be used + * temporarily during unionfs_mmap, because the latter calls + * generic_file_mmap, which checks if ->readpage exists, else returns + * -ENOEXEC. + */ +struct address_space_operations unionfs_dummy_aops = { .readpage = unionfs_readpage, - .prepare_write = unionfs_prepare_write, - .commit_write = unionfs_commit_write, - .bmap = unionfs_bmap, +}; + +struct vm_operations_struct unionfs_vm_ops = { + .fault = unionfs_fault, }; diff --git a/fs/unionfs/union.h b/fs/unionfs/union.h index 7b55e33..f6fabf8 100644 --- a/fs/unionfs/union.h +++ b/fs/unionfs/union.h @@ -75,7 +75,8 @@ extern struct inode_operations unionfs_dir_iops; extern struct inode_operations unionfs_symlink_iops; extern struct super_operations unionfs_sops; extern struct dentry_operations unionfs_dops; -extern struct address_space_operations unionfs_aops; +extern struct address_space_operations unionfs_aops, unionfs_dummy_aops; +extern struct vm_operations_struct unionfs_vm_ops; /* How long should an entry be allowed to persist */ #define RDCACHE_JIFFIES (5*HZ) @@ -89,6 +90,7 @@ struct unionfs_file_info { struct unionfs_dir_state *rdstate; struct file **lower_files; int *saved_branch_ids; /* IDs of branches when file was opened */ + struct vm_operations_struct *lower_vm_ops; }; /* unionfs inode data in memory */ -- 1.5.2.2 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html