On Mon, Apr 02, 2007 at 02:09:34PM +0200, Nick Piggin wrote: > Updated aops patchset against 2.6.21-rc5. > > http://www.kernel.org/pub/linux/kernel/people/npiggin/patches/new-aops/ Thanks again for pushing this stuff out Nick. Fwiw, I had merge conflicts working against the latest git code from Linus - it looks like Jens pushed some splice changes (some from you). A refreshed fs-new-write-aops.patch is attached. The ext3 patch also caused me some problems, but I just temporarily dropped it for my immediate work. --Mark -- Mark Fasheh Senior Software Developer, Oracle mark.fasheh@xxxxxxxxxx Introduce write_begin, write_end, and perform_write aops. These are intended to replace prepare_write and commit_write with more flexible alternatives that are also able to avoid the buffered write deadlock problems efficiently (which prepare_write is unable to do). Signed-off-by: Nick Piggin <npiggin@xxxxxxx> Documentation/filesystems/Locking | 9 - Documentation/filesystems/vfs.txt | 36 +++++ drivers/block/loop.c | 69 +++-------- fs/buffer.c | 161 +++++++++++++++++++++++--- fs/libfs.c | 40 ++++++ fs/namei.c | 47 +------ fs/splice.c | 107 +---------------- include/linux/buffer_head.h | 7 + include/linux/fs.h | 31 +++++ include/linux/pagemap.h | 2 mm/filemap.c | 229 ++++++++++++++++++++++++++++++++++---- 11 files changed, 513 insertions(+), 225 deletions(-) Index: linux-2.6.git/include/linux/fs.h =================================================================== --- linux-2.6.git.orig/include/linux/fs.h +++ linux-2.6.git/include/linux/fs.h @@ -389,6 +389,8 @@ enum positive_aop_returns { AOP_TRUNCATED_PAGE = 0x80001, }; +#define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */ + /* * oh the beauties of C type declarations. */ @@ -449,6 +451,17 @@ struct address_space_operations { */ int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); int (*commit_write)(struct file *, struct page *, unsigned, unsigned); + + int (*write_begin)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); + int (*write_end)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + + ssize_t (*perform_write)(struct file *, struct address_space *, + struct iov_iter *, loff_t); + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidatepage) (struct page *, unsigned long); @@ -463,6 +476,18 @@ struct address_space_operations { int (*launder_page) (struct page *); }; +/* + * pagecache_write_begin/pagecache_write_end must be used by general code + * to write into the pagecache. + */ +int pagecache_write_begin(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); + +int pagecache_write_end(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ @@ -1907,6 +1932,12 @@ extern int simple_prepare_write(struct f unsigned offset, unsigned to); extern int simple_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to); +extern int simple_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +extern int simple_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); Index: linux-2.6.git/mm/filemap.c =================================================================== --- linux-2.6.git.orig/mm/filemap.c +++ linux-2.6.git/mm/filemap.c @@ -2049,6 +2049,93 @@ inline int generic_write_checks(struct f } EXPORT_SYMBOL(generic_write_checks); +int pagecache_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + + if (aops->write_begin) { + return aops->write_begin(file, mapping, pos, len, flags, + pagep, fsdata); + } else { + int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + struct page *page; +again: + page = __grab_cache_page(mapping, index); + *pagep = page; + if (!page) + return -ENOMEM; + + if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { + /* + * There is no way to resolve a short write situation + * for a !Uptodate page (except by double copying in + * the caller done by generic_perform_write_2copy). + * + * Instead, we have to bring it uptodate here. + */ + ret = aops->readpage(file, page); + page_cache_release(page); + if (ret) { + if (ret == AOP_TRUNCATED_PAGE) + goto again; + return ret; + } + goto again; + } + + ret = aops->prepare_write(file, page, offset, offset+len); + if (ret) { + if (ret != AOP_TRUNCATED_PAGE) + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + if (ret == AOP_TRUNCATED_PAGE) + goto again; + } + return ret; + } +} +EXPORT_SYMBOL(pagecache_write_begin); + +int pagecache_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + int ret; + + if (aops->write_begin) { + ret = aops->write_end(file, mapping, pos, len, copied, + page, fsdata); + } else { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + + flush_dcache_page(page); + ret = aops->commit_write(file, page, offset, offset+len); + unlock_page(page); + page_cache_release(page); + BUG_ON(ret == AOP_TRUNCATED_PAGE); /* can't deal with */ + + if (ret < 0) { + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } else if (ret > 0) + ret = min_t(size_t, copied, ret); + else + ret = copied; + } + + return ret; +} +EXPORT_SYMBOL(pagecache_write_end); + ssize_t generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long *nr_segs, loff_t pos, loff_t *ppos, @@ -2092,8 +2179,7 @@ EXPORT_SYMBOL(generic_file_direct_write) * Find or create a page at the given pagecache position. Return the locked * page. This function is specifically for buffered writes. */ -static struct page *__grab_cache_page(struct address_space *mapping, - pgoff_t index) +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) { int status; struct page *page; @@ -2114,20 +2200,16 @@ repeat: } return page; } +EXPORT_SYMBOL(__grab_cache_page); -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) +static ssize_t generic_perform_write_2copy(struct file *file, + struct iov_iter *i, loff_t pos) { - struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, written); + struct inode *inode = mapping->host; + long status = 0; + ssize_t written = 0; do { struct page *src_page; @@ -2140,7 +2222,7 @@ generic_file_buffered_write(struct kiocb offset = (pos & (PAGE_CACHE_SIZE - 1)); index = pos >> PAGE_CACHE_SHIFT; bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, - iov_iter_count(&i)); + iov_iter_count(i)); /* * a non-NULL src_page indicates that we're doing the @@ -2158,7 +2240,7 @@ generic_file_buffered_write(struct kiocb * to check that the address is actually valid, when atomic * usercopies are used, below. */ - if (unlikely(iov_iter_fault_in_readable(&i))) { + if (unlikely(iov_iter_fault_in_readable(i))) { status = -EFAULT; break; } @@ -2189,7 +2271,7 @@ generic_file_buffered_write(struct kiocb * same reason as we can't take a page fault with a * page locked (as explained below). */ - copied = iov_iter_copy_from_user(src_page, &i, + copied = iov_iter_copy_from_user(src_page, i, offset, bytes); if (unlikely(copied == 0)) { status = -EFAULT; @@ -2214,7 +2296,6 @@ generic_file_buffered_write(struct kiocb page_cache_release(src_page); continue; } - } status = a_ops->prepare_write(file, page, offset, offset+bytes); @@ -2236,7 +2317,7 @@ generic_file_buffered_write(struct kiocb * really matter. */ pagefault_disable(); - copied = iov_iter_copy_from_user_atomic(page, &i, + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); pagefault_enable(); } else { @@ -2262,9 +2343,9 @@ generic_file_buffered_write(struct kiocb if (src_page) page_cache_release(src_page); - iov_iter_advance(&i, copied); - written += copied; + iov_iter_advance(i, copied); pos += copied; + written += copied; balance_dirty_pages_ratelimited(mapping); cond_resched(); @@ -2288,13 +2369,119 @@ fs_write_aop_error: continue; else break; - } while (iov_iter_count(&i)); - *ppos = pos; + } while (iov_iter_count(i)); + + return written ? written : status; +} + +static ssize_t generic_perform_write(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + long status = 0; + ssize_t written = 0; + + do { + struct page *page; + pgoff_t index; /* Pagecache index for current page */ + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE - 1)); + index = pos >> PAGE_CACHE_SHIFT; + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); + +again: + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i))) { + status = -EFAULT; + break; + } + + status = a_ops->write_begin(file, mapping, pos, bytes, 0, + &page, &fsdata); + if (unlikely(status)) + break; + + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); + + status = a_ops->write_end(file, mapping, pos, bytes, copied, + page, fsdata); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + iov_iter_advance(i, copied); + pos += copied; + written += copied; + + balance_dirty_pages_ratelimited(mapping); + + } while (iov_iter_count(i)); + + return written ? written : status; +} + +ssize_t +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, loff_t *ppos, + size_t count, ssize_t written) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + ssize_t status; + struct iov_iter i; + + iov_iter_init(&i, iov, nr_segs, count, written); + if (a_ops->perform_write) + status = a_ops->perform_write(file, mapping, &i, pos); + else if (a_ops->write_begin) + status = generic_perform_write(file, &i, pos); + else + status = generic_perform_write_2copy(file, &i, pos); - /* - * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC - */ if (likely(status >= 0)) { + written += status; + *ppos = pos + status; + + /* + * For now, when the user asks for O_SYNC, we'll actually give + * O_DSYNC + */ if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { if (!a_ops->writepage || !is_sync_kiocb(iocb)) status = generic_osync_inode(inode, mapping, Index: linux-2.6.git/fs/buffer.c =================================================================== --- linux-2.6.git.orig/fs/buffer.c +++ linux-2.6.git/fs/buffer.c @@ -1840,44 +1840,54 @@ static int __block_prepare_write(struct if (!buffer_uptodate(*wait_bh)) err = -EIO; } - if (!err) { - bh = head; - do { - if (buffer_new(bh)) - clear_buffer_new(bh); - } while ((bh = bh->b_this_page) != head); - return 0; - } - /* Error case: */ - /* - * Zero out any newly allocated blocks to avoid exposing stale - * data. If BH_New is set, we know that the block was newly - * allocated in the above loop. - */ - bh = head; + if (unlikely(err)) + page_zero_new_buffers(page, from, to); + return err; +} + +/* + * If a page has any new buffers, zero them out here, and mark them uptodate + * and dirty so they'll be written out (in order to prevent uninitialised + * block data from leaking). And clear the new bit. + */ +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) +{ + unsigned int block_start, block_end; + struct buffer_head *head, *bh; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; + + bh = head = page_buffers(page); block_start = 0; do { - block_end = block_start+blocksize; - if (block_end <= from) - goto next_bh; - if (block_start >= to) - break; + block_end = block_start + bh->b_size; + if (buffer_new(bh)) { - void *kaddr; + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, end; + void *kaddr; + + start = max(from, block_start); + end = min(to, block_end); + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+start, 0, end - start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh); + } - clear_buffer_new(bh); - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr+block_start, 0, bh->b_size); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); + clear_buffer_new(bh); + mark_buffer_dirty(bh); + } } -next_bh: + block_start = block_end; bh = bh->b_this_page; } while (bh != head); - return err; } static int __block_commit_write(struct inode *inode, struct page *page, @@ -1901,6 +1911,7 @@ static int __block_commit_write(struct i set_buffer_uptodate(bh); mark_buffer_dirty(bh); } + clear_buffer_new(bh); } /* @@ -1915,6 +1926,115 @@ static int __block_commit_write(struct i } /* + * block_write_begin takes care of the basic task of block allocation and + * bringing partial write blocks uptodate first. + * + * If *pagep is not NULL, then block_write_begin uses the locked page + * at *pagep rather than allocating its own. In this case, the page will + * not be unlocked or deallocated on failure. + */ +int block_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + struct inode *inode = mapping->host; + int status = 0; + struct page *page; + pgoff_t index; + unsigned start, end; + int ownpage = 0; + + index = pos >> PAGE_CACHE_SHIFT; + start = pos & (PAGE_CACHE_SIZE - 1); + end = start + len; + + page = *pagep; + if (page == NULL) { + ownpage = 1; + page = __grab_cache_page(mapping, index); + if (!page) { + status = -ENOMEM; + goto out; + } + *pagep = page; + } else + BUG_ON(!PageLocked(page)); + + status = __block_prepare_write(inode, page, start, end, get_block); + if (unlikely(status)) { + ClearPageUptodate(page); + + if (ownpage) { + unlock_page(page); + page_cache_release(page); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } + goto out; + } + +out: + return status; +} +EXPORT_SYMBOL(block_write_begin); + +int block_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + unsigned start; + + start = pos & (PAGE_CACHE_SIZE - 1); + + if (unlikely(copied < len)) { + /* + * The buffers that were written will now be uptodate, so we + * don't have to worry about a readpage reading them and + * overwriting a partial write. However if we have encountered + * a short write and only partially written into a buffer, it + * will not be marked uptodate, so a readpage might come in and + * destroy our partial write. + * + * Do the simplest thing, and just treat any short write to a + * non uptodate page as a zero-length write, and force the + * caller to redo the whole thing. + */ + if (!PageUptodate(page)) + copied = 0; + + page_zero_new_buffers(page, start+copied, start+len); + } + flush_dcache_page(page); + + /* This could be a short (even 0-length) commit */ + __block_commit_write(inode, page, start, start+copied); + + unlock_page(page); + mark_page_accessed(page); /* XXX: put this in caller? */ + page_cache_release(page); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + */ + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + mark_inode_dirty(inode); + } + + return copied; +} +EXPORT_SYMBOL(block_write_end); + +/* * Generic "read page" function for block devices that have the normal * get_block functionality. This is most of the block device filesystems. * Reads the page asynchronously --- the unlock_buffer() and Index: linux-2.6.git/include/linux/buffer_head.h =================================================================== --- linux-2.6.git.orig/include/linux/buffer_head.h +++ linux-2.6.git/include/linux/buffer_head.h @@ -202,6 +202,13 @@ void block_invalidatepage(struct page *p int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc); int block_read_full_page(struct page*, get_block_t*); +int block_write_begin(struct file *, struct address_space *, + loff_t, unsigned, unsigned, + struct page **, void **, get_block_t*); +int block_write_end(struct file *, struct address_space *, + loff_t, unsigned, unsigned, + struct page *, void *); +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, loff_t *); Index: linux-2.6.git/include/linux/pagemap.h =================================================================== --- linux-2.6.git.orig/include/linux/pagemap.h +++ linux-2.6.git/include/linux/pagemap.h @@ -85,6 +85,8 @@ unsigned find_get_pages_contig(struct ad unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages); +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index); + /* * Returns locked page at given index in given cache, creating it if needed. */ Index: linux-2.6.git/fs/libfs.c =================================================================== --- linux-2.6.git.orig/fs/libfs.c +++ linux-2.6.git/fs/libfs.c @@ -342,6 +342,26 @@ int simple_prepare_write(struct file *fi return 0; } +int simple_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct page *page; + pgoff_t index; + unsigned from; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + + page = __grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + + *pagep = page; + + return simple_prepare_write(file, page, from, from+len); +} + int simple_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { @@ -360,6 +380,28 @@ int simple_commit_write(struct file *fil return 0; } +int simple_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + /* zero the stale part of the page if we did a short copy */ + if (copied < len) { + void *kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + from + copied, 0, len - copied); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + + simple_commit_write(file, page, from, from+copied); + + unlock_page(page); + page_cache_release(page); + + return copied; +} + int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files) { struct inode *inode; @@ -616,6 +658,8 @@ EXPORT_SYMBOL(dcache_dir_open); EXPORT_SYMBOL(dcache_readdir); EXPORT_SYMBOL(generic_read_dir); EXPORT_SYMBOL(get_sb_pseudo); +EXPORT_SYMBOL(simple_write_begin); +EXPORT_SYMBOL(simple_write_end); EXPORT_SYMBOL(simple_commit_write); EXPORT_SYMBOL(simple_dir_inode_operations); EXPORT_SYMBOL(simple_dir_operations); Index: linux-2.6.git/drivers/block/loop.c =================================================================== --- linux-2.6.git.orig/drivers/block/loop.c +++ linux-2.6.git/drivers/block/loop.c @@ -206,11 +206,10 @@ lo_do_transfer(struct loop_device *lo, i * space operations prepare_write and commit_write. */ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, - int bsize, loff_t pos, struct page *page) + int bsize, loff_t pos, struct page *unused) { struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */ struct address_space *mapping = file->f_mapping; - const struct address_space_operations *aops = mapping->a_ops; pgoff_t index; unsigned offset, bv_offs; int len, ret; @@ -222,67 +221,47 @@ static int do_lo_send_aops(struct loop_d len = bvec->bv_len; while (len > 0) { sector_t IV; - unsigned size; + unsigned size, copied; int transfer_result; + struct page *page; + void *fsdata; IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9); size = PAGE_CACHE_SIZE - offset; if (size > len) size = len; - page = grab_cache_page(mapping, index); - if (unlikely(!page)) + + ret = pagecache_write_begin(file, mapping, pos, size, 0, + &page, &fsdata); + if (ret) goto fail; - ret = aops->prepare_write(file, page, offset, - offset + size); - if (unlikely(ret)) { - if (ret == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - continue; - } - goto unlock; - } + transfer_result = lo_do_transfer(lo, WRITE, page, offset, bvec->bv_page, bv_offs, size, IV); - if (unlikely(transfer_result)) { - char *kaddr; + copied = size; + if (unlikely(transfer_result)) + copied = 0; + + ret = pagecache_write_end(file, mapping, pos, size, copied, + page, fsdata); + if (ret < 0) + goto fail; + if (ret < copied) + copied = ret; - /* - * The transfer failed, but we still write the data to - * keep prepare/commit calls balanced. - */ - printk(KERN_ERR "loop: transfer error block %llu\n", - (unsigned long long)index); - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, size); - kunmap_atomic(kaddr, KM_USER0); - } - flush_dcache_page(page); - ret = aops->commit_write(file, page, offset, - offset + size); - if (unlikely(ret)) { - if (ret == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - continue; - } - goto unlock; - } if (unlikely(transfer_result)) - goto unlock; - bv_offs += size; - len -= size; + goto fail; + + bv_offs += copied; + len -= copied; offset = 0; index++; - pos += size; - unlock_page(page); - page_cache_release(page); + pos += copied; } ret = 0; out: mutex_unlock(&mapping->host->i_mutex); return ret; -unlock: - unlock_page(page); - page_cache_release(page); fail: ret = -1; goto out; Index: linux-2.6.git/fs/namei.c =================================================================== --- linux-2.6.git.orig/fs/namei.c +++ linux-2.6.git/fs/namei.c @@ -2688,53 +2688,30 @@ int __page_symlink(struct inode *inode, { struct address_space *mapping = inode->i_mapping; struct page *page; + void *fsdata; int err; char *kaddr; retry: - err = -ENOMEM; - page = find_or_create_page(mapping, 0, gfp_mask); - if (!page) - goto fail; - err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); - if (err == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry; - } + err = pagecache_write_begin(NULL, mapping, 0, PAGE_CACHE_SIZE, + AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); if (err) - goto fail_map; + goto fail; + kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len-1); + memset(kaddr+len-1, 0, PAGE_CACHE_SIZE-(len-1)); kunmap_atomic(kaddr, KM_USER0); - err = mapping->a_ops->commit_write(NULL, page, 0, len-1); - if (err == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry; - } - if (err) - goto fail_map; - /* - * Notice that we are _not_ going to block here - end of page is - * unmapped, so this will only try to map the rest of page, see - * that it is unmapped (typically even will not look into inode - - * ->i_size will be enough for everything) and zero it out. - * OTOH it's obviously correct and should make the page up-to-date. - */ - if (!PageUptodate(page)) { - err = mapping->a_ops->readpage(NULL, page); - if (err != AOP_TRUNCATED_PAGE) - wait_on_page_locked(page); - } else { - unlock_page(page); - } - page_cache_release(page); + + err = pagecache_write_end(NULL, mapping, 0, PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, + page, fsdata); if (err < 0) goto fail; + if (err < PAGE_CACHE_SIZE) + goto retry; + mark_inode_dirty(inode); return 0; -fail_map: - unlock_page(page); - page_cache_release(page); fail: return err; } Index: linux-2.6.git/fs/splice.c =================================================================== --- linux-2.6.git.orig/fs/splice.c +++ linux-2.6.git/fs/splice.c @@ -559,7 +559,7 @@ static int pipe_to_file(struct pipe_inod struct address_space *mapping = file->f_mapping; unsigned int offset, this_len; struct page *page; - pgoff_t index; + void *fsdata; int ret; /* @@ -569,49 +569,16 @@ static int pipe_to_file(struct pipe_inod if (unlikely(ret)) return ret; - index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; this_len = sd->len; if (this_len + offset > PAGE_CACHE_SIZE) this_len = PAGE_CACHE_SIZE - offset; -find_page: - page = find_lock_page(mapping, index); - if (!page) { - ret = -ENOMEM; - page = page_cache_alloc_cold(mapping); - if (unlikely(!page)) - goto out_ret; - - /* - * This will also lock the page - */ - ret = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL); - if (unlikely(ret)) - goto out; - } - - ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); - if (unlikely(ret)) { - loff_t isize = i_size_read(mapping->host); - - if (ret != AOP_TRUNCATED_PAGE) - unlock_page(page); - page_cache_release(page); - if (ret == AOP_TRUNCATED_PAGE) - goto find_page; - - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. - */ - if (sd->pos + this_len > isize) - vmtruncate(mapping->host, isize); - - goto out_ret; - } + ret = pagecache_write_begin(file, mapping, sd->pos, sd->len, + AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + if (unlikely(ret)) + goto out; if (buf->page != page) { /* @@ -621,35 +588,14 @@ find_page: char *dst = kmap_atomic(page, KM_USER1); memcpy(dst + offset, src + buf->offset, this_len); - flush_dcache_page(page); kunmap_atomic(dst, KM_USER1); buf->ops->unmap(pipe, buf, src); } - ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); - if (ret) { - if (ret == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto find_page; - } - if (ret < 0) - goto out; - /* - * Partial write has happened, so 'ret' already initialized by - * number of bytes written, Where is nothing we have to do here. - */ - } else - ret = this_len; - /* - * Return the number of bytes written and mark page as - * accessed, we are now done! - */ - mark_page_accessed(page); - balance_dirty_pages_ratelimited(mapping); + ret = pagecache_write_end(file, mapping, sd->pos, sd->len, sd->len, page, fsdata); + out: - page_cache_release(page); - unlock_page(page); -out_ret: + return ret; } Index: linux-2.6.git/Documentation/filesystems/Locking =================================================================== --- linux-2.6.git.orig/Documentation/filesystems/Locking +++ linux-2.6.git/Documentation/filesystems/Locking @@ -176,15 +176,18 @@ prototypes: locking rules: All except set_page_dirty may block - BKL PageLocked(page) + BKL PageLocked(page) i_sem writepage: no yes, unlocks (see below) readpage: no yes, unlocks sync_page: no maybe writepages: no set_page_dirty no no readpages: no -prepare_write: no yes -commit_write: no yes +prepare_write: no yes yes +commit_write: no yes yes +write_begin: no locks the page yes +write_end: no yes, unlocks yes +perform_write: no n/a yes bmap: yes invalidatepage: no yes releasepage: no yes Index: linux-2.6.git/Documentation/filesystems/vfs.txt =================================================================== --- linux-2.6.git.orig/Documentation/filesystems/vfs.txt +++ linux-2.6.git/Documentation/filesystems/vfs.txt @@ -534,6 +534,14 @@ struct address_space_operations { struct list_head *pages, unsigned nr_pages); int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); int (*commit_write)(struct file *, struct page *, unsigned, unsigned); + int (*write_begin)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); + int (*write_end)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + int (*perform_write)(struct file *, struct address_space *, + struct iov_iter *, loff_t); sector_t (*bmap)(struct address_space *, sector_t); int (*invalidatepage) (struct page *, unsigned long); int (*releasepage) (struct page *, int); @@ -629,6 +637,34 @@ struct address_space_operations { operations. It should avoid returning an error if possible - errors should have been handled by prepare_write. + write_begin: This is intended as a replacement for prepare_write. Called + by the generic buffered write code to ask the filesystem to prepare + to write len bytes at the given offset in the file. flags is a field + for AOP_FLAG_xxx flags, described in include/linux/mm.h. + + The filesystem must return the locked pagecache page for the caller + to write into. + + A void * may be returned in fsdata, which then gets passed into + write_end. + + Returns < 0 on failure, in which case all cleanup must be done and + write_end not called. 0 on success, in which case write_end must + be called. + + write_end: After a successful write_begin, and data copy, write_end must + be called. len is the original len passed to write_begin, and copied + is the amount that was able to be copied (they must be equal if + write_begin was called with intr == 0). + + The filesystem must take care of unlocking the page and dropping its + refcount, and updating i_size. + + Returns < 0 on failure, otherwise the number of bytes (<= 'copied') + that were able to be copied into the file. + + perform_write: + bmap: called by the VFS to map a logical block offset within object to physical block number. This method is used by the FIBMAP ioctl and for working with swap-files. To be able to swap to - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html