Nick Piggin <npiggin@xxxxxxx> writes: > Introduce write_begin, write_end, and perform_write aops. > > These are intended to replace prepare_write and commit_write with more > flexible alternatives that are also able to avoid the buffered write > deadlock problems efficiently (which prepare_write is unable to do). > > Documentation/filesystems/Locking | 9 + > Documentation/filesystems/vfs.txt | 38 ++++++ > drivers/block/loop.c | 69 ++++------- > fs/buffer.c | 144 ++++++++++++++++++++---- > fs/libfs.c | 40 ++++++ > fs/namei.c | 47 ++----- > fs/splice.c | 106 +---------------- > include/linux/buffer_head.h | 7 + > include/linux/fs.h | 29 ++++ > include/linux/pagemap.h | 2 > mm/filemap.c | 228 ++++++++++++++++++++++++++++++++++---- > 11 files changed, 494 insertions(+), 225 deletions(-) > > Index: linux-2.6/include/linux/fs.h > =================================================================== > --- linux-2.6.orig/include/linux/fs.h > +++ linux-2.6/include/linux/fs.h > @@ -449,6 +449,17 @@ struct address_space_operations { > */ > int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); > int (*commit_write)(struct file *, struct page *, unsigned, unsigned); > + > + int (*write_begin)(struct file *, struct address_space *mapping, > + loff_t pos, unsigned len, int intr, > + struct page **pagep, void **fsdata); > + int (*write_end)(struct file *, struct address_space *mapping, > + loff_t pos, unsigned len, unsigned copied, > + struct page *page, void *fsdata); > + > + int (*perform_write)(struct file *, struct address_space *, > + struct iov_iter *, loff_t); > + > /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ > sector_t (*bmap)(struct address_space *, sector_t); > void (*invalidatepage) (struct page *, unsigned long); > @@ -463,6 +474,18 @@ struct address_space_operations { > int (*launder_page) (struct page *); > }; > > +/* > + * pagecache_write_begin/pagecache_write_end must be used by general code > + * to write into the pagecache. > + */ > +int pagecache_write_begin(struct file *, struct address_space *mapping, > + loff_t pos, unsigned len, int intr, > + struct page **pagep, void **fsdata); > + > +int pagecache_write_end(struct file *, struct address_space *mapping, > + loff_t pos, unsigned len, unsigned copied, > + struct page *page, void *fsdata); > + > struct backing_dev_info; > struct address_space { > struct inode *host; /* owner: inode, block_device */ > @@ -1902,6 +1925,12 @@ extern int simple_prepare_write(struct f > unsigned offset, unsigned to); > extern int simple_commit_write(struct file *file, struct page *page, > unsigned offset, unsigned to); > +extern int simple_write_begin(struct file *file, struct address_space *mapping, > + loff_t pos, unsigned len, int intr, > + struct page **pagep, void **fsdata); > +extern int simple_write_end(struct file *file, struct address_space *mapping, > + loff_t pos, unsigned len, unsigned copied, > + struct page *page, void *fsdata); > > extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); > extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); > Index: linux-2.6/mm/filemap.c > =================================================================== > --- linux-2.6.orig/mm/filemap.c > +++ linux-2.6/mm/filemap.c > @@ -2049,6 +2049,88 @@ inline int generic_write_checks(struct f > } > EXPORT_SYMBOL(generic_write_checks); > > +int pagecache_write_begin(struct file *file, struct address_space *mapping, > + loff_t pos, unsigned len, int intr, > + struct page **pagep, void **fsdata) > +{ > + const struct address_space_operations *aops = mapping->a_ops; > + > + if (aops->write_begin) > + return aops->write_begin(file, mapping, pos, len, intr, pagep, fsdata); > + else { > + int ret; > + pgoff_t index = pos >> PAGE_CACHE_SHIFT; > + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); > + struct inode *inode = mapping->host; > + struct page *page; > +again: > + page = __grab_cache_page(mapping, index); > + *pagep = page; > + if (!page) > + return -ENOMEM; > + > + if (intr && !PageUptodate(page)) { > + /* > + * There is no way to resolve a short write situation > + * for a !Uptodate page (except by double copying in > + * the caller done by generic_perform_write_2copy). > + * > + * Instead, we have to bring it uptodate here. > + */ > + ret = aops->readpage(file, page); > + page_cache_release(page); > + if (ret) { > + if (ret == AOP_TRUNCATED_PAGE) > + goto again; > + return ret; > + } > + goto again; > + } > + > + ret = aops->prepare_write(file, page, offset, offset+len); > + if (ret) { > + if (ret != AOP_TRUNCATED_PAGE) > + unlock_page(page); > + page_cache_release(page); > + if (pos + len > inode->i_size) > + vmtruncate(inode, inode->i_size); > + if (ret == AOP_TRUNCATED_PAGE) > + goto again; > + } > + return ret; If prepare_write succeed we return with locked and getted page, let's remember it as [1] > + } > +} > +EXPORT_SYMBOL(pagecache_write_begin); > + > +int pagecache_write_end(struct file *file, struct address_space *mapping, > + loff_t pos, unsigned len, unsigned copied, > + struct page *page, void *fsdata) > +{ > + const struct address_space_operations *aops = mapping->a_ops; > + int ret; > + > + if (aops->write_begin) > + ret = aops->write_end(file, mapping, pos, len, copied, page, fsdata); > + else { > + int ret; > + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); > + struct inode *inode = mapping->host; > + > + flush_dcache_page(page); > + ret = aops->commit_write(file, page, offset, offset+len); > + if (ret < 0) { > + unlock_page(page); > + page_cache_release(page); > + if (pos + len > inode->i_size) > + vmtruncate(inode, inode->i_size); > + } else > + ret = copied; What about AOP_TRUNCATED_PAGE? Off corse we can't just "goto retry" here :) , but we may return it to caller and let's caller handle it. > + } > + > + return copied; if ->commit_write return non negative value we return with sill locked page look above at [1] may be it will be unlocked by caller? I guess no it was just forgoten. > +} > +EXPORT_SYMBOL(pagecache_write_end); > + > ssize_t > generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, > unsigned long *nr_segs, loff_t pos, loff_t *ppos, > @@ -2092,8 +2174,7 @@ EXPORT_SYMBOL(generic_file_direct_write) > * Find or create a page at the given pagecache position. Return the locked > * page. This function is specifically for buffered writes. > */ > -static struct page *__grab_cache_page(struct address_space *mapping, > - pgoff_t index) > +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) > { > int status; > struct page *page; > @@ -2115,19 +2196,14 @@ repeat: > return page; > } > > -ssize_t > -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, > - unsigned long nr_segs, loff_t pos, loff_t *ppos, > - size_t count, ssize_t written) > +static ssize_t generic_perform_write_2copy(struct file *file, > + struct iov_iter *i, loff_t pos) > { > - struct file *file = iocb->ki_filp; > struct address_space *mapping = file->f_mapping; > const struct address_space_operations *a_ops = mapping->a_ops; > - struct inode *inode = mapping->host; > - long status = 0; > - struct iov_iter i; > - > - iov_iter_init(&i, iov, nr_segs, count, written); > + struct inode *inode = mapping->host; > + long status = 0; > + ssize_t written = 0; > > do { > struct page *src_page; > @@ -2140,7 +2216,7 @@ generic_file_buffered_write(struct kiocb > offset = (pos & (PAGE_CACHE_SIZE - 1)); > index = pos >> PAGE_CACHE_SHIFT; > bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, > - iov_iter_count(&i)); > + iov_iter_count(i)); > > /* > * a non-NULL src_page indicates that we're doing the > @@ -2158,7 +2234,7 @@ generic_file_buffered_write(struct kiocb > * to check that the address is actually valid, when atomic > * usercopies are used, below. > */ > - if (unlikely(iov_iter_fault_in_readable(&i))) { > + if (unlikely(iov_iter_fault_in_readable(i))) { > status = -EFAULT; > break; > } > @@ -2189,7 +2265,7 @@ generic_file_buffered_write(struct kiocb > * same reason as we can't take a page fault with a > * page locked (as explained below). > */ > - copied = iov_iter_copy_from_user(src_page, &i, > + copied = iov_iter_copy_from_user(src_page, i, > offset, bytes); > if (unlikely(copied == 0)) { > status = -EFAULT; > @@ -2228,7 +2304,7 @@ generic_file_buffered_write(struct kiocb > * really matter. > */ > pagefault_disable(); > - copied = iov_iter_copy_from_user_atomic(page, &i, > + copied = iov_iter_copy_from_user_atomic(page, i, > offset, bytes); > pagefault_enable(); > } else { > @@ -2254,9 +2330,9 @@ generic_file_buffered_write(struct kiocb > if (src_page) > page_cache_release(src_page); > > - iov_iter_advance(&i, copied); > - written += copied; > + iov_iter_advance(i, copied); > pos += copied; > + written += copied; > > balance_dirty_pages_ratelimited(mapping); > cond_resched(); > @@ -2280,13 +2356,119 @@ fs_write_aop_error: > continue; > else > break; > - } while (iov_iter_count(&i)); > - *ppos = pos; > + } while (iov_iter_count(i)); > + > + return written ? written : status; > +} > + > +static ssize_t generic_perform_write(struct file *file, > + struct iov_iter *i, loff_t pos) > +{ > + struct address_space *mapping = file->f_mapping; > + const struct address_space_operations *a_ops = mapping->a_ops; > + long status = 0; > + ssize_t written = 0; > + > + do { > + struct page *page; > + pgoff_t index; /* Pagecache index for current page */ > + unsigned long offset; /* Offset into pagecache page */ > + unsigned long bytes; /* Bytes to write to page */ > + size_t copied; /* Bytes copied from user */ > + void *fsdata; > + > + offset = (pos & (PAGE_CACHE_SIZE - 1)); > + index = pos >> PAGE_CACHE_SHIFT; > + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, > + iov_iter_count(i)); > + > +again: > + > + /* > + * Bring in the user page that we will copy from _first_. > + * Otherwise there's a nasty deadlock on copying from the > + * same page as we're writing to, without it being marked > + * up-to-date. > + * > + * Not only is this an optimisation, but it is also required > + * to check that the address is actually valid, when atomic > + * usercopies are used, below. > + */ > + if (unlikely(iov_iter_fault_in_readable(i))) { > + status = -EFAULT; > + break; > + } > + > + status = a_ops->write_begin(file, mapping, pos, bytes, 1, > + &page, &fsdata); > + if (unlikely(status)) > + break; > + > + pagefault_disable(); > + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); > + pagefault_enable(); > + flush_dcache_page(page); > + > + status = a_ops->write_end(file, mapping, pos, bytes, copied, > + page, fsdata); > + if (unlikely(status < 0)) > + break; > + copied = status; > + > + cond_resched(); > + > + if (unlikely(copied == 0)) { > + /* > + * If we were unable to copy any data at all, we must > + * fall back to a single segment length write. > + * > + * If we didn't fallback here, we could livelock > + * because not all segments in the iov can be copied at > + * once without a pagefault. > + */ > + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, > + iov_iter_single_seg_count(i)); > + goto again; > + } > + iov_iter_advance(i, copied); > + pos += copied; > + written += copied; WOW caller forgot unlock page too see above at [1]. > + > + balance_dirty_pages_ratelimited(mapping); > + > + } while (iov_iter_count(i)); > + > + return written ? written : status; > +} > + > +ssize_t > +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, > + unsigned long nr_segs, loff_t pos, loff_t *ppos, > + size_t count, ssize_t written) > +{ > + struct file *file = iocb->ki_filp; > + struct address_space *mapping = file->f_mapping; > + const struct address_space_operations *a_ops = mapping->a_ops; > + struct inode *inode = mapping->host; > + long status = 0; > + struct iov_iter i; > + > + iov_iter_init(&i, iov, nr_segs, count, written); > + if (a_ops->perform_write) > + status = a_ops->perform_write(file, mapping, &i, pos); > + else if (a_ops->write_begin) > + status = generic_perform_write(file, &i, pos); > + else > + status = generic_perform_write_2copy(file, &i, pos); > > - /* > - * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC > - */ > if (likely(status >= 0)) { > + written += status; > + *ppos = pos + status; > + > + /* > + * For now, when the user asks for O_SYNC, we'll actually give > + * O_DSYNC > + */ > if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { > if (!a_ops->writepage || !is_sync_kiocb(iocb)) > status = generic_osync_inode(inode, mapping, > Index: linux-2.6/fs/buffer.c > =================================================================== > --- linux-2.6.orig/fs/buffer.c > +++ linux-2.6/fs/buffer.c > @@ -1846,36 +1846,50 @@ static int __block_prepare_write(struct > } while ((bh = bh->b_this_page) != head); > return 0; > } > + > /* Error case: */ > - /* > - * Zero out any newly allocated blocks to avoid exposing stale > - * data. If BH_New is set, we know that the block was newly > - * allocated in the above loop. > - */ > - bh = head; > + page_zero_new_buffers(page, from, to); > + return err; > +} > + > +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) > +{ > + unsigned int block_start, block_end; > + struct buffer_head *head, *bh; > + > + BUG_ON(!PageLocked(page)); > + if (!page_has_buffers(page)) > + return;__block_prepare_write > + > + bh = head = page_buffers(page); > block_start = 0; > do { > - block_end = block_start+blocksize; > - if (block_end <= from) > - goto next_bh; > - if (block_start >= to) > - break; > + block_end = block_start + bh->b_size; > + > if (buffer_new(bh)) { > - void *kaddr; > + if (block_end > from && block_start < to) { > + if (!PageUptodate(page)) { > + unsigned start, end; > + void *kaddr; > + > + start = max(from, block_start); > + end = min(to, block_end); > + > + kaddr = kmap_atomic(page, KM_USER0); > + memset(kaddr+start, 0, block_end-end); <<<<OOPS why you new block wasn't fully zeroed? as it was done before. At least this result in information leak in case of (stat == from) just imagine fs with blocksize == 1k conains file with i_size == 4096 and fist two blocks not mapped (hole), now invoke write op from 1023 to 2048. For example we succeed in allocating first block, but faile while allocating second , then we call page_zero_new_buffers(...from == 1023, to == 2048) and then zerro only one last byte for first block, and set is uptodate After this we just do read( from == 0, to == 1023) and steal old block content. > + flush_dcache_page(page); > + kunmap_atomic(kaddr, KM_USER0); > + set_buffer_uptodate(bh); > + } > > - clear_buffer_new(bh); > - kaddr = kmap_atomic(page, KM_USER0); > - memset(kaddr+block_start, 0, bh->b_size); > - flush_dcache_page(page); > - kunmap_atomic(kaddr, KM_USER0); > - set_buffer_uptodate(bh); > - mark_buffer_dirty(bh); > + clear_buffer_new(bh); > + mark_buffer_dirty(bh); > + } > } > -next_bh: > + > block_start = block_end; > bh = bh->b_this_page; > } while (bh != head); > - return err; > } > > static int __block_commit_write(struct inode *inode, struct page *page, > @@ -1899,6 +1913,7 @@ static int __block_commit_write(struct i > set_buffer_uptodate(bh); > mark_buffer_dirty(bh); > } > + clear_buffer_new(bh); > } > > /* > @@ -1912,6 +1927,93 @@ static int __block_commit_write(struct i > return 0; > } > > +int block_write_begin(struct file *file, struct address_space *mapping, > + loff_t pos, unsigned len, int intr, > + struct page **pagep, void **fsdata, > + get_block_t *get_block) > +{ > + struct inode *inode = mapping->host; > + int status = 0; > + struct page *page; > + pgoff_t index; > + unsigned start, end; > + > + index = pos >> PAGE_CACHE_SHIFT; > + start = pos & (PAGE_CACHE_SIZE - 1); > + end = start + len; > + > + page = *pagep; > + if (page == NULL) { > + page = __grab_cache_page(mapping, index); > + if (!page) { > + status = -ENOMEM; > + goto out; > + } > + *pagep = page; > + } > + > + status = __block_prepare_write(inode, page, start, end, get_block); > + if (unlikely(status)) { > + ClearPageUptodate(page); > + > + unlock_page(page); > + page_cache_release(page); > + > + /* > + * prepare_write() may have instantiated a few blocks > + * outside i_size. Trim these off again. Don't need > + * i_size_read because we hold i_mutex. > + */ > + if (pos + len > inode->i_size) > + vmtruncate(inode, inode->i_size); > + goto out; > + } > + > +out: > + return status; > +} > + > +int block_write_end(struct file *file, struct address_space *mapping, > + loff_t pos, unsigned len, unsigned copied, > + struct page *page, void *fsdata) > +{ > + struct inode *inode = mapping->host; > + unsigned start; > + > + start = pos & (PAGE_CACHE_SIZE - 1); > + > + if (unlikely(copied < len)) > + page_zero_new_buffers(page, start+copied, start+len); > + flush_dcache_page(page); > + > + /* This could be a short (even 0-length) commit */ > + __block_commit_write(inode, page, start, start+copied); > + > + /* > + * The buffers that were written will now be uptodate, so we don't > + * have to worry about a readpage reading them again. > + * XXX: however I'm not sure about partial writes to buffers. Must > + * recheck that, and hopefully we can remove the below test. > + */ > + if (!PageUptodate(page)) > + copied = 0; > + > + unlock_page(page); > + mark_page_accessed(page); /* XXX: put this in caller? */ > + page_cache_release(page); > + > + /* > + * No need to use i_size_read() here, the i_size > + * cannot change under us because we hold i_mutex. > + */ > + if (pos+copied > inode->i_size) { > + i_size_write(inode, pos+copied); > + mark_inode_dirty(inode); > + } > + > + return copied; > +} > + > /* > * Generic "read page" function for block devices that have the normal > * get_block functionality. This is most of the block device filesystems. > Index: linux-2.6/include/linux/buffer_head.h > =================================================================== > --- linux-2.6.orig/include/linux/buffer_head.h > +++ linux-2.6/include/linux/buffer_head.h > @@ -202,6 +202,13 @@ void block_invalidatepage(struct page *p > int block_write_full_page(struct page *page, get_block_t *get_block, > struct writeback_control *wbc); > int block_read_full_page(struct page*, get_block_t*); > +int block_write_begin(struct file *, struct address_space *, > + loff_t, unsigned, int, > + struct page **, void **, get_block_t*); > +int block_write_end(struct file *, struct address_space *, > + loff_t, unsigned, unsigned, > + struct page *, void *); > +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); > int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); > int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, > loff_t *); > Index: linux-2.6/include/linux/pagemap.h > =================================================================== > --- linux-2.6.orig/include/linux/pagemap.h > +++ linux-2.6/include/linux/pagemap.h > @@ -85,6 +85,8 @@ unsigned find_get_pages_contig(struct ad > unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, > int tag, unsigned int nr_pages, struct page **pages); > > +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index); > + > /* > * Returns locked page at given index in given cache, creating it if needed. > */ > Index: linux-2.6/fs/libfs.c > =================================================================== > --- linux-2.6.orig/fs/libfs.c > +++ linux-2.6/fs/libfs.c > @@ -342,6 +342,24 @@ int simple_prepare_write(struct file *fi > return 0; > } > > +int simple_write_begin(struct file *file, struct address_space *mapping, > + loff_t pos, unsigned len, int intr, > + struct page **pagep, void **fsdata) > +{ > + struct page *page; > + pgoff_t index; > + unsigned from; > + > + index = pos >> PAGE_CACHE_SHIFT; > + from = pos & (PAGE_CACHE_SIZE - 1); > + > + page = __grab_cache_page(mapping, index); > + if (!page) > + return -ENOMEM; > + > + return simple_prepare_write(file, page, from, from+len); > +} > + > int simple_commit_write(struct file *file, struct page *page, > unsigned from, unsigned to) > { > @@ -360,6 +378,28 @@ int simple_commit_write(struct file *fil > return 0; > } > > +int simple_write_end(struct file *file, struct address_space *mapping, > + loff_t pos, unsigned len, unsigned copied, > + struct page *page, void *fsdata) > +{ > + unsigned from = pos & (PAGE_CACHE_SIZE - 1); > + > + /* zero the stale part of the page if we did a short copy */ > + if (copied < len) { > + void *kaddr = kmap_atomic(page, KM_USER0); > + memset(kaddr + from + copied, 0, len - copied); > + flush_dcache_page(page); > + kunmap_atomic(kaddr, KM_USER0); > + } > + > + simple_commit_write(file, page, from, from+copied); > + > + unlock_page(page); > + page_cache_release(page); > + > + return copied; > +} > + > int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files) > { > struct inode *inode; > Index: linux-2.6/drivers/block/loop.c > =================================================================== > --- linux-2.6.orig/drivers/block/loop.c > +++ linux-2.6/drivers/block/loop.c > @@ -206,11 +206,10 @@ lo_do_transfer(struct loop_device *lo, i > * space operations prepare_write and commit_write. > */ > static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, > - int bsize, loff_t pos, struct page *page) > + int bsize, loff_t pos, struct page *unused) > { > struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */ > struct address_space *mapping = file->f_mapping; > - const struct address_space_operations *aops = mapping->a_ops; > pgoff_t index; > unsigned offset, bv_offs; > int len, ret; > @@ -222,67 +221,47 @@ static int do_lo_send_aops(struct loop_d > len = bvec->bv_len; > while (len > 0) { > sector_t IV; > - unsigned size; > + unsigned size, copied; > int transfer_result; > + struct page *page; > + void *fsdata; > > IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9); > size = PAGE_CACHE_SIZE - offset; > if (size > len) > size = len; > - page = grab_cache_page(mapping, index); > - if (unlikely(!page)) > + > + ret = pagecache_write_begin(file, mapping, pos, size, 1, > + &page, &fsdata); > + if (ret) > goto fail; > - ret = aops->prepare_write(file, page, offset, > - offset + size); > - if (unlikely(ret)) { > - if (ret == AOP_TRUNCATED_PAGE) { > - page_cache_release(page); > - continue; > - } > - goto unlock; > - } > + > transfer_result = lo_do_transfer(lo, WRITE, page, offset, > bvec->bv_page, bv_offs, size, IV); > - if (unlikely(transfer_result)) { > - char *kaddr; > + copied = size; > + if (unlikely(transfer_result)) > + copied = 0; > + > + ret = pagecache_write_end(file, mapping, pos, size, copied, > + page, fsdata); > + if (ret < 0) > + goto fail; > + if (ret < copied) > + copied = ret; > > - /* > - * The transfer failed, but we still write the data to > - * keep prepare/commit calls balanced. > - */ > - printk(KERN_ERR "loop: transfer error block %llu\n", > - (unsigned long long)index); > - kaddr = kmap_atomic(page, KM_USER0); > - memset(kaddr + offset, 0, size); > - kunmap_atomic(kaddr, KM_USER0); > - } > - flush_dcache_page(page); > - ret = aops->commit_write(file, page, offset, > - offset + size); > - if (unlikely(ret)) { > - if (ret == AOP_TRUNCATED_PAGE) { > - page_cache_release(page); > - continue; > - } > - goto unlock; <<<<<< BTW: do_lo_send_aops() code itself is realy ugly, for example patrial write not supported. > - } > if (unlikely(transfer_result)) > - goto unlock; > - bv_offs += size; > - len -= size; > + goto fail; > + > + bv_offs += copied; > + len -= copied; > offset = 0; > index++; > - pos += size; > - unlock_page(page); > - page_cache_release(page); > + pos += copied; > } > ret = 0; > out: > mutex_unlock(&mapping->host->i_mutex); > return ret; > -unlock: > - unlock_page(page); > - page_cache_release(page); > fail: > ret = -1; > goto out; > Index: linux-2.6/fs/namei.c > =================================================================== > --- linux-2.6.orig/fs/namei.c > +++ linux-2.6/fs/namei.c > @@ -2688,53 +2688,30 @@ int __page_symlink(struct inode *inode, > { > struct address_space *mapping = inode->i_mapping; > struct page *page; > + void *fsdata; > int err; > char *kaddr; > > retry: > - err = -ENOMEM; > - page = find_or_create_page(mapping, 0, gfp_mask); > - if (!page) > - goto fail; > - err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); > - if (err == AOP_TRUNCATED_PAGE) { > - page_cache_release(page); > - goto retry; > - } > + err = pagecache_write_begin(NULL, mapping, 0, PAGE_CACHE_SIZE, 0, > + &page, &fsdata); > if (err) > - goto fail_map; > + goto fail; > + > kaddr = kmap_atomic(page, KM_USER0); > memcpy(kaddr, symname, len-1); > + memset(kaddr+len-1, 0, PAGE_CACHE_SIZE-(len-1)); > kunmap_atomic(kaddr, KM_USER0); > - err = mapping->a_ops->commit_write(NULL, page, 0, len-1); > - if (err == AOP_TRUNCATED_PAGE) { > - page_cache_release(page); > - goto retry; > - } > - if (err) > - goto fail_map; > - /* > - * Notice that we are _not_ going to block here - end of page is > - * unmapped, so this will only try to map the rest of page, see > - * that it is unmapped (typically even will not look into inode - > - * ->i_size will be enough for everything) and zero it out. > - * OTOH it's obviously correct and should make the page up-to-date. > - */ > - if (!PageUptodate(page)) { > - err = mapping->a_ops->readpage(NULL, page); > - if (err != AOP_TRUNCATED_PAGE) > - wait_on_page_locked(page); > - } else { > - unlock_page(page); > - } > - page_cache_release(page); > + > + err = pagecache_write_end(NULL, mapping, 0, PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, > + page, fsdata); > if (err < 0) > goto fail; > + if (err < PAGE_CACHE_SIZE) > + goto retry; > + > mark_inode_dirty(inode); > return 0; > -fail_map: > - unlock_page(page); > - page_cache_release(page); > fail: > return err; > } > Index: linux-2.6/fs/splice.c > =================================================================== > --- linux-2.6.orig/fs/splice.c > +++ linux-2.6/fs/splice.c > @@ -559,7 +559,7 @@ static int pipe_to_file(struct pipe_inod > struct address_space *mapping = file->f_mapping; > unsigned int offset, this_len; > struct page *page; > - pgoff_t index; > + void *fsdata; > int ret; > > /* > @@ -569,13 +569,13 @@ static int pipe_to_file(struct pipe_inod > if (unlikely(ret)) > return ret; > > - index = sd->pos >> PAGE_CACHE_SHIFT; > offset = sd->pos & ~PAGE_CACHE_MASK; > > this_len = sd->len; > if (this_len + offset > PAGE_CACHE_SIZE) > this_len = PAGE_CACHE_SIZE - offset; > > +#if 0 > /* > * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full > * page. > @@ -587,86 +587,11 @@ static int pipe_to_file(struct pipe_inod > * locked on successful return. > */ > if (buf->ops->steal(pipe, buf)) > - goto find_page; > +#endif > > - page = buf->page; > - if (add_to_page_cache(page, mapping, index, GFP_KERNEL)) { > - unlock_page(page); > - goto find_page; > - } > - > - page_cache_get(page); > - > - if (!(buf->flags & PIPE_BUF_FLAG_LRU)) > - lru_cache_add(page); > - } else { > -find_page: > - page = find_lock_page(mapping, index); > - if (!page) { > - ret = -ENOMEM; > - page = page_cache_alloc_cold(mapping); > - if (unlikely(!page)) > - goto out_ret; > - > - /* > - * This will also lock the page > - */ > - ret = add_to_page_cache_lru(page, mapping, index, > - GFP_KERNEL); > - if (unlikely(ret)) > - goto out; > - } > - > - /* > - * We get here with the page locked. If the page is also > - * uptodate, we don't need to do more. If it isn't, we > - * may need to bring it in if we are not going to overwrite > - * the full page. > - */ > - if (!PageUptodate(page)) { > - if (this_len < PAGE_CACHE_SIZE) { > - ret = mapping->a_ops->readpage(file, page); > - if (unlikely(ret)) > - goto out; > - > - lock_page(page); > - > - if (!PageUptodate(page)) { > - /* > - * Page got invalidated, repeat. > - */ > - if (!page->mapping) { > - unlock_page(page); > - page_cache_release(page); > - goto find_page; > - } > - ret = -EIO; > - goto out; > - } > - } else > - SetPageUptodate(page); > - } > - } > - > - ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); > - if (unlikely(ret)) { > - loff_t isize = i_size_read(mapping->host); > - > - if (ret != AOP_TRUNCATED_PAGE) > - unlock_page(page); > - page_cache_release(page); > - if (ret == AOP_TRUNCATED_PAGE) > - goto find_page; > - > - /* > - * prepare_write() may have instantiated a few blocks > - * outside i_size. Trim these off again. > - */ > - if (sd->pos + this_len > isize) > - vmtruncate(mapping->host, isize); > - > - goto out_ret; > - } > + ret = pagecache_write_begin(file, mapping, sd->pos, sd->len, 0, &page, &fsdata); > + if (unlikely(ret)) > + goto out; > > if (buf->page != page) { > /* > @@ -676,28 +601,13 @@ find_page: > char *dst = kmap_atomic(page, KM_USER1); > > memcpy(dst + offset, src + buf->offset, this_len); > - flush_dcache_page(page); > kunmap_atomic(dst, KM_USER1); > buf->ops->unmap(pipe, buf, src); > } > > - ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); > - if (!ret) { > - /* > - * Return the number of bytes written and mark page as > - * accessed, we are now done! > - */ > - ret = this_len; > - mark_page_accessed(page); > - balance_dirty_pages_ratelimited(mapping); > - } else if (ret == AOP_TRUNCATED_PAGE) { > - page_cache_release(page); > - goto find_page; > - } > + ret = pagecache_write_end(file, mapping, sd->pos, sd->len, sd->len, page, fsdata); > + > out: > - page_cache_release(page); > - unlock_page(page); > -out_ret: > return ret; > } > > Index: linux-2.6/Documentation/filesystems/Locking > =================================================================== > --- linux-2.6.orig/Documentation/filesystems/Locking > +++ linux-2.6/Documentation/filesystems/Locking > @@ -176,15 +176,18 @@ prototypes: > locking rules: > All except set_page_dirty may block > > - BKL PageLocked(page) > + BKL PageLocked(page) i_sem > writepage: no yes, unlocks (see below) > readpage: no yes, unlocks > sync_page: no maybe > writepages: no > set_page_dirty no no > readpages: no > -prepare_write: no yes > -commit_write: no yes > +prepare_write: no yes yes > +commit_write: no yes yes > +write_begin: no locks the page yes > +write_end: no yes, unlocks yes Ohhh and now i'm totaly shure what you just forgot unlocking stuff in pagecache_write_end() > +perform_write: no n/a yes > bmap: yes > invalidatepage: no yes > releasepage: no yes > Index: linux-2.6/Documentation/filesystems/vfs.txt > =================================================================== > --- linux-2.6.orig/Documentation/filesystems/vfs.txt > +++ linux-2.6/Documentation/filesystems/vfs.txt > @@ -534,6 +534,14 @@ struct address_space_operations { > struct list_head *pages, unsigned nr_pages); > int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); > int (*commit_write)(struct file *, struct page *, unsigned, unsigned); > + int (*write_begin)(struct file *, struct address_space *mapping, > + loff_t pos, unsigned len, int intr, > + struct page **pagep, void **fsdata); > + int (*write_end)(struct file *, struct address_space *mapping, > + loff_t pos, unsigned len, unsigned copied, > + struct page *page, void *fsdata); > + int (*perform_write)(struct file *, struct address_space *, > + struct iov_iter *, loff_t); > sector_t (*bmap)(struct address_space *, sector_t); > int (*invalidatepage) (struct page *, unsigned long); > int (*releasepage) (struct page *, int); > @@ -629,6 +637,36 @@ struct address_space_operations { > operations. It should avoid returning an error if possible - > errors should have been handled by prepare_write. > > + write_begin: This is intended as a replacement for prepare_write. Called > + by the generic buffered write code to ask the filesystem to prepare > + to write len bytes at the given offset in the file. intr is a boolean > + that specifies whether or not the filesystem must be prepared to > + handle a short write (ie. write_end called with less than len bytes > + written). > + > + The filesystem must return the locked pagecache page for the caller > + to write into. > + > + A void * may be returned in fsdata, which then gets passed into > + write_end. > + > + Returns < 0 on failure, in which case all cleanup must be done and > + write_end not called. 0 on success, in which case write_end must > + be called. > + > + write_end: After a successful write_begin, and data copy, write_end must > + be called. len is the original len passed to write_begin, and copied > + is the amount that was able to be copied (they must be equal if > + write_begin was called with intr == 0). > + > + The filesystem must take care of unlocking the page and dropping its > + refcount, and updating i_size. > + > + Returns < 0 on failure, otherwise the number of bytes (<= 'copied') > + that were able to be copied into the file. > + > + perform_write: > + > bmap: called by the VFS to map a logical block offset within object to > physical block number. This method is used by the FIBMAP > ioctl and for working with swap-files. To be able to swap to > - > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html