Nick Piggin <npiggin@xxxxxxx> writes: > Convert ext2 to use ->perform_write. This uses the main loop out of > generic_perform_write, but when encountering a short usercopy, it > zeroes out new uninitialised blocks, and passes in a short-length commit > to __block_commit_write, which does the right thing (in terms of not > setting things uptodate). > > fs/buffer.c | 143 ++++++++++++++++++++++++++++++++++++++++++++ > fs/ext2/inode.c | 7 ++ > include/linux/buffer_head.h | 1 > include/linux/pagemap.h | 2 > 4 files changed, 153 insertions(+) > > Index: linux-2.6/fs/buffer.c > =================================================================== > --- linux-2.6.orig/fs/buffer.c > +++ linux-2.6/fs/buffer.c > @@ -1866,6 +1866,50 @@ next_bh: > return err; > } > > +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) > +{ > + unsigned int block_start, block_end; > + struct buffer_head *head, *bh; > + > + BUG_ON(!PageLocked(page)); > + if (!page_has_buffers(page)) > + return; > + > + bh = head = page_buffers(page); > + block_start = 0; > + do { > + block_end = block_start + bh->b_size; > + > + if (buffer_new(bh)) { > + if (block_end > from && block_start < to) { > + if (!PageUptodate(page)) { > + unsigned start, end; > + void *kaddr; > + > + start = max(from, block_start); > + end = min(to, block_end); > + > + kaddr = kmap_atomic(page, KM_USER0); > + memset(kaddr+start, 0, block_end-end); > + flush_dcache_page(page); > + kunmap_atomic(kaddr, KM_USER0); > + set_buffer_uptodate(bh); > + } > + > + /* > + * XXX: make buffer_new behaviour more > + * consistent. > + * clear_buffer_new(bh); > + */ > + mark_buffer_dirty(bh); > + } > + } > + > + block_start = block_end; > + bh = bh->b_this_page; > + } while (bh != head); > +} > + > static int __block_commit_write(struct inode *inode, struct page *page, > unsigned from, unsigned to) > { > @@ -1900,6 +1944,105 @@ static int __block_commit_write(struct i > return 0; > } > > +ssize_t block_perform_write(struct file *file, struct iovec_iterator *i, > + loff_t pos, get_block_t *get_block) > +{ > + struct address_space *mapping = file->f_mapping; > + struct inode *inode = mapping->host; > + long status = 0; > + ssize_t written = 0; > + > + do { > + struct page *page; > + pgoff_t index; /* Pagecache index for current page */ > + unsigned long offset; /* Offset into pagecache page */ > + unsigned long bytes; /* Bytes to write to page */ > + size_t copied; /* Bytes copied from user */ > + > + offset = (pos & (PAGE_CACHE_SIZE - 1)); > + index = pos >> PAGE_CACHE_SHIFT; > + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, > + iovec_iterator_count(i)); > + > + /* > + * Bring in the user page that we will copy from _first_. > + * Otherwise there's a nasty deadlock on copying from the > + * same page as we're writing to, without it being marked > + * up-to-date. > + * > + * Not only is this an optimisation, but it is also required > + * to check that the address is actually valid, when atomic > + * usercopies are used, below. > + */ > + if (unlikely(iovec_iterator_fault_in_readable(i))) { > + status = -EFAULT; > + break; > + } > + > + page = __grab_cache_page(mapping, index); > + if (!page) { > + status = -ENOMEM; > + break; > + } > + > + status = __block_prepare_write(inode, page, offset, > + offset+bytes, get_block); > + if (unlikely(status)) { > + ClearPageUptodate(page); > + > + page_cache_release(page); > + > + /* > + * prepare_write() may have instantiated a few blocks > + * outside i_size. Trim these off again. Don't need > + * i_size_read because we hold i_mutex. > + */ > + if (pos + bytes > inode->i_size) > + vmtruncate(inode, inode->i_size); > + break; > + } > + > + /* > + * Must not enter the pagefault handler here, because > + * we hold the page lock. See mm/filemap.c for more > + * details. > + */ > + pagefault_disable(); > + copied = iovec_iterator_copy_from_user_atomic(page, i, > + offset, bytes); > + pagefault_enable(); > + if (unlikely(copied < bytes)) > + page_zero_new_buffers(page, offset+copied, offset+bytes); > + flush_dcache_page(page); > + <<<<<<<<<<< here fs cat do some fs-specific stuff without making internal state visiable. cool. > + /* This could be a short (even 0-length) commit */ > + __block_commit_write(inode, page, offset, offset+copied); > + > + unlock_page(page); > + mark_page_accessed(page); > + page_cache_release(page); > + > + iovec_iterator_advance(i, copied); > + pos += copied; > + written += copied; > + > + balance_dirty_pages_ratelimited(mapping); > + cond_resched(); > + > + } while (iovec_iterator_count(i)); > + <<<<<<<<<<< If i've understand correctly folowing scenario possible: iteration 1: ->iovec_iterator_fault_in_readable(...) = 0 iteration 1: __block_prepare_write = {blocks allocated} iteration 1: iovec_iterator_copy_from_user_atomic(...) = 0 iteration 1: while(iovec_iterator_count(i)) == goto next loop iteration 2: ->iovec_iterator_fault_in_readable(...) = -EFAULT Than breack loop . At this point prepare_write() may have instantiated a few blocks outside i_size on iteration(1) So we have to trim these off again. > + /* > + * No need to use i_size_read() here, the i_size > + * cannot change under us because we hold i_mutex. > + */ > + if (pos > inode->i_size) { > + i_size_write(inode, pos); > + mark_inode_dirty(inode); > + } > + > + return written ? written : status; > +} > + > /* > * Generic "read page" function for block devices that have the normal > * get_block functionality. This is most of the block device filesystems. > Index: linux-2.6/fs/ext2/inode.c > =================================================================== > --- linux-2.6.orig/fs/ext2/inode.c > +++ linux-2.6/fs/ext2/inode.c > @@ -642,6 +642,12 @@ ext2_readpages(struct file *file, struct > return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); > } > > +static ssize_t > +ext2_perform_write(struct file *file, struct iovec_iterator *i, loff_t pos) > +{ > + return block_perform_write(file, i, pos, ext2_get_block); > +} > + > static int > ext2_prepare_write(struct file *file, struct page *page, > unsigned from, unsigned to) > @@ -689,6 +695,7 @@ const struct address_space_operations ex > .readpages = ext2_readpages, > .writepage = ext2_writepage, > .sync_page = block_sync_page, > + .perform_write = ext2_perform_write, > .prepare_write = ext2_prepare_write, > .commit_write = generic_commit_write, > .bmap = ext2_bmap, > Index: linux-2.6/include/linux/buffer_head.h > =================================================================== > --- linux-2.6.orig/include/linux/buffer_head.h > +++ linux-2.6/include/linux/buffer_head.h > @@ -198,6 +198,7 @@ void block_invalidatepage(struct page *p > int block_write_full_page(struct page *page, get_block_t *get_block, > struct writeback_control *wbc); > int block_read_full_page(struct page*, get_block_t*); > +ssize_t block_perform_write(struct file *, struct iovec_iterator*, loff_t, get_block_t*); > int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); > int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, > loff_t *); > Index: linux-2.6/include/linux/pagemap.h > =================================================================== > --- linux-2.6.orig/include/linux/pagemap.h > +++ linux-2.6/include/linux/pagemap.h > @@ -87,6 +87,8 @@ unsigned find_get_pages_contig(struct ad > unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, > int tag, unsigned int nr_pages, struct page **pages); > > +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index); > + > /* > * Returns locked page at given index in given cache, creating it if needed. > */ > - > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html