Hi Nick, On Wed, Mar 14, 2007 at 02:38:22PM +0100, Nick Piggin wrote: > Introduce write_begin, write_end, and perform_write aops. > > These are intended to replace prepare_write and commit_write with more > flexible alternatives that are also able to avoid the buffered write > deadlock problems efficiently (which prepare_write is unable to do). > Index: linux-2.6/include/linux/fs.h > =================================================================== > --- linux-2.6.orig/include/linux/fs.h > +++ linux-2.6/include/linux/fs.h > @@ -449,6 +449,17 @@ struct address_space_operations { > */ > int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); > int (*commit_write)(struct file *, struct page *, unsigned, unsigned); > + > + int (*write_begin)(struct file *, struct address_space *mapping, > + loff_t pos, unsigned len, int intr, > + struct page **pagep, void **fsdata); > + int (*write_end)(struct file *, struct address_space *mapping, > + loff_t pos, unsigned len, unsigned copied, > + struct page *page, void *fsdata); Are we going to get rid of the file and intr arguments btw? I'm not sure intr is useful, and mapping is probably enough to get whatever we inside ->write_begin / ->write_end. Also, I noticed that you didn't export block_write_begin(), simple_write_begin(), block_write_end() and simple_write_end() - I think we want those for client modules. Attached is a quick patch to hook up the existing ocfs2 write code. This has been compile tested only for now - one of my test machines isn't cooperating, so a runtime test will have to wait until tommorrow. One interesting side effect is that we no longer pass AOP_TRUNCATE_PAGE up a level. This gives callers less to deal with. And it means that ocfs2 doesn't have to use the ocfs2_*_lock_with_page() cluster lock variants in ocfs2_block_write_begin() because it can order cluster locks outside of the page lock there. My ocfs2 write rework will be a more serious user of these stuff, including the fsdata backpointer. That code will also eliminate the entire ocfs2_*_lock_with_page() cluster locking workarounds for write (they'll have to remain for ->readpage()). I'm beginning work on cleaning those ocfs2 patches up and getting them plugged into this stuff. I hope to post them in the next day or two. --Mark -- Mark Fasheh Senior Software Developer, Oracle mark.fasheh@xxxxxxxxxx ocfs2: Convert to new aops Turn ocfs2_prepare_write() and ocfs2_commit_write() into ocfs2_write_begin() and ocfs2_write_end(). Signed-off-by: Mark Fasheh <mark.fasheh@xxxxxxxxxx> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 93628b0..e7bcbbd 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -293,29 +293,30 @@ int ocfs2_prepare_write_nolock(struct in } /* - * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called - * from loopback. It must be able to perform its own locking around - * ocfs2_get_block(). + * ocfs2_write_begin() can be an outer-most ocfs2 call when it is + * called from elsewhere in the kernel. It must be able to perform its + * own locking around ocfs2_get_block(). */ -static int ocfs2_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ocfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, int intr, + struct page **pagep, void **fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = mapping->host; int ret; - mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); - - ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); + ret = ocfs2_meta_lock(inode, NULL, 0); if (ret != 0) { mlog_errno(ret); goto out; } - ret = ocfs2_prepare_write_nolock(inode, page, from, to); + down_read(&OCFS2_I(inode)->ip_alloc_sem); + ret = block_write_begin(file, mapping, pos, len, intr, pagep, fsdata, + ocfs2_get_block); + up_read(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_meta_unlock(inode, 0); out: - mlog_exit(ret); return ret; } @@ -388,16 +389,21 @@ out: return handle; } -static int ocfs2_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ocfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { int ret; + unsigned from, to; struct buffer_head *di_bh = NULL; struct inode *inode = page->mapping->host; handle_t *handle = NULL; struct ocfs2_dinode *di; - mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); + mlog_entry("(0x%p, 0x%p)\n", file, page); + + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; /* NOTE: ocfs2_file_aio_write has ensured that it's safe for * us to continue here without rechecking the I/O against @@ -441,8 +447,10 @@ static int ocfs2_commit_write(struct fil } /* might update i_size */ - ret = generic_commit_write(file, page, from, to); - if (ret < 0) { + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + if (copied < 0) { + ret = copied; + copied = 0; mlog_errno(ret); goto out_commit; } @@ -458,11 +466,10 @@ static int ocfs2_commit_write(struct fil di->i_size = cpu_to_le64((u64)i_size_read(inode)); ret = ocfs2_journal_dirty(handle, di_bh); - if (ret < 0) { + if (ret < 0) mlog_errno(ret); - goto out_commit; - } + ret = 0; out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out_unlock_data: @@ -470,11 +477,29 @@ out_unlock_data: out_unlock_meta: ocfs2_meta_unlock(inode, 1); out: + if (ret && ret != AOP_TRUNCATED_PAGE) { + /* + * We caught an error before block_write_end() - + * unlock and free the page. + */ + unlock_page(page); + page_cache_release(page); + } else if (ret == AOP_TRUNCATED_PAGE) { + /* + * The cluster locking code avoided a deadlock by + * unlocking the page - simply release here and turn + * the error into something negative for the caller to + * key on. + */ + page_cache_release(page); + ret = -EAGAIN; + } + if (di_bh) brelse(di_bh); mlog_exit(ret); - return ret; + return copied ? copied : ret; } static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) @@ -657,8 +682,8 @@ out: const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, .writepage = ocfs2_writepage, - .prepare_write = ocfs2_prepare_write, - .commit_write = ocfs2_commit_write, + .write_begin = ocfs2_write_begin, + .write_end = ocfs2_write_end, .bmap = ocfs2_bmap, .sync_page = block_sync_page, .direct_IO = ocfs2_direct_IO - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html