This creates a version of ext3_get_block that starts and ends a transaction. By starting and ending the transaction inside get_block, this is able to avoid lock inversion problems when the DIO code tries to take page locks inside blockdev_direct_IO. (transaction locks must always happen after page locks). Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> diff -r 385bc75d9266 -r bebaf8972a31 fs/ext3/inode.c --- a/fs/ext3/inode.c Thu Dec 21 15:31:30 2006 -0500 +++ b/fs/ext3/inode.c Thu Dec 21 15:31:30 2006 -0500 @@ -1673,6 +1673,30 @@ static int ext3_releasepage(struct page return journal_try_to_free_buffers(journal, page, wait); } +static int ext3_get_block_direct_IO(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + handle_t *handle = ext3_journal_start(inode, DIO_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext3_get_block(inode, iblock, bh_result, create); + /* + * Reacquire the handle: ext3_get_block() can restart the transaction + */ + handle = journal_current_handle(); + if (handle) { + int err; + err = ext3_journal_stop(handle); + if (!ret) + ret = err; + } +out: + return ret; +} + /* * If the O_DIRECT write will extend the file then add this inode to the * orphan list. So recovery will truncate it back to the original size @@ -1693,39 +1717,58 @@ static ssize_t ext3_direct_IO(int rw, st int orphan = 0; size_t count = iov_length(iov, nr_segs); - if (rw == WRITE) { - loff_t final_size = offset + count; - + if (rw == WRITE && (offset + count > inode->i_size)) { handle = ext3_journal_start(inode, DIO_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } - if (final_size > inode->i_size) { - ret = ext3_orphan_add(handle, inode); - if (ret) - goto out_stop; - orphan = 1; - ei->i_disksize = inode->i_size; - } - } - + ret = ext3_orphan_add(handle, inode); + if (ret) { + ext3_journal_stop(handle); + goto out; + } + ei->i_disksize = inode->i_size; + ret = ext3_journal_stop(handle); + if (ret) { + /* something has gone horribly wrong, cleanup + * the orphan list in ram + */ + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); + goto out; + } + orphan = 1; + } + + /* + * the placeholder page code may take a page lock, so we have + * to stop any running transactions before calling + * blockdev_direct_IO. Use ext3_get_block_direct_IO to start + * and stop a transaction on each get_block call. + */ ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, - ext3_get_block, NULL); + ext3_get_block_direct_IO, NULL); /* * Reacquire the handle: ext3_get_block() can restart the transaction */ handle = journal_current_handle(); -out_stop: - if (handle) { + if (orphan) { int err; - - if (orphan && inode->i_nlink) + handle = ext3_journal_start(inode, DIO_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); + goto out; + } + + if (inode->i_nlink) ext3_orphan_del(handle, inode); - if (orphan && ret > 0) { + if (ret > 0) { loff_t end = offset + ret; if (end > inode->i_size) { ei->i_disksize = end; - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html