This implements journal callbacks j_submit|finish_inode_data_buffers() with different behavior for data=journal: to write-protect pages under commit, preventing changes to buffers writeably mapped to userspace. If a buffer's content changes between commit's checksum calculation and write-out to disk, it can cause journal recovery/mount failures upon a kernel crash or power loss. [ 27.334874] EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support! [ 27.339492] JBD2: Invalid checksum recovering data block 8705 in log [ 27.342716] JBD2: recovery failed [ 27.343316] EXT4-fs (loop0): error loading journal mount: /ext4: can't read superblock on /dev/loop0. In j_submit_inode_data_buffers() we write-protect the inode's pages with write_cache_pages() and redirty w/ writepage callback if needed. In j_finish_inode_data_buffers() there is nothing do to. And in order to use the callbacks, inodes are added to the inode list in transaction in __ext4_journalled_writepage() and ext4_page_mkwrite(). In ext4_page_mkwrite() we must make sure that: 1) the inode is always added to the list; thus we skip the 'all buffers mapped' optimization on data=journal; 2) the buffers are attached to transaction as dirty; as already done in __ext4_journalled_writepage(). Signed-off-by: Mauricio Faria de Oliveira <mfo@xxxxxxxxxxxxx> Suggested-by: Jan Kara <jack@xxxxxxx> Reported-by: Dann Frazier <dann.frazier@xxxxxxxxxxxxx> --- fs/ext4/inode.c | 29 ++++++++++++++------ fs/ext4/super.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 91 insertions(+), 10 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf596467c234..fa4109da056c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1910,6 +1910,9 @@ static int __ext4_journalled_writepage(struct page *page, err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, write_end_fn); } + if (ret == 0) + ret = err; + err = ext4_jbd2_inode_add_write(handle, inode, 0, len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -6004,9 +6007,12 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) len = PAGE_SIZE; /* * Return if we have all the buffers mapped. This avoids the need to do - * journal_start/journal_stop which can block and take a long time + * journal_start/journal_stop which can block and take a long time. + * + * This cannot be done for data journalling, as we have to add the + * inode to the transaction's list to writeprotect pages on commit. */ - if (page_has_buffers(page)) { + if (page_has_buffers(page) && !ext4_should_journal_data(inode)) { if (!ext4_walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, ext4_bh_unmapped)) { @@ -6032,12 +6038,14 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) err = block_page_mkwrite(vma, vmf, get_block); if (!err && ext4_should_journal_data(inode)) { if (ext4_walk_page_buffers(handle, page_buffers(page), 0, - PAGE_SIZE, NULL, do_journal_get_write_access)) { - unlock_page(page); - ret = VM_FAULT_SIGBUS; - ext4_journal_stop(handle); - goto out; - } + PAGE_SIZE, NULL, do_journal_get_write_access)) + goto out_err; + /* Make sure buffers are attached to the transaction as dirty */ + if (ext4_walk_page_buffers(handle, page_buffers(page), 0, + PAGE_SIZE, NULL, write_end_fn)) + goto out_err; + if (ext4_jbd2_inode_add_write(handle, inode, 0, PAGE_SIZE)) + goto out_err; ext4_set_inode_state(inode, EXT4_STATE_JDATA); } ext4_journal_stop(handle); @@ -6049,6 +6057,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; +out_err: + unlock_page(page); + ret = VM_FAULT_SIGBUS; + ext4_journal_stop(handle); + goto out; } vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7303839d7ad9..528b5e20b71c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -472,14 +472,82 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) spin_unlock(&sbi->s_md_lock); } +/* + * This writepage callback for write_cache_pages() + * takes care of a few cases after page cleaning. + * + * write_cache_pages() already checks for dirty pages + * and calls clear_page_dirty_for_io(), which we want, + * to write protect the pages. + * + * However, we have to redirty a page in these cases: + * 1) some buffer is dirty (needs checkpointing) + * 2) some buffer is not part of the committing transaction + * 3) some buffer already has b_next_transaction set + */ + +static int ext4_journalled_writepage_callback(struct page *page, + struct writeback_control *wbc, + void *data) +{ + transaction_t *transaction = (transaction_t *) data; + struct buffer_head *bh, *head; + struct journal_head *jh; + + bh = head = page_buffers(page); + do { + jh = bh2jh(bh); + if (buffer_dirty(bh) || + (jh && (jh->b_transaction != transaction || + jh->b_next_transaction))) { + redirty_page_for_writepage(wbc, page); + goto out; + } + } while ((bh = bh->b_this_page) != head); + +out: + return AOP_WRITEPAGE_ACTIVATE; +} + +static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; + transaction_t *transaction = jinode->i_transaction; + loff_t dirty_start = jinode->i_dirty_start; + loff_t dirty_end = jinode->i_dirty_end; + + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = ~0ULL, + .range_start = dirty_start, + .range_end = dirty_end, + }; + + return write_cache_pages(mapping, &wbc, + ext4_journalled_writepage_callback, + transaction); +} + static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { - return jbd2_journal_submit_inode_data_buffers(jinode); + int ret; + + if (ext4_should_journal_data(jinode->i_vfs_inode)) + ret = ext4_journalled_submit_inode_data_buffers(jinode); + else + ret = jbd2_journal_submit_inode_data_buffers(jinode); + + return ret; } static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) { - return jbd2_journal_finish_inode_data_buffers(jinode); + int ret = 0; + + if (!ext4_should_journal_data(jinode->i_vfs_inode)) + ret = jbd2_journal_finish_inode_data_buffers(jinode); + + return ret; } static bool system_going_down(void) -- 2.17.1