From: Mauricio Faria de Oliveira <mfo@xxxxxxxxxxxxx> Work in progress / request for comments to fix issue with journal consistency errors on unclean shutdown with mmap() on ext4 data=journal,journal_checksum mode. Reference: https://lore.kernel.org/linux-ext4/20190830012236.GC10779@xxxxxxx/ --- fs/ext4/ext4_jbd2.h | 11 ++++++ fs/ext4/inline.c | 13 +++++++ fs/ext4/inode.c | 82 ++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 102 insertions(+), 4 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index a6b9b66dbfad..8b51ca8231b7 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -144,6 +144,17 @@ struct ext4_journal_cb_entry { /* user data goes here */ }; +/** + * struct ext4_journal_cb_entry_simple - Simple structure for callback information. + * + * This struct is a 'simple' structure on top of the base/seed structure, + * which adds a private data pointer to be used by the callback function. + */ +struct ext4_journal_cb_entry_simple { + struct ext4_journal_cb_entry jce; + void *jce_private; +}; + /** * ext4_journal_callback_add: add a function to call after transaction commit * @handle: active journal transaction handle to register callback on diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 2fec62d764fa..a168fe497d5d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -565,6 +565,9 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, ret = -ENOMEM; goto out; } + /* XXX(mfo): deadlock with journal? */ + if (ext4_should_journal_data(inode)) + wait_on_page_writeback(page); ext4_write_lock_xattr(inode, &no_expand); sem_held = 1; @@ -693,6 +696,9 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, ret = -ENOMEM; goto out; } + /* XXX(mfo): deadlock with journal? */ + if (ext4_should_journal_data(inode)) + wait_on_page_writeback(page); *pagep = page; down_read(&EXT4_I(inode)->xattr_sem); @@ -808,6 +814,10 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) return -ENOMEM; + /* XXX(mfo): should not deadlock with journal; + * this is only called after ext4_journal_stop(). */ + if (ext4_should_journal_data(inode)) + wait_on_page_writeback(page); down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { @@ -911,6 +921,9 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, ret = -ENOMEM; goto out_journal; } + /* XXX(mfo): deadlock with journal? */ + if (ext4_should_journal_data(inode)) + wait_on_page_writeback(page); down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 28f28de0c1b6..ca31db5f81f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -139,7 +139,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, static void ext4_invalidatepage(struct page *page, unsigned int offset, unsigned int length); -static int __ext4_journalled_writepage(struct page *page, unsigned int len); +static int __ext4_journalled_writepage(struct page *page, unsigned int len, bool sync); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); @@ -1155,6 +1155,15 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); retry_journal: + + /* XXX(mfo): deadlock with journal: fix attempt. + * does just wait_on_page_writeback() need (un)lock_page() ? */ + if (ext4_should_journal_data(inode)) { + lock_page(page); + wait_on_page_writeback(page); + unlock_page(page); + } + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { put_page(page); @@ -1170,6 +1179,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, goto retry_grab; } /* In case writeback began while the page was unlocked */ + /* XXX(mfo): this may call wait_on_page_writeback() and deadlock. */ wait_for_stable_page(page); #ifdef CONFIG_FS_ENCRYPTION @@ -1841,8 +1851,21 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) return 0; } +static void __ext4_jce_finish_page_writeback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int error) +{ + struct ext4_journal_cb_entry_simple *jce_simple = + (struct ext4_journal_cb_entry_simple *) jce; + struct page *page = (struct page *) jce_simple->jce_private; + + end_page_writeback(page); + kfree(jce); +} + static int __ext4_journalled_writepage(struct page *page, - unsigned int len) + unsigned int len, + bool sync) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; @@ -1851,6 +1874,7 @@ static int __ext4_journalled_writepage(struct page *page, int ret = 0, err = 0; int inline_data = ext4_has_inline_data(inode); struct buffer_head *inode_bh = NULL; + struct ext4_journal_cb_entry_simple *jce = NULL; ClearPageChecked(page); @@ -1875,13 +1899,25 @@ static int __ext4_journalled_writepage(struct page *page, * out from under us. */ get_page(page); + /* XXX(mfo): do NOT set_page_writeback() here; as the next lock_page() + * could deadlock with write_cache_pages() (since it calls lock_page() + * and then wait_on_page_writeback()). + */ + //set_page_writeback(page); unlock_page(page); + if (!sync) { + jce = kzalloc(sizeof(*jce), GFP_NOFS); + if (!jce) { + ret = -ENOMEM; + goto out_no_pagelock; + } + } + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - put_page(page); goto out_no_pagelock; } BUG_ON(!ext4_handle_valid(handle)); @@ -1906,7 +1942,25 @@ static int __ext4_journalled_writepage(struct page *page, } if (ret == 0) ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + + /* XXX(mfo): only call set_page_writeback() and add callback to end_page_writeback() + * on journal commit on the *a*sync case; otherwise msync() blocks until periodic + * journal commit happens, waiting on page writeback. + * msync() -> ext4_sync_file() -> __filemap_fdatawait_range() -> wait_on_page_writeback() + * + * Confirm wether the *sync* case does *not* need ext4_handle_sync() ? + * as msync() -> ext4_sync_file() already calls ext4_force_commit() for data=journal. + */ + if (!sync) { + /* Add journal callback entry to finish page writeback and free itself */ + set_page_writeback(page); + jce->jce_private = page; + ext4_journal_callback_add(handle, __ext4_jce_finish_page_writeback, + (struct ext4_journal_cb_entry *) jce); + } + err = ext4_journal_stop(handle); if (!ret) ret = err; @@ -1918,6 +1972,15 @@ static int __ext4_journalled_writepage(struct page *page, out: unlock_page(page); out_no_pagelock: + /* + * Error leg for handle not yet initialized (jce allocation error) + * or failed to. Either way kfree(jce) is ok (it's NULL or valid.) + */ + if (IS_ERR_OR_NULL(handle)) { + put_page(page); + kfree(jce); + } + brelse(inode_bh); return ret; } @@ -2029,7 +2092,8 @@ static int ext4_writepage(struct page *page, * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ - return __ext4_journalled_writepage(page, len); + return __ext4_journalled_writepage(page, len, + (wbc->sync_mode == WB_SYNC_ALL)); ext4_io_submit_init(&io_submit, wbc); io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); @@ -2974,6 +3038,14 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, * of file which has an already mapped buffer. */ retry_journal: + + /* XXX(mfo): see comment in non-da ext4_write_begin(). */ + if (ext4_should_journal_data(inode)) { + lock_page(page); + wait_on_page_writeback(page); + unlock_page(page); + } + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, ext4_da_write_credits(inode, pos, len)); if (IS_ERR(handle)) { @@ -5929,6 +6001,8 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ wait_for_stable_page(page); + if (ext4_should_journal_data(inode)) + wait_on_page_writeback(page); ret = VM_FAULT_LOCKED; goto out; } -- 2.17.1