[RFC PATCH V2 1/2] ext4: fix a potential assertion failure due to improperly dirtied buffer

zhangshida <starzhangzsd@xxxxxxxxx> · Fri, 9 Aug 2024 14:46:05 +0800

From: Shida Zhang <zhangshida@xxxxxxxxxx>

On an old kernel version(4.19, ext3, data=journal, pagesize=64k),
an assertion failure will occasionally be triggered by the line below:
-----------
jbd2_journal_commit_transaction
{
...
J_ASSERT_BH(bh, !buffer_dirty(bh));
/*
* The buffer on BJ_Forget list and not jbddirty means
...
}
-----------

The same condition may also be applied to the lattest kernel version.

AFAIC, that's how the problem works:
--------
journal_unmap_buffer
jbd2_journal_invalidatepage
__ext4_journalled_invalidatepage
ext4_journalled_invalidatepage
do_invalidatepage
truncate_inode_pages_range
truncate_inode_pages
truncate_pagecache
ext4_setattr
--------
First try to truncate and invalidate the page.
ext4_setattr() will try to free it by adding it to the BJ_Forget list
for further processing.
Put it more clearly,
when ext4_setattr() truncates the file, the buffer is not fully freed
yet. It's half-freed.
Furthermore,
Because the buffer is half-freed, the reallocating thing won't need to happen.
Now,
under that scenario, can we redirty the half-freed buffer on the BJ_Forget list?
The answer may be 'yes'.

redirty it by the following code:
ext4_block_write_begin
    if (!buffer_mapped(bh)) { // check 1
         _ext4_get_block(inode, block, bh, 1);
        (buffer_new(bh)) { // check 2
             if (folio_test_uptodate(folio)) { // check 3
                 mark_buffer_dirty(bh);

But can it pass the checks?

Is the buffer mapped? no, journal_unmap_buffer() will clear the mapped state.
Pass the check 1.

Is the buffer new? maybe, _ext4_get_block will mark it as new when the
underlying block is unwritten.
Pass the check 2.

Is the folio uptodate? yes.
Pass the check 3.

Yep, the buffer finally gets dirtied and jbd2_journal_commit_transaction() sees
a dirty but not jbd_dirty buffer on the BJ_Forget list.

To fix it:
Trace the user data dirting in ext4_block_write_begin() for data=journal mode,
as suggested by Jan.

Reported-by: Baolin Liu <liubaolin@xxxxxxxxxx>
Suggested-by: Jan Kara <jack@xxxxxxx>
Signed-off-by: Shida Zhang <zhangshida@xxxxxxxxxx>
---
 fs/ext4/inode.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 941c1c0d5c6e..de46c0a6842a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -49,6 +49,11 @@
 
 #include <trace/events/ext4.h>
 
+static void ext4_journalled_zero_new_buffers(handle_t *handle,
+					    struct inode *inode,
+					    struct folio *folio,
+					    unsigned from, unsigned to);
+
 static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
 			      struct ext4_inode_info *ei)
 {
@@ -1042,7 +1047,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode,
 }
 
 #ifdef CONFIG_FS_ENCRYPTION
-static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
+static int ext4_block_write_begin(handle_t *handle, struct folio *folio,
+				  loff_t pos, unsigned len,
 				  get_block_t *get_block)
 {
 	unsigned from = pos & (PAGE_SIZE - 1);
@@ -1056,6 +1062,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
 	struct buffer_head *bh, *head, *wait[2];
 	int nr_wait = 0;
 	int i;
+	bool should_journal_data = ext4_should_journal_data(inode);
 
 	BUG_ON(!folio_test_locked(folio));
 	BUG_ON(from > PAGE_SIZE);
@@ -1084,11 +1091,16 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				break;
+			if (should_journal_data)
+				do_journal_get_write_access(handle, inode, bh);
 			if (buffer_new(bh)) {
 				if (folio_test_uptodate(folio)) {
 					clear_buffer_new(bh);
 					set_buffer_uptodate(bh);
-					mark_buffer_dirty(bh);
+					if (should_journal_data)
+						ext4_dirty_journalled_data(handle, bh);
+					else
+						mark_buffer_dirty(bh);
 					continue;
 				}
 				if (block_end > to || block_start < from)
@@ -1118,7 +1130,11 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
 			err = -EIO;
 	}
 	if (unlikely(err)) {
-		folio_zero_new_buffers(folio, from, to);
+		if (should_journal_data)
+			ext4_journalled_zero_new_buffers(handle, inode, folio,
+							 from, to);
+		else
+			folio_zero_new_buffers(folio, from, to);
 	} else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
 		for (i = 0; i < nr_wait; i++) {
 			int err2;
@@ -1218,10 +1234,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 
 #ifdef CONFIG_FS_ENCRYPTION
 	if (ext4_should_dioread_nolock(inode))
-		ret = ext4_block_write_begin(folio, pos, len,
+		ret = ext4_block_write_begin(handle, folio, pos, len,
 					     ext4_get_block_unwritten);
 	else
-		ret = ext4_block_write_begin(folio, pos, len, ext4_get_block);
+		ret = ext4_block_write_begin(handle, folio, pos, len,
+					     ext4_get_block);
 #else
 	if (ext4_should_dioread_nolock(inode))
 		ret = __block_write_begin(&folio->page, pos, len,
@@ -2962,7 +2979,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 		return PTR_ERR(folio);
 
 #ifdef CONFIG_FS_ENCRYPTION
-	ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep);
+	ret = ext4_block_write_begin(NULL, folio, pos, len,
+				     ext4_da_get_block_prep);
 #else
 	ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep);
 #endif
-- 
2.33.0