From: Filipe Manana <fdmanana@xxxxxxxx> [ Upstream commit f9690f426b2134cc3e74bfc5d9dfd6a4b2ca5281 ] Commit dbcc7d57bffc0c ("btrfs: fix race when cloning extent buffer during rewind of an old root"), fixed a race when we need to rewind the extent buffer of an old root. It was caused by picking a new mod log operation for the extent buffer while getting a cloned extent buffer with an outdated number of items (off by -1), because we cloned the extent buffer without locking it first. However there is still another similar race, but in the opposite direction. The cloned extent buffer has a number of items that does not match the number of tree mod log operations that are going to be replayed. This is because right after we got the last (most recent) tree mod log operation to replay and before locking and cloning the extent buffer, another task adds a new pointer to the extent buffer, which results in adding a new tree mod log operation and incrementing the number of items in the extent buffer. So after cloning we have mismatch between the number of items in the extent buffer and the number of mod log operations we are going to apply to it. This results in hitting a BUG_ON() that produces the following stack trace: ------------[ cut here ]------------ kernel BUG at fs/btrfs/tree-mod-log.c:675! invalid opcode: 0000 [#1] SMP KASAN PTI CPU: 3 PID: 4811 Comm: crawl_1215 Tainted: G W 5.12.0-7d1efdf501f8-misc-next+ #99 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 RIP: 0010:tree_mod_log_rewind+0x3b1/0x3c0 Code: 05 48 8d 74 10 (...) RSP: 0018:ffffc90001027090 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff8880a8514600 RCX: ffffffffaa9e59b6 RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8880a851462c RBP: ffffc900010270e0 R08: 00000000000000c0 R09: ffffed1004333417 R10: ffff88802199a0b7 R11: ffffed1004333416 R12: 000000000000000e R13: ffff888135af8748 R14: ffff88818766ff00 R15: ffff8880a851462c FS: 00007f29acf62700(0000) GS:ffff8881f2200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f0e6013f718 CR3: 000000010d42e003 CR4: 0000000000170ee0 Call Trace: btrfs_get_old_root+0x16a/0x5c0 ? lock_downgrade+0x400/0x400 btrfs_search_old_slot+0x192/0x520 ? btrfs_search_slot+0x1090/0x1090 ? free_extent_buffer.part.61+0xd7/0x140 ? free_extent_buffer+0x13/0x20 resolve_indirect_refs+0x3e9/0xfc0 ? lock_downgrade+0x400/0x400 ? __kasan_check_read+0x11/0x20 ? add_prelim_ref.part.11+0x150/0x150 ? lock_downgrade+0x400/0x400 ? __kasan_check_read+0x11/0x20 ? lock_acquired+0xbb/0x620 ? __kasan_check_write+0x14/0x20 ? do_raw_spin_unlock+0xa8/0x140 ? rb_insert_color+0x340/0x360 ? prelim_ref_insert+0x12d/0x430 find_parent_nodes+0x5c3/0x1830 ? stack_trace_save+0x87/0xb0 ? resolve_indirect_refs+0xfc0/0xfc0 ? fs_reclaim_acquire+0x67/0xf0 ? __kasan_check_read+0x11/0x20 ? lockdep_hardirqs_on_prepare+0x210/0x210 ? fs_reclaim_acquire+0x67/0xf0 ? __kasan_check_read+0x11/0x20 ? ___might_sleep+0x10f/0x1e0 ? __kasan_kmalloc+0x9d/0xd0 ? trace_hardirqs_on+0x55/0x120 btrfs_find_all_roots_safe+0x142/0x1e0 ? find_parent_nodes+0x1830/0x1830 ? trace_hardirqs_on+0x55/0x120 ? ulist_free+0x1f/0x30 ? btrfs_inode_flags_to_xflags+0x50/0x50 iterate_extent_inodes+0x20e/0x580 ? tree_backref_for_extent+0x230/0x230 ? release_extent_buffer+0x225/0x280 ? read_extent_buffer+0xdd/0x110 ? lock_downgrade+0x400/0x400 ? __kasan_check_read+0x11/0x20 ? lock_acquired+0xbb/0x620 ? __kasan_check_write+0x14/0x20 ? do_raw_spin_unlock+0xa8/0x140 ? _raw_spin_unlock+0x22/0x30 ? release_extent_buffer+0x225/0x280 iterate_inodes_from_logical+0x129/0x170 ? iterate_inodes_from_logical+0x129/0x170 ? btrfs_inode_flags_to_xflags+0x50/0x50 ? iterate_extent_inodes+0x580/0x580 ? __vmalloc_node+0x92/0xb0 ? init_data_container+0x34/0xb0 ? init_data_container+0x34/0xb0 ? kvmalloc_node+0x60/0x80 btrfs_ioctl_logical_to_ino+0x158/0x230 btrfs_ioctl+0x2038/0x4360 ? __kasan_check_write+0x14/0x20 ? mmput+0x3b/0x220 ? btrfs_ioctl_get_supported_features+0x30/0x30 ? __kasan_check_read+0x11/0x20 ? __kasan_check_read+0x11/0x20 ? lock_release+0xc8/0x650 ? __might_fault+0x64/0xd0 ? __kasan_check_read+0x11/0x20 ? lock_downgrade+0x400/0x400 ? lockdep_hardirqs_on_prepare+0x210/0x210 ? lockdep_hardirqs_on_prepare+0x13/0x210 ? _raw_spin_unlock_irqrestore+0x51/0x63 ? __kasan_check_read+0x11/0x20 ? do_vfs_ioctl+0xfc/0x9d0 ? ioctl_file_clone+0xe0/0xe0 ? lock_downgrade+0x400/0x400 ? lockdep_hardirqs_on_prepare+0x210/0x210 ? __kasan_check_read+0x11/0x20 ? lock_release+0xc8/0x650 ? __task_pid_nr_ns+0xd3/0x250 ? __kasan_check_read+0x11/0x20 ? __fget_files+0x160/0x230 ? __fget_light+0xf2/0x110 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f29ae85b427 Code: 00 00 90 48 8b (...) RSP: 002b:00007f29acf5fcf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007f29acf5ff40 RCX: 00007f29ae85b427 RDX: 00007f29acf5ff48 RSI: 00000000c038943b RDI: 0000000000000003 RBP: 0000000001000000 R08: 0000000000000000 R09: 00007f29acf60120 R10: 00005640d5fc7b00 R11: 0000000000000246 R12: 0000000000000003 R13: 00007f29acf5ff48 R14: 00007f29acf5ff40 R15: 00007f29acf5fef8 Modules linked in: ---[ end trace 85e5fce078dfbe04 ]--- (gdb) l *(tree_mod_log_rewind+0x3b1) 0xffffffff819e5b21 is in tree_mod_log_rewind (fs/btrfs/tree-mod-log.c:675). 670 * the modification. As we're going backwards, we do the 671 * opposite of each operation here. 672 */ 673 switch (tm->op) { 674 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: 675 BUG_ON(tm->slot < n); 676 fallthrough; 677 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: 678 case BTRFS_MOD_LOG_KEY_REMOVE: 679 btrfs_set_node_key(eb, &tm->key, tm->slot); (gdb) quit The following steps explain in more detail how it happens: 1) We have one tree mod log user (through fiemap or the logical ino ioctl), with a sequence number of 1, so we have fs_info->tree_mod_seq == 1. This is task A; 2) Another task is at ctree.c:balance_level() and we have eb X currently as the root of the tree, and we promote its single child, eb Y, as the new root. Then, at ctree.c:balance_level(), we call: ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 3) At btrfs_tree_mod_log_insert_root() we create a tree mod log operation of type BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, with a ->logical field pointing to ebX->start. We only have one item in eb X, so we create only one tree mod log operation, and store in the "tm_list" array; 4) Then, still at btrfs_tree_mod_log_insert_root(), we create a tree mod log element of operation type BTRFS_MOD_LOG_ROOT_REPLACE, ->logical set to ebY->start, ->old_root.logical set to ebX->start, ->old_root.level set to the level of eb X and ->generation set to the generation of eb X; 5) Then btrfs_tree_mod_log_insert_root() calls tree_mod_log_free_eb() with "tm_list" as argument. After that, tree_mod_log_free_eb() calls tree_mod_log_insert(). This inserts the mod log operation of type BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING from step 3 into the rbtree with a sequence number of 2 (and fs_info->tree_mod_seq set to 2); 6) Then, after inserting the "tm_list" single element into the tree mod log rbtree, the BTRFS_MOD_LOG_ROOT_REPLACE element is inserted, which gets the sequence number 3 (and fs_info->tree_mod_seq set to 3); 7) Back to ctree.c:balance_level(), we free eb X by calling btrfs_free_tree_block() on it. Because eb X was created in the current transaction, has no other references and writeback did not happen for it, we add it back to the free space cache/tree; 8) Later some other task B allocates the metadata extent from eb X, since it is marked as free space in the space cache/tree, and uses it as a node for some other btree; 9) The tree mod log user task calls btrfs_search_old_slot(), which calls btrfs_get_old_root(), and finally that calls tree_mod_log_oldest_root() with time_seq == 1 and eb_root == eb Y; 10) The first iteration of the while loop finds the tree mod log element with sequence number 3, for the logical address of eb Y and of type BTRFS_MOD_LOG_ROOT_REPLACE; 11) Because the operation type is BTRFS_MOD_LOG_ROOT_REPLACE, we don't break out of the loop, and set root_logical to point to tm->old_root.logical, which corresponds to the logical address of eb X; 12) On the next iteration of the while loop, the call to tree_mod_log_search_oldest() returns the smallest tree mod log element for the logical address of eb X, which has a sequence number of 2, an operation type of BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING and corresponds to the old slot 0 of eb X (eb X had only 1 item in it before being freed at step 7); 13) We then break out of the while loop and return the tree mod log operation of type BTRFS_MOD_LOG_ROOT_REPLACE (eb Y), and not the one for slot 0 of eb X, to btrfs_get_old_root(); 14) At btrfs_get_old_root(), we process the BTRFS_MOD_LOG_ROOT_REPLACE operation and set "logical" to the logical address of eb X, which was the old root. We then call tree_mod_log_search() passing it the logical address of eb X and time_seq == 1; 15) But before calling tree_mod_log_search(), task B locks eb X, adds a key to eb X, which results in adding a tree mod log operation of type BTRFS_MOD_LOG_KEY_ADD, with a sequence number of 4, to the tree mod log, and increments the number of items in eb X from 0 to 1. Now fs_info->tree_mod_seq has a value of 4; 16) Task A then calls tree_mod_log_search(), which returns the most recent tree mod log operation for eb X, which is the one just added by task B at the previous step, with a sequence number of 4, a type of BTRFS_MOD_LOG_KEY_ADD and for slot 0; 17) Before task A locks and clones eb X, task A adds another key to eb X, which results in adding a new BTRFS_MOD_LOG_KEY_ADD mod log operation, with a sequence number of 5, for slot 1 of eb X, increments the number of items in eb X from 1 to 2, and unlocks eb X. Now fs_info->tree_mod_seq has a value of 5; 18) Task A then locks eb X and clones it. The clone has a value of 2 for the number of items and the pointer "tm" points to the tree mod log operation with sequence number 4, not the most recent one with a sequence number of 5, so there is mismatch between the number of mod log operations that are going to be applied to the cloned version of eb X and the number of items in the clone; 19) Task A then calls tree_mod_log_rewind() with the clone of eb X, the tree mod log operation with sequence number 4 and a type of BTRFS_MOD_LOG_KEY_ADD, and time_seq == 1; 20) At tree_mod_log_rewind(), we set the local variable "n" with a value of 2, which is the number of items in the clone of eb X. Then in the first iteration of the while loop, we process the mod log operation with sequence number 4, which is targeted at slot 0 and has a type of BTRFS_MOD_LOG_KEY_ADD. This results in decrementing "n" from 2 to 1. Then we pick the next tree mod log operation for eb X, which is the tree mod log operation with a sequence number of 2, a type of BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING and for slot 0, it is the one added in step 5 to the tree mod log tree. We go back to the top of the loop to process this mod log operation, and because its slot is 0 and "n" has a value of 1, we hit the BUG_ON: (...) switch (tm->op) { case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); fallthrough; (...) Fix this by checking for a more recent tree mod log operation after locking and cloning the extent buffer of the old root node, and use it as the first operation to apply to the cloned extent buffer when rewinding it. Stable backport notes: due to moved code and renames, in =< 5.11 the change should be applied to ctree.c:get_old_root. Reported-by: Zygo Blaxell <ce3g8jdj@xxxxxxxxxxxxxxxxxxxxx> Link: https://lore.kernel.org/linux-btrfs/20210404040732.GZ32440@xxxxxxxxxxxxxx/ Fixes: 834328a8493079 ("Btrfs: tree mod log's old roots could still be part of the tree") CC: stable@xxxxxxxxxxxxxxx # 4.4+ Signed-off-by: Filipe Manana <fdmanana@xxxxxxxx> Signed-off-by: David Sterba <dsterba@xxxxxxxx> Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx> --- fs/btrfs/ctree.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 35741901f4e0..ab69e3563b12 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1406,10 +1406,30 @@ get_old_root(struct btrfs_root *root, u64 time_seq) "failed to read tree block %llu from get_old_root", logical); } else { + struct tree_mod_elem *tm2; + btrfs_tree_read_lock(old); eb = btrfs_clone_extent_buffer(old); + /* + * After the lookup for the most recent tree mod operation + * above and before we locked and cloned the extent buffer + * 'old', a new tree mod log operation may have been added. + * So lookup for a more recent one to make sure the number + * of mod log operations we replay is consistent with the + * number of items we have in the cloned extent buffer, + * otherwise we can hit a BUG_ON when rewinding the extent + * buffer. + */ + tm2 = tree_mod_log_search(fs_info, logical, time_seq); btrfs_tree_read_unlock(old); free_extent_buffer(old); + ASSERT(tm2); + ASSERT(tm2 == tm || tm2->seq > tm->seq); + if (!tm2 || tm2->seq < tm->seq) { + free_extent_buffer(eb); + return NULL; + } + tm = tm2; } } else if (old_root) { eb_root_owner = btrfs_header_owner(eb_root); -- 2.30.2