Hi, cgroup based writeback sometimes appears to manipulate inode->i_io_list while holding the lock on the wrong bdi_writeback object. Following is a crash I was able to produce by adding extra delays in between list pointer updates (repro patch attached). [ 116.595958] ------------[ cut here ]------------ [ 116.596508] kernel BUG at lib/list_debug.c:74! [ 116.597000] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC [ 116.597654] CPU: 3 PID: 940 Comm: kworker/u8:6 Not tainted 4.5.0-rc6+ #39 [ 116.598397] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 [ 116.599290] Workqueue: writeback wb_workfn (flush-8:16) [ 116.599895] task: ffff88007c213700 ti: ffff88007c714000 task.ti: ffff88007c714000 [ 116.600006] RIP: 0010:[<ffffffff805a3393>] [<ffffffff805a3393>] __list_del_entry+0x53/0x60 [ 116.600006] RSP: 0018:ffff88007c717c50 EFLAGS: 00010206 [ 116.600006] RAX: dead000000000200 RBX: ffff88007c284818 RCX: ffff88007c717db0 [ 116.600006] RDX: ffff8800765bbdd8 RSI: ffff88007c717c90 RDI: ffff8800768595d8 [ 116.600006] RBP: ffff88007c717c60 R08: ffff8800768591d8 R09: 0000000000000000 [ 116.600006] R10: 0000000000000004 R11: 0000000000000001 R12: ffff88007c284818 [ 116.600006] R13: ffff88007c717c90 R14: ffff88007c284818 R15: ffff88007c717d28 [ 116.600006] FS: 0000000000000000(0000) GS:ffff88007f980000(0000) knlGS:0000000000000000 [ 116.600006] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 116.600006] CR2: 00007fff5822ff40 CR3: 0000000000e0a000 CR4: 00000000000006e0 [ 116.600006] Stack: [ 116.600006] ffff8800768595d8 ffff88007c46f000 ffff88007c717cc8 ffffffff8035e37f [ 116.600006] ffff88007c284828 0000000000000082 000000000000026a ffff88007c717cc0 [ 116.600006] ffff8800768111d8 ffff880076a309d8 ffff88007c284800 ffff88007c284828 [ 116.600006] Call Trace: [ 116.600006] [<ffffffff8035e37f>] move_expired_inodes+0x4f/0x180 [ 116.600006] [<ffffffff8035f0a1>] queue_io+0x61/0xb0 [ 116.600006] [<ffffffff80360813>] wb_writeback+0x1a3/0x1e0 [ 116.600006] [<ffffffff803609db>] wb_workfn+0x18b/0x280 [ 116.600006] [<ffffffff802795a8>] process_one_work+0x128/0x300 [ 116.600006] [<ffffffff802798a0>] worker_thread+0x120/0x480 [ 116.600006] [<ffffffff80279780>] ? process_one_work+0x300/0x300 [ 116.600006] [<ffffffff8027ea64>] kthread+0xc4/0xe0 [ 116.600006] [<ffffffff8032d1e8>] ? kfree+0xc8/0x100 [ 116.600006] [<ffffffff8027e9a0>] ? __kthread_parkme+0x70/0x70 [ 116.600006] [<ffffffff80970bdf>] ret_from_fork+0x3f/0x70 [ 116.600006] [<ffffffff8027e9a0>] ? __kthread_parkme+0x70/0x70 [ 116.600006] Code: 39 c3 74 29 48 3b 3b 75 22 49 3b 7c 24 08 75 19 49 89 5c 24 08 e8 fe fd ff ff 4c 89 23 e8 f6 fd ff ff 5b 41 5c 5d c3 0f 0b 0f 0b <0f> 0b 0f 0b 66 0f 1f 84 00 00 00 00 00 55 48 89 e5 53 48 89 fb [ 116.600006] RIP [<ffffffff805a3393>] __list_del_entry+0x53/0x60 [ 116.600006] RSP <ffff88007c717c50> [ 116.623099] ---[ end trace ff07596b6e4f4928 ]--- I see a few places where a wrong bdi_writeback object may be locked while manipulating inode->i_io_list links. The first one is in writeback_single_inode(): 1353 spin_lock(&wb->list_lock); 1354 spin_lock(&inode->i_lock); 1355 /* 1356 * If inode is clean, remove it from writeback lists. Otherwise don't 1357 * touch it. See comment above for explanation. 1358 */ 1359 if (!(inode->i_state & I_DIRTY_ALL)) 1360 inode_io_list_del_locked(inode, wb); 1361 spin_unlock(&wb->list_lock); The locked wb is passed in as a parameter and equals to inode_to_bdi(inode)->wb. But this may not actually match inode->i_wb. The second one is in writeback_sb_inodes(): 1499 __writeback_single_inode(inode, &wbc); 1500 1501 wbc_detach_inode(&wbc); 1502 work->nr_pages -= write_chunk - wbc.nr_to_write; 1503 wrote += write_chunk - wbc.nr_to_write; 1504 1505 if (need_resched()) { 1506 /* 1507 * We're trying to balance between building up a nice 1508 * long list of IOs to improve our merge rate, and 1509 * getting those IOs out quickly for anyone throttling 1510 * in balance_dirty_pages(). cond_resched() doesn't 1511 * unplug, so get our IOs out the door before we 1512 * give up the CPU. 1513 */ 1514 blk_flush_plug(current); 1515 cond_resched(); 1516 } 1517 1518 1519 spin_lock(&wb->list_lock); 1520 spin_lock(&inode->i_lock); 1521 if (!(inode->i_state & I_DIRTY_ALL)) 1522 wrote++; 1523 requeue_inode(inode, wb, &wbc); After wbc_detach_inode() is called, inode's i_wb could have changed. So locking the original wb seems wrong. The same issue exists in writeback_single_inode(). Repro patch: --- fs/fs-writeback.c | 4 ++++ include/linux/list.h | 6 ++++++ lib/list_debug.c | 29 ++++++++++++++++------------- repro.sh | 31 +++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 13 deletions(-) create mode 100755 repro.sh diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1f76d89..bd1bd75 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1301,6 +1301,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) return ret; } +atomic_t slow_down = ATOMIC_INIT(0); + /* * Write out an inode's dirty pages. Either the caller has an active reference * on the inode or the inode has I_WILL_FREE set. @@ -1315,6 +1317,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, { int ret = 0; + atomic_inc(&slow_down); spin_lock(&inode->i_lock); if (!atomic_read(&inode->i_count)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); @@ -1362,6 +1365,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, inode_sync_complete(inode); out: spin_unlock(&inode->i_lock); + atomic_dec(&slow_down); return ret; } diff --git a/include/linux/list.h b/include/linux/list.h index 30cf420..e3b032b 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -6,6 +6,8 @@ #include <linux/poison.h> #include <linux/const.h> #include <linux/kernel.h> +#include <asm/atomic.h> +#include <asm/processor.h> /* * Simple doubly linked list implementation. @@ -77,6 +79,8 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head) __list_add(new, head->prev, head); } +void do_delay(void); + /* * Delete a list entry by making the prev/next entries * point to each other. @@ -87,7 +91,9 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head) static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; + do_delay(); WRITE_ONCE(prev->next, next); + do_delay(); } /** diff --git a/lib/list_debug.c b/lib/list_debug.c index 3345a08..7195c4f 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -19,6 +19,14 @@ void list_force_poison(struct list_head *entry) entry->prev = &force_poison; } +extern atomic_t slow_down; + +void do_delay() { + int i; + for (i=0; atomic_read(&slow_down) && i<100000; i++) + cpu_relax(); +} + /* * Insert a new entry between two known consecutive entries. * @@ -44,9 +52,13 @@ void __list_add(struct list_head *new, "list_add double add: new=%p, prev=%p, next=%p.\n", new, prev, next); next->prev = new; + do_delay(); new->next = next; + do_delay(); new->prev = prev; + do_delay(); WRITE_ONCE(prev->next, new); + do_delay(); } EXPORT_SYMBOL(__list_add); @@ -57,19 +69,10 @@ void __list_del_entry(struct list_head *entry) prev = entry->prev; next = entry->next; - if (WARN(next == LIST_POISON1, - "list_del corruption, %p->next is LIST_POISON1 (%p)\n", - entry, LIST_POISON1) || - WARN(prev == LIST_POISON2, - "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", - entry, LIST_POISON2) || - WARN(prev->next != entry, - "list_del corruption. prev->next should be %p, " - "but was %p\n", entry, prev->next) || - WARN(next->prev != entry, - "list_del corruption. next->prev should be %p, " - "but was %p\n", entry, next->prev)) - return; + BUG_ON(next == LIST_POISON1); + BUG_ON(prev == LIST_POISON2); + BUG_ON(prev->next != entry); + BUG_ON(next->prev != entry); __list_del(prev, next); } diff --git a/repro.sh b/repro.sh new file mode 100755 index 0000000..05de4c7 --- /dev/null +++ b/repro.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +CGROUP_ROOT=/mnt-cgroup2 + +mkdir -p $CGROUP_ROOT + +if ! mount | grep -qw cgroup2; then + mount -t cgroup2 none $CGROUP_ROOT +fi + +mkdir -p $CGROUP_ROOT/mem1 + +echo '+memory' > $CGROUP_ROOT/cgroup.subtree_control + +echo $$ > $CGROUP_ROOT/mem1/cgroup.procs + +if dumpe2fs -h /dev/sdb |grep -q 'Journal size'; then + echo 'This repro requires ext4 without journaling' + exit 1 +fi + +if ! mount | grep -qw /dev/sdb; then + mount /dev/sdb /mnt/sdb +fi + +(for i in {1..10000}; do dd if=/dev/urandom of=/mnt/sdb/fsync1 bs=4096 count=1 conv=notrunc,fsync &> /dev/null; done)& + +(for i in {1..10000};do dd if=/dev/urandom of=/mnt/sdb/mark_dirty$i bs=4096 count=1 conv=notrunc &> /dev/null;done)& + +wait + -- 2.7.0.rc3.207.g0ac5344 -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html