On Tue 24-05-22 08:05:40, Jchao Sun wrote: > Commit b35250c0816c ("writeback: Protect inode->i_io_list with > inode->i_lock") made inode->i_io_list not only protected by > wb->list_lock but also inode->i_lock, but inode_io_list_move_locked() > was missed. Add lock there and also update comment describing > things protected by inode->i_lock. This also fixes a race where > __mark_inode_dirty() could move inode under flush worker's hands > and thus sync(2) could miss writing some inodes. > > Fixes: b35250c0816c ("writeback: Protect inode->i_io_list with inode->i_lock") > Signed-off-by: Jchao Sun <sunjunchao2870@xxxxxxxxx> Thanks for the fix! It looks good to me now (modulo some too long comment lines but I can wrap those on commit). I'll take the patch to my tree once I push out stuff I have ready for the merge window. Honza > --- > fs/fs-writeback.c | 37 ++++++++++++++++++++++++++++--------- > fs/inode.c | 2 +- > 2 files changed, 29 insertions(+), 10 deletions(-) > > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c > index 591fe9cf1659..99793bb838e5 100644 > --- a/fs/fs-writeback.c > +++ b/fs/fs-writeback.c > @@ -120,6 +120,7 @@ static bool inode_io_list_move_locked(struct inode *inode, > struct list_head *head) > { > assert_spin_locked(&wb->list_lock); > + assert_spin_locked(&inode->i_lock); > > list_move(&inode->i_io_list, head); > > @@ -1365,9 +1366,9 @@ static int move_expired_inodes(struct list_head *delaying_queue, > inode = wb_inode(delaying_queue->prev); > if (inode_dirtied_after(inode, dirtied_before)) > break; > + spin_lock(&inode->i_lock); > list_move(&inode->i_io_list, &tmp); > moved++; > - spin_lock(&inode->i_lock); > inode->i_state |= I_SYNC_QUEUED; > spin_unlock(&inode->i_lock); > if (sb_is_blkdev_sb(inode->i_sb)) > @@ -1383,7 +1384,12 @@ static int move_expired_inodes(struct list_head *delaying_queue, > goto out; > } > > - /* Move inodes from one superblock together */ > + /* > + * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue', > + * we don't take inode->i_lock here because it is just a pointless overhead. > + * Inode is already marked as I_SYNC_QUEUED so writeback list handling is > + * fully under our control. > + */ > while (!list_empty(&tmp)) { > sb = wb_inode(tmp.prev)->i_sb; > list_for_each_prev_safe(pos, node, &tmp) { > @@ -1821,8 +1827,8 @@ static long writeback_sb_inodes(struct super_block *sb, > * We'll have another go at writing back this inode > * when we completed a full scan of b_io. > */ > - spin_unlock(&inode->i_lock); > requeue_io(inode, wb); > + spin_unlock(&inode->i_lock); > trace_writeback_sb_inodes_requeue(inode); > continue; > } > @@ -2351,6 +2357,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) > { > struct super_block *sb = inode->i_sb; > int dirtytime = 0; > + struct bdi_writeback *wb = NULL; > > trace_writeback_mark_inode_dirty(inode, flags); > > @@ -2402,6 +2409,17 @@ void __mark_inode_dirty(struct inode *inode, int flags) > inode->i_state &= ~I_DIRTY_TIME; > inode->i_state |= flags; > > + /* > + * Grab inode's wb early because it requires dropping i_lock and we > + * need to make sure following checks happen atomically with dirty > + * list handling so that we don't move inodes under flush worker's > + * hands. > + */ > + if (!was_dirty) { > + wb = locked_inode_to_wb_and_lock_list(inode); > + spin_lock(&inode->i_lock); > + } > + > /* > * If the inode is queued for writeback by flush worker, just > * update its dirty state. Once the flush worker is done with > @@ -2409,7 +2427,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) > * list, based upon its state. > */ > if (inode->i_state & I_SYNC_QUEUED) > - goto out_unlock_inode; > + goto out_unlock; > > /* > * Only add valid (hashed) inodes to the superblock's > @@ -2417,22 +2435,19 @@ void __mark_inode_dirty(struct inode *inode, int flags) > */ > if (!S_ISBLK(inode->i_mode)) { > if (inode_unhashed(inode)) > - goto out_unlock_inode; > + goto out_unlock; > } > if (inode->i_state & I_FREEING) > - goto out_unlock_inode; > + goto out_unlock; > > /* > * If the inode was already on b_dirty/b_io/b_more_io, don't > * reposition it (that would break b_dirty time-ordering). > */ > if (!was_dirty) { > - struct bdi_writeback *wb; > struct list_head *dirty_list; > bool wakeup_bdi = false; > > - wb = locked_inode_to_wb_and_lock_list(inode); > - > inode->dirtied_when = jiffies; > if (dirtytime) > inode->dirtied_time_when = jiffies; > @@ -2446,6 +2461,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) > dirty_list); > > spin_unlock(&wb->list_lock); > + spin_unlock(&inode->i_lock); > trace_writeback_dirty_inode_enqueue(inode); > > /* > @@ -2460,6 +2476,9 @@ void __mark_inode_dirty(struct inode *inode, int flags) > return; > } > } > +out_unlock: > + if (wb) > + spin_unlock(&wb->list_lock); > out_unlock_inode: > spin_unlock(&inode->i_lock); > } > diff --git a/fs/inode.c b/fs/inode.c > index 9d9b422504d1..bd4da9c5207e 100644 > --- a/fs/inode.c > +++ b/fs/inode.c > @@ -27,7 +27,7 @@ > * Inode locking rules: > * > * inode->i_lock protects: > - * inode->i_state, inode->i_hash, __iget() > + * inode->i_state, inode->i_hash, __iget(), inode->i_io_list > * Inode LRU list locks protect: > * inode->i_sb->s_inode_lru, inode->i_lru > * inode->i_sb->s_inode_list_lock protects: > -- > 2.17.1 > -- Jan Kara <jack@xxxxxxxx> SUSE Labs, CR