On Fri, Mar 09, 2012 at 10:02:28AM +0100, Jan Kara wrote: > Doing iput() from flusher thread (writeback_sb_inodes()) can create problems > because iput() can do a lot of work - for example truncate the inode if it's > the last iput on unlinked file. Some filesystems (e.g. ubifs) may need to > allocate blocks during truncate (due to their COW nature) and in some cases > they thus need to flush dirty data from truncate to reduce uncertainty in the > amount of free space. This effectively creates a deadlock. > > We get rid of iput() in flusher thread by using the fact that I_SYNC inode > flag effectively pins the inode in memory. So if we take care to either hold > i_lock or have I_SYNC set, we can get away without taking inode reference > in writeback_sb_inodes(). I created this graph while trying to understand/prove the new locking rules, and it looks perfectly fine. flusher: I_FREEING check requeue/dequeue . . b_io.prev . writepages write_inode . . . . . . . . . . . I_SYNC . . ============================== i_lock . ============ === ====== list_lock ============= . . ============== . . . . check/wait I_SYNC I_DIRTY handling evict/end_writeback: check I_SYNC set I_FREEING . wait I_SYNC destroy_inode . dequeue . . . I_SYNC . . . ********************* . i_lock ===== . ==== *==== . list_lock ====== It may be worthwhile to note that evict() needs to grab i_lock *after* waiting for I_SYNC, so that the flusher won't be accessing possibly freed inode when unlocking i_lock. > As a side effect, we also fix possible use-after-free in wb_writeback() because > inode_wait_for_writeback() call could try to reacquire i_lock on the inode that > was already free. Good catch! I think you are referring to this code: /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise * we'll just busyloop. */ if (!list_empty(&wb->b_more_io)) { trace_writeback_wait(wb->bdi, work); inode = wb_inode(wb->b_more_io.prev); spin_lock(&inode->i_lock); inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); } > Signed-off-by: Jan Kara <jack@xxxxxxx> > --- > fs/fs-writeback.c | 38 ++++++++++++++++++++++++-------------- > fs/inode.c | 11 ++++++++++- > include/linux/fs.h | 7 ++++--- > include/linux/writeback.h | 7 +------ > 4 files changed, 39 insertions(+), 24 deletions(-) > > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c > index 1e8bf44..f9f9b61 100644 > --- a/fs/fs-writeback.c > +++ b/fs/fs-writeback.c > @@ -325,19 +325,21 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) > } > > /* > - * Wait for writeback on an inode to complete. > + * Wait for writeback on an inode to complete. Called with i_lock held. > + * Return 1 if we dropped i_lock and waited, 0 is returned otherwise. > */ > -static void inode_wait_for_writeback(struct inode *inode) > +int __must_check inode_wait_for_writeback(struct inode *inode) > { > DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); > wait_queue_head_t *wqh; > > wqh = bit_waitqueue(&inode->i_state, __I_SYNC); > - while (inode->i_state & I_SYNC) { > + if (inode->i_state & I_SYNC) { > spin_unlock(&inode->i_lock); > __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); > - spin_lock(&inode->i_lock); > + return 1; > } > + return 0; > } > > /* > @@ -426,9 +428,12 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, > return 0; > } > /* > - * It's a data-integrity sync. We must wait. > + * It's a data-integrity sync. We must wait. Since callers hold > + * inode reference or inode has I_WILL_FREE set, it cannot go > + * away under us. > */ > - inode_wait_for_writeback(inode); > + while (inode_wait_for_writeback(inode)) > + spin_lock(&inode->i_lock); > } > > ret = __writeback_single_inode(inode, wb, wbc); > @@ -604,12 +609,20 @@ static long writeback_sb_inodes(struct super_block *sb, > } > spin_unlock(&wb->list_lock); > > - __iget(inode); > - inode_wait_for_writeback(inode); > + /* Did we drop i_lock to wait for I_SYNC? */ > + if (inode_wait_for_writeback(inode)) { > + /* Inode may be gone, start again */ > + spin_lock(&wb->list_lock); > + continue; > + } > write_chunk = writeback_chunk_size(wb->bdi, work); > wbc.nr_to_write = write_chunk; > wbc.pages_skipped = 0; > > + /* > + * We use I_SYNC to pin the inode in memory. While it is set > + * end_writeback() will wait so the inode cannot be freed. > + */ > __writeback_single_inode(inode, wb, &wbc); > > work->nr_pages -= write_chunk - wbc.nr_to_write; > @@ -633,10 +646,7 @@ static long writeback_sb_inodes(struct super_block *sb, > continue_unlock: > inode_sync_complete(inode); > spin_unlock(&inode->i_lock); > - spin_unlock(&wb->list_lock); > - iput(inode); > - cond_resched(); > - spin_lock(&wb->list_lock); > + cond_resched_lock(&wb->list_lock); > /* > * bail out to wb_writeback() often enough to check > * background threshold and other termination conditions. > @@ -831,8 +841,8 @@ static long wb_writeback(struct bdi_writeback *wb, > inode = wb_inode(wb->b_more_io.prev); > spin_lock(&inode->i_lock); > spin_unlock(&wb->list_lock); > - inode_wait_for_writeback(inode); > - spin_unlock(&inode->i_lock); > + if (!inode_wait_for_writeback(inode)) > + spin_unlock(&inode->i_lock); > spin_lock(&wb->list_lock); > } > } > diff --git a/fs/inode.c b/fs/inode.c > index d3ebdbe..b64e2fe 100644 > --- a/fs/inode.c > +++ b/fs/inode.c > @@ -510,7 +510,16 @@ void end_writeback(struct inode *inode) > BUG_ON(!list_empty(&inode->i_data.private_list)); > BUG_ON(!(inode->i_state & I_FREEING)); > BUG_ON(inode->i_state & I_CLEAR); > - inode_sync_wait(inode); > + /* > + * Wait for flusher thread to be done with the inode. Since the inode > + * has I_FREEING set, flusher thread won't start new work on the inode. > + * We just have to wait for running writeback to finish. We must use > + * i_lock here because flusher thread might be working with the inode > + * without I_SYNC set but under i_lock. > + */ > + spin_lock(&inode->i_lock); > + if (!inode_wait_for_writeback(inode)) > + spin_unlock(&inode->i_lock); > /* don't need i_lock here, no concurrent mods to i_state */ > inode->i_state = I_FREEING | I_CLEAR; > } > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 69cd5bb..e1f0f5a 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1742,9 +1742,10 @@ struct super_operations { > * anew. Other functions will just ignore such inodes, > * if appropriate. I_NEW is used for waiting. > * > - * I_SYNC Synchonized write of dirty inode data. The bits is > - * set during data writeback, and cleared with a wakeup > - * on the bit address once it is done. > + * I_SYNC Writeback of inode is running. The bits is set during s/bits/bit/ > + * data writeback, and cleared with a wakeup on the bit > + * address once it is done. The bit is also used to pin > + * the inode in memory for flusher thread. > * > * I_REFERENCED Marks the inode as recently references on the LRU list. > * > diff --git a/include/linux/writeback.h b/include/linux/writeback.h > index 995b8bf..3a34dc0 100644 > --- a/include/linux/writeback.h > +++ b/include/linux/writeback.h > @@ -94,6 +94,7 @@ long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, > enum wb_reason reason); > long wb_do_writeback(struct bdi_writeback *wb, int force_wait); > void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); > +int __must_check inode_wait_for_writeback(struct inode *inode); > > /* writeback.h requires fs.h; it, too, is not included from here. */ > static inline void wait_on_inode(struct inode *inode) > @@ -101,12 +102,6 @@ static inline void wait_on_inode(struct inode *inode) > might_sleep(); > wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); > } > -static inline void inode_sync_wait(struct inode *inode) > -{ > - might_sleep(); > - wait_on_bit(&inode->i_state, __I_SYNC, inode_wait, > - TASK_UNINTERRUPTIBLE); > -} > > > /* > -- > 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html