There is a race between mark inode dirty and writeback thread, see the following scenario. In this case, writeback thread will not run though there is dirty_io. __mark_inode_dirty() bdi_writeback_workfn() ... ... spin_lock(&inode->i_lock); ... if (bdi_cap_writeback_dirty(bdi)) { <<< assume wb has dirty_io, so wakeup_bdi is false. <<< the following inode_dirty also have wakeup_bdi false. if (!wb_has_dirty_io(&bdi->wb)) wakeup_bdi = true; } spin_unlock(&inode->i_lock); <<< assume last dirty_io is removed here. pages_written = wb_do_writeback(wb); ... <<< work_list empty and wb has no dirty_io, <<< delayed_work will not be queued. if (!list_empty(&bdi->work_list) || (wb_has_dirty_io(wb) && dirty_writeback_interval)) queue_delayed_work(bdi_wq, &wb->dwork, msecs_to_jiffies(dirty_writeback_interval * 10)); spin_lock(&bdi->wb.list_lock); inode->dirtied_when = jiffies; <<< new dirty_io is added. list_move(&inode->i_wb_list, &bdi->wb.b_dirty); spin_unlock(&bdi->wb.list_lock); <<< though there is dirty_io, but wakeup_bdi is false, <<< so writeback thread will not be waked up and <<< the new dirty_io will not be flushed. if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); Writeback will run until there is a new flush work queued. This may cause a lot of dirty pages stay in memory for a long time. Cc: Fengguang Wu <fengguang.wu@xxxxxxxxx> Signed-off-by: Junxiao Bi <junxiao.bi@xxxxxxxxxx> --- fs/fs-writeback.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 68851ff..72e6275 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1140,6 +1140,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); + bdi = inode_to_bdi(inode); + spin_lock(&bdi->wb.list_lock); spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; @@ -1171,7 +1173,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ if (!was_dirty) { bool wakeup_bdi = false; - bdi = inode_to_bdi(inode); if (bdi_cap_writeback_dirty(bdi)) { WARN(!test_bit(BDI_registered, &bdi->state), @@ -1187,10 +1188,9 @@ void __mark_inode_dirty(struct inode *inode, int flags) wakeup_bdi = true; } - spin_unlock(&inode->i_lock); - spin_lock(&bdi->wb.list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + spin_unlock(&inode->i_lock); spin_unlock(&bdi->wb.list_lock); if (wakeup_bdi) @@ -1200,6 +1200,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) } out_unlock_inode: spin_unlock(&inode->i_lock); + spin_unlock(&bdi->wb.list_lock); } EXPORT_SYMBOL(__mark_inode_dirty); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html