Currently, I_SYNC is used to keep track of whether writeback is in progress on the inode or not. With cgroup writeback support, multiple writebacks could be in progress simultaneously and a single bit in inode->i_state isn't sufficient. If CONFIG_CGROUP_WRITEBACK, this patch makes each iwbl (inode_wb_link) track whether writeback is in progress using the new IWBL_SYNC flag on iwbl->data and inode->i_nr_syncs aggregate total number of writebacks in progress on the inode. New helpers, iwbl_{test|set|clear}_sync(), iwbl_sync_wakeup() and __iwbl_wait_for_writeback() are added to manipulate these states and inode_sleep_on_writeback() is converted to iwbl_sleep_on_writeback(). I_SYNC retains the same meaning - it's set if any writeback is in progress and cleared if none. If !CONFIG_CGROUP_WRITEBACK, the helpers simply operate on I_SYNC directly and there's no behavioral changes compared to before. When CONFIG_CGROUP_WRITEBACK, this adds an atomic_t to struct inode. This competes for the same left over 4 byte slot w/ i_readcount from CONFIG_IMA on 64 bit, and, as long as CONFIG_IMA isn't enabled, it doesn't increase the size of struct inode on 64 bit. This allows keeping track of writeback in-progress state per cgroup bdi_writeback. Signed-off-by: Tejun Heo <tj@xxxxxxxxxx> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx> Cc: Christoph Hellwig <hch@xxxxxxxxxxxxx> Cc: Jens Axboe <axboe@xxxxxxxxx> Cc: Jan Kara <jack@xxxxxxx> --- fs/fs-writeback.c | 172 +++++++++++++++++++++++++++++++++------ include/linux/backing-dev-defs.h | 7 ++ include/linux/fs.h | 3 + mm/backing-dev.c | 1 + 4 files changed, 160 insertions(+), 23 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d10c231..df99b5b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -430,26 +430,113 @@ restart: rcu_read_unlock(); } +/** + * iwbl_test_sync - test whether writeback is in progress on an inode_wb_link + * @iwbl: target inode_wb_link + * + * Test whether writeback is in progress for the inode on the bdi_writeback + * specified by @iwbl. The caller is responsible for synchornization. + */ +static bool iwbl_test_sync(struct inode_wb_link *iwbl) +{ + return test_bit(IWBL_SYNC, &iwbl->data); +} + +/** + * iwbl_set_sync - mark an inode_wb_link that writeback is in progress + * @iwbl: target inode_wb_link + * @inode: inode @iwbl is associated with + * + * Mark that writeback is in progress for @inode on the bdi_writeback + * specified by @iwbl. iwbl_test_sync() will return %true on @iwbl and + * %I_SYNC is set on @inode while there's any writeback in progress on it. + */ +static void iwbl_set_sync(struct inode_wb_link *iwbl, struct inode *inode) +{ + lockdep_assert_held(&inode->i_lock); + WARN_ON_ONCE(iwbl_test_sync(iwbl)); + + set_bit(IWBL_SYNC, &iwbl->data); + inode->i_nr_syncs++; + inode->i_state |= I_SYNC; +} + +/** + * iwbl_clear_sync - undo iwbl_set_sync() + * @iwbl: target inode_wb_link + * @inode: inode @iwbl is associated with + * + * Returns %true if this was the last writeback in progress on @inode; + * %false, otherwise. + */ +static bool iwbl_clear_sync(struct inode_wb_link *iwbl, struct inode *inode) +{ + bool sync_complete; + + lockdep_assert_held(&inode->i_lock); + WARN_ON_ONCE(!iwbl_test_sync(iwbl)); + + clear_bit(IWBL_SYNC, &iwbl->data); + sync_complete = !--inode->i_nr_syncs; + if (sync_complete) + inode->i_state &= ~I_SYNC; + return sync_complete; +} + +/** + * iwbl_wait_for_writeback - wait for writeback in progree on an inode_wb_link + * @iwbl: target inode_wb_link + * + * Wait for the writeback in progress for the inode on the bdi_writeback + * specified by @iwbl. + */ +static void iwbl_wait_for_writeback(struct inode_wb_link *iwbl) + __releases(inode->i_lock) + __acquires(inode->i_lock) +{ + struct inode *inode = iwbl_to_inode(iwbl); + DEFINE_WAIT_BIT(wq, &iwbl->data, IWBL_SYNC); + wait_queue_head_t *wqh; + + lockdep_assert_held(&inode->i_lock); + + wqh = bit_waitqueue(&iwbl->data, IWBL_SYNC); + while (test_bit(IWBL_SYNC, &iwbl->data)) { + spin_unlock(&inode->i_lock); + __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE); + spin_lock(&inode->i_lock); + } +} + /* * Sleep until I_SYNC is cleared. This function must be called with i_lock * held and drops it. It is aimed for callers not holding any inode reference * so once i_lock is dropped, inode can go away. */ -static void inode_sleep_on_writeback(struct inode *inode) - __releases(inode->i_lock) +static void iwbl_sleep_on_writeback(struct inode_wb_link *iwbl) { DEFINE_WAIT(wait); - wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); + struct inode *inode = iwbl_to_inode(iwbl); + wait_queue_head_t *wqh = bit_waitqueue(&iwbl->data, IWBL_SYNC); int sleep; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); - sleep = inode->i_state & I_SYNC; + sleep = test_bit(IWBL_SYNC, &iwbl->data); spin_unlock(&inode->i_lock); if (sleep) schedule(); finish_wait(wqh, &wait); } +/** + * iwbl_sync_wakeup - wakeup iwbl_{wait_for|sleep_on}_writeback() waiter + * @iwbl: target inode_wb_link + */ +static void iwbl_sync_wakeup(struct inode_wb_link *iwbl) +{ + wake_up_bit(&iwbl->data, IWBL_SYNC); +} + static inline struct inode_cgwb_link *icgwbl_first(struct inode *inode) { struct hlist_node *node = @@ -504,6 +591,7 @@ static void inode_icgwbls_del(struct inode *inode) * bdi->icgwbls_lock. */ inode_for_each_icgwbl(icgwbl, next, inode) { + WARN_ON_ONCE(test_bit(IWBL_SYNC, &icgwbl->iwbl.data)); hlist_del_rcu(&icgwbl->inode_node); list_move(&icgwbl->wb_node, &to_free); } @@ -544,15 +632,39 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, } } +static bool iwbl_test_sync(struct inode_wb_link *iwbl) +{ + struct inode *inode = iwbl_to_inode(iwbl); + + return inode->i_state & I_SYNC; +} + +static void iwbl_set_sync(struct inode_wb_link *iwbl, struct inode *inode) +{ + inode->i_state |= I_SYNC; +} + +static bool iwbl_clear_sync(struct inode_wb_link *iwbl, struct inode *inode) +{ + inode->i_state &= ~I_SYNC; + return true; +} + +static void iwbl_wait_for_writeback(struct inode_wb_link *iwbl) +{ + __inode_wait_for_writeback(iwbl_to_inode(iwbl)); +} + /* * Sleep until I_SYNC is cleared. This function must be called with i_lock * held and drops it. It is aimed for callers not holding any inode reference * so once i_lock is dropped, inode can go away. */ -static void inode_sleep_on_writeback(struct inode *inode) +static void iwbl_sleep_on_writeback(struct inode_wb_link *iwbl) __releases(inode->i_lock) { DEFINE_WAIT(wait); + struct inode *inode = iwbl_to_inode(iwbl); wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); int sleep; @@ -564,6 +676,11 @@ static void inode_sleep_on_writeback(struct inode *inode) finish_wait(wqh, &wait); } +static void iwbl_sync_wakeup(struct inode_wb_link *iwbl) +{ + /* noop, __I_SYNC wakeup is enough */ +} + static void inode_icgwbls_del(struct inode *inode) { } @@ -700,14 +817,22 @@ static void requeue_io(struct inode_wb_link *iwbl, struct bdi_writeback *wb) iwbl_move_locked(iwbl, wb, &wb->b_more_io); } -static void inode_sync_complete(struct inode *inode) +static void iwbl_sync_complete(struct inode_wb_link *iwbl) { - inode->i_state &= ~I_SYNC; + struct inode *inode = iwbl_to_inode(iwbl); + bool sync_complete; + + sync_complete = iwbl_clear_sync(iwbl, inode); /* If inode is clean an unused, put it into LRU now... */ - inode_add_lru(inode); + if (sync_complete) + inode_add_lru(inode); + /* Waiters must see I_SYNC cleared before being woken up */ smp_mb(); - wake_up_bit(&inode->i_state, __I_SYNC); + + iwbl_sync_wakeup(iwbl); + if (sync_complete) + wake_up_bit(&inode->i_state, __I_SYNC); } static bool iwbl_dirtied_after(struct inode_wb_link *iwbl, unsigned long t) @@ -888,17 +1013,18 @@ static void requeue_inode(struct inode_wb_link *iwbl, struct bdi_writeback *wb, /* * Write out an inode and its dirty pages. Do not update the writeback list * linkage. That is left to the caller. The caller is also responsible for - * setting I_SYNC flag and calling inode_sync_complete() to clear it. + * setting I_SYNC flag and calling iwbl_sync_complete() to clear it. */ static int __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; + struct inode_wb_link *iwbl = &inode->i_wb_link; long nr_to_write = wbc->nr_to_write; unsigned dirty; int ret; - WARN_ON(!(inode->i_state & I_SYNC)); + WARN_ON(!iwbl_test_sync(iwbl)); trace_writeback_single_inode_start(inode, wbc, nr_to_write); @@ -976,7 +1102,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, else WARN_ON(inode->i_state & I_WILL_FREE); - if (inode->i_state & I_SYNC) { + if (iwbl_test_sync(iwbl)) { if (wbc->sync_mode != WB_SYNC_ALL) goto out; /* @@ -984,9 +1110,9 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * inode reference or inode has I_WILL_FREE set, it cannot go * away under us. */ - __inode_wait_for_writeback(inode); + iwbl_wait_for_writeback(iwbl); } - WARN_ON(inode->i_state & I_SYNC); + WARN_ON(iwbl_test_sync(iwbl)); /* * Skip inode if it is clean and we have no outstanding writeback in * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this @@ -999,7 +1125,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; - inode->i_state |= I_SYNC; + iwbl_set_sync(iwbl, inode); spin_unlock(&inode->i_lock); ret = __writeback_single_inode(inode, wbc); @@ -1013,7 +1139,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, if (!(inode->i_state & I_DIRTY)) iwbl_del_locked(iwbl, wb); spin_unlock(&wb->list_lock); - inode_sync_complete(inode); + iwbl_sync_complete(iwbl); out: spin_unlock(&inode->i_lock); return ret; @@ -1107,7 +1233,7 @@ static long writeback_sb_inodes(struct super_block *sb, redirty_tail(iwbl, wb); continue; } - if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { + if (iwbl_test_sync(iwbl) && wbc.sync_mode != WB_SYNC_ALL) { /* * If this inode is locked for writeback and we are not * doing writeback-for-data-integrity, move it to @@ -1129,14 +1255,14 @@ static long writeback_sb_inodes(struct super_block *sb, * are doing WB_SYNC_NONE writeback. So this catches only the * WB_SYNC_ALL case. */ - if (inode->i_state & I_SYNC) { + if (iwbl_test_sync(iwbl)) { /* Wait for I_SYNC. This function drops i_lock... */ - inode_sleep_on_writeback(inode); + iwbl_sleep_on_writeback(iwbl); /* Inode may be gone, start again */ spin_lock(&wb->list_lock); continue; } - inode->i_state |= I_SYNC; + iwbl_set_sync(iwbl, inode); spin_unlock(&inode->i_lock); write_chunk = writeback_chunk_size(wb, work); @@ -1156,7 +1282,7 @@ static long writeback_sb_inodes(struct super_block *sb, if (!(inode->i_state & I_DIRTY)) wrote++; requeue_inode(iwbl, wb, &wbc); - inode_sync_complete(inode); + iwbl_sync_complete(iwbl); spin_unlock(&inode->i_lock); cond_resched_lock(&wb->list_lock); /* @@ -1356,7 +1482,7 @@ static long wb_writeback(struct bdi_writeback *wb, spin_lock(&inode->i_lock); spin_unlock(&wb->list_lock); /* This function drops i_lock... */ - inode_sleep_on_writeback(inode); + iwbl_sleep_on_writeback(iwbl); spin_lock(&wb->list_lock); } } @@ -1648,7 +1774,7 @@ void mark_inode_dirty_dctx(struct dirty_context *dctx, int flags) * The unlocker will place the inode on the appropriate * superblock list, based upon its state. */ - if (inode->i_state & I_SYNC) + if (iwbl_test_sync(iwbl)) goto out_unlock_inode; /* diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index e448edc..e3b18f3 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -47,8 +47,15 @@ enum wb_stat_item { * IWBL_* flags which occupy the lower bits of inode_wb_link->data. The * upper bits point to bdi_writeback, so the number of these flags * determines the minimum alignment of bdi_writeback. + * + * IWBL_SYNC + * + * Tracks whether writeback is in progress for an iwbl. If this bit is + * set for any iwbl on an inode, the inode's I_SYNC is set too. */ enum { + IWBL_SYNC = 0, + IWBL_FLAGS_BITS, IWBL_FLAGS_MASK = (1UL << IWBL_FLAGS_BITS) - 1, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index b394821..4c22824 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -625,6 +625,9 @@ struct inode { #ifdef CONFIG_IMA atomic_t i_readcount; /* struct files open RO */ #endif +#ifdef CONFIG_CGROUP_WRITEBACK + unsigned int i_nr_syncs; +#endif const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct file_lock *i_flock; struct address_space i_data; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e4db465..1399ad6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -539,6 +539,7 @@ static void cgwb_exit(struct bdi_writeback *wb) spin_lock_irqsave(&wb->bdi->icgwbls_lock, flags); list_for_each_entry_safe(icgwbl, next, &wb->icgwbls, wb_node) { WARN_ON_ONCE(!list_empty(&icgwbl->iwbl.dirty_list)); + WARN_ON_ONCE(test_bit(IWBL_SYNC, &icgwbl->iwbl.data)); hlist_del_rcu(&icgwbl->inode_node); list_del(&icgwbl->wb_node); kfree_rcu(icgwbl, rcu); -- 2.1.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>