The mechanism for detecting whether an inode should switch its wb (bdi_writeback) association is now in place. This patch build the framework for the actual switching. This patch adds a new inode flag I_WB_SWITCHING, which has two functions. First, the easy one, it ensures that there's only one switching in progress for a give inode. Second, it's used as a mechanism to synchronize wb stat updates. The two stats, WB_RECLAIMABLE and WB_WRITEBACK, aren't event counters but track the current number of dirty pages and pages under writeback respectively. As such, when an inode is moved from one wb to another, the inode's portion of those stats have to be transferred together; unfortunately, this is a bit tricky as those stat updates are percpu operations which are performed without holding any lock in some places. This patch solves the problem in a similar way as memcg. Each such lockless stat updates are wrapped in transaction surrounded by inode_wb_stat_unlocked_begin/end(). During normal operation, they map to rcu_read_lock/unlock(); however, if I_WB_SWITCHING is asserted, mapping->tree_lock is grabbed across the transaction. In turn, the switching path sets I_WB_SWITCHING and waits for a RCU grace period to pass before actually starting to switch, which guarantees that all stat update paths are synchronizing against mapping->tree_lock. Combined with the previous patch, this makes all wb list and stat operations to be protected by either of inode->i_lock, wb->list_lock, or mapping->tree_lock while wb switching is in progress. This patch still doesn't implement the actual switching. Signed-off-by: Tejun Heo <tj@xxxxxxxxxx> Cc: Jens Axboe <axboe@xxxxxxxxx> Cc: Jan Kara <jack@xxxxxxx> Cc: Wu Fengguang <fengguang.wu@xxxxxxxxx> Cc: Greg Thelen <gthelen@xxxxxxxxxx> --- fs/fs-writeback.c | 117 +++++++++++++++++++++++++++++++++++++++++--- include/linux/backing-dev.h | 53 ++++++++++++++++++++ include/linux/fs.h | 6 +++ mm/page-writeback.c | 16 ++++-- mm/truncate.c | 7 ++- 5 files changed, 188 insertions(+), 11 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e888c4b..7a1ab24 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -295,6 +295,115 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) return locked_inode_to_wb_and_lock_list(inode); } +struct inode_switch_wb_context { + struct inode *inode; + struct bdi_writeback *new_wb; + + struct rcu_head rcu_head; + struct work_struct work; +}; + +static void inode_switch_wb_work_fn(struct work_struct *work) +{ + struct inode_switch_wb_context *isw = + container_of(work, struct inode_switch_wb_context, work); + struct inode *inode = isw->inode; + struct bdi_writeback *new_wb = isw->new_wb; + + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions + * between inode_wb_stat_unlocked_begin/end() are guaranteed to be + * synchronizing against mapping->tree_lock. + */ + spin_lock(&inode->i_lock); + + inode->i_wb_frn_winner = 0; + inode->i_wb_frn_avg_time = 0; + inode->i_wb_frn_history = 0; + + /* + * Paired with load_acquire in inode_wb_stat_unlocked_begin() and + * ensures that the new wb is visible if they see !I_WB_SWITCH. + */ + smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); + + spin_unlock(&inode->i_lock); + + iput(inode); + wb_put(new_wb); + kfree(isw); +} + +static void inode_switch_wb_rcu_fn(struct rcu_head *rcu_head) +{ + struct inode_switch_wb_context *isw = + container_of(rcu_head, struct inode_switch_wb_context, rcu_head); + + /* needs to grab bh-unsafe locks, bounce to work item */ + INIT_WORK(&isw->work, inode_switch_wb_work_fn); + schedule_work(&isw->work); +} + +/** + * inode_switch_wb - change the wb association of an inode + * @inode: target inode + * @new_wb_id: ID of the new wb + * + * Switch @inode's wb association to the wb identified by @new_wb_id. The + * switching is performed asynchronously and may fail silently. + */ +static void inode_switch_wb(struct inode *inode, int new_wb_id) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct cgroup_subsys_state *memcg_css; + struct inode_switch_wb_context *isw; + + /* noop if seems to be already in progress */ + if (inode->i_state & I_WB_SWITCH) + return; + + isw = kzalloc(sizeof(*isw), GFP_ATOMIC); + if (!isw) + return; + + /* find and pin the new wb */ + rcu_read_lock(); + memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); + if (memcg_css) + isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + rcu_read_unlock(); + if (!isw->new_wb) + goto out_free; + + /* while holding I_WB_SWITCH, no one else can update the association */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_WB_SWITCH | I_FREEING) || + inode_to_wb(inode) == isw->new_wb) { + spin_unlock(&inode->i_lock); + goto out_free; + } + inode->i_state |= I_WB_SWITCH; + spin_unlock(&inode->i_lock); + + ihold(inode); + isw->inode = inode; + + /* + * In addition to synchronizing among switchers, I_WB_SWITCH tells + * the RCU protected stat update paths to grab the mapping's + * tree_lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be visible. + */ + call_rcu(&isw->rcu_head, inode_switch_wb_rcu_fn); + return; + +out_free: + if (isw->new_wb) + wb_put(isw->new_wb); + kfree(isw); +} + /** * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it * @wbc: writeback_control of interest @@ -420,12 +529,8 @@ void wbc_detach_inode(struct writeback_control *wbc) * is okay. The main goal is avoiding keeping an inode on * the wrong wb for an extended period of time. */ - if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) { - /* switch */ - max_id = 0; - avg_time = 0; - history = 0; - } + if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) + inode_switch_wb(inode, max_id); } /* diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3cbbec3..040be1a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -333,6 +333,49 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return inode->i_wb; } +/** + * inode_wb_stat_unlocked_begin - begin wb stat update transaction + * @inode: target inode + * @lockedp: temp bool output param, to be passed to the end function + * + * The caller wants to update stats of the wb associated with @inode but + * isn't holding inode->i_lock, mapping->tree_lock, or wb->list_lock. This + * function determines the wb to use and ensures that the stat update is + * properly synchronized against wb switching. + * + * The caller must call inode_wb_stat_unlocked_end() with *@lockdep after + * updating the stats and can't sleep during transaction. IRQ may or may + * not be disabled on return. + */ +static inline struct bdi_writeback * +inode_wb_stat_unlocked_begin(struct inode *inode, bool *lockedp) +{ + rcu_read_lock(); + + /* + * Paired with store_release in inode_switch_wb_work_fn() and + * ensures that we see the new wb if we see cleared I_WB_SWITCH. + */ + *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + + if (unlikely(*lockedp)) + spin_lock_irq(&inode->i_mapping->tree_lock); + return inode_to_wb(inode); +} + +/** + * inode_wb_stat_unlocked_end - end wb stat update transaction + * @inode: target inode + * @locked: *@lockedp from inode_wb_stat_unlocked_begin() + */ +static inline void inode_wb_stat_unlocked_end(struct inode *inode, bool locked) +{ + if (unlikely(locked)) + spin_unlock_irq(&inode->i_mapping->tree_lock); + + rcu_read_unlock(); +} + struct wb_iter { int start_blkcg_id; struct radix_tree_iter tree_iter; @@ -421,6 +464,16 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return &inode_to_bdi(inode)->wb; } +static inline struct bdi_writeback * +inode_wb_stat_unlocked_begin(struct inode *inode, bool *lockedp) +{ + return inode_to_wb(inode); +} + +static inline void inode_wb_stat_unlocked_end(struct inode *inode, bool locked) +{ +} + static inline void wb_memcg_offline(struct mem_cgroup *memcg) { } diff --git a/include/linux/fs.h b/include/linux/fs.h index 6c3ea8a..51221cb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1780,6 +1780,11 @@ struct super_operations { * * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). * + * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to + * synchronize competing switching instances and to tell + * wb stat updates to grab mapping->tree_lock. See + * inode_switch_wb_work_fn() for details. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -1799,6 +1804,7 @@ struct super_operations { #define I_DIRTY_TIME (1 << 11) #define __I_DIRTY_TIME_EXPIRED 12 #define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) +#define I_WB_SWITCH (1 << 13) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 88e7553..f037ea8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2474,11 +2474,15 @@ void account_page_redirty(struct page *page) struct address_space *mapping = page->mapping; if (mapping && mapping_cap_account_dirty(mapping)) { - struct bdi_writeback *wb = inode_to_wb(mapping->host); + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + bool locked; + wb = inode_wb_stat_unlocked_begin(inode, &locked); current->nr_dirtied--; dec_zone_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); + inode_wb_stat_unlocked_end(inode, locked); } } EXPORT_SYMBOL(account_page_redirty); @@ -2579,12 +2583,16 @@ EXPORT_SYMBOL(set_page_dirty_lock); int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; int ret = 0; BUG_ON(!PageLocked(page)); if (mapping && mapping_cap_account_dirty(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct mem_cgroup *memcg; + bool locked; + /* * Yes, Virginia, this is indeed insane. * @@ -2620,14 +2628,16 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ + wb = inode_wb_stat_unlocked_begin(inode, &locked); memcg = mem_cgroup_begin_page_stat(page); if (TestClearPageDirty(page)) { mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); - dec_wb_stat(inode_to_wb(mapping->host), WB_RECLAIMABLE); + dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } mem_cgroup_end_page_stat(memcg); + inode_wb_stat_unlocked_end(inode, locked); return ret; } return TestClearPageDirty(page); diff --git a/mm/truncate.c b/mm/truncate.c index 9d40cd4..9604b01 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -111,12 +111,14 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) struct address_space *mapping = page->mapping; if (mapping && mapping_cap_account_dirty(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; struct mem_cgroup *memcg; + bool locked; + wb = inode_wb_stat_unlocked_begin(inode, &locked); memcg = mem_cgroup_begin_page_stat(page); if (TestClearPageDirty(page)) { - struct bdi_writeback *wb = inode_to_wb(mapping->host); - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); @@ -124,6 +126,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) task_io_account_cancelled_write(account_size); } mem_cgroup_end_page_stat(memcg); + inode_wb_stat_unlocked_end(inode, locked); } else { ClearPageDirty(page); } -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html