When the system is under background dirty memory threshold but a cgroup is over its background dirty memory threshold, then only writeback inodes associated with the over-limit cgroup(s). In addition to checking if the system dirty memory usage is over the system background threshold, over_bground_thresh() also checks if any cgroups are over their respective background dirty memory thresholds. The writeback_control.for_cgroup field is set to distinguish between a system and memcg overage. If performing cgroup writeback, move_expired_inodes() skips inodes that do not contribute dirty pages to the cgroup being written back. After writing some pages, wb_writeback() will call mem_cgroup_writeback_done() to update the set of over-bg-limits memcg. Signed-off-by: Greg Thelen <gthelen@xxxxxxxxxx> --- fs/fs-writeback.c | 31 +++++++++++++++++++++++-------- 1 files changed, 23 insertions(+), 8 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0174fcf..b01bb2a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -256,14 +256,17 @@ static void move_expired_inodes(struct list_head *delaying_queue, LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; - struct inode *inode; + struct inode *inode, *tmp_inode; int do_sb_sort = 0; - while (!list_empty(delaying_queue)) { - inode = wb_inode(delaying_queue->prev); + list_for_each_entry_safe_reverse(inode, tmp_inode, delaying_queue, + i_wb_list) { if (wbc->older_than_this && inode_dirtied_after(inode, *wbc->older_than_this)) break; + if (wbc->for_cgroup && + !should_writeback_mem_cgroup_inode(inode, wbc)) + continue; if (sb && sb != inode->i_sb) do_sb_sort = 1; sb = inode->i_sb; @@ -614,14 +617,21 @@ void writeback_inodes_wb(struct bdi_writeback *wb, */ #define MAX_WRITEBACK_PAGES 1024 -static inline bool over_bground_thresh(void) +static inline bool over_bground_thresh(struct bdi_writeback *wb, + struct writeback_control *wbc) { unsigned long background_thresh, dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); - return (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) > background_thresh); + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) > background_thresh) { + wbc->for_cgroup = 0; + return true; + } + + wbc->for_cgroup = 1; + return mem_cgroups_over_bground_dirty_thresh(); } /* @@ -700,7 +710,7 @@ static long wb_writeback(struct bdi_writeback *wb, * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh()) + if (work->for_background && !over_bground_thresh(wb, &wbc)) break; if (work->for_kupdate || work->for_background) { @@ -729,6 +739,9 @@ retry: work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; + if (write_chunk - wbc.nr_to_write > 0) + mem_cgroup_writeback_done(); + /* * Did we write something? Try for more * @@ -809,7 +822,9 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh()) { + struct writeback_control wbc; + + if (over_bground_thresh(wb, &wbc)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, -- 1.7.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html