When the system is under background dirty memory threshold but some cgroups are over their background dirty memory thresholds, then only writeback inodes associated with the over-limit cgroups. In addition to checking if the system dirty memory usage is over the system background threshold, over_bground_thresh() now checks if any cgroups are over their respective background dirty memory thresholds. If over-limit cgroups are found, then the new wb_writeback_work.for_cgroup field is set to distinguish between system and memcg overages. The new wb_writeback_work.shared_inodes field is also set. Inodes written by multiple cgroup are marked owned by I_MEMCG_SHARED rather than a particular cgroup. Such shared inodes cannot easily be attributed to a cgroup, so per-cgroup writeback (futures version of wakeup_flusher_threads and balance_dirty_pages) performs suboptimally in the presence of shared inodes. Therefore, write shared inodes when performing cgroup background writeback. If performing cgroup writeback, move_expired_inodes() skips inodes that do not contribute dirty pages to the cgroup being written back. After writing some pages, wb_writeback() will call mem_cgroup_writeback_done() to update the set of over-bg-limits memcg. This change also makes wakeup_flusher_threads() memcg aware so that per-cgroup try_to_free_pages() is able to operate more efficiently without having to write pages of foreign containers. This change adds a mem_cgroup parameter to wakeup_flusher_threads() to allow callers, especially try_to_free_pages() and foreground writeback from balance_dirty_pages(), to specify a particular cgroup to write inodes from. Signed-off-by: Greg Thelen <gthelen@xxxxxxxxxx> --- Changelog since v8: - Added optional memcg parameter to __bdi_start_writeback(), bdi_start_writeback(), wakeup_flusher_threads(), writeback_inodes_wb(). - move_expired_inodes() now uses pass in struct wb_writeback_work instead of struct writeback_control. - Added comments to over_bground_thresh(). fs/buffer.c | 2 +- fs/fs-writeback.c | 96 +++++++++++++++++++++++++++++++++----------- fs/sync.c | 2 +- include/linux/writeback.h | 6 ++- mm/backing-dev.c | 3 +- mm/page-writeback.c | 3 +- mm/vmscan.c | 3 +- 7 files changed, 84 insertions(+), 31 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index dd0220b..da1fb23 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -293,7 +293,7 @@ static void free_more_memory(void) struct zone *zone; int nid; - wakeup_flusher_threads(1024); + wakeup_flusher_threads(1024, NULL); yield(); for_each_online_node(nid) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e91fb82..ba55336 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -38,10 +38,14 @@ struct wb_writeback_work { struct super_block *sb; unsigned long *older_than_this; enum writeback_sync_modes sync_mode; + unsigned short memcg_id; /* If non-zero, then writeback specified + * cgroup. */ unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; + unsigned int for_cgroup:1; /* cgroup writeback */ + unsigned int shared_inodes:1; /* write inodes spanning cgroups */ struct list_head list; /* pending work list */ struct completion *done; /* set if the caller waits */ @@ -114,9 +118,12 @@ static void bdi_queue_work(struct backing_dev_info *bdi, spin_unlock_bh(&bdi->wb_lock); } +/* + * @memcg is optional. If set, then limit writeback to the specified cgroup. + */ static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic) + bool range_cyclic, struct mem_cgroup *memcg) { struct wb_writeback_work *work; @@ -136,6 +143,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; + work->memcg_id = memcg ? css_id(mem_cgroup_css(memcg)) : 0; + work->for_cgroup = memcg != NULL; bdi_queue_work(bdi, work); } @@ -153,7 +162,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) { - __bdi_start_writeback(bdi, nr_pages, true); + __bdi_start_writeback(bdi, nr_pages, true, NULL); } /** @@ -257,15 +266,20 @@ static int move_expired_inodes(struct list_head *delaying_queue, LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; - struct inode *inode; + struct inode *inode, *tmp_inode; int do_sb_sort = 0; int moved = 0; - while (!list_empty(delaying_queue)) { - inode = wb_inode(delaying_queue->prev); + list_for_each_entry_safe_reverse(inode, tmp_inode, delaying_queue, + i_wb_list) { if (work->older_than_this && inode_dirtied_after(inode, *work->older_than_this)) break; + if (work->for_cgroup && + !should_writeback_mem_cgroup_inode(inode, + work->memcg_id, + work->shared_inodes)) + continue; if (sb && sb != inode->i_sb) do_sb_sort = 1; sb = inode->i_sb; @@ -643,31 +657,63 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, return wrote; } -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) +/* + * @memcg is optional. If set, then limit writeback to the specified cgroup. + * If @shared_inodes is set then writeback inodes shared by several memcg. + */ +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, + struct mem_cgroup *memcg, bool shared_inodes) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, + .memcg_id = memcg ? css_id(mem_cgroup_css(memcg)) : 0, + .for_cgroup = (memcg != NULL) || shared_inodes, + .shared_inodes = shared_inodes, .range_cyclic = 1, }; spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) - queue_io(wb, NULL); + queue_io(wb, &work); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); return nr_pages - work.nr_pages; } -static inline bool over_bground_thresh(void) +static inline bool over_bground_thresh(struct wb_writeback_work *work) { unsigned long background_thresh, dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); - return (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) > background_thresh); + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) > background_thresh) { + work->for_cgroup = 0; + return true; + } + + /* + * System dirty memory is below system background limit. Check if any + * memcg are over memcg background limit. + */ + if (mem_cgroups_over_bground_dirty_thresh()) { + work->for_cgroup = 1; + + /* + * Set shared_inodes so that background flusher writes shared + * inodes in addition to inodes in over-limit memcg. Such + * shared inodes should be rarer than inodes written by a single + * memcg. Shared inodes limit the ability to map from memcg to + * inode in wakeup_flusher_threads() and writeback_inodes_wb(). + * So the quicker such shared inodes are written, the better. + */ + work->shared_inodes = 1; + return true; + } + + return false; } /* @@ -729,7 +775,7 @@ static long wb_writeback(struct bdi_writeback *wb, * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh()) + if (work->for_background && !over_bground_thresh(work)) break; if (work->for_kupdate) { @@ -749,6 +795,9 @@ static long wb_writeback(struct bdi_writeback *wb, wb_update_bandwidth(wb, wb_start); + if (progress) + mem_cgroup_writeback_done(); + /* * Did we write something? Try for more * @@ -813,17 +862,15 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh()) { - - struct wb_writeback_work work = { - .nr_pages = LONG_MAX, - .sync_mode = WB_SYNC_NONE, - .for_background = 1, - .range_cyclic = 1, - }; + struct wb_writeback_work work = { + .nr_pages = LONG_MAX, + .sync_mode = WB_SYNC_NONE, + .for_background = 1, + .range_cyclic = 1, + }; + if (over_bground_thresh(&work)) return wb_writeback(wb, &work); - } return 0; } @@ -968,10 +1015,11 @@ int bdi_writeback_thread(void *data) /* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back the + * whole world. If 'memcg' is non-NULL, then limit attempt to only write pages + * from the specified cgroup. */ -void wakeup_flusher_threads(long nr_pages) +void wakeup_flusher_threads(long nr_pages, struct mem_cgroup *memcg) { struct backing_dev_info *bdi; @@ -984,7 +1032,7 @@ void wakeup_flusher_threads(long nr_pages) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false); + __bdi_start_writeback(bdi, nr_pages, false, memcg); } rcu_read_unlock(); } diff --git a/fs/sync.c b/fs/sync.c index c98a747..7c1ba55 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -98,7 +98,7 @@ static void sync_filesystems(int wait) */ SYSCALL_DEFINE0(sync) { - wakeup_flusher_threads(0); + wakeup_flusher_threads(0, NULL); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d12d070..e6790e8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -40,6 +40,7 @@ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) struct backing_dev_info; +struct mem_cgroup; /* * fs/fs-writeback.c @@ -85,9 +86,10 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr); int writeback_inodes_sb_if_idle(struct super_block *); int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); void sync_inodes_sb(struct super_block *); -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages); +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, + struct mem_cgroup *memcg, bool shared_inodes); long wb_do_writeback(struct bdi_writeback *wb, int force_wait); -void wakeup_flusher_threads(long nr_pages); +void wakeup_flusher_threads(long nr_pages, struct mem_cgroup *memcg); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d6edf8d..60d101d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -456,7 +456,8 @@ static int bdi_forker_thread(void *ptr) * the bdi from the thread. Hopefully 1024 is * large enough for efficient IO. */ - writeback_inodes_wb(&bdi->wb, 1024); + writeback_inodes_wb(&bdi->wb, 1024, NULL, + false); } else { /* * The spinlock makes sure we do not lose diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 12b3900..64de98c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -736,7 +736,8 @@ static void balance_dirty_pages(struct address_space *mapping, trace_balance_dirty_start(bdi); if (bdi_nr_reclaimable > task_bdi_thresh) { pages_written += writeback_inodes_wb(&bdi->wb, - write_chunk); + write_chunk, + NULL, false); trace_balance_dirty_written(bdi, pages_written); if (pages_written >= write_chunk) break; /* We've done our duty */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 3153729..fb0ae99 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2223,7 +2223,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; if (total_scanned > writeback_threshold) { - wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, + sc->mem_cgroup); sc->may_writepage = 1; } -- 1.7.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html