Disclaimer: this code has a lot of rough edges and assumes a simple storage. It's mainly to serve as a proof of concept and focuses on getting the basic control algorithm out. It's generally working for pure/mixed buffered/direct writes, except that it still assumes the direct writes, if there are any, are aggressive ones. The exploration stops here since I see no obvious way for this scheme to support hierarchical cgroups. Test results can be found in https://github.com/fengguang/io-controller-tests/blob/master/log/ The key ideas and comments can be found in two functions in the patch: - cfq_scale_slice() - blkcg_update_dirty_ratelimit() The other changes are mainly supporting bits. It adapts the existing interfaces - blkio.throttle.write_bps_device - blkio.weight from the semantics "for direct IO" to "for direct+buffered IO" (it now handles write IO only, but should be trivial to cover reads). It tries to do 1:1 split of direct:buffered writes inside the cgroup which essentially implements intra-cgroup proportional weights. Signed-off-by: Fengguang Wu <fengguang.wu@xxxxxxxxx> --- block/blk-cgroup.c | 19 ++- block/blk-throttle.c | 2 +- block/cfq-iosched.c | 42 ++++- block/cfq.h | 2 +- fs/direct-io.c | 9 + include/linux/backing-dev.h | 13 ++ {block => include/linux}/blk-cgroup.h | 76 +++++++- include/trace/events/writeback.h | 88 ++++++++- mm/backing-dev.c | 2 + mm/filemap.c | 1 + mm/page-writeback.c | 369 +++++++++++++++++++++++++++++++-- 11 files changed, 588 insertions(+), 35 deletions(-) rename {block => include/linux}/blk-cgroup.h (87%) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 126c341..56cb330 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -17,7 +17,7 @@ #include <linux/err.h> #include <linux/blkdev.h> #include <linux/slab.h> -#include "blk-cgroup.h" +#include <linux/blk-cgroup.h> #include <linux/genhd.h> #define MAX_KEY_LEN 100 @@ -25,7 +25,11 @@ static DEFINE_SPINLOCK(blkio_list_lock); static LIST_HEAD(blkio_list); -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; +struct blkio_cgroup blkio_root_cgroup = +{ + .weight = 2*BLKIO_WEIGHT_DEFAULT, + .dio_weight = 2*BLKIO_WEIGHT_DEFAULT, +}; EXPORT_SYMBOL_GPL(blkio_root_cgroup); /* for encoding cft->private value on file */ @@ -1302,6 +1306,7 @@ static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) spin_lock(&blkio_list_lock); spin_lock_irq(&blkcg->lock); blkcg->weight = (unsigned int)val; + blkcg->dio_weight = (unsigned int)val; hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { pn = blkio_policy_search_node(blkcg, blkg->dev, @@ -1564,6 +1569,8 @@ static void blkiocg_destroy(struct cgroup *cgroup) free_css_id(&blkio_subsys, &blkcg->css); rcu_read_unlock(); + percpu_counter_destroy(&blkcg->nr_dirtied); + percpu_counter_destroy(&blkcg->nr_direct_write); if (blkcg != &blkio_root_cgroup) kfree(blkcg); } @@ -1583,11 +1590,19 @@ static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) return ERR_PTR(-ENOMEM); blkcg->weight = BLKIO_WEIGHT_DEFAULT; + blkcg->dio_weight = BLKIO_WEIGHT_DEFAULT; + blkcg->dirty_ratelimit = (100 << (20 - PAGE_SHIFT)); + blkcg->balanced_dirty_ratelimit = (100 << (20 - PAGE_SHIFT)); + blkcg->recent_dirtied_error = 1 << BLKCG_DIRTY_ERROR_SHIFT; done: spin_lock_init(&blkcg->lock); INIT_HLIST_HEAD(&blkcg->blkg_list); INIT_LIST_HEAD(&blkcg->policy_list); + + percpu_counter_init(&blkcg->nr_dirtied, 0); + percpu_counter_init(&blkcg->nr_direct_write, 0); + return &blkcg->css; } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f2ddb94..f004ccc 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -9,7 +9,7 @@ #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> -#include "blk-cgroup.h" +#include <linux/blk-cgroup.h> #include "blk.h" /* Max dispatch from a group in 1 round */ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3c38536..759c57a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -541,12 +541,47 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); } -static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) +extern unsigned int total_async_weight; + +static inline u64 cfq_scale_slice(unsigned long delta, + struct cfq_group *cfqg, + struct cfq_rb_root *st, + bool sync) { u64 d = delta << CFQ_SERVICE_SHIFT; + unsigned int weight = cfqg->weight; + +#ifdef CONFIG_BLK_CGROUP + struct blkio_cgroup *blkcg; + + if (!sync && cfqg->blkg.blkcg_id == 1) + /* + * weight for the flusher; assume no other IO in the root + * cgroup for now + */ + weight = max_t(int, BLKIO_WEIGHT_MIN, total_async_weight); + else { + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + if (time_is_after_eq_jiffies(blkcg->bw_time_stamp + HZ)) + /* + * weight for the direct IOs in this cgroup; the other + * weight will be stealed into total_async_weight for + * the async IOs, so that the flusher get proper disk + * time to do async writers for duty of this cgroup. + */ + weight = blkcg->dio_weight; + rcu_read_unlock(); + } + + trace_printk("blkcg_id=%d charge=%lu %s_weight=%u weight=%u\n", + (int)cfqg->blkg.blkcg_id, delta, + sync ? "dio" : "async", + weight, cfqg->weight); +#endif d = d * BLKIO_WEIGHT_DEFAULT; - do_div(d, cfqg->weight); + do_div(d, weight); return d; } @@ -989,7 +1024,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, /* Can't update vdisktime while group is on service tree */ cfq_group_service_tree_del(st, cfqg); - cfqg->vdisktime += cfq_scale_slice(charge, cfqg); + cfqg->vdisktime += cfq_scale_slice(charge, cfqg, st, + cfq_cfqq_sync(cfqq)); /* If a new weight was requested, update now, off tree */ cfq_group_service_tree_add(st, cfqg); diff --git a/block/cfq.h b/block/cfq.h index 2a15592..e322f33 100644 --- a/block/cfq.h +++ b/block/cfq.h @@ -1,6 +1,6 @@ #ifndef _CFQ_H #define _CFQ_H -#include "blk-cgroup.h" +#include <linux/blk-cgroup.h> #ifdef CONFIG_CFQ_GROUP_IOSCHED static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, diff --git a/fs/direct-io.c b/fs/direct-io.c index f4aadd1..e85e4da 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,6 +37,7 @@ #include <linux/uio.h> #include <linux/atomic.h> #include <linux/prefetch.h> +#include <linux/blk-cgroup.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -766,10 +767,18 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, int ret = 0; if (dio->rw & WRITE) { +#ifdef CONFIG_BLK_DEV_THROTTLING + struct blkio_cgroup *blkcg = task_blkio_cgroup(current); + if (blkcg) + __percpu_counter_add(&blkcg->nr_direct_write, len, + BDI_STAT_BATCH); +#endif /* * Read accounting is performed in submit_bio() */ task_io_account_write(len); + add_bdi_stat(dio->inode->i_mapping->backing_dev_info, + BDI_DIRECT_WRITE, len); } /* diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index b1038bd..55bb537 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -42,6 +42,7 @@ enum bdi_stat_item { BDI_WRITEBACK, BDI_DIRTIED, BDI_WRITTEN, + BDI_DIRECT_WRITE, NR_BDI_STAT_ITEMS }; @@ -79,6 +80,8 @@ struct backing_dev_info { unsigned long written_stamp; /* pages written at bw_time_stamp */ unsigned long write_bandwidth; /* the estimated write bandwidth */ unsigned long avg_write_bandwidth; /* further smoothed write bw */ + unsigned long direct_write_stamp; + unsigned long direct_write_bandwidth; /* * The base dirty throttle rate, re-calculated on every 200ms. @@ -144,6 +147,16 @@ static inline void __add_bdi_stat(struct backing_dev_info *bdi, __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH); } +static inline void add_bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item, s64 amount) +{ + unsigned long flags; + + local_irq_save(flags); + __add_bdi_stat(bdi, item, amount); + local_irq_restore(flags); +} + static inline void __inc_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item) { diff --git a/block/blk-cgroup.h b/include/linux/blk-cgroup.h similarity index 87% rename from block/blk-cgroup.h rename to include/linux/blk-cgroup.h index 6f3ace7..87082cc 100644 --- a/block/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -21,6 +21,10 @@ enum blkio_policy_id { BLKIO_POLICY_THROTL, /* Throttling */ }; +#define BLKIO_WEIGHT_MIN 10 +#define BLKIO_WEIGHT_MAX 1000 +#define BLKIO_WEIGHT_DEFAULT 500 + /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX @@ -111,12 +115,34 @@ enum blkcg_file_name_throtl { BLKIO_THROTL_io_serviced, }; +/* keep a history of ~50s (256 * 200ms) */ +#define BLKCG_RECENT_DIRTIED_BUCKETS 256 + struct blkio_cgroup { struct cgroup_subsys_state css; unsigned int weight; + unsigned int dio_weight; spinlock_t lock; struct hlist_head blkg_list; struct list_head policy_list; /* list of blkio_policy_node */ + struct percpu_counter nr_dirtied; + struct percpu_counter nr_direct_write; + unsigned long bw_time_stamp; + unsigned long dirtied_stamp; + unsigned long direct_write_stamp; + unsigned long dio_rate; + unsigned long dirty_rate; + unsigned long avg_dirty_rate; + unsigned long dirty_ratelimit; + unsigned long balanced_dirty_ratelimit; + + /* optional feature: long term dirty error cancellation */ + int recent_dirtied_error; + int recent_dirtied_index; + int recent_dirtied_sum; + int recent_dirtied_target_sum; + int recent_dirtied[BLKCG_RECENT_DIRTIED_BUCKETS]; + int recent_dirtied_target[BLKCG_RECENT_DIRTIED_BUCKETS]; }; struct blkio_group_stats { @@ -208,6 +234,29 @@ extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev); +extern struct blkio_cgroup blkio_root_cgroup; + +static inline bool blkcg_is_root(struct blkio_cgroup *blkcg) +{ + return blkcg == &blkio_root_cgroup; +} +static inline unsigned int blkcg_weight(struct blkio_cgroup *blkcg) +{ + return blkcg->weight; +} +static inline unsigned long blkcg_dirty_ratelimit(struct blkio_cgroup *blkcg) +{ + return blkcg->dirty_ratelimit; +} + +#define BLKCG_DIRTY_ERROR_SHIFT 10 +static inline unsigned long blkcg_dirty_position(struct blkio_cgroup *blkcg, + unsigned long pos_ratio) +{ + return pos_ratio * blkcg->recent_dirtied_error >> + BLKCG_DIRTY_ERROR_SHIFT; +} + typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); typedef void (blkio_update_group_weight_fn) (void *key, @@ -247,6 +296,9 @@ static inline char *blkg_path(struct blkio_group *blkg) #else +struct blkio_cgroup { +}; + struct blkio_group { }; @@ -258,11 +310,26 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } -#endif +static inline bool blkcg_is_root(struct blkio_cgroup *blkcg) +{ + return true; +} +static inline unsigned int blkcg_weight(struct blkio_cgroup *blkcg) +{ + return BLKIO_WEIGHT_DEFAULT; +} +static inline unsigned long blkcg_dirty_ratelimit(struct blkio_cgroup *blkcg) +{ + return 0; +} -#define BLKIO_WEIGHT_MIN 10 -#define BLKIO_WEIGHT_MAX 1000 -#define BLKIO_WEIGHT_DEFAULT 500 +static inline unsigned long blkcg_dirty_position(struct blkio_cgroup *blkcg, + unsigned long pos_ratio) +{ + return pos_ratio; +} + +#endif #ifdef CONFIG_DEBUG_BLK_CGROUP void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); @@ -304,7 +371,6 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} #endif #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) -extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 7b81887..f04508c 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -248,6 +248,91 @@ TRACE_EVENT(global_dirty_state, #define KBps(x) ((x) << (PAGE_SHIFT - 10)) +TRACE_EVENT(blkcg_dirty_ratelimit, + + TP_PROTO(struct backing_dev_info *bdi, + unsigned long pps, + unsigned long dirty_rate, + unsigned long avg_dirty_rate, + unsigned long task_ratelimit, + unsigned long balanced_dirty_ratelimit, + unsigned long dio_rate, + unsigned long avg_dio_rate, + unsigned int dio_weight, + unsigned int async_weight, + unsigned int total_async_weight, + unsigned int recent_dirtied_error, + unsigned int blkcg_id + ), + + TP_ARGS(bdi, pps, dirty_rate, avg_dirty_rate, + task_ratelimit, balanced_dirty_ratelimit, + dio_rate, avg_dio_rate, dio_weight, async_weight, + total_async_weight, recent_dirtied_error, blkcg_id), + + TP_STRUCT__entry( + __array(char, bdi, 32) + __field(unsigned long, kbps) + __field(unsigned long, dirty_rate) + __field(unsigned long, avg_dirty_rate) + __field(unsigned long, writeout_rate) + __field(unsigned long, dirty_ratelimit) + __field(unsigned long, task_ratelimit) + __field(unsigned long, balanced_dirty_ratelimit) + __field(unsigned long, dio_rate) + __field(unsigned long, avg_dio_rate) + __field(unsigned int, dio_weight) + __field(unsigned int, async_weight) + __field(unsigned int, total_async_weight) + __field(unsigned int, recent_dirtied_error) + __field(unsigned int, blkcg_id) + ), + + TP_fast_assign( + strlcpy(__entry->bdi, dev_name(bdi->dev), 32); + __entry->kbps = KBps(pps); + __entry->dirty_rate = KBps(dirty_rate); + __entry->avg_dirty_rate = KBps(avg_dirty_rate); + __entry->writeout_rate = KBps(bdi->avg_write_bandwidth); + __entry->task_ratelimit = KBps(task_ratelimit); + __entry->dirty_ratelimit = KBps(bdi->dirty_ratelimit); + __entry->balanced_dirty_ratelimit = + KBps(balanced_dirty_ratelimit); + __entry->dio_rate = KBps(dio_rate); + __entry->avg_dio_rate = KBps(avg_dio_rate); + __entry->dio_weight = dio_weight; + __entry->async_weight = async_weight; + __entry->total_async_weight = total_async_weight; + __entry->recent_dirtied_error = recent_dirtied_error; + __entry->blkcg_id = blkcg_id; + ), + + TP_printk("bdi %s: kbps=%lu " + "dirty_rate=%lu avg_dirty_rate=%lu bdi_writeout_rate=%lu " + "bdi_dirty_ratelimit=%lu " + "task_ratelimit=%lu " + "balanced_dirty_ratelimit=%lu " + "dio_rate=%lu avg_dio_rate=%lu " + "dio_weight=%u async_weight=%u total_async_weight=%u " + "dirty_error=%u blkcg_id=%u", + __entry->bdi, + __entry->kbps, + __entry->dirty_rate, + __entry->avg_dirty_rate, + __entry->writeout_rate, + __entry->dirty_ratelimit, + __entry->task_ratelimit, + __entry->balanced_dirty_ratelimit, + __entry->dio_rate, + __entry->avg_dio_rate, + __entry->dio_weight, + __entry->async_weight, + __entry->total_async_weight, + __entry->recent_dirtied_error, + __entry->blkcg_id + ) +); + TRACE_EVENT(bdi_dirty_ratelimit, TP_PROTO(struct backing_dev_info *bdi, @@ -269,7 +354,8 @@ TRACE_EVENT(bdi_dirty_ratelimit, TP_fast_assign( strlcpy(__entry->bdi, dev_name(bdi->dev), 32); __entry->write_bw = KBps(bdi->write_bandwidth); - __entry->avg_write_bw = KBps(bdi->avg_write_bandwidth); + __entry->avg_write_bw = KBps(bdi->avg_write_bandwidth + + bdi->direct_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); __entry->dirty_ratelimit = KBps(bdi->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dd8e2aa..b623358 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -99,6 +99,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BackgroundThresh: %10lu kB\n" "BdiDirtied: %10lu kB\n" "BdiWritten: %10lu kB\n" + "BdiDirectWrite: %10lu kB\n" "BdiWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" @@ -112,6 +113,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) K(background_thresh), (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)), (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), + (unsigned long) K(bdi_stat(bdi, BDI_DIRECT_WRITE)), (unsigned long) K(bdi->write_bandwidth), nr_dirty, nr_io, diff --git a/mm/filemap.c b/mm/filemap.c index 79c4b2b..a945b71 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2294,6 +2294,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); + inc_bdi_stat(mapping->backing_dev_info, BDI_DIRECT_WRITE); /* * Finally, try again to invalidate clean pages which might have been diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 26adea8..f02c1bf 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -34,6 +34,7 @@ #include <linux/syscalls.h> #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ #include <linux/pagevec.h> +#include <linux/blk-cgroup.h> #include <trace/events/writeback.h> /* @@ -736,13 +737,10 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, return pos_ratio; } -static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, - unsigned long elapsed, - unsigned long written) +static unsigned long calc_bandwidth(unsigned long write_bandwidth, + unsigned long pages, unsigned long elapsed) { const unsigned long period = roundup_pow_of_two(3 * HZ); - unsigned long avg = bdi->avg_write_bandwidth; - unsigned long old = bdi->write_bandwidth; u64 bw; /* @@ -752,26 +750,36 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, * write_bandwidth = --------------------------------------------------- * period */ - bw = written - bdi->written_stamp; - bw *= HZ; + bw = pages * HZ; if (unlikely(elapsed > period)) { do_div(bw, elapsed); - avg = bw; - goto out; + return bw; } - bw += (u64)bdi->write_bandwidth * (period - elapsed); + bw += (u64)write_bandwidth * (period - elapsed); bw >>= ilog2(period); + return bw; +} + +static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, + unsigned long elapsed, + unsigned long written) +{ + unsigned long avg = bdi->avg_write_bandwidth; + unsigned long old = bdi->write_bandwidth; + unsigned long bw; + + bw = calc_bandwidth(old, written - bdi->written_stamp, elapsed); + /* * one more level of smoothing, for filtering out sudden spikes */ - if (avg > old && old >= (unsigned long)bw) + if (avg > old && old >= bw) avg -= (avg - old) >> 3; - if (avg < old && old <= (unsigned long)bw) + if (avg < old && old <= bw) avg += (old - avg) >> 3; -out: bdi->write_bandwidth = bw; bdi->avg_write_bandwidth = avg; } @@ -864,6 +872,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * when dirty pages are truncated by userspace or re-dirtied by FS. */ dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; + dirty_rate += bdi->direct_write_bandwidth; pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty); @@ -904,13 +913,10 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * the dirty count meet the setpoint, but also where the slope of * pos_ratio is most flat and hence task_ratelimit is least fluctuated. */ - balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, - dirty_rate | 1); - /* - * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw - */ - if (unlikely(balanced_dirty_ratelimit > write_bw)) - balanced_dirty_ratelimit = write_bw; + balanced_dirty_ratelimit = + div_u64((u64)task_ratelimit * write_bw + + (u64)dirty_ratelimit * bdi->direct_write_bandwidth, + dirty_rate | 1); /* * We could safely do this and return immediately: @@ -993,6 +999,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, unsigned long elapsed = now - bdi->bw_time_stamp; unsigned long dirtied; unsigned long written; + unsigned long direct_written; /* * rate-limit, only update once every 200ms. @@ -1002,6 +1009,8 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]); written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + direct_written = percpu_counter_read(&bdi->bdi_stat[BDI_DIRECT_WRITE]) + >> PAGE_CACHE_SHIFT; /* * Skip quiet periods when disk bandwidth is under-utilized. @@ -1010,17 +1019,23 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) goto snapshot; + bdi_update_write_bandwidth(bdi, elapsed, written); + bdi->direct_write_bandwidth = + calc_bandwidth(bdi->direct_write_bandwidth, + direct_written - bdi->direct_write_stamp, + elapsed); + if (thresh) { global_update_bandwidth(thresh, dirty, now); bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirtied, elapsed); } - bdi_update_write_bandwidth(bdi, elapsed, written); snapshot: bdi->dirtied_stamp = dirtied; bdi->written_stamp = written; + bdi->direct_write_stamp = direct_written; bdi->bw_time_stamp = now; } @@ -1151,6 +1166,299 @@ static long bdi_min_pause(struct backing_dev_info *bdi, return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } + +static DEFINE_SPINLOCK(async_weight_lock); +unsigned int total_async_weight; +static unsigned int async_weight_val[100]; +static unsigned long async_weight_timestamp[100]; + +#ifdef CONFIG_BLK_DEV_THROTTLING +/* + * a quick hack for maintaining a sum over all active blkcg's async_weight. + * + * total_async_weight = sum(blkcg->async_weight) + * + * total_async_weight will be used as cfqg weight for the flusher. + */ +static void blkcg_update_async_weight(struct blkio_cgroup *blkcg, + unsigned int async_weight) +{ + int i, j; + + spin_lock(&async_weight_lock); + i = css_id(&blkcg->css); + if (i >= 100) + i = 99; + j = -async_weight_val[i]; + async_weight_val[i] = async_weight; + async_weight_timestamp[i] = jiffies; + j += async_weight_val[i]; + total_async_weight += j; + /* + * retire async weights for the groups that went quiet. Shall also + * clear total_async_weight when no more buffered writes in the system. + */ + for (i = 0; i < 100; i++) { + if (!time_is_after_eq_jiffies(async_weight_timestamp[i] + HZ)) { + total_async_weight -= async_weight_val[i]; + async_weight_val[i] = 0; + } + } + spin_unlock(&async_weight_lock); +} + +/* optional feature: long term dirty error cancellation */ +static void blkcg_update_dirty_position(struct blkio_cgroup *blkcg, + struct backing_dev_info *bdi, + unsigned long pos_ratio, + unsigned long target, + unsigned long dirtied, + unsigned long elapsed) +{ + int i, j; + int recent_dirtied; + + target = (target * pos_ratio * elapsed >> RATELIMIT_CALC_SHIFT) / HZ; + recent_dirtied = blkcg->dirtied_stamp ? + dirtied - blkcg->dirtied_stamp : 0; + i = blkcg->recent_dirtied_index; + blkcg->recent_dirtied_sum += recent_dirtied - blkcg->recent_dirtied[i]; + blkcg->recent_dirtied_target_sum += + target - blkcg->recent_dirtied_target[i]; + blkcg->recent_dirtied[i] = recent_dirtied; + blkcg->recent_dirtied_target[i] = target; + if (++i >= BLKCG_RECENT_DIRTIED_BUCKETS) + i = 0; + blkcg->recent_dirtied_index = i; + + i = blkcg->recent_dirtied_target_sum; + j = blkcg->recent_dirtied_target_sum - blkcg->recent_dirtied_sum; + j = clamp_val(j, -i/8, i/8); + blkcg->recent_dirtied_error = (1 << BLKCG_DIRTY_ERROR_SHIFT) + + (j << BLKCG_DIRTY_ERROR_SHIFT) / i; + + trace_printk("recent_dirtied=%d/%d target=%lu/%d error=%d/%d\n", + recent_dirtied, blkcg->recent_dirtied_sum, + target, blkcg->recent_dirtied_target_sum, + j, blkcg->recent_dirtied_error); +} + +static void blkcg_update_dirty_ratelimit(struct blkio_cgroup *blkcg, + struct backing_dev_info *bdi, + unsigned long write_bps, + unsigned long pos_ratio, + unsigned long dirtied, + unsigned long direct_written, + unsigned long elapsed) +{ + unsigned long async_write_bps; + unsigned long blkcg_pos_ratio; + unsigned long ratelimit; + unsigned long dirty_rate; + unsigned long balanced_dirty_rate; + unsigned long task_ratelimit; + unsigned long dio_rate; + unsigned long step; + unsigned long x; + unsigned int dio_weight; + unsigned int async_weight; + + blkcg_pos_ratio = blkcg_dirty_position(blkcg, pos_ratio); + + dirty_rate = (dirtied - blkcg->dirtied_stamp) * HZ; + dirty_rate /= elapsed; + blkcg->dirty_rate = (blkcg->dirty_rate * 7 + dirty_rate) / 8, + + dio_rate = (direct_written - blkcg->direct_write_stamp) * HZ; + dio_rate /= elapsed; + blkcg->dio_rate = (blkcg->dio_rate * 7 + dio_rate) / 8; + + /* + * write_bps will be the buffered+direct write rate limit for this + * cgroup/bdi. It's computed by the proportional weight and/or + * bandwidth throttle policies, whichever lower limit applies. + * + * If replace bdi->dirty_ratelimit with parent_blkcg->dirty_ratelimit, + * it becomes a hirechichal control (may also need accounting changes). + */ + x = bdi->dirty_ratelimit * blkcg_weight(blkcg) / BLKIO_WEIGHT_DEFAULT; + if (!write_bps || write_bps > x) + write_bps = x; + + /* + * Target for 1:1 direct_IO:buffered_write split inside the cgroup. + * + * When there are both aggressive buffered and direct writers, we'll + * grant half blkcg->weight to the global cgroup that holds the + * flusher and another half for the direct IO inside the cgroup: + * + * if (both agressive buffered and direct writers) { + * total_async_weight += blkcg->weight/2; + * blkcg->dio_weight = blkcg->weight/2; + * } + * + * Otherwise: + * + * if (only aggressive buffered writers) + * total_async_weight += blkcg->weight; + * + * if (only aggressive direct writers) + * blkcg->dio_weight = blkcg->weight; + * + * When the buffered and/or direct writers have long think times and + * are self-throttled under (write_bps/2), it becomes tricky to + * allocate the weight. + * + * It's fine to set + * + * blkcg->dio_weight = blkcg->weight / 2; + * + * for a self-throttled direct writer. The extra weight simply won't be + * utilized. The weight for the flusher will be: + * + * total_async_weight += blkcg->weight * + * min(blkcg->dirty_rate, write_bps - blkcg->dio_rate) / write_bps; + * + * Unfortunately we don't know for sure whether the direct writer is + * self-throttled. So that logic is not enabled currently. + * + * Self-throttled buffered dirtiers can be reliably detected and + * handled easily this way: + * + * blkcg->dio_weight = blkcg->weight * + * (write_bps - blkcg->dirty_rate) / write_bps; + * total_async_weight += blkcg->weight - blkcg->dio_weight; + * + * There will be no side effect if the direct writer happen to be + * self-throttled and cannot utilize the allocated dio_weight. + */ + + balanced_dirty_rate = div_u64((u64)blkcg->dirty_rate << + RATELIMIT_CALC_SHIFT, blkcg_pos_ratio + 1); + if (blkcg->dirty_ratelimit >= write_bps && + balanced_dirty_rate < write_bps / 2) { + /* self throttled buffered writes */ + dio_weight = div_u64((u64)blkcg->weight * + (write_bps - balanced_dirty_rate), write_bps); + } else { + dio_weight = blkcg->weight / 2; + } + blkcg->dio_weight = dio_weight; + + if (!blkcg->dio_rate) { + /* no direct writes at all */ + async_write_bps = write_bps; + async_weight = blkcg->weight; +#if 0 // XXX: need some logic to detect this case, perhaps short lived cfqg? + } else if (dio is self-throttled under write_bps / 2) + async_write_bps = write_bps - blkcg->dio_rate; + async_weight = blkcg->weight * async_write_bps / write_bps; +#endif + } else { + /* assume aggressive direct writes */ + async_write_bps = write_bps / 2; + async_weight = blkcg->weight - dio_weight; + } + + /* + * add this blkcg's async_weight to the global total_async_weight for + * use by the flusher + */ + blkcg_update_async_weight(blkcg, async_weight); + + /* optional feature: long term dirty error cancellation */ + blkcg_update_dirty_position(blkcg, bdi, pos_ratio, + async_write_bps, dirtied, elapsed); + + /* + * given the async_write_bps target, calc the balanced dirty ratelimit + * for the dirtier tasks inside the cgroup. + */ + task_ratelimit = blkcg->dirty_ratelimit * blkcg_pos_ratio >> + RATELIMIT_CALC_SHIFT; + ratelimit = div_u64((u64)task_ratelimit * async_write_bps, + blkcg->dirty_rate + 1); + /* + * update blkcg->dirty_ratelimit towards @ratelimit, limiting the step + * size and filtering out noises + */ + step = 0; + if (blkcg->recent_dirtied_error > (1 << BLKCG_DIRTY_ERROR_SHIFT)) { + x = min(blkcg->balanced_dirty_ratelimit, + min(ratelimit, task_ratelimit)); + if (blkcg->dirty_ratelimit < x) + step = x - blkcg->dirty_ratelimit; + } else { + x = max(blkcg->balanced_dirty_ratelimit, + max(ratelimit, task_ratelimit)); + if (blkcg->dirty_ratelimit > x) + step = blkcg->dirty_ratelimit - x; + } + step >>= blkcg->dirty_ratelimit / (32 * step + 1); + step = (step + 7) / 8; + if (blkcg->dirty_ratelimit < ratelimit) + blkcg->dirty_ratelimit += step; + else + blkcg->dirty_ratelimit -= step; + blkcg->dirty_ratelimit++; /* avoid stucking in 0 */ + blkcg->dirty_ratelimit = min(blkcg->dirty_ratelimit, write_bps); + blkcg->balanced_dirty_ratelimit = ratelimit; + + trace_blkcg_dirty_ratelimit(bdi, write_bps, + dirty_rate, blkcg->dirty_rate, + task_ratelimit, ratelimit, + dio_rate, blkcg->dio_rate, + dio_weight, async_weight, total_async_weight, + blkcg->recent_dirtied_error, + css_id(&blkcg->css)); +} + +void blkcg_update_bandwidth(struct blkio_cgroup *blkcg, + struct backing_dev_info *bdi, + unsigned long write_bps, + unsigned long pos_ratio) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - blkcg->bw_time_stamp; + unsigned long dirtied; + unsigned long direct_written; + unsigned long flags; + + if (elapsed <= BANDWIDTH_INTERVAL) /* avoid unnecessary locks */ + return; + + spin_lock_irqsave(&blkcg->lock, flags); + + if (elapsed <= BANDWIDTH_INTERVAL) + goto unlock; + + dirtied = percpu_counter_read(&blkcg->nr_dirtied); + direct_written = percpu_counter_read(&blkcg->nr_direct_write) >> + PAGE_CACHE_SHIFT; + if (elapsed > HZ) + goto snapshot; + + blkcg_update_dirty_ratelimit(blkcg, bdi, write_bps, pos_ratio, + dirtied, direct_written, elapsed); +snapshot: + blkcg->dirtied_stamp = dirtied; + blkcg->direct_write_stamp = direct_written; + blkcg->bw_time_stamp = now; +unlock: + spin_unlock_irqrestore(&blkcg->lock, flags); +} + +#else + +void blkcg_update_bandwidth(struct blkio_cgroup *blkcg, + struct backing_dev_info *bdi, + unsigned long write_bps, + unsigned long pos_ratio) +{ +} + +#endif + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -1180,6 +1488,9 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long pos_ratio; struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long start_time = jiffies; + struct blkio_cgroup *blkcg = task_blkio_cgroup(current); + unsigned long blkcg_write_bps = blkcg_get_write_bps(blkcg, 0) >> + PAGE_CACHE_SHIFT; for (;;) { unsigned long now = jiffies; @@ -1258,10 +1569,19 @@ static void balance_dirty_pages(struct address_space *mapping, nr_dirty, bdi_thresh, bdi_dirty, start_time); - dirty_ratelimit = bdi->dirty_ratelimit; pos_ratio = bdi_position_ratio(bdi, dirty_thresh, background_thresh, nr_dirty, bdi_thresh, bdi_dirty); + + if (blkcg_is_root(blkcg)) + dirty_ratelimit = bdi->dirty_ratelimit; + else { + blkcg_update_bandwidth(blkcg, bdi, + blkcg_write_bps, pos_ratio); + pos_ratio = blkcg_dirty_position(blkcg, pos_ratio); + dirty_ratelimit = blkcg_dirty_ratelimit(blkcg); + } + task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> RATELIMIT_CALC_SHIFT; max_pause = bdi_max_pause(bdi, bdi_dirty); @@ -1936,6 +2256,11 @@ int __set_page_dirty_no_writeback(struct page *page) void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { +#ifdef CONFIG_BLK_DEV_THROTTLING + struct blkio_cgroup *blkcg = task_blkio_cgroup(current); + if (blkcg) + __percpu_counter_add(&blkcg->nr_dirtied, 1, BDI_STAT_BATCH); +#endif __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); -- 1.7.9.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html