async write IO controllers

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Vivek,

To explore the possibility of an integrated async write cgroup IO
controller in balance_dirty_pages(), I did the attached patches.
They should serve it well to illustrate the basic ideas.

It's based on Andrea's two supporting patches and a slightly
simplified and improved version of this v6 patchset.

        root@fat ~# cat test-blkio-cgroup.sh
        #!/bin/sh

        mount /dev/sda7 /fs  

        rmdir /cgroup/async_write
        mkdir /cgroup/async_write
        echo $$ > /cgroup/async_write/tasks
        # echo "8:16  1048576" > /cgroup/async_write/blkio.throttle.read_bps_device

        dd if=/dev/zero of=/fs/zero1 bs=1M count=100 &
        dd if=/dev/zero of=/fs/zero2 bs=1M count=100 &

2-dd case:

        root@fat ~# 100+0 records in
        100+0 records out
        104857600 bytes (105 MB) copied100+0 records in
        100+0 records out
        , 11.9477 s, 8.8 MB/s
        104857600 bytes (105 MB) copied, 11.9496 s, 8.8 MB/s

1-dd case:

        root@fat ~# 100+0 records in
        100+0 records out
        104857600 bytes (105 MB) copied, 6.21919 s, 16.9 MB/s

The patch hard codes a limit of 16MiB/s or 16.8MB/s.  So the 1-dd case
is pretty accurate, and the 2-dd case is a bit leaked due to the time
to take the throttle bandwidth from its initial value 16MiB/s to
8MiB/s. This could be compensated by some position control in future,
so that it won't leak in normal cases.

The main bits, blkcg_update_throttle_bandwidth() is in fact a minimal
version of bdi_update_throttle_bandwidth(); blkcg_update_bandwidth()
is also a cut-down version of bdi_update_bandwidth().

Thanks,
Fengguang
Subject: blkcg: dirty rate accounting
Date: Sat Apr 02 20:15:28 CST 2011

To be used by the balance_dirty_pages() async write IO controller.

Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx>
---
 block/blk-cgroup.c         |    4 ++++
 include/linux/blk-cgroup.h |    1 +
 mm/page-writeback.c        |    4 ++++
 3 files changed, 9 insertions(+)

--- linux-next.orig/block/blk-cgroup.c	2011-04-02 20:17:08.000000000 +0800
+++ linux-next/block/blk-cgroup.c	2011-04-02 21:59:24.000000000 +0800
@@ -1458,6 +1458,7 @@ static void blkiocg_destroy(struct cgrou
 
 	free_css_id(&blkio_subsys, &blkcg->css);
 	rcu_read_unlock();
+	percpu_counter_destroy(&blkcg->nr_dirtied);
 	if (blkcg != &blkio_root_cgroup)
 		kfree(blkcg);
 }
@@ -1483,6 +1484,9 @@ done:
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 
 	INIT_LIST_HEAD(&blkcg->policy_list);
+
+	percpu_counter_init(&blkcg->nr_dirtied, 0);
+
 	return &blkcg->css;
 }
 
--- linux-next.orig/include/linux/blk-cgroup.h	2011-04-02 20:17:08.000000000 +0800
+++ linux-next/include/linux/blk-cgroup.h	2011-04-02 21:59:02.000000000 +0800
@@ -111,6 +111,7 @@ struct blkio_cgroup {
 	spinlock_t lock;
 	struct hlist_head blkg_list;
 	struct list_head policy_list; /* list of blkio_policy_node */
+	struct percpu_counter nr_dirtied;
 };
 
 struct blkio_group_stats {
--- linux-next.orig/mm/page-writeback.c	2011-04-02 20:17:08.000000000 +0800
+++ linux-next/mm/page-writeback.c	2011-04-02 21:59:02.000000000 +0800
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <linux/blk-cgroup.h>
 #include <trace/events/writeback.h>
 
 /*
@@ -221,6 +222,9 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 
 void task_dirty_inc(struct task_struct *tsk)
 {
+	struct blkio_cgroup *blkcg = task_to_blkio_cgroup(tsk);
+	if (blkcg)
+		__percpu_counter_add(&blkcg->nr_dirtied, 1, BDI_STAT_BATCH);
 	prop_inc_single(&vm_dirties, &tsk->dirties);
 }
 
Subject: writeback: async write IO controllers
Date: Fri Mar 04 10:38:04 CST 2011

- a bare per-task async write IO controller
- a bare per-cgroup async write IO controller

XXX: the per-task user interface is reusing RLIMIT_RSS for now.
XXX: the per-cgroup user interface is missing

CC: Vivek Goyal <vgoyal@xxxxxxxxxx>
CC: Andrea Righi <arighi@xxxxxxxxxxx>
Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx>
---
 block/blk-cgroup.c         |    2 
 include/linux/blk-cgroup.h |    4 +
 mm/page-writeback.c        |   86 +++++++++++++++++++++++++++++++----
 3 files changed, 84 insertions(+), 8 deletions(-)

--- linux-next.orig/mm/page-writeback.c	2011-04-05 01:26:38.000000000 +0800
+++ linux-next/mm/page-writeback.c	2011-04-05 01:26:53.000000000 +0800
@@ -1117,6 +1117,49 @@ static unsigned long max_pause(struct ba
 	return clamp_val(t, MIN_PAUSE, MAX_PAUSE);
 }
 
+static void blkcg_update_throttle_bandwidth(struct blkio_cgroup *blkcg,
+					    unsigned long dirtied,
+					    unsigned long elapsed)
+{
+	unsigned long bw = blkcg->throttle_bandwidth;
+	unsigned long long ref_bw;
+	unsigned long dirty_bw;
+
+	ref_bw = blkcg->async_write_bps >> (3 + PAGE_SHIFT - RATIO_SHIFT);
+	dirty_bw = ((dirtied - blkcg->dirtied_stamp)*HZ + elapsed/2) / elapsed;
+	do_div(ref_bw, dirty_bw | 1);
+	ref_bw = bw * ref_bw >> RATIO_SHIFT;
+
+	blkcg->throttle_bandwidth = (bw + ref_bw) / 2;
+}
+
+void blkcg_update_bandwidth(struct blkio_cgroup *blkcg)
+{
+	unsigned long now = jiffies;
+	unsigned long dirtied;
+	unsigned long elapsed;
+
+	if (!blkcg)
+		return;
+	if (!spin_trylock(&blkcg->lock))
+		return;
+
+	elapsed = now - blkcg->bw_time_stamp;
+	dirtied = percpu_counter_read(&blkcg->nr_dirtied);
+
+	if (elapsed > MAX_PAUSE * 2)
+		goto snapshot;
+	if (elapsed <= MAX_PAUSE)
+		goto unlock;
+
+	blkcg_update_throttle_bandwidth(blkcg, dirtied, elapsed);
+snapshot:
+	blkcg->dirtied_stamp = dirtied;
+	blkcg->bw_time_stamp = now;
+unlock:
+	spin_unlock(&blkcg->lock);
+}
+
 /*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
@@ -1139,6 +1182,10 @@ static void balance_dirty_pages(struct a
 	unsigned long pause_max;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	unsigned long start_time = jiffies;
+	struct blkio_cgroup *blkcg = task_to_blkio_cgroup(current);
+
+	if (blkcg == &blkio_root_cgroup)
+		blkcg = NULL;
 
 	for (;;) {
 		unsigned long now = jiffies;
@@ -1178,6 +1225,15 @@ static void balance_dirty_pages(struct a
 		 * when the bdi limits are ramping up.
 		 */
 		if (nr_dirty <= (background_thresh + dirty_thresh) / 2) {
+			if (blkcg) {
+				pause_max = max_pause(bdi, 0);
+				goto cgroup_ioc;
+			}
+			if (current->signal->rlim[RLIMIT_RSS].rlim_cur !=
+			    RLIM_INFINITY) {
+				pause_max = max_pause(bdi, 0);
+				goto task_ioc;
+			}
 			current->paused_when = now;
 			current->nr_dirtied = 0;
 			break;
@@ -1190,21 +1246,35 @@ static void balance_dirty_pages(struct a
 			bdi_start_background_writeback(bdi);
 
 		pause_max = max_pause(bdi, bdi_dirty);
-
 		base_bw = bdi->throttle_bandwidth;
-		/*
-		 * Double the bandwidth for PF_LESS_THROTTLE (ie. nfsd) and
-		 * real-time tasks.
-		 */
-		if (current->flags & PF_LESS_THROTTLE || rt_task(current))
-			base_bw *= 2;
 		bw = position_ratio(bdi, dirty_thresh, nr_dirty, bdi_dirty);
 		if (unlikely(bw == 0)) {
 			period = pause_max;
 			pause = pause_max;
 			goto pause;
 		}
-		bw = base_bw * (u64)bw >> RATIO_SHIFT;
+		bw = (u64)base_bw * bw >> RATIO_SHIFT;
+		if (blkcg && bw > blkcg->throttle_bandwidth) {
+cgroup_ioc:
+			blkcg_update_bandwidth(blkcg);
+			bw = blkcg->throttle_bandwidth;
+			base_bw = bw;
+		}
+		if (bw > current->signal->rlim[RLIMIT_RSS].rlim_cur >>
+								PAGE_SHIFT) {
+task_ioc:
+			bw = current->signal->rlim[RLIMIT_RSS].rlim_cur >>
+								PAGE_SHIFT;
+			base_bw = bw;
+		}
+		/*
+		 * Double the bandwidth for PF_LESS_THROTTLE (ie. nfsd) and
+		 * real-time tasks.
+		 */
+		if (current->flags & PF_LESS_THROTTLE || rt_task(current)) {
+			bw *= 2;
+			base_bw = bw;
+		}
 		period = (HZ * pages_dirtied + bw / 2) / (bw | 1);
 		pause = current->paused_when + period - now;
 		/*
--- linux-next.orig/block/blk-cgroup.c	2011-04-05 01:26:38.000000000 +0800
+++ linux-next/block/blk-cgroup.c	2011-04-05 01:26:39.000000000 +0800
@@ -1486,6 +1486,8 @@ done:
 	INIT_LIST_HEAD(&blkcg->policy_list);
 
 	percpu_counter_init(&blkcg->nr_dirtied, 0);
+	blkcg->async_write_bps = 16 << 23; /* XXX: tunable interface */
+	blkcg->throttle_bandwidth = 16 << (20 - PAGE_SHIFT);
 
 	return &blkcg->css;
 }
--- linux-next.orig/include/linux/blk-cgroup.h	2011-04-05 01:26:38.000000000 +0800
+++ linux-next/include/linux/blk-cgroup.h	2011-04-05 01:26:39.000000000 +0800
@@ -112,6 +112,10 @@ struct blkio_cgroup {
 	struct hlist_head blkg_list;
 	struct list_head policy_list; /* list of blkio_policy_node */
 	struct percpu_counter nr_dirtied;
+	unsigned long bw_time_stamp;
+	unsigned long dirtied_stamp;
+	unsigned long throttle_bandwidth;
+	unsigned long async_write_bps;
 };
 
 struct blkio_group_stats {

[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]