On Wed, May 13, 2009 at 10:00:21AM +0800, Gui Jianfeng wrote: > Hi Vivek, > > This patch enables per-cgroup per-device weight and ioprio_class handling. > A new cgroup interface "policy" is introduced. You can make use of this > file to configure weight and ioprio_class for each device in a given cgroup. > The original "weight" and "ioprio_class" files are still available. If you > don't do special configuration for a particular device, "weight" and > "ioprio_class" are used as default values in this device. > > You can use the following format to play with the new interface. > #echo DEV:weight:ioprio_class > /patch/to/cgroup/policy > weight=0 means removing the policy for DEV. > > Examples: > Configure weight=300 ioprio_class=2 on /dev/hdb in this cgroup > # echo /dev/hdb:300:2 > io.policy > # cat io.policy > dev weight class > /dev/hdb 300 2 > > Configure weight=500 ioprio_class=1 on /dev/hda in this cgroup > # echo /dev/hda:500:1 > io.policy > # cat io.policy > dev weight class > /dev/hda 500 1 > /dev/hdb 300 2 > > Remove the policy for /dev/hda in this cgroup > # echo /dev/hda:0:1 > io.policy > # cat io.policy > dev weight class > /dev/hdb 300 2 > > Signed-off-by: Gui Jianfeng <guijianfeng@xxxxxxxxxxxxxx> > --- > block/elevator-fq.c | 239 +++++++++++++++++++++++++++++++++++++++++++++++++- > block/elevator-fq.h | 11 +++ > 2 files changed, 245 insertions(+), 5 deletions(-) > > diff --git a/block/elevator-fq.c b/block/elevator-fq.c > index 69435ab..7c95d55 100644 > --- a/block/elevator-fq.c > +++ b/block/elevator-fq.c > @@ -12,6 +12,9 @@ > #include "elevator-fq.h" > #include <linux/blktrace_api.h> > #include <linux/biotrack.h> > +#include <linux/seq_file.h> > +#include <linux/genhd.h> > + > > /* Values taken from cfq */ > const int elv_slice_sync = HZ / 10; > @@ -1045,12 +1048,30 @@ struct io_group *io_lookup_io_group_current(struct request_queue *q) > } > EXPORT_SYMBOL(io_lookup_io_group_current); > > -void io_group_init_entity(struct io_cgroup *iocg, struct io_group *iog) > +static struct policy_node *policy_search_node(const struct io_cgroup *iocg, > + void *key); > + > +void io_group_init_entity(struct io_cgroup *iocg, struct io_group *iog, > + void *key) > { > struct io_entity *entity = &iog->entity; > + struct policy_node *pn; > + > + spin_lock_irq(&iocg->lock); > + pn = policy_search_node(iocg, key); > + if (pn) { > + entity->weight = pn->weight; > + entity->new_weight = pn->weight; > + entity->ioprio_class = pn->ioprio_class; > + entity->new_ioprio_class = pn->ioprio_class; > + } else { > + entity->weight = iocg->weight; > + entity->new_weight = iocg->weight; > + entity->ioprio_class = iocg->ioprio_class; > + entity->new_ioprio_class = iocg->ioprio_class; > + } > + spin_unlock_irq(&iocg->lock); Hi Gui, It might make sense to also store the device name or device major and minor number in io_group while creating the io group. This will help us to display io.disk_time and io.disk_sector statistics per device instead of aggregate. I am attaching a patch I was playing around with to display per device statistics instead of aggregate one. So if user has specified the per device rule. Thanks Vivek o Currently the statistics exported through cgroup are aggregate of statistics on all devices for that cgroup. Instead of aggregate, make these per device. o Also export another statistics io.disk_dequeue. This keeps a count of how many times a particular group got out of race for the disk. This is a debugging aid to keep a track how often we could create continuously backlogged queues. Signed-off-by: Vivek Goyal <vgoyal@xxxxxxxxxx> --- block/elevator-fq.c | 127 +++++++++++++++++++++++++++++++++------------------- block/elevator-fq.h | 3 + 2 files changed, 85 insertions(+), 45 deletions(-) Index: linux14/block/elevator-fq.h =================================================================== --- linux14.orig/block/elevator-fq.h 2009-05-13 11:40:32.000000000 -0400 +++ linux14/block/elevator-fq.h 2009-05-13 11:40:57.000000000 -0400 @@ -250,6 +250,9 @@ struct io_group { #ifdef CONFIG_DEBUG_GROUP_IOSCHED unsigned short iocg_id; + dev_t dev; + /* How many times this group has been removed from active tree */ + unsigned long dequeue; #endif }; Index: linux14/block/elevator-fq.c =================================================================== --- linux14.orig/block/elevator-fq.c 2009-05-13 11:40:53.000000000 -0400 +++ linux14/block/elevator-fq.c 2009-05-13 11:40:57.000000000 -0400 @@ -12,6 +12,7 @@ #include "elevator-fq.h" #include <linux/blktrace_api.h> #include <linux/biotrack.h> +#include <linux/seq_file.h> /* Values taken from cfq */ const int elv_slice_sync = HZ / 10; @@ -758,6 +759,18 @@ int __bfq_deactivate_entity(struct io_en BUG_ON(sd->active_entity == entity); BUG_ON(sd->next_active == entity); +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + { + struct io_group *iog = io_entity_to_iog(entity); + /* + * Keep track of how many times a group has been removed + * from active tree because it did not have any active + * backlogged ioq under it + */ + if (iog) + iog->dequeue++; + } +#endif return ret; } @@ -1126,90 +1139,103 @@ STORE_FUNCTION(weight, 0, WEIGHT_MAX); STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); #undef STORE_FUNCTION -/* - * traverse through all the io_groups associated with this cgroup and calculate - * the aggr disk time received by all the groups on respective disks. - */ -static u64 calculate_aggr_disk_time(struct io_cgroup *iocg) +static int io_cgroup_disk_time_read(struct cgroup *cgroup, + struct cftype *cftype, struct seq_file *m) { + struct io_cgroup *iocg; struct io_group *iog; struct hlist_node *n; - u64 disk_time = 0; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + iocg = cgroup_to_io_cgroup(cgroup); rcu_read_lock(); + spin_lock_irq(&iocg->lock); hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { /* * There might be groups which are not functional and * waiting to be reclaimed upon cgoup deletion. */ - if (rcu_dereference(iog->key)) - disk_time += iog->entity.total_service; + if (rcu_dereference(iog->key)) { + seq_printf(m, "%u %u %lu\n", MAJOR(iog->dev), + MINOR(iog->dev), + iog->entity.total_service); + } } + spin_unlock_irq(&iocg->lock); rcu_read_unlock(); - return disk_time; + cgroup_unlock(); + + return 0; } -static u64 io_cgroup_disk_time_read(struct cgroup *cgroup, - struct cftype *cftype) +static int io_cgroup_disk_sectors_read(struct cgroup *cgroup, + struct cftype *cftype, struct seq_file *m) { struct io_cgroup *iocg; - u64 ret; + struct io_group *iog; + struct hlist_node *n; if (!cgroup_lock_live_group(cgroup)) return -ENODEV; iocg = cgroup_to_io_cgroup(cgroup); - spin_lock_irq(&iocg->lock); - ret = jiffies_to_msecs(calculate_aggr_disk_time(iocg)); - spin_unlock_irq(&iocg->lock); - - cgroup_unlock(); - - return ret; -} - -/* - * traverse through all the io_groups associated with this cgroup and calculate - * the aggr number of sectors transferred by all the groups on respective disks. - */ -static u64 calculate_aggr_disk_sectors(struct io_cgroup *iocg) -{ - struct io_group *iog; - struct hlist_node *n; - u64 disk_sectors = 0; rcu_read_lock(); + spin_lock_irq(&iocg->lock); hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { /* * There might be groups which are not functional and * waiting to be reclaimed upon cgoup deletion. */ - if (rcu_dereference(iog->key)) - disk_sectors += iog->entity.total_sector_service; + if (rcu_dereference(iog->key)) { + seq_printf(m, "%u %u %lu\n", MAJOR(iog->dev), + MINOR(iog->dev), + iog->entity.total_sector_service); + } } + spin_unlock_irq(&iocg->lock); rcu_read_unlock(); - return disk_sectors; + cgroup_unlock(); + + return 0; } -static u64 io_cgroup_disk_sectors_read(struct cgroup *cgroup, - struct cftype *cftype) +static int io_cgroup_disk_dequeue_read(struct cgroup *cgroup, + struct cftype *cftype, struct seq_file *m) { - struct io_cgroup *iocg; - u64 ret; + struct io_cgroup *iocg = NULL; + struct io_group *iog = NULL; + struct hlist_node *n; if (!cgroup_lock_live_group(cgroup)) return -ENODEV; iocg = cgroup_to_io_cgroup(cgroup); + + rcu_read_lock(); spin_lock_irq(&iocg->lock); - ret = calculate_aggr_disk_sectors(iocg); + /* Loop through all the io groups and print statistics */ + hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) { + /* + * There might be groups which are not functional and + * waiting to be reclaimed upon cgoup deletion. + */ + if (rcu_dereference(iog->key)) { + seq_printf(m, "%u %u %lu\n", MAJOR(iog->dev), + MINOR(iog->dev), iog->dequeue); + } + } spin_unlock_irq(&iocg->lock); + rcu_read_unlock(); cgroup_unlock(); - return ret; + return 0; } /** @@ -1222,7 +1248,7 @@ static u64 io_cgroup_disk_sectors_read(s * to the root has already an allocated group on @bfqd. */ struct io_group *io_group_chain_alloc(struct request_queue *q, void *key, - struct cgroup *cgroup) + struct cgroup *cgroup, struct bio *bio) { struct io_cgroup *iocg; struct io_group *iog, *leaf = NULL, *prev = NULL; @@ -1250,8 +1276,13 @@ struct io_group *io_group_chain_alloc(st io_group_init_entity(iocg, iog); iog->my_entity = &iog->entity; + #ifdef CONFIG_DEBUG_GROUP_IOSCHED iog->iocg_id = css_id(&iocg->css); + if (bio) { + struct gendisk *disk = bio->bi_bdev->bd_disk; + iog->dev = MKDEV(disk->major, disk->first_minor); + } #endif blk_init_request_list(&iog->rl); @@ -1364,7 +1395,7 @@ void io_group_chain_link(struct request_ */ struct io_group *io_find_alloc_group(struct request_queue *q, struct cgroup *cgroup, struct elv_fq_data *efqd, - int create) + int create, struct bio *bio) { struct io_cgroup *iocg = cgroup_to_io_cgroup(cgroup); struct io_group *iog = NULL; @@ -1375,7 +1406,7 @@ struct io_group *io_find_alloc_group(str if (iog != NULL || !create) return iog; - iog = io_group_chain_alloc(q, key, cgroup); + iog = io_group_chain_alloc(q, key, cgroup, bio); if (iog != NULL) io_group_chain_link(q, key, cgroup, iog, efqd); @@ -1481,7 +1512,7 @@ struct io_group *io_get_io_group(struct goto out; } - iog = io_find_alloc_group(q, cgroup, efqd, create); + iog = io_find_alloc_group(q, cgroup, efqd, create, bio); if (!iog) { if (create) iog = efqd->root_group; @@ -1554,12 +1585,18 @@ struct cftype bfqio_files[] = { }, { .name = "disk_time", - .read_u64 = io_cgroup_disk_time_read, + .read_seq_string = io_cgroup_disk_time_read, }, { .name = "disk_sectors", - .read_u64 = io_cgroup_disk_sectors_read, + .read_seq_string = io_cgroup_disk_sectors_read, }, +#ifdef CONFIG_DEBUG_GROUP_IOSCHED + { + .name = "disk_dequeue", + .read_seq_string = io_cgroup_disk_dequeue_read, + }, +#endif }; int iocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel