Hello, Boris. On Fri, May 29, 2020 at 04:20:17PM -0700, Boris Burkov wrote: > In order to improve consistency and usability in cgroup stat accounting, > we would like to support the root cgroup's io.stat. > > Since the root cgroup has processes doing io even if the system has no > explicitly created cgroups, we need to be careful to avoid overhead in > that case. For that reason, the rstat algorithms don't handle the root > cgroup, so just turning the file on wouldn't give correct statistics. > > To get around this, we simulate flushing the iostat struct by filling it > out directly from global disk stats. The result is a root cgroup io.stat > file consistent with both /proc/diskstats and io.stat. > > Signed-off-by: Boris Burkov <boris@xxxxxx> > Suggested-by: Tejun Heo <tj@xxxxxxxxxx> ... > +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) > +{ Can you please separate out code reorganization to a separate patch so that the actual change can be reviewed clearly? > +/* > + * The rstat algorithms intentionally don't handle the root cgroup to avoid > + * incurring overhead when no cgroups are defined. For that reason, > + * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the > + * iostat in the root cgroup's blkcg_gq. > + * > + * However, we would like to re-use the printing code between the root and > + * non-root cgroups to the extent possible. For that reason, we simulate > + * flushing the root cgroup's stats by explicitly filling in the iostat > + * with disk level statistics. > + */ This is clever and neat. > +static void blkcg_fill_root_iostats(void) > +{ > + struct class_dev_iter iter; > + struct device *dev; > + > + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); > + while ((dev = class_dev_iter_next(&iter))) { > + struct gendisk *disk = dev_to_disk(dev); > + struct hd_struct *part = disk_get_part(disk, 0); > + struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); > + struct blkg_iostat tmp; > + int cpu; > + > + memset(&tmp, 0, sizeof(tmp)); > + for_each_possible_cpu(cpu) { > + struct disk_stats *cpu_dkstats; > + > + cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); > + tmp.ios[BLKG_IOSTAT_READ] += > + cpu_dkstats->ios[STAT_READ]; > + tmp.ios[BLKG_IOSTAT_WRITE] += > + cpu_dkstats->ios[STAT_WRITE]; > + tmp.ios[BLKG_IOSTAT_DISCARD] += > + cpu_dkstats->ios[STAT_DISCARD]; > + // convert sectors to bytes > + tmp.bytes[BLKG_IOSTAT_READ] += > + cpu_dkstats->sectors[STAT_READ] << 9; > + tmp.bytes[BLKG_IOSTAT_WRITE] += > + cpu_dkstats->sectors[STAT_WRITE] << 9; > + tmp.bytes[BLKG_IOSTAT_DISCARD] += > + cpu_dkstats->sectors[STAT_DISCARD] << 9; > + > + u64_stats_update_begin(&blkg->iostat.sync); > + blkg_iostat_set(&blkg->iostat.cur, &tmp); > + u64_stats_update_end(&blkg->iostat.sync); > + } > + } > +} ... > diff --git a/block/genhd.c b/block/genhd.c > index afdb2c3e5b22..4f5f4590517c 100644 > --- a/block/genhd.c > +++ b/block/genhd.c > @@ -38,8 +38,6 @@ static struct kobject *block_depr; > static DEFINE_SPINLOCK(ext_devt_lock); > static DEFINE_IDR(ext_devt_idr); > > -static const struct device_type disk_type; > - > static void disk_check_events(struct disk_events *ev, > unsigned int *clearing_ptr); > static void disk_alloc_events(struct gendisk *disk); > @@ -1566,7 +1564,7 @@ static char *block_devnode(struct device *dev, umode_t *mode, > return NULL; > } > > -static const struct device_type disk_type = { > +const struct device_type disk_type = { > .name = "disk", > .groups = disk_attr_groups, > .release = disk_release, > diff --git a/include/linux/genhd.h b/include/linux/genhd.h > index a9384449465a..ea38bc36bc6d 100644 > --- a/include/linux/genhd.h > +++ b/include/linux/genhd.h > @@ -26,6 +26,7 @@ > #define disk_to_dev(disk) (&(disk)->part0.__dev) > #define part_to_dev(part) (&((part)->__dev)) > > +extern const struct device_type disk_type; So, this is fine but I'd explicitly mention it in the patch description. Thanks. -- tejun