This patch adds to the block layer the ability to intercept flush requests for the purpose of measuring the round-trip times of the write-cache flush command on whatever hardware sits underneath the block layer. The eventual intent of this patch is for higher-level clients (filesystems) to be able to measure flush times to tune their use of them. Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> --- block/blk-core.c | 13 +++++++++++++ block/genhd.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/bio.c | 11 +++++++++++ include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 2 ++ include/linux/genhd.h | 23 +++++++++++++++++++++++ 6 files changed, 90 insertions(+), 0 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 4ce953f..233573a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1540,6 +1540,9 @@ static inline void __generic_make_request(struct bio *bio) trace_block_bio_queue(q, bio); + if (bio->bi_rw & REQ_FLUSH) + bio->flush_start_time_ns = ktime_to_ns(ktime_get()); + ret = q->make_request_fn(q, bio); } while (ret); @@ -1954,6 +1957,9 @@ void blk_start_request(struct request *req) req->next_rq->resid_len = blk_rq_bytes(req->next_rq); blk_add_timer(req); + + if (req->cmd_flags & REQ_FLUSH) + req->flush_start_time_ns = ktime_to_ns(ktime_get()); } EXPORT_SYMBOL(blk_start_request); @@ -2182,6 +2188,13 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); */ static void blk_finish_request(struct request *req, int error) { + if (!error && req->cmd_flags & REQ_FLUSH) { + u64 delta; + + delta = ktime_to_ns(ktime_get()) - req->flush_start_time_ns; + update_flush_times(req->rq_disk, delta); + } + if (blk_rq_tagged(req)) blk_queue_end_tag(req->q, req); diff --git a/block/genhd.c b/block/genhd.c index 5fa2b44..465bd00 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -538,6 +538,9 @@ void add_disk(struct gendisk *disk) */ disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); + spin_lock_init(&disk->flush_time_lock); + disk->avg_flush_time_ns = 0; + disk->flush_samples = 0; /* Register BDI before referencing it from bdev */ bdi = &disk->queue->backing_dev_info; @@ -824,6 +827,37 @@ static ssize_t disk_range_show(struct device *dev, return sprintf(buf, "%d\n", disk->minors); } +static ssize_t disk_flush_samples_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%llu\n", disk->flush_samples); +} + +static ssize_t disk_avg_flush_time_ns_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%llu\n", disk->avg_flush_time_ns); +} + +static ssize_t disk_avg_flush_time_ns_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + + spin_lock(&disk->flush_time_lock); + disk->avg_flush_time_ns = 0; + disk->flush_samples = 0; + spin_unlock(&disk->flush_time_lock); + + return count; +} + + static ssize_t disk_ext_range_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -876,6 +910,9 @@ static ssize_t disk_discard_alignment_show(struct device *dev, } static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); +static DEVICE_ATTR(avg_flush_time_ns, S_IRUGO | S_IWUSR, + disk_avg_flush_time_ns_show, disk_avg_flush_time_ns_store); +static DEVICE_ATTR(flush_samples, S_IRUGO, disk_flush_samples_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); @@ -898,6 +935,8 @@ static struct device_attribute dev_attr_fail_timeout = static struct attribute *disk_attrs[] = { &dev_attr_range.attr, + &dev_attr_avg_flush_time_ns.attr, + &dev_attr_flush_samples.attr, &dev_attr_ext_range.attr, &dev_attr_removable.attr, &dev_attr_ro.attr, diff --git a/fs/bio.c b/fs/bio.c index 4bd454f..aab5f27 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1442,11 +1442,22 @@ EXPORT_SYMBOL(bio_flush_dcache_pages); **/ void bio_endio(struct bio *bio, int error) { + struct request_queue *q = NULL; + if (error) clear_bit(BIO_UPTODATE, &bio->bi_flags); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; + if (bio->bi_bdev) + q = bdev_get_queue(bio->bi_bdev); + if (!(q && q->prep_rq_fn) && !error && bio->bi_rw & REQ_FLUSH) { + u64 delta; + + delta = ktime_to_ns(ktime_get()) - bio->flush_start_time_ns; + update_flush_times(bio->bi_bdev->bd_disk, delta); + } + if (bio->bi_end_io) bio->bi_end_io(bio, error); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 46ad519..e8c29b9 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -74,6 +74,8 @@ struct bio { bio_destructor_t *bi_destructor; /* destructor */ + u64 flush_start_time_ns; + /* * We can inline a number of vecs at the end of the bio, to avoid * double allocations for a small number of bio_vecs. This member diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aae86fd..357d669 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -163,6 +163,8 @@ struct request { /* for bidi */ struct request *next_rq; + + u64 flush_start_time_ns; }; static inline unsigned short req_get_ioprio(struct request *req) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7a7b9c1..7097396 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -178,8 +178,31 @@ struct gendisk { struct blk_integrity *integrity; #endif int node_id; + + spinlock_t flush_time_lock; + u64 avg_flush_time_ns; + u64 flush_samples; }; +static inline void update_flush_times(struct gendisk *disk, u64 delta) +{ + u64 data; + + spin_lock(&disk->flush_time_lock); + if (disk->flush_samples < 256) + disk->flush_samples++; + if (!disk->avg_flush_time_ns) + disk->avg_flush_time_ns = delta; + else { + data = disk->avg_flush_time_ns; + data *= 255; + data += delta; + data /= 256; + disk->avg_flush_time_ns = data; + } + spin_unlock(&disk->flush_time_lock); +} + static inline struct gendisk *part_to_disk(struct hd_struct *part) { if (likely(part)) { -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html