On Thu, Aug 27, 2020 at 08:41:30PM -0600, Jens Axboe wrote: > We currently increment the task/vm counts when we first attempt to queue a > bio. But this isn't necessarily correct - if the request allocation fails > with -EAGAIN, for example, and the caller retries, then we'll over-account > by as many retries as are done. > > This can happen for polled IO, where we cannot wait for requests. Hence > retries can get aggressive, if we're running out of requests. If this > happens, then watching the IO rates in vmstat are incorrect as they count > every issue attempt as successful and hence the stats are inflated by > quite a lot potentially. > > Abstract out the accounting function, and move the blk-mq accounting into > blk_mq_make_request() when we know we got a request. For the non-mq path, > just do it when the bio is queued. > > Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> > > --- > > diff --git a/block/blk-core.c b/block/blk-core.c > index d9d632639bd1..aabd016faf79 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -28,7 +28,6 @@ > #include <linux/slab.h> > #include <linux/swap.h> > #include <linux/writeback.h> > -#include <linux/task_io_accounting_ops.h> > #include <linux/fault-inject.h> > #include <linux/list_sort.h> > #include <linux/delay.h> > @@ -1113,6 +1112,8 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) > struct bio_list bio_list_on_stack[2]; > blk_qc_t ret = BLK_QC_T_NONE; > > + blk_account_bio(bio); > + > BUG_ON(bio->bi_next); > > bio_list_init(&bio_list_on_stack[0]); > @@ -1232,35 +1233,6 @@ blk_qc_t submit_bio(struct bio *bio) > if (blkcg_punt_bio_submit(bio)) > return BLK_QC_T_NONE; > > - /* > - * If it's a regular read/write or a barrier with data attached, > - * go through the normal accounting stuff before submission. > - */ > - if (bio_has_data(bio)) { > - unsigned int count; > - > - if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) > - count = queue_logical_block_size(bio->bi_disk->queue) >> 9; > - else > - count = bio_sectors(bio); > - > - if (op_is_write(bio_op(bio))) { > - count_vm_events(PGPGOUT, count); > - } else { > - task_io_account_read(bio->bi_iter.bi_size); > - count_vm_events(PGPGIN, count); > - } > - > - if (unlikely(block_dump)) { > - char b[BDEVNAME_SIZE]; > - printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", > - current->comm, task_pid_nr(current), > - op_is_write(bio_op(bio)) ? "WRITE" : "READ", > - (unsigned long long)bio->bi_iter.bi_sector, > - bio_devname(bio, b), count); > - } > - } > - > /* > * If we're reading data that is part of the userspace workingset, count > * submission time as memory stall. When the device is congested, or > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 0015a1892153..b77c66dfc19c 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -27,6 +27,7 @@ > #include <linux/crash_dump.h> > #include <linux/prefetch.h> > #include <linux/blk-crypto.h> > +#include <linux/task_io_accounting_ops.h> > > #include <trace/events/block.h> > > @@ -2111,6 +2112,39 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) > } > } > > +void blk_account_bio(struct bio *bio) > +{ > + unsigned int count; > + > + /* > + * If it's a regular read/write or a barrier with data attached, > + * go through the normal accounting stuff before submission. > + */ > + if (unlikely(!bio_has_data(bio))) > + return; > + > + if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) > + count = queue_logical_block_size(bio->bi_disk->queue) >> 9; > + else > + count = bio_sectors(bio); > + > + if (op_is_write(bio_op(bio))) { > + count_vm_events(PGPGOUT, count); > + } else { > + task_io_account_read(bio->bi_iter.bi_size); > + count_vm_events(PGPGIN, count); > + } > + > + if (unlikely(block_dump)) { > + char b[BDEVNAME_SIZE]; > + printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", > + current->comm, task_pid_nr(current), > + op_is_write(bio_op(bio)) ? "WRITE" : "READ", > + (unsigned long long)bio->bi_iter.bi_sector, > + bio_devname(bio, b), count); > + } > +} > + > /** > * blk_mq_submit_bio - Create and send a request to block device. > * @bio: Bio pointer. > @@ -2165,6 +2199,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) > goto queue_exit; > } > > + blk_account_bio(bio); bio merged to plug list or sched will not be accounted in this way. Thanks, Ming