Re: [PATCH] block: skip QUEUE_FLAG_STATS and rq-qos for passthrough io

Kundan Kumar <kundanthebest@xxxxxxxxx> · Fri, 24 Nov 2023 16:58:44 +0530

On Fri, Nov 24, 2023 at 2:07 AM Jens Axboe <axboe@xxxxxxxxx> wrote:
>
> On 11/23/23 1:19 PM, Jens Axboe wrote:
> > On 11/23/23 11:12 AM, Kanchan Joshi wrote:
> >> On 11/23/2023 9:00 PM, Christoph Hellwig wrote:
> >>> The rest looks good, but that stats overhead seems pretty horrible..
> >>
> >> On my setup
> >> Before[1]: 7.06M
> >> After[2]: 8.29M
> >>
> >> [1]
> >> # taskset -c 2,3 t/io_uring -b512 -d256 -c32 -s32 -p1 -F1 -B1 -O0 -n2
> >> -u1 -r4 /dev/ng0n1 /dev/ng1n1
> >> submitter=0, tid=2076, file=/dev/ng0n1, node=-1
> >> submitter=1, tid=2077, file=/dev/ng1n1, node=-1
> >> polled=1, fixedbufs=1/0, register_files=1, buffered=1, QD=256
> >> Engine=io_uring, sq_ring=256, cq_ring=256
> >> polled=1, fixedbufs=1/0, register_files=1, buffered=1, QD=256
> >> Engine=io_uring, sq_ring=256, cq_ring=256
> >> IOPS=6.95M, BW=3.39GiB/s, IOS/call=32/31
> >> IOPS=7.06M, BW=3.45GiB/s, IOS/call=32/32
> >> IOPS=7.06M, BW=3.45GiB/s, IOS/call=32/31
> >> Exiting on timeout
> >> Maximum IOPS=7.06M
> >>
> >> [2]
> >>   # taskset -c 2,3 t/io_uring -b512 -d256 -c32 -s32 -p1 -F1 -B1 -O0 -n2
> >> -u1 -r4 /dev/ng0n1 /dev/ng1n1
> >> submitter=0, tid=2123, file=/dev/ng0n1, node=-1
> >> submitter=1, tid=2124, file=/dev/ng1n1, node=-1
> >> polled=1, fixedbufs=1/0, register_files=1, buffered=1, QD=256
> >> Engine=io_uring, sq_ring=256, cq_ring=256
> >> IOPS=8.27M, BW=4.04GiB/s, IOS/call=32/31
> >> IOPS=8.29M, BW=4.05GiB/s, IOS/call=32/31
> >> IOPS=8.29M, BW=4.05GiB/s, IOS/call=31/31
> >> Exiting on timeout
> >> Maximum IOPS=8.29M
> >
> > It's all really down to how expensive getting the current time is on
> > your box, some will be better and some worse
> >
> > One idea that has been bounced around in the past is to have a
> > blk_ktime_get_ns() and have it be something ala:
> >
> > u64 blk_ktime_get_ns(void)
> > {
> >       struct blk_plug *plug = current->plug;
> >
> >       if (!plug)
> >               return ktime_get_ns();
> >
> >       if (!plug->ktime_valid)
> >               plug->ktime = ktime_get_ns();
> >
> >       return plug->ktime;
> > }
> >
> > in freestyle form, with the idea being that we don't care granularity to
> > the extent that we'd need a new stamp every time.
> >
> > If the task is scheduled out, the plug is flushed anyway, which should
> > invalidate the stamp. For preemption this isn't true iirc, so we'd need
> > some kind of blk_flush_plug_ts() or something for that case to
> > invalidate it.
> >
> > Hopefully this could then also get away from passing in a cached value
> > that we do in various spots, exactly because all of this time stamping
> > is expensive. It's also a bit of a game of whack-a-mole, as users get
> > added and distro kernels tend to turn on basically everything anyway.
>
> Did a quick'n dirty (see below), which recoups half of the lost
> performance for me. And my kernel deliberately doesn't enable all of the
> gunk in block/ that slows everything down, I suspect it'll be a bigger
> win for you percentage wise:
>
> IOPS=121.42M, BW=59.29GiB/s, IOS/call=31/31
> IOPS=121.47M, BW=59.31GiB/s, IOS/call=32/32
> IOPS=121.44M, BW=59.30GiB/s, IOS/call=31/31
> IOPS=121.47M, BW=59.31GiB/s, IOS/call=31/31
> IOPS=121.45M, BW=59.30GiB/s, IOS/call=32/32
> IOPS=119.95M, BW=58.57GiB/s, IOS/call=31/32
> IOPS=115.30M, BW=56.30GiB/s, IOS/call=32/31
> IOPS=115.38M, BW=56.34GiB/s, IOS/call=32/32
> IOPS=115.35M, BW=56.32GiB/s, IOS/call=32/32
>
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index fdf25b8d6e78..6e74af442b94 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1055,6 +1055,7 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
>         plug->rq_count = 0;
>         plug->multiple_queues = false;
>         plug->has_elevator = false;
> +       plug->cur_ktime = 0;
>         INIT_LIST_HEAD(&plug->cb_list);
>
>         /*
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index 3f4d41952ef2..e4e9f7eed346 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -143,7 +143,7 @@ static void blk_account_io_flush(struct request *rq)
>         part_stat_lock();
>         part_stat_inc(part, ios[STAT_FLUSH]);
>         part_stat_add(part, nsecs[STAT_FLUSH],
> -                     ktime_get_ns() - rq->start_time_ns);
> +                     blk_ktime_get_ns() - rq->start_time_ns);
>         part_stat_unlock();
>  }
>
> diff --git a/block/blk-iocost.c b/block/blk-iocost.c
> index 089fcb9cfce3..d06cea625462 100644
> --- a/block/blk-iocost.c
> +++ b/block/blk-iocost.c
> @@ -829,7 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
>
>         /* step up/down based on the vrate */
>         vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
> -       now_ns = ktime_get_ns();
> +       now_ns = blk_ktime_get_ns();
>
>         if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
>                 if (!ioc->autop_too_fast_at)
> @@ -1044,7 +1044,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now)
>         unsigned seq;
>         u64 vrate;
>
> -       now->now_ns = ktime_get();
> +       now->now_ns = blk_ktime_get();
>         now->now = ktime_to_us(now->now_ns);
>         vrate = atomic64_read(&ioc->vtime_rate);
>
> @@ -2810,7 +2810,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
>                 return;
>         }
>
> -       on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
> +       on_q_ns = blk_ktime_get_ns() - rq->alloc_time_ns;
>         rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
>         size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 900c1be1fee1..9c96dee9e584 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -323,7 +323,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
>         RB_CLEAR_NODE(&rq->rb_node);
>         rq->tag = BLK_MQ_NO_TAG;
>         rq->internal_tag = BLK_MQ_NO_TAG;
> -       rq->start_time_ns = ktime_get_ns();
> +       rq->start_time_ns = blk_ktime_get_ns();
>         rq->part = NULL;
>         blk_crypto_rq_set_defaults(rq);
>  }
> @@ -333,7 +333,7 @@ EXPORT_SYMBOL(blk_rq_init);
>  static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
>  {
>         if (blk_mq_need_time_stamp(rq))
> -               rq->start_time_ns = ktime_get_ns();
> +               rq->start_time_ns = blk_ktime_get_ns();
>         else
>                 rq->start_time_ns = 0;
>
> @@ -444,7 +444,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
>
>         /* alloc_time includes depth and tag waits */
>         if (blk_queue_rq_alloc_time(q))
> -               alloc_time_ns = ktime_get_ns();
> +               alloc_time_ns = blk_ktime_get_ns();
>
>         if (data->cmd_flags & REQ_NOWAIT)
>                 data->flags |= BLK_MQ_REQ_NOWAIT;
> @@ -629,7 +629,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
>
>         /* alloc_time includes depth and tag waits */
>         if (blk_queue_rq_alloc_time(q))
> -               alloc_time_ns = ktime_get_ns();
> +               alloc_time_ns = blk_ktime_get_ns();
>
>         /*
>          * If the tag allocator sleeps we could get an allocation for a
> @@ -1037,7 +1037,7 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
>  inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
>  {
>         if (blk_mq_need_time_stamp(rq))
> -               __blk_mq_end_request_acct(rq, ktime_get_ns());
> +               __blk_mq_end_request_acct(rq, blk_ktime_get_ns());
>
>         blk_mq_finish_request(rq);
>
> @@ -1080,7 +1080,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
>         u64 now = 0;
>
>         if (iob->need_ts)
> -               now = ktime_get_ns();
> +               now = blk_ktime_get_ns();
>
>         while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
>                 prefetch(rq->bio);
> @@ -1249,7 +1249,7 @@ void blk_mq_start_request(struct request *rq)
>         trace_block_rq_issue(rq);
>
>         if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
> -               rq->io_start_time_ns = ktime_get_ns();
> +               rq->io_start_time_ns = blk_ktime_get_ns();
>                 rq->stats_sectors = blk_rq_sectors(rq);
>                 rq->rq_flags |= RQF_STATS;
>                 rq_qos_issue(q, rq);
> @@ -3066,7 +3066,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq)
>         blk_mq_run_dispatch_ops(q,
>                         ret = blk_mq_request_issue_directly(rq, true));
>         if (ret)
> -               blk_account_io_done(rq, ktime_get_ns());
> +               blk_account_io_done(rq, blk_ktime_get_ns());
>         return ret;
>  }
>  EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 13e4377a8b28..5919919ba1c3 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -1813,7 +1813,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
>         time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
>         ret = tg->latency_target == DFL_LATENCY_TARGET ||
>               tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
> -             (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
> +             (blk_ktime_get_ns() >> 10) - tg->last_finish_time > time ||
>               tg->avg_idletime > tg->idletime_threshold ||
>               (tg->latency_target && tg->bio_cnt &&
>                 tg->bad_bio_cnt * 5 < tg->bio_cnt);
> @@ -2058,7 +2058,7 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
>         if (last_finish_time == 0)
>                 return;
>
> -       now = ktime_get_ns() >> 10;
> +       now = blk_ktime_get_ns() >> 10;
>         if (now <= last_finish_time ||
>             last_finish_time == tg->checked_last_finish_time)
>                 return;
> @@ -2325,7 +2325,7 @@ void blk_throtl_bio_endio(struct bio *bio)
>         if (!tg->td->limit_valid[LIMIT_LOW])
>                 return;
>
> -       finish_time_ns = ktime_get_ns();
> +       finish_time_ns = blk_ktime_get_ns();
>         tg->last_finish_time = finish_time_ns >> 10;
>
>         start_time = bio_issue_time(&bio->bi_issue) >> 10;
> diff --git a/block/blk.h b/block/blk.h
> index 08a358bc0919..4f081a00e644 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -518,4 +518,17 @@ static inline int req_ref_read(struct request *req)
>         return atomic_read(&req->ref);
>  }
>
> +static inline u64 blk_ktime_get_ns(void)
> +{
> +       struct blk_plug *plug = current->plug;
> +
> +       if (!plug)
> +               return ktime_get_ns();
> +       if (!(plug->cur_ktime & 1ULL)) {
> +               plug->cur_ktime = ktime_get_ns();
> +               plug->cur_ktime |= 1ULL;
> +       }
> +       return plug->cur_ktime;
> +}
> +
>  #endif /* BLK_INTERNAL_H */
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 51fa7ffdee83..081830279f70 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -972,6 +972,9 @@ struct blk_plug {
>         bool multiple_queues;
>         bool has_elevator;
>
> +       /* bit0 set if valid */
> +       u64 cur_ktime;
> +
>         struct list_head cb_list; /* md requires an unplug callback */
>  };
>
>
> --
> Jens Axboe
>
>

Hi Jens,

This is what I see with your changes on my setup.

Before[1]: 7.06M
After[2]: 7.52M

[1]
# taskset -c 2,3 t/io_uring -b512 -d256 -c32 -s32 -p1 -F1 -B1 -O0 -n2 \
-u1 -r4 /dev/ng0n1 /dev/ng1n1
submitter=0, tid=2076, file=/dev/ng0n1, node=-1
submitter=1, tid=2077, file=/dev/ng1n1, node=-1
polled=1, fixedbufs=1/0, register_files=1, buffered=1, QD=256
Engine=io_uring, sq_ring=256, cq_ring=256
polled=1, fixedbufs=1/0, register_files=1, buffered=1, QD=256
Engine=io_uring, sq_ring=256, cq_ring=256
IOPS=6.95M, BW=3.39GiB/s, IOS/call=32/31
IOPS=7.06M, BW=3.45GiB/s, IOS/call=32/32
IOPS=7.06M, BW=3.45GiB/s, IOS/call=32/31
Exiting on timeout
Maximum IOPS=7.06M

[2]
# taskset -c 2,3 t/io_uring -b512 -d256 -c32 -s32 -p1 -F1 -B1 -O0 -n2 \
-u1 -r4 /dev/ng0n1 /dev/ng1n1
submitter=0, tid=2204, file=/dev/ng0n1, node=-1
submitter=1, tid=2205, file=/dev/ng1n1, node=-1
polled=1, fixedbufs=1/0, register_files=1, buffered=1, QD=256
Engine=io_uring, sq_ring=256, cq_ring=256
IOPS=7.40M, BW=3.62GiB/s, IOS/call=32/31
IOPS=7.51M, BW=3.67GiB/s, IOS/call=32/31
IOPS=7.52M, BW=3.67GiB/s, IOS/call=32/32
Exiting on timeout
Maximum IOPS=7.52M

The original patch avoids processing throttle stats and wbt_issue/done
stats for passthrough-io path.

Improvement with original-patch :
7.06M -> 8.29M

It seems that both the optimizations are different. The original patch is about
"completely disabling stats for passthrough-io" and your changes
optimize getting the
current time which would improve performance for everyone.

I think both of them are independent.

--
Kundan Kumar