In order to improve the sequential of split io, this patch disables tag preemption for the first split bios and other non-split bios if the device is under high io pressure. Noted that this solution rely on waitqueues of sbitmap to be balanced, otherwise it may happen that 'wake_batch' tags is freed and wakers don't obtain 'wake_batch' new tags, thus concurrent io will become less. The next patch will avoid such problem, however, fix the unfairness of waitqueues might be better. Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx> --- block/blk-merge.c | 7 ++++++- block/blk-mq-tag.c | 37 ++++++++++++++++++++++++++----------- block/blk-mq.c | 6 ++++++ block/blk-mq.h | 1 + include/linux/blk_types.h | 2 ++ lib/sbitmap.c | 14 ++++++++++---- 6 files changed, 51 insertions(+), 16 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 340860746cac..fd4bbf773b45 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -357,6 +357,11 @@ static unsigned short blk_queue_split_all(struct request_queue *q, if (!first) first = split; + /* + * Except the first split bio, others will always preempt + * tag, so that they can be sequential. + */ + split->bi_opf |= REQ_PREEMPTIVE; nr_split++; submit_bio_noacct(split); } @@ -387,7 +392,7 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio) if (split) { split->bi_nr_split = blk_queue_split_all(q, *bio); - (*bio)->bi_opf |= REQ_SPLIT; + (*bio)->bi_opf |= (REQ_SPLIT | REQ_PREEMPTIVE); submit_bio_noacct(*bio); *bio = split; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 83dfbe2f1cfc..4e485bcc5820 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -127,6 +127,13 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, return ret; } +static inline bool preempt_tag(struct blk_mq_alloc_data *data, + struct sbitmap_queue *bt) +{ + return data->preemption || + atomic_read(&bt->ws_active) <= SBQ_WAIT_QUEUES; +} + unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); @@ -148,12 +155,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) tag_offset = tags->nr_reserved_tags; } - tag = __blk_mq_get_tag(data, bt); - if (tag != BLK_MQ_NO_TAG) - goto found_tag; + if (data->flags & BLK_MQ_REQ_NOWAIT || preempt_tag(data, bt)) { + tag = __blk_mq_get_tag(data, bt); + if (tag != BLK_MQ_NO_TAG) + goto found_tag; - if (data->flags & BLK_MQ_REQ_NOWAIT) - return BLK_MQ_NO_TAG; + if (data->flags & BLK_MQ_REQ_NOWAIT) + return BLK_MQ_NO_TAG; + } wait.nr_tags += data->nr_split; ws = bt_wait_ptr(bt, data->hctx); @@ -171,20 +180,26 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ - tag = __blk_mq_get_tag(data, bt); - if (tag != BLK_MQ_NO_TAG) - break; + if (preempt_tag(data, bt)) { + tag = __blk_mq_get_tag(data, bt); + if (tag != BLK_MQ_NO_TAG) + break; + } sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); - tag = __blk_mq_get_tag(data, bt); - if (tag != BLK_MQ_NO_TAG) - break; + if (preempt_tag(data, bt)) { + tag = __blk_mq_get_tag(data, bt); + if (tag != BLK_MQ_NO_TAG) + break; + } bt_prev = bt; io_schedule(); sbitmap_finish_wait(bt, ws, &wait); + if (!blk_mq_is_tag_preemptive(data->hctx->flags)) + data->preemption = true; data->ctx = blk_mq_get_ctx(data->q); data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, diff --git a/block/blk-mq.c b/block/blk-mq.c index 9bace9e2c5ca..06ba6fa9ec1a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -470,6 +470,9 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); + if (blk_mq_is_tag_preemptive(data->hctx->flags)) + data->preemption = true; + if (!(data->rq_flags & RQF_ELV)) blk_mq_tag_busy(data->hctx); @@ -577,6 +580,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, data.hctx = xa_load(&q->hctx_table, hctx_idx); if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; + if (blk_mq_is_tag_preemptive(data.hctx->flags)) + data.preemption = true; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); data.ctx = __blk_mq_get_ctx(q, cpu); @@ -2738,6 +2743,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, .nr_tags = 1, .cmd_flags = bio->bi_opf, .nr_split = bio->bi_nr_split, + .preemption = (bio->bi_opf & REQ_PREEMPTIVE), }; struct request *rq; diff --git a/block/blk-mq.h b/block/blk-mq.h index 3eabe394a5a9..915bb710dd6f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -157,6 +157,7 @@ struct blk_mq_alloc_data { /* allocate multiple requests/tags in one go */ unsigned int nr_tags; unsigned int nr_split; + bool preemption; struct request **cached_rq; /* input & output parameter */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 702f6b83dc88..8fd9756f0a06 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -419,6 +419,7 @@ enum req_flag_bits { __REQ_DRV, __REQ_SWAP, /* swapping request. */ __REQ_SPLIT, /* io is splitted */ + __REQ_PREEMPTIVE, /* io can preempt tag */ __REQ_NR_BITS, /* stops here */ }; @@ -444,6 +445,7 @@ enum req_flag_bits { #define REQ_DRV (1ULL << __REQ_DRV) #define REQ_SWAP (1ULL << __REQ_SWAP) #define REQ_SPLIT (1ULL << __REQ_SPLIT) +#define REQ_PREEMPTIVE (1ULL << __REQ_PREEMPTIVE) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 9d04c0ecc8f7..1655c15ee11d 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -597,7 +597,8 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) return NULL; } -static unsigned int get_wake_nr(struct sbq_wait_state *ws, unsigned int nr_tags) +static unsigned int get_wake_nr(struct sbq_wait_state *ws, + unsigned int *nr_tags) { struct sbq_wait *wait; struct wait_queue_entry *entry; @@ -606,11 +607,13 @@ static unsigned int get_wake_nr(struct sbq_wait_state *ws, unsigned int nr_tags) spin_lock_irq(&ws->wait.lock); list_for_each_entry(entry, &ws->wait.head, entry) { wait = container_of(entry, struct sbq_wait, wait); - if (nr_tags <= wait->nr_tags) + if (*nr_tags <= wait->nr_tags) { + *nr_tags = 0; break; + } nr++; - nr_tags -= wait->nr_tags; + *nr_tags -= wait->nr_tags; } spin_unlock_irq(&ws->wait.lock); @@ -648,7 +651,10 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq) ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch); if (ret == wait_cnt) { sbq_index_atomic_inc(&sbq->wake_index); - wake_up_nr(&ws->wait, get_wake_nr(ws, wake_batch)); + wake_up_nr(&ws->wait, get_wake_nr(ws, &wake_batch)); + if (wake_batch) + sbitmap_queue_wake_all(sbq); + return false; } -- 2.31.1