Re: [RFC PATCH v2] blk-mq: Introduce per sw queue time-slice

Andreas Herrmann <aherrmann@xxxxxxxx> · Tue, 9 Feb 2016 18:12:39 +0100

[CC-ing linux-block and linux-scsi and adding some comments]

On Mon, Feb 01, 2016 at 11:43:40PM +0100, Andreas Herrmann wrote:
> This introduces a new blk_mq hw attribute time_slice_us which allows
> to specify a time slice in usecs.
> 
> Default value is 0 and implies no modification to blk-mq behaviour.
> 
> A positive value changes blk-mq to service only one software queue
> within this time slice until it expires or the software queue is
> empty. Then the next software queue with pending requests is selected.
> 
> Signed-off-by: Andreas Herrmann <aherrmann@xxxxxxxx>
> ---
>  block/blk-mq-sysfs.c   |  27 +++++++
>  block/blk-mq.c         | 208 +++++++++++++++++++++++++++++++++++++++++--------
>  include/linux/blk-mq.h |   9 +++
>  3 files changed, 211 insertions(+), 33 deletions(-)
> 
> Hi,
> 
> This update is long overdue (sorry for the delay).
> 
> Change to v1:
> - time slice is now specified in usecs instead of msecs.
> - time slice is extended (up to 3 times the initial value) when there
>   was actually a request to be serviced for the software queue
> 
> Fio test results are sent in a separate mail to this.

See http://marc.info/?l=linux-kernel&m=145436682607949&w=2

In short it shows significant performance gains in some tests,
e.g. sequential read iops up by >40% with 8 jobs. But it's never on
par with CFQ when more than 1 job was used during the test.

> Results for fio improved to some extent with this patch. But in
> reality the picture is quite mixed. Performance is highly dependend on
> task scheduling. There is no guarantee that the requests originated
> from one CPU belong to the same process.
> 
> I think for rotary devices CFQ is by far the best choice. A simple
> illustration is:
> 
>   Copying two files (750MB in this case) in parallel on a rotary
>   device. The elapsed wall clock time (seconds) for this is
>                                mean    stdev
>    cfq, slice_idle=8           16.18   4.95
>    cfq, slice_idle=0           23.74   2.82
>    blk-mq, time_slice_usec=0   24.37   2.05
>    blk-mq, time_slice_usec=250 25.58   3.16

This illustrates that although their was performance gain with fio
tests, the patch can cause higher variance and lower performance in
comparison to unmodified blk-mq with other tests. And it underscores
superiority of CFQ for rotary disks.

Meanwhile my opinion is that it's not really worth to look further
into introduction of I/O scheduling support in blk-mq. I don't see the
need for scheduling support (deadline or something else) for fast
storage devices. And rotary devices should really avoid usage of blk-mq
and stick to CFQ.

Thus I think that introducing some coexistence of blk-mq and the
legacy block with CFQ is the best option.

Recently Johannes sent a patch to enable scsi-mq per driver, see
http://marc.info/?l=linux-scsi&m=145347009631192&w=2

Probably that is a good solution (at least in the short term) to allow
users to switch to blk-mq for some host adapters (with fast storage
attached) but to stick to legacy stuff on other host adapters with
rotary devices.

What do others think?

Thanks,

Andreas

> Regards,
> 
> Andreas
> 
> diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
> index 1cf1878..77c875c 100644
> --- a/block/blk-mq-sysfs.c
> +++ b/block/blk-mq-sysfs.c
> @@ -247,6 +247,26 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
>  	return ret;
>  }
>  
> +static ssize_t blk_mq_hw_sysfs_tslice_show(struct blk_mq_hw_ctx *hctx,
> +					  char *page)
> +{
> +	return sprintf(page, "%u\n", hctx->tslice_us);
> +}
> +
> +static ssize_t blk_mq_hw_sysfs_tslice_store(struct blk_mq_hw_ctx *hctx,
> +					const char *page, size_t length)
> +{
> +	unsigned long long store;
> +	int err;
> +
> +	err = kstrtoull(page, 10, &store);
> +	if (err)
> +		return -EINVAL;
> +
> +	hctx->tslice_us = (unsigned)store;
> +	return length;
> +}
> +
>  static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
>  	.attr = {.name = "dispatched", .mode = S_IRUGO },
>  	.show = blk_mq_sysfs_dispatched_show,
> @@ -305,6 +325,12 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
>  	.show = blk_mq_hw_sysfs_poll_show,
>  };
>  
> +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tslice = {
> +	.attr = {.name = "time_slice_us", .mode = S_IRUGO | S_IWUSR },
> +	.show = blk_mq_hw_sysfs_tslice_show,
> +	.store = blk_mq_hw_sysfs_tslice_store,
> +};
> +
>  static struct attribute *default_hw_ctx_attrs[] = {
>  	&blk_mq_hw_sysfs_queued.attr,
>  	&blk_mq_hw_sysfs_run.attr,
> @@ -314,6 +340,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
>  	&blk_mq_hw_sysfs_cpus.attr,
>  	&blk_mq_hw_sysfs_active.attr,
>  	&blk_mq_hw_sysfs_poll.attr,
> +	&blk_mq_hw_sysfs_tslice.attr,
>  	NULL,
>  };
>  
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 4c0622f..97d32f2 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -68,6 +68,7 @@ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
>  
>  	if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
>  		set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
> +	cpumask_set_cpu(ctx->cpu, hctx->cpu_pending_mask);
>  }
>  
>  static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
> @@ -76,6 +77,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
>  	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
>  
>  	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
> +	cpumask_clear_cpu(ctx->cpu, hctx->cpu_pending_mask);
>  }
>  
>  void blk_mq_freeze_queue_start(struct request_queue *q)
> @@ -682,15 +684,41 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
>  	return false;
>  }
>  
> +static int tslice_flush_one_ctx(struct blk_mq_hw_ctx *hctx,
> +				struct list_head *list,	int cpu)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct blk_mq_ctx *ctx;
> +
> +	if (cpu == -1 || !hctx->tslice_us)
> +		return 0;
> +
> +	ctx = __blk_mq_get_ctx(q, cpu);
> +	spin_lock(&ctx->lock);
> +	if (!list_empty(&ctx->rq_list)) {
> +		list_splice_tail_init(&ctx->rq_list, list);
> +		spin_lock(&hctx->lock);
> +		hctx->tslice_inc = 1;
> +		blk_mq_hctx_clear_pending(hctx, ctx);
> +		spin_unlock(&hctx->lock);
> +	}
> +	spin_unlock(&ctx->lock);
> +	return 1;
> +}
> +
>  /*
>   * Process software queues that have been marked busy, splicing them
>   * to the for-dispatch
>   */
> -static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
> +static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx,
> +			struct list_head *list,	int cpu)
>  {
>  	struct blk_mq_ctx *ctx;
>  	int i;
>  
> +	if (tslice_flush_one_ctx(hctx, list, cpu))
> +		return;
> +
>  	for (i = 0; i < hctx->ctx_map.size; i++) {
>  		struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
>  		unsigned int off, bit;
> @@ -706,9 +734,11 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
>  				break;
>  
>  			ctx = hctx->ctxs[bit + off];
> -			clear_bit(bit, &bm->word);
>  			spin_lock(&ctx->lock);
>  			list_splice_tail_init(&ctx->rq_list, list);
> +			spin_lock(&hctx->lock);
> +			blk_mq_hctx_clear_pending(hctx, ctx);
> +			spin_unlock(&hctx->lock);
>  			spin_unlock(&ctx->lock);
>  
>  			bit++;
> @@ -717,6 +747,114 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
>  }
>  
>  /*
> + * It'd be great if the workqueue API had a way to pass
> + * in a mask and had some smarts for more clever placement.
> + * For now we just round-robin here, switching for every
> + * BLK_MQ_CPU_WORK_BATCH queued items.
> + */
> +static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
> +{
> +	if (hctx->queue->nr_hw_queues == 1)
> +		return WORK_CPU_UNBOUND;
> +
> +	if (--hctx->next_cpu_batch <= 0) {
> +		int cpu = hctx->next_cpu, next_cpu;
> +
> +		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
> +		if (next_cpu >= nr_cpu_ids)
> +			next_cpu = cpumask_first(hctx->cpumask);
> +
> +		hctx->next_cpu = next_cpu;
> +		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
> +
> +		return cpu;
> +	}
> +
> +	return hctx->next_cpu;
> +}
> +
> +static int blk_mq_tslice_expired(struct blk_mq_hw_ctx *hctx)
> +{
> +	if (time_after_eq(jiffies, hctx->tslice_expiration))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +static int blk_mq_tslice_next_cpu(struct blk_mq_hw_ctx *hctx)
> +{
> +	int c = hctx->tslice_cpu;
> +
> +	/* allow extension of time slice */
> +	if (hctx->tslice_inc && hctx->tslice_inc_count < 4) {
> +		hctx->tslice_inc = 0;
> +		hctx->tslice_inc_count++;
> +		goto out;
> +	}
> +	hctx->tslice_inc = 0;
> +	hctx->tslice_inc_count = 0;
> +
> +	/* clear CPU for which tslice has expired */
> +	if (c != -1)
> +		cpumask_clear_cpu(c, hctx->cpu_service_mask);
> +
> +	/* calculate mask of remaining CPUs with pending work */
> +	if (cpumask_and(hctx->cpu_next_mask, hctx->cpu_pending_mask,
> +				hctx->cpu_service_mask)) {
> +		c = cpumask_first(hctx->cpu_next_mask);
> +	} else {
> +		/*
> +		 * no remaining CPUs with pending work, reset epoch,
> +		 * start with first CPU that has requests pending
> +		 */
> +		hctx->tslice_expiration = 0;
> +		cpumask_setall(hctx->cpu_service_mask);
> +		c = cpumask_first(hctx->cpu_pending_mask);
> +	}
> +
> +	/* no CPU with pending requests */
> +	if (c >= nr_cpu_ids)
> +		return -1;
> +
> +out:
> +	hctx->tslice_expiration = jiffies + usecs_to_jiffies(hctx->tslice_us);
> +	return c;
> +}
> +
> +static int tslice_get_busy_ctx(struct blk_mq_hw_ctx *hctx)
> +{
> +	int cpu = -1;
> +
> +	if (hctx->tslice_us) {
> +		spin_lock(&hctx->lock);
> +		if (blk_mq_tslice_expired(hctx))
> +			hctx->tslice_cpu = blk_mq_tslice_next_cpu(hctx);
> +		cpu = hctx->tslice_cpu;
> +		spin_unlock(&hctx->lock);
> +	}
> +
> +	return cpu;
> +}
> +
> +/*
> + * Ensure that hardware queue is run if there are pending
> + * requests in any software queue.
> + */
> +static void tslice_schedule_pending_work(struct blk_mq_hw_ctx *hctx)
> +{
> +	long t;
> +
> +	if (!hctx->tslice_us || cpumask_empty(hctx->cpu_pending_mask))
> +		return;
> +
> +	t = hctx->tslice_expiration - jiffies;
> +	if (t < 0)
> +		t = 0;
> +	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
> +					&hctx->run_work, (unsigned int)t);
> +}
> +
> +/*
>   * Run this hardware queue, pulling any software queues mapped to it in.
>   * Note that this function currently has various problems around ordering
>   * of IO. In particular, we'd like FIFO behaviour on handling existing
> @@ -729,7 +867,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>  	LIST_HEAD(rq_list);
>  	LIST_HEAD(driver_list);
>  	struct list_head *dptr;
> -	int queued;
> +	int queued, cpu;
>  
>  	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
>  
> @@ -739,9 +877,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>  	hctx->run++;
>  
>  	/*
> -	 * Touch any software queue that has pending entries.
> +	 * Touch dedicated software queue if time slice is set or any
> +	 * software queue that has pending entries (cpu == -1).
>  	 */
> -	flush_busy_ctxs(hctx, &rq_list);
> +	cpu = tslice_get_busy_ctx(hctx);
> +	flush_busy_ctxs(hctx, &rq_list, cpu);
>  
>  	/*
>  	 * If we have previous entries on our dispatch list, grab them
> @@ -826,34 +966,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>  		 * blk_mq_run_hw_queue() already checks the STOPPED bit
>  		 **/
>  		blk_mq_run_hw_queue(hctx, true);
> -	}
> -}
> -
> -/*
> - * It'd be great if the workqueue API had a way to pass
> - * in a mask and had some smarts for more clever placement.
> - * For now we just round-robin here, switching for every
> - * BLK_MQ_CPU_WORK_BATCH queued items.
> - */
> -static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
> -{
> -	if (hctx->queue->nr_hw_queues == 1)
> -		return WORK_CPU_UNBOUND;
> -
> -	if (--hctx->next_cpu_batch <= 0) {
> -		int cpu = hctx->next_cpu, next_cpu;
> -
> -		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
> -		if (next_cpu >= nr_cpu_ids)
> -			next_cpu = cpumask_first(hctx->cpumask);
> -
> -		hctx->next_cpu = next_cpu;
> -		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
> -
> -		return cpu;
> -	}
> -
> -	return hctx->next_cpu;
> +	} else
> +		tslice_schedule_pending_work(hctx);
>  }
>  
>  void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
> @@ -992,7 +1106,10 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
>  	struct blk_mq_ctx *ctx = rq->mq_ctx;
>  
>  	__blk_mq_insert_req_list(hctx, ctx, rq, at_head);
> +	spin_lock(&hctx->lock);
>  	blk_mq_hctx_mark_pending(hctx, ctx);
> +	spin_unlock(&hctx->lock);
> +
>  }
>  
>  void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
> @@ -1583,7 +1700,9 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
>  	spin_lock(&ctx->lock);
>  	if (!list_empty(&ctx->rq_list)) {
>  		list_splice_init(&ctx->rq_list, &tmp);
> +		spin_lock(&hctx->lock);
>  		blk_mq_hctx_clear_pending(hctx, ctx);
> +		spin_unlock(&hctx->lock);
>  	}
>  	spin_unlock(&ctx->lock);
>  
> @@ -1602,7 +1721,9 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
>  	}
>  
>  	hctx = q->mq_ops->map_queue(q, ctx->cpu);
> +	spin_lock(&hctx->lock);
>  	blk_mq_hctx_mark_pending(hctx, ctx);
> +	spin_unlock(&hctx->lock);
>  
>  	spin_unlock(&ctx->lock);
>  
> @@ -1691,6 +1812,12 @@ static int blk_mq_init_hctx(struct request_queue *q,
>  	hctx->queue_num = hctx_idx;
>  	hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
>  
> +	hctx->tslice_us = 0;
> +	hctx->tslice_expiration = 0;
> +	hctx->tslice_cpu = -1;
> +	hctx->tslice_inc = 0;
> +	hctx->tslice_inc_count = 0;
> +
>  	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
>  					blk_mq_hctx_notify, hctx);
>  	blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
> @@ -2006,6 +2133,18 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>  						node))
>  			goto err_hctxs;
>  
> +		if (!zalloc_cpumask_var_node(&hctxs[i]->cpu_pending_mask,
> +						GFP_KERNEL, node))
> +			goto err_hctxs;
> +
> +		if (!zalloc_cpumask_var_node(&hctxs[i]->cpu_service_mask,
> +						GFP_KERNEL, node))
> +			goto err_hctxs;
> +
> +		if (!zalloc_cpumask_var_node(&hctxs[i]->cpu_next_mask,
> +						GFP_KERNEL, node))
> +			goto err_hctxs;
> +
>  		atomic_set(&hctxs[i]->nr_active, 0);
>  		hctxs[i]->numa_node = node;
>  		hctxs[i]->queue_num = i;
> @@ -2069,6 +2208,9 @@ err_hctxs:
>  		if (!hctxs[i])
>  			break;
>  		free_cpumask_var(hctxs[i]->cpumask);
> +		free_cpumask_var(hctxs[i]->cpu_pending_mask);
> +		free_cpumask_var(hctxs[i]->cpu_service_mask);
> +		free_cpumask_var(hctxs[i]->cpu_next_mask);
>  		kfree(hctxs[i]);
>  	}
>  err_map:
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 7fc9296..a8ca685 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -57,6 +57,15 @@ struct blk_mq_hw_ctx {
>  
>  	atomic_t		nr_active;
>  
> +	cpumask_var_t		cpu_service_mask; /* CPUs not yet serviced */
> +	cpumask_var_t		cpu_pending_mask; /* CPUs with pending work */
> +	cpumask_var_t		cpu_next_mask;	/* CPUs to be serviced */
> +	int			tslice_us;	/* time slice in µs */
> +	int			tslice_inc;	/* extend time slice */
> +	int			tslice_inc_count;
> +	unsigned long		tslice_expiration; /* in jiffies */
> +	int			tslice_cpu;	/* cpu of time slice */
> +
>  	struct blk_mq_cpu_notifier	cpu_notifier;
>  	struct kobject		kobj;
>  
> -- 
> 1.9.1
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html