On Thu, Oct 27, 2016 at 6:53 AM, Bart Van Assche <bart.vanassche@xxxxxxxxxxx> wrote: > blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations > have finished. This function does *not* wait until all outstanding > requests have finished (this means invocation of request.end_io()). > The algorithm used by blk_mq_quiesce_queue() is as follows: > * Hold either an RCU read lock or an SRCU read lock around > .queue_rq() calls. The former is used if .queue_rq() does not > block and the latter if .queue_rq() may block. > * blk_mq_quiesce_queue() calls synchronize_srcu() or > synchronize_rcu() to wait for .queue_rq() invocations that > started before blk_mq_quiesce_queue() was called. > * The blk_mq_hctx_stopped() calls that control whether or not > .queue_rq() will be called are called with the (S)RCU read lock > held. This is necessary to avoid race conditions against > the "blk_mq_stop_hw_queues(q); blk_mq_quiesce_queue(q);" > sequence from another thread. > > Signed-off-by: Bart Van Assche <bart.vanassche@xxxxxxxxxxx> > Cc: Christoph Hellwig <hch@xxxxxx> > Cc: Ming Lei <tom.leiming@xxxxxxxxx> > Cc: Hannes Reinecke <hare@xxxxxxxx> > Cc: Johannes Thumshirn <jthumshirn@xxxxxxx> > --- > block/Kconfig | 1 + > block/blk-mq.c | 69 +++++++++++++++++++++++++++++++++++++++++++++----- > include/linux/blk-mq.h | 3 +++ > include/linux/blkdev.h | 1 + > 4 files changed, 67 insertions(+), 7 deletions(-) > > diff --git a/block/Kconfig b/block/Kconfig > index 1d4d624..0562ef9 100644 > --- a/block/Kconfig > +++ b/block/Kconfig > @@ -5,6 +5,7 @@ menuconfig BLOCK > bool "Enable the block layer" if EXPERT > default y > select SBITMAP > + select SRCU > help > Provide block layer support for the kernel. > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 0cf21c2..4945437 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -115,6 +115,31 @@ void blk_mq_unfreeze_queue(struct request_queue *q) > } > EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); > > +/** > + * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished > + * @q: request queue. > + * > + * Note: this function does not prevent that the struct request end_io() > + * callback function is invoked. Additionally, it is not prevented that > + * new queue_rq() calls occur unless the queue has been stopped first. > + */ > +void blk_mq_quiesce_queue(struct request_queue *q) > +{ > + struct blk_mq_hw_ctx *hctx; > + unsigned int i; > + bool rcu = false; Before synchronizing SRCU/RCU, we have to set a per-hctx flag or per-queue flag to block comming .queue_rq(), seems I mentioned that before: https://www.spinics.net/lists/linux-rdma/msg41389.html > + > + queue_for_each_hw_ctx(q, hctx, i) { > + if (hctx->flags & BLK_MQ_F_BLOCKING) > + synchronize_srcu(&hctx->queue_rq_srcu); > + else > + rcu = true; > + } > + if (rcu) > + synchronize_rcu(); > +} > +EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); > + > void blk_mq_wake_waiters(struct request_queue *q) > { > struct blk_mq_hw_ctx *hctx; > @@ -778,7 +803,7 @@ static inline unsigned int queued_to_index(unsigned int queued) > * of IO. In particular, we'd like FIFO behaviour on handling existing > * items on the hctx->dispatch list. Ignore that for now. > */ > -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) > +static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) > { > struct request_queue *q = hctx->queue; > struct request *rq; > @@ -790,9 +815,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) > if (unlikely(blk_mq_hctx_stopped(hctx))) > return; > > - WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && > - cpu_online(hctx->next_cpu)); > - > hctx->run++; > > /* > @@ -883,6 +905,24 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) > } > } > > +static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) > +{ > + int srcu_idx; > + > + WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && > + cpu_online(hctx->next_cpu)); > + > + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { > + rcu_read_lock(); > + blk_mq_process_rq_list(hctx); > + rcu_read_unlock(); > + } else { > + srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); > + blk_mq_process_rq_list(hctx); > + srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); > + } > +} > + > /* > * It'd be great if the workqueue API had a way to pass > * in a mask and had some smarts for more clever placement. > @@ -1293,7 +1333,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) > const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); > struct blk_map_ctx data; > struct request *rq; > - unsigned int request_count = 0; > + unsigned int request_count = 0, srcu_idx; > struct blk_plug *plug; > struct request *same_queue_rq = NULL; > blk_qc_t cookie; > @@ -1336,7 +1376,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) > blk_mq_bio_to_request(rq, bio); > > /* > - * We do limited pluging. If the bio can be merged, do that. > + * We do limited plugging. If the bio can be merged, do that. > * Otherwise the existing request in the plug list will be > * issued. So the plug list will have one request at most > */ > @@ -1356,7 +1396,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) > blk_mq_put_ctx(data.ctx); > if (!old_rq) > goto done; > - blk_mq_try_issue_directly(data.hctx, old_rq, &cookie); > + > + if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) { > + rcu_read_lock(); > + blk_mq_try_issue_directly(data.hctx, old_rq, &cookie); > + rcu_read_unlock(); > + } else { > + srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu); > + blk_mq_try_issue_directly(data.hctx, old_rq, &cookie); > + srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx); > + } > goto done; > } > > @@ -1635,6 +1684,9 @@ static void blk_mq_exit_hctx(struct request_queue *q, > if (set->ops->exit_hctx) > set->ops->exit_hctx(hctx, hctx_idx); > > + if (hctx->flags & BLK_MQ_F_BLOCKING) > + cleanup_srcu_struct(&hctx->queue_rq_srcu); > + > blk_mq_remove_cpuhp(hctx); > blk_free_flush_queue(hctx->fq); > sbitmap_free(&hctx->ctx_map); > @@ -1715,6 +1767,9 @@ static int blk_mq_init_hctx(struct request_queue *q, > flush_start_tag + hctx_idx, node)) > goto free_fq; > > + if (hctx->flags & BLK_MQ_F_BLOCKING) > + init_srcu_struct(&hctx->queue_rq_srcu); > + > return 0; > > free_fq: > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index aa93000..61be48b 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -3,6 +3,7 @@ > > #include <linux/blkdev.h> > #include <linux/sbitmap.h> > +#include <linux/srcu.h> > > struct blk_mq_tags; > struct blk_flush_queue; > @@ -35,6 +36,8 @@ struct blk_mq_hw_ctx { > > struct blk_mq_tags *tags; > > + struct srcu_struct queue_rq_srcu; > + > unsigned long queued; > unsigned long run; > #define BLK_MQ_MAX_DISPATCH_ORDER 7 > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index c47c358..8259d87 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -824,6 +824,7 @@ extern void __blk_run_queue(struct request_queue *q); > extern void __blk_run_queue_uncond(struct request_queue *q); > extern void blk_run_queue(struct request_queue *); > extern void blk_run_queue_async(struct request_queue *q); > +extern void blk_mq_quiesce_queue(struct request_queue *q); > extern int blk_rq_map_user(struct request_queue *, struct request *, > struct rq_map_data *, void __user *, unsigned long, > gfp_t); > -- > 2.10.1 > -- Ming Lei -- To unsubscribe from this list: send the line "unsubscribe linux-block" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html