Hi Ming On 09/13/2018 08:15 PM, Ming Lei wrote: > This patchset introduces per-host admin request queue for submitting > admin request only, and uses this approach to implement both SCSI > quiesce and runtime PM in one very simply way. Also runtime PM deadlock > can be avoided in case that request pool is used up, such as when too > many IO requests are allocated before resuming device To be honest, after look in to the SCSI part of this patchset, I really suspect whether it is worth to introduce this per scsi-host adminq. Too much complexity is introduced into SCSI. Compared with this, the current preempt-only feature look much simpler. If you just don't like the preempt-only checking in the hot path, we may introduce a new interface similar with blk_queue_enter for the non hot path. With your patch percpu-refcount: relax limit on percpu_ref_reinit() (totally no test) diff --git a/block/blk-core.c b/block/blk-core.c index 4dbc93f..dd7f007 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -427,14 +427,24 @@ EXPORT_SYMBOL(blk_sync_queue); * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not * set and 1 if the flag was already set. */ -int blk_set_preempt_only(struct request_queue *q) +int blk_set_preempt_only(struct request_queue *q, bool drain) { - return blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q); + if (blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q)) + return 1; + percpu_ref_kill(&q->q_usage_counter); + synchronize_sched(); + + if (drain) + wait_event(q->mq_freeze_wq, + percpu_ref_is_zero(&q->q_usage_counter)); + + return 0; } EXPORT_SYMBOL_GPL(blk_set_preempt_only); void blk_clear_preempt_only(struct request_queue *q) { + percpu_ref_reinit(&q->q_usage_counter); blk_queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); wake_up_all(&q->mq_freeze_wq); } @@ -913,31 +923,13 @@ EXPORT_SYMBOL(blk_alloc_queue); /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer - * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT + * @flags: BLK_MQ_REQ_NOWAIT */ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { - const bool preempt = flags & BLK_MQ_REQ_PREEMPT; - while (true) { - bool success = false; - - rcu_read_lock(); - if (percpu_ref_tryget_live(&q->q_usage_counter)) { - /* - * The code that sets the PREEMPT_ONLY flag is - * responsible for ensuring that that flag is globally - * visible before the queue is unfrozen. - */ - if (preempt || !blk_queue_preempt_only(q)) { - success = true; - } else { - percpu_ref_put(&q->q_usage_counter); - } - } - rcu_read_unlock(); - if (success) + if (percpu_ref_tryget_live(&q->q_usage_counter)) return 0; if (flags & BLK_MQ_REQ_NOWAIT) @@ -954,7 +946,44 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) wait_event(q->mq_freeze_wq, (atomic_read(&q->mq_freeze_depth) == 0 && - (preempt || !blk_queue_preempt_only(q))) || + !blk_queue_preempt_only(q)) || + blk_queue_dying(q)); + if (blk_queue_dying(q)) + return -ENODEV; + } +} + +/* + * When set PREEMPT_ONLY mode, q->q_usage_counter is killed. + * If only PREEMPT_ONLY mode, go on. + * If queue frozen, wait. + */ +int blk_queue_preempt_enter(struct request_queue *q, + blk_mq_req_flags_t flags) +{ + while (true) { + + if (percpu_ref_tryget_live(&q->q_usage_counter)) + return 0; + + smp_rmb(); + + /* + * If queue is only in PREEMPT_ONLY mode, get the ref + * directly. The one who set PREEMPT_ONLY mode is responsible + * to wake up the waiters on mq_freeze_wq. + */ + if (!atomic_read(&q->mq_freeze_depth) && + blk_queue_preempt_only(q)) { + percpu_ref_get_many(&q->q_usage_counter, 1); + return 0; + } + + if (flags & BLK_MQ_REQ_NOWAIT) + return -EBUSY; + + wait_event(q->mq_freeze_wq, + (atomic_read(&q->mq_freeze_depth) == 0 || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; @@ -1587,7 +1616,7 @@ static struct request *blk_old_get_request(struct request_queue *q, /* create ioc upfront */ create_io_context(gfp_mask, q->node); - ret = blk_queue_enter(q, flags); + ret = blk_queue_preempt_enter(q, flags); if (ret) return ERR_PTR(ret); spin_lock_irq(q->queue_lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index 85a1c1a..3aea78f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -403,7 +403,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, struct request *rq; int ret; - ret = blk_queue_enter(q, flags); + ret = blk_queue_preempt_enter(q, flags); if (ret) return ERR_PTR(ret); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index eb97d2d..c338c3a 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -3046,17 +3046,7 @@ scsi_device_quiesce(struct scsi_device *sdev) */ WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current); - blk_set_preempt_only(q); - - blk_mq_freeze_queue(q); - /* - * Ensure that the effect of blk_set_preempt_only() will be visible - * for percpu_ref_tryget() callers that occur after the queue - * unfreeze even if the queue was already frozen before this function - * was called. See also https://lwn.net/Articles/573497/. - */ - synchronize_rcu(); - blk_mq_unfreeze_queue(q); + blk_set_preempt_only(q, true); mutex_lock(&sdev->state_mutex); err = scsi_device_set_state(sdev, SDEV_QUIESCE);