On Mon, Sep 25, 2017 at 01:29:24PM -0700, Bart Van Assche wrote: > It is essential during suspend and resume that neither the filesystem > state nor the filesystem metadata in RAM changes. This is why while > the hibernation image is being written or restored that SCSI devices > are quiesced. The SCSI core quiesces devices through scsi_device_quiesce() > and scsi_device_resume(). In the SDEV_QUIESCE state execution of > non-preempt requests is deferred. This is realized by returning > BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI > devices. Avoid that a full queue prevents power management requests > to be submitted by deferring allocation of non-preempt requests for > devices in the quiesced state. This patch has been tested by running > the following commands and by verifying that after resume the fio job > is still running: > > for d in /sys/class/block/sd*[a-z]; do > hcil=$(readlink "$d/device") > hcil=${hcil#../../../} > echo 4 > "$d/queue/nr_requests" > echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth" > done > bdev=$(readlink /dev/disk/by-uuid/5217d83f-213e-4b42-b86e-20013325ba6c) > bdev=${bdev#../../} > hcil=$(readlink "/sys/block/$bdev/device") > hcil=${hcil#../../../} > fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 --rw=randread \ > --ioengine=libaio --numjobs=4 --iodepth=16 --iodepth_batch=1 --thread \ > --loops=$((2**31)) & > pid=$! > sleep 1 > systemctl hibernate > sleep 10 > kill $pid > > Reported-by: Oleksandr Natalenko <oleksandr@xxxxxxxxxxxxxx> > References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348). > Signed-off-by: Bart Van Assche <bart.vanassche@xxxxxxx> > Cc: Martin K. Petersen <martin.petersen@xxxxxxxxxx> > Cc: Ming Lei <ming.lei@xxxxxxxxxx> > Cc: Christoph Hellwig <hch@xxxxxx> > Cc: Hannes Reinecke <hare@xxxxxxxx> > Cc: Johannes Thumshirn <jthumshirn@xxxxxxx> > --- > block/blk-core.c | 41 +++++++++++++++++++++++++++++++---------- > block/blk-mq.c | 4 ++-- > block/blk-timeout.c | 2 +- > fs/block_dev.c | 4 ++-- > include/linux/blkdev.h | 2 +- > 5 files changed, 37 insertions(+), 16 deletions(-) > > diff --git a/block/blk-core.c b/block/blk-core.c > index 9111a8f9c7a1..01b7afee58f0 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -351,10 +351,12 @@ void blk_set_preempt_only(struct request_queue *q, bool preempt_only) > unsigned long flags; > > spin_lock_irqsave(q->queue_lock, flags); > - if (preempt_only) > + if (preempt_only) { > queue_flag_set(QUEUE_FLAG_PREEMPT_ONLY, q); > - else > + } else { > queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); > + wake_up_all(&q->mq_freeze_wq); > + } > spin_unlock_irqrestore(q->queue_lock, flags); > } > EXPORT_SYMBOL(blk_set_preempt_only); > @@ -776,13 +778,31 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) > } > EXPORT_SYMBOL(blk_alloc_queue); > > -int blk_queue_enter(struct request_queue *q, bool nowait) > +/** > + * blk_queue_enter() - try to increase q->q_usage_counter > + * @q: request queue pointer > + * @nowait: if the queue is frozen, do not wait until it is unfrozen > + * @preempt: if QUEUE_FLAG_PREEMPT_ONLY has been set, do not wait until that > + * flag has been cleared > + */ > +int blk_queue_enter(struct request_queue *q, bool nowait, bool preempt) > { > while (true) { > int ret; > > - if (percpu_ref_tryget_live(&q->q_usage_counter)) > - return 0; > + if (percpu_ref_tryget_live(&q->q_usage_counter)) { > + /* > + * Since setting the PREEMPT_ONLY flag is followed > + * by a switch of q_usage_counter from per-cpu to > + * atomic mode and back to per-cpu and since the > + * switch to atomic mode uses call_rcu_sched(), it > + * is not necessary to call smp_rmb() here. > + */ rcu_read_lock is held only inside percpu_ref_tryget_live(). Without one explicit barrier(smp_mb) between getting the refcounter and reading the preempt only flag, the two operations(writing to refcounter and reading the flag) can be reordered, so unfreeze/unfreeze may be completed before this IO is completed. > + if (preempt || !blk_queue_preempt_only(q)) > + return 0; > + else > + percpu_ref_put(&q->q_usage_counter); > + } > > if (nowait) > return -EBUSY; > @@ -797,7 +817,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait) > smp_rmb(); > > ret = wait_event_interruptible(q->mq_freeze_wq, > - !atomic_read(&q->mq_freeze_depth) || > + (atomic_read(&q->mq_freeze_depth) == 0 && > + (preempt || !blk_queue_preempt_only(q))) || > blk_queue_dying(q)); > if (blk_queue_dying(q)) > return -ENODEV; > @@ -1416,7 +1437,7 @@ static struct request *blk_old_get_request(struct request_queue *q, > create_io_context(gfp_mask, q->node); > > ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) || > - (op & REQ_NOWAIT)); > + (op & REQ_NOWAIT), op & REQ_PREEMPT); > if (ret) > return ERR_PTR(ret); > spin_lock_irq(q->queue_lock); > @@ -2184,6 +2205,7 @@ blk_qc_t generic_make_request(struct bio *bio) > */ > struct bio_list bio_list_on_stack[2]; > blk_qc_t ret = BLK_QC_T_NONE; > + const bool nowait = bio->bi_opf & REQ_NOWAIT; > > if (!generic_make_request_checks(bio)) > goto out; > @@ -2223,7 +2245,7 @@ blk_qc_t generic_make_request(struct bio *bio) > do { > struct request_queue *q = bio->bi_disk->queue; > > - if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) { > + if (likely(blk_queue_enter(q, nowait, false) == 0)) { > struct bio_list lower, same; > > /* Create a fresh bio_list for all subordinate requests */ > @@ -2248,8 +2270,7 @@ blk_qc_t generic_make_request(struct bio *bio) > bio_list_merge(&bio_list_on_stack[0], &same); > bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); > } else { > - if (unlikely(!blk_queue_dying(q) && > - (bio->bi_opf & REQ_NOWAIT))) > + if (unlikely(!blk_queue_dying(q) && nowait)) > bio_wouldblock_error(bio); > else > bio_io_error(bio); > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 10c1f49f663d..cbf7bf7e3d13 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -384,7 +384,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, > struct request *rq; > int ret; > > - ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); > + ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT, op & REQ_PREEMPT); > if (ret) > return ERR_PTR(ret); > > @@ -423,7 +423,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, > if (hctx_idx >= q->nr_hw_queues) > return ERR_PTR(-EIO); > > - ret = blk_queue_enter(q, true); > + ret = blk_queue_enter(q, true, op & REQ_PREEMPT); > if (ret) > return ERR_PTR(ret); > > diff --git a/block/blk-timeout.c b/block/blk-timeout.c > index 17ec83bb0900..0dfdc975473a 100644 > --- a/block/blk-timeout.c > +++ b/block/blk-timeout.c > @@ -134,7 +134,7 @@ void blk_timeout_work(struct work_struct *work) > struct request *rq, *tmp; > int next_set = 0; > > - if (blk_queue_enter(q, true)) > + if (blk_queue_enter(q, true, true)) > return; > spin_lock_irqsave(q->queue_lock, flags); > > diff --git a/fs/block_dev.c b/fs/block_dev.c > index 93d088ffc05c..e9ca45087a40 100644 > --- a/fs/block_dev.c > +++ b/fs/block_dev.c > @@ -674,7 +674,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, > if (!ops->rw_page || bdev_get_integrity(bdev)) > return result; > > - result = blk_queue_enter(bdev->bd_queue, false); > + result = blk_queue_enter(bdev->bd_queue, false, false); > if (result) > return result; > result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); > @@ -710,7 +710,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, > > if (!ops->rw_page || bdev_get_integrity(bdev)) > return -EOPNOTSUPP; > - result = blk_queue_enter(bdev->bd_queue, false); > + result = blk_queue_enter(bdev->bd_queue, false, false); > if (result) > return result; > > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 5bd87599eed0..a025597c378a 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -964,7 +964,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, > extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, > struct scsi_ioctl_command __user *); > > -extern int blk_queue_enter(struct request_queue *q, bool nowait); > +extern int blk_queue_enter(struct request_queue *q, bool nowait, bool preempt); > extern void blk_queue_exit(struct request_queue *q); > extern void blk_start_queue(struct request_queue *q); > extern void blk_start_queue_async(struct request_queue *q); > -- > 2.14.1 > -- Ming