On Tue, Sep 20, 2022 at 09:49:33AM +0800, Ziyang Zhang wrote: > On 2022/9/19 20:33, Ming Lei wrote: > >>>> + > >>>> +static void ublk_quiesce_queue(struct ublk_device *ub, > >>>> + struct ublk_queue *ubq) > >>>> +{ > >>>> + int i; > >>>> + > >>>> + for (i = 0; i < ubq->q_depth; i++) { > >>>> + struct ublk_io *io = &ubq->ios[i]; > >>>> + > >>>> + if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) { > >>>> + struct request *rq = blk_mq_tag_to_rq( > >>>> + ub->tag_set.tags[ubq->q_id], i); > >>>> + > >>>> + WARN_ON_ONCE(!rq); > >>>> + pr_devel("%s: %s rq: qid %d tag %d io_flags %x\n", __func__, > >>>> + ublk_queue_can_use_recovery_reissue(ubq) ? > >>>> + "requeue" : "abort", > >>>> + ubq->q_id, i, io->flags); > >>>> + if (ublk_queue_can_use_recovery_reissue(ubq)) > >>>> + blk_mq_requeue_request(rq, false); > >>> > >>> This way is too violent. > >>> > >>> There may be just one queue dying, but you requeue all requests > >>> from any queue. I'd suggest to take the approach in ublk_daemon_monitor_work(), > >>> such as, just requeuing requests in dying queue. > >> > >> If we want to start a new process after a crash for USER_RECOVERY, all old ubq_daemons > >> must exit and rqs of all queues have to be requeued/aborted. We cannot let live > >> ubq_daemons run any more because they do not belong to the new process. > > > > IMO, the old process really can exist, and recently even I got such > > requirement for switching queue from one thread to another. > > For now, only one process can open /dev/ublkcX, so a new process is necessary now. > > If you think "per ubq_daemon" recovery is reasonable, I can do that in the future > if multiple processes is supported. But I really suggest that we can keep current > design as the first step which assumes all ubq_daemons are exited and a new process > is started, and that really meets our requirement. > > BTW, START_USER_RECOVERY has to be reconsidered because we may need to pass a ubq_id > with it. > > > > > What we should do is to get all inflight requests done, and cancel all io > > commands, no matter if the ubq pthread is dead or live. > > > >> > >> BTW, I really wonder why there could be just one queue dying? All queues must be dying > >> shortly after any ubq_daemon is dying since they are all pthreads in the same process. > > > > You can't assume it is always so. Maybe one pthread is dead first, and > > others are dying later, maybe just one is dead. > > Yes, I know there may be only one pthread is dead while others keep running, but now > ublk_drv only support one process opening the same /dev/ublkcX, so other pthreads > must dead(no matter they are aborted by signal or themselves) later. > > > > > If one queue's pthread is live, you may get trouble by simply requeuing > > the request, that is why I suggest to re-use the logic of > > ublk_daemon_monitor_work/ublk_abort_queue(). > > Actually, if any ubq_daemon is live, no rqs are requeued, please see the check in > ublk_quiesce_dev(). It always makes sure that ALL ubq_daemons are dying, then it > starts quiesce jobs. OK, looks I miss this point, but you should have quiesced queue at the beginning of ublk_quiesce_dev(), then the transition period can be kept as short as possible. Otherwise, if one queue pthread isn't dying, the device can be kept in this part-working state forever. > > > > > For stopping device, request queue is frozen in del_gendisk() and all > > in-flight requests are drained, and monitor work provides such > > guarantee. > > > > For user recovery, monitor work should help you too by aborting one > > queue if it is dying until all requests are drained. > > Monitor work can schedule quiesce_work if it finds a dying ubq_daemon. > Then quiesce_work calls ublk_quiesce_dev(). I do this because ublk_quiesce_dev() > has to wait all inflight rqs with ACTIVE set being requeued. > > > > >> > >>> > >>> That said you still can re-use the logic in ublk_abort_queue()/ublk_daemon_monitor_work() > >>> for making progress, just changing aborting request with requeue in > >>> ublk_abort_queue(). > >> > >> I get your point, but it may be hard to reuse the logic in ublk_daemon_monitor_work() > >> because: > >> (1) we have to quiesce request queue in ublk_quiesce_dev(). This has to be done with > >> ub_mutex. > >> (2) ublk_quiesce_dev() cannot be run after rqs are requeued/aborted. Follows the delta patch against patch 5 for showing the idea: diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4409a130d0b6..60c5786c4711 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -656,7 +656,8 @@ static void ublk_complete_rq(struct request *req) * Also aborting may not be started yet, keep in mind that one failed * request may be issued by block layer again. */ -static void __ublk_fail_req(struct ublk_io *io, struct request *req) +static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, + struct request *req) { WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); @@ -667,7 +668,10 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req) req->tag, io->flags); io->flags |= UBLK_IO_FLAG_ABORTED; - blk_mq_end_request(req, BLK_STS_IOERR); + if (ublk_queue_can_use_recovery_reissue(ubq)) + blk_mq_requeue_request(req, false); + else + blk_mq_end_request(req, BLK_STS_IOERR); } } @@ -1018,7 +1022,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) */ rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); if (rq) - __ublk_fail_req(io, rq); + __ublk_fail_req(ubq, io, rq); } } ublk_put_device(ub); @@ -1026,12 +1030,10 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) static bool ublk_check_inflight_rq(struct request *rq, void *data) { - struct ublk_queue *ubq = rq->mq_hctx->driver_data; - struct ublk_io *io = &ubq->ios[rq->tag]; - bool *busy = data; + bool *idle = data; - if (io->flags & UBLK_IO_FLAG_ACTIVE) { - *busy = true; + if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) { + *idle = false; return false; } return true; @@ -1039,16 +1041,15 @@ static bool ublk_check_inflight_rq(struct request *rq, void *data) static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) { - bool busy = false; + bool idle = true; WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue)); while (true) { blk_mq_tagset_busy_iter(&ub->tag_set, - ublk_check_inflight_rq, &busy); - if (busy) - msleep(UBLK_REQUEUE_DELAY_MS); - else + ublk_check_inflight_rq, &idle); + if (idle) break; + msleep(UBLK_REQUEUE_DELAY_MS); } } @@ -1069,10 +1070,7 @@ static void ublk_quiesce_queue(struct ublk_device *ub, ublk_queue_can_use_recovery_reissue(ubq) ? "requeue" : "abort", ubq->q_id, i, io->flags); - if (ublk_queue_can_use_recovery_reissue(ubq)) - blk_mq_requeue_request(rq, false); - else - __ublk_fail_req(io, rq); + __ublk_fail_req(ubq, io, rq); } else { pr_devel("%s: done old cmd: qid %d tag %d\n", __func__, ubq->q_id, i); @@ -1092,12 +1090,6 @@ static void ublk_quiesce_dev(struct ublk_device *ub) if (ub->dev_info.state != UBLK_S_DEV_LIVE) goto unlock; - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - struct ublk_queue *ubq = ublk_get_queue(ub, i); - - if (!ubq_daemon_is_dying(ubq)) - goto unlock; - } blk_mq_quiesce_queue(ub->ub_disk->queue); ublk_wait_tagset_rqs_idle(ub); pr_devel("%s: quiesce ub: dev_id %d\n", @@ -1129,14 +1121,13 @@ static void ublk_daemon_monitor_work(struct work_struct *work) struct ublk_queue *ubq = ublk_get_queue(ub, i); if (ubq_daemon_is_dying(ubq)) { - if (ublk_queue_can_use_recovery(ubq)) { + if (ublk_queue_can_use_recovery(ubq)) schedule_work(&ub->quiesce_work); - } else { + else schedule_work(&ub->stop_work); - /* abort queue is for making forward progress */ - ublk_abort_queue(ub, ubq); - } + /* abort queue is for making forward progress */ + ublk_abort_queue(ub, ubq); } } Thanks, Ming