On 12/1/21 1:43 PM, Keith Busch wrote:
If the the timeout handler successfully sets the state to complete, and the lld returns BLK_EH_RESET_TIMER, who gets to complete this command?
That's a longstanding problem, isn't it? Anyway, how about replacing this patch with the two untested patches below? Subject: [PATCH 1/2] scsi: core: Rename scsi_cmnd.state Prepare for removal of SCMD_STATE_COMPLETE. This patch does not change any functionality. Cc: Ming Lei <ming.lei@xxxxxxxxxx> Cc: Hannes Reinecke <hare@xxxxxxx> Cc: Keith Busch <kbusch@xxxxxxxxxx> Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx> --- drivers/scsi/hosts.c | 2 +- drivers/scsi/scsi_error.c | 2 +- drivers/scsi/scsi_lib.c | 12 ++++++------ include/scsi/scsi_cmnd.h | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index f69b77cbf538..f5869bd13bcf 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -572,7 +572,7 @@ static bool scsi_host_check_in_flight(struct request *rq, void *data, int *count = data; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); - if (test_bit(SCMD_STATE_INFLIGHT, &cmd->state)) + if (test_bit(SCMD_STATE_INFLIGHT, &cmd->in_flight)) (*count)++; return true; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 9cb0f9df621a..7b603b259ae2 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -353,7 +353,7 @@ enum blk_eh_timer_return scsi_times_out(struct request *req) * so return RESET_TIMER to allow error handling another shot * at this command. */ - if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state)) + if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->in_flight)) return BLK_EH_RESET_TIMER; if (scsi_abort_command(scmd) != SUCCESS) { set_host_byte(scmd, DID_TIME_OUT); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 621d841d819a..6bb0e2726d51 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -280,7 +280,7 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost, struct scsi_cmnd *cmd) unsigned long flags; rcu_read_lock(); - __clear_bit(SCMD_STATE_INFLIGHT, &cmd->state); + __clear_bit(SCMD_STATE_INFLIGHT, &cmd->in_flight); if (unlikely(scsi_host_in_recovery(shost))) { spin_lock_irqsave(shost->host_lock, flags); if (shost->host_failed || shost->host_eh_scheduled) @@ -1138,7 +1138,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd) jiffies_at_alloc = cmd->jiffies_at_alloc; retries = cmd->retries; - in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state); + in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->in_flight); /* * Zero out the cmd, except for the embedded scsi_request. Only clear * the driver-private command data if the LLD does not supply a @@ -1158,7 +1158,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd) cmd->jiffies_at_alloc = jiffies_at_alloc; cmd->retries = retries; if (in_flight) - __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); + __set_bit(SCMD_STATE_INFLIGHT, &cmd->in_flight); cmd->budget_token = budget_token; } @@ -1367,7 +1367,7 @@ static inline int scsi_host_queue_ready(struct request_queue *q, spin_unlock_irq(shost->host_lock); } - __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); + __set_bit(SCMD_STATE_INFLIGHT, &cmd->in_flight); return 1; @@ -1597,7 +1597,7 @@ void scsi_done(struct scsi_cmnd *cmd) if (unlikely(blk_should_fake_timeout(scsi_cmd_to_rq(cmd)->q))) return; - if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state))) + if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->in_flight))) return; trace_scsi_dispatch_cmd_done(cmd); blk_mq_complete_request(scsi_cmd_to_rq(cmd)); @@ -1691,7 +1691,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, goto out_dec_host_busy; req->rq_flags |= RQF_DONTPREP; } else { - clear_bit(SCMD_STATE_COMPLETE, &cmd->state); + clear_bit(SCMD_STATE_COMPLETE, &cmd->in_flight); } cmd->flags &= SCMD_PRESERVED_FLAGS; diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 477a800a9543..6cbfbfbbb803 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -60,7 +60,7 @@ struct scsi_pointer { /* flags preserved across unprep / reprep */ #define SCMD_PRESERVED_FLAGS (SCMD_INITIALIZED) -/* for scmd->state */ +/* for scmd->in_flight */ #define SCMD_STATE_COMPLETE 0 #define SCMD_STATE_INFLIGHT 1 @@ -139,7 +139,7 @@ struct scsi_cmnd { int result; /* Status code from lower level driver */ int flags; /* Command flags */ - unsigned long state; /* Command completion state */ + unsigned long in_flight;/* Command completion state */ unsigned int extra_len; /* length of alignment and padding */ }; Subject: [PATCH 2/2] scsi: core: Fix a race between scsi_done() and scsi_times_out() If scsi_done() and scsi_times_out() are called concurrently, make sure that only one of these two functions proceeds. If scsi_done() is called while scsi_times_out() is in progress and if scsi_times_out() returns BLK_EH_RESET_TIMER, complete the command. If scsi_times_out() returns BLK_EH_RESET_TIMER, allow scsi_done() to complete the command. Cc: Ming Lei <ming.lei@xxxxxxxxxx> Cc: Hannes Reinecke <hare@xxxxxxx> Cc: Keith Busch <kbusch@xxxxxxxxxx> Fixes: 065990bd198e ("scsi: set timed out out mq requests to complete") Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx> --- drivers/scsi/scsi_error.c | 35 +++++++++++++++++++++-------------- drivers/scsi/scsi_lib.c | 16 +++++++++++++--- drivers/scsi/virtio_scsi.c | 4 ++-- include/scsi/scsi_cmnd.h | 12 ++++++++++-- 4 files changed, 46 insertions(+), 21 deletions(-) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 7b603b259ae2..42dd967167e6 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -330,6 +330,15 @@ enum blk_eh_timer_return scsi_times_out(struct request *req) struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); enum blk_eh_timer_return rtn = BLK_EH_DONE; struct Scsi_Host *host = scmd->device->host; + int old = SCMD_STATE_SUBMITTED; + + /* + * scsi_done() may be called concurrently with scsi_times_out(). Only + * one of these two functions should proceed. Hence return early if + * scsi_done() won the race. + */ + if (!atomic_try_cmpxchg(&scmd->state, &old, SCMD_STATE_TIMED_OUT)) + return BLK_EH_DONE; trace_scsi_dispatch_cmd_timeout(scmd); scsi_log_completion(scmd, TIMEOUT_ERROR); @@ -341,26 +350,24 @@ enum blk_eh_timer_return scsi_times_out(struct request *req) rtn = host->hostt->eh_timed_out(scmd); if (rtn == BLK_EH_DONE) { - /* - * Set the command to complete first in order to prevent a real - * completion from releasing the command while error handling - * is using it. If the command was already completed, then the - * lower level driver beat the timeout handler, and it is safe - * to return without escalating error recovery. - * - * If timeout handling lost the race to a real completion, the - * block layer may ignore that due to a fake timeout injection, - * so return RESET_TIMER to allow error handling another shot - * at this command. - */ - if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->in_flight)) - return BLK_EH_RESET_TIMER; if (scsi_abort_command(scmd) != SUCCESS) { set_host_byte(scmd, DID_TIME_OUT); scsi_eh_scmd_add(scmd); } + return rtn; } + /* The order of the atomic_try_cmpxchg() calls below is important. */ + old = SCMD_STATE_TIMED_OUT; + if (atomic_try_cmpxchg(&scmd->state, &old, SCMD_STATE_SUBMITTED)) + return rtn; + old = SCMD_STATE_COMPLETE_AFTER_TIMEOUT; + WARN_ON_ONCE(!atomic_try_cmpxchg(&scmd->state, &old, + SCMD_STATE_COMPLETED)); + WARN_ON_ONCE(scmd->submitter != SUBMITTED_BY_BLOCK_LAYER); + trace_scsi_dispatch_cmd_done(scmd); + blk_mq_complete_request(scsi_cmd_to_rq(scmd)); + return rtn; } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 6bb0e2726d51..797ad188e7a2 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1586,6 +1586,8 @@ static blk_status_t scsi_prepare_cmd(struct request *req) void scsi_done(struct scsi_cmnd *cmd) { + int old; + switch (cmd->submitter) { case SUBMITTED_BY_BLOCK_LAYER: break; @@ -1597,8 +1599,15 @@ void scsi_done(struct scsi_cmnd *cmd) if (unlikely(blk_should_fake_timeout(scsi_cmd_to_rq(cmd)->q))) return; - if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->in_flight))) - return; + for (;;) { + old = SCMD_STATE_SUBMITTED; + if (atomic_try_cmpxchg(&cmd->state, &old, SCMD_STATE_COMPLETED)) + break; + old = SCMD_STATE_TIMED_OUT; + if (atomic_try_cmpxchg(&cmd->state, &old, + SCMD_STATE_COMPLETE_AFTER_TIMEOUT)) + return; + } trace_scsi_dispatch_cmd_done(cmd); blk_mq_complete_request(scsi_cmd_to_rq(cmd)); } @@ -1691,7 +1700,8 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, goto out_dec_host_busy; req->rq_flags |= RQF_DONTPREP; } else { - clear_bit(SCMD_STATE_COMPLETE, &cmd->in_flight); + BUILD_BUG_ON(SCMD_STATE_SUBMITTED != 0); + atomic_set(&cmd->state, SCMD_STATE_SUBMITTED); } cmd->flags &= SCMD_PRESERVED_FLAGS; diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 19f7d7b90625..9ea3ec308ecd 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -620,8 +620,8 @@ static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd) * we're using independent interrupts (e.g. MSI). Poll the * virtqueues once. * - * In the abort case, scsi_done() will do nothing, because the - * command timed out and hence SCMD_STATE_COMPLETE has been set. + * In the abort case, scsi_done() will do nothing. See also the + * scsi_done() implementation. */ virtscsi_poll_requests(vscsi); diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 6cbfbfbbb803..350b47792a4d 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -53,6 +53,14 @@ struct scsi_pointer { volatile int phase; }; +enum scsi_cmd_state { + SCMD_STATE_SUBMITTED = 0, /* Owned by device or not submitted. */ + SCMD_STATE_COMPLETED = 1, /* scsi_done() is in progress. */ + SCMD_STATE_TIMED_OUT = 2, /* Inside timeout handler. */ + SCMD_STATE_COMPLETE_AFTER_TIMEOUT = 3, /* Complete after timeout + * handler finished. */ +} __packed; + /* for scmd->flags */ #define SCMD_TAGGED (1 << 0) #define SCMD_INITIALIZED (1 << 1) @@ -61,8 +69,7 @@ struct scsi_pointer { #define SCMD_PRESERVED_FLAGS (SCMD_INITIALIZED) /* for scmd->in_flight */ -#define SCMD_STATE_COMPLETE 0 -#define SCMD_STATE_INFLIGHT 1 +#define SCMD_STATE_INFLIGHT 0 enum scsi_cmnd_submitter { SUBMITTED_BY_BLOCK_LAYER = 0, @@ -92,6 +99,7 @@ struct scsi_cmnd { int retries; int allowed; + atomic_t state; /* See also enum scsi_cmd_state */ unsigned char prot_op; unsigned char prot_type; unsigned char prot_flags;