On Sat, Oct 26, 2019 at 3:25 AM Ming Lei <ming.lei@xxxxxxxxxx> wrote: > > It isn't necessary to check the host depth in scsi_queue_rq() any more > since it has been respected by blk-mq before calling scsi_queue_rq() via > getting driver tag. > > Lots of LUNs may attach to same host and per-host IOPS may reach millions, > so we should avoid expensive atomic operations on the host-wide counter in > the IO path. > > This patch implements scsi_host_busy() via blk_mq_tagset_busy_iter() > with one scsi command state for reading the count of busy IOs for scsi_mq. > > It is observed that IOPS is increased by 15% in IO test on scsi_debug (32 > LUNs, 32 submit queues, 1024 can_queue, libaio/dio) in a dual-socket > system. > > V5: > - fix document on .can_queue, no code change > > V4: > - fix one build waring, just a line change in scsi_dev_queue_ready() > > V3: > - use non-atomic set/clear bit operations as suggested by Bart > - kill single field struct for storing count of in-flight requests > - add patch to bypass the atomic LUN-wide counter of device_busy > for fast SSD device > > V2: > - introduce SCMD_STATE_INFLIGHT for getting accurate host busy > via blk_mq_tagset_busy_iter() > - verified that original Jens's report[1] is fixed > - verified that SCSI timeout/abort works fine > > [1] https://www.spinics.net/lists/linux-scsi/msg122867.html > [2] V1 & its revert: > > d772a65d8a6c Revert "scsi: core: avoid host-wide host_busy counter for scsi_mq" > 23aa8e69f2c6 Revert "scsi: core: fix scsi_host_queue_ready" > 265d59aacbce scsi: core: fix scsi_host_queue_ready > 328728630d9f scsi: core: avoid host-wide host_busy counter for scsi_mq > > Cc: Jens Axboe <axboe@xxxxxxxxx> > Cc: Ewan D. Milne <emilne@xxxxxxxxxx> > Cc: Omar Sandoval <osandov@xxxxxx>, > Cc: "Martin K. Petersen" <martin.petersen@xxxxxxxxxx>, > Cc: James Bottomley <james.bottomley@xxxxxxxxxxxxxxxxxxxxx>, > Cc: Christoph Hellwig <hch@xxxxxx>, > Cc: Kashyap Desai <kashyap.desai@xxxxxxxxxxxx> > Cc: Hannes Reinecke <hare@xxxxxxx> > Cc: Laurence Oberman <loberman@xxxxxxxxxx> > Cc: Bart Van Assche <bart.vanassche@xxxxxxx> > Reviewed-by: Bart Van Assche <bart.vanassche@xxxxxxx> > Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> > --- > drivers/scsi/hosts.c | 19 ++++++++++++++++- > drivers/scsi/scsi.c | 2 +- > drivers/scsi/scsi_lib.c | 45 ++++++++++++++++++++-------------------- > drivers/scsi/scsi_priv.h | 2 +- > include/scsi/scsi_cmnd.h | 1 + > include/scsi/scsi_host.h | 3 +-- > 6 files changed, 44 insertions(+), 28 deletions(-) > > diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c > index 55522b7162d3..1d669e47b692 100644 > --- a/drivers/scsi/hosts.c > +++ b/drivers/scsi/hosts.c > @@ -38,6 +38,7 @@ > #include <scsi/scsi_device.h> > #include <scsi/scsi_host.h> > #include <scsi/scsi_transport.h> > +#include <scsi/scsi_cmnd.h> > > #include "scsi_priv.h" > #include "scsi_logging.h" > @@ -554,13 +555,29 @@ struct Scsi_Host *scsi_host_get(struct Scsi_Host *shost) > } > EXPORT_SYMBOL(scsi_host_get); > > +static bool scsi_host_check_in_flight(struct request *rq, void *data, > + bool reserved) > +{ > + int *count = data; > + struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); > + > + if (test_bit(SCMD_STATE_INFLIGHT, &cmd->state)) > + (*count)++; > + > + return true; > +} > + > /** > * scsi_host_busy - Return the host busy counter > * @shost: Pointer to Scsi_Host to inc. > **/ > int scsi_host_busy(struct Scsi_Host *shost) > { > - return atomic_read(&shost->host_busy); > + int cnt = 0; > + > + blk_mq_tagset_busy_iter(&shost->tag_set, > + scsi_host_check_in_flight, &cnt); > + return cnt; > } > EXPORT_SYMBOL(scsi_host_busy); > > diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c > index 1f5b5c8a7f72..ddc4ec6ea2a1 100644 > --- a/drivers/scsi/scsi.c > +++ b/drivers/scsi/scsi.c > @@ -186,7 +186,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd) > struct scsi_driver *drv; > unsigned int good_bytes; > > - scsi_device_unbusy(sdev); > + scsi_device_unbusy(sdev, cmd); > > /* > * Clear the flags that say that the device/target/host is no longer > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c > index dc210b9d4896..2563b061f56b 100644 > --- a/drivers/scsi/scsi_lib.c > +++ b/drivers/scsi/scsi_lib.c > @@ -189,7 +189,7 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy) > * active on the host/device. > */ > if (unbusy) > - scsi_device_unbusy(device); > + scsi_device_unbusy(device, cmd); > > /* > * Requeue this command. It will go before all other commands > @@ -321,20 +321,20 @@ static void scsi_init_cmd_errh(struct scsi_cmnd *cmd) > } > > /* > - * Decrement the host_busy counter and wake up the error handler if necessary. > - * Avoid as follows that the error handler is not woken up if shost->host_busy > - * == shost->host_failed: use call_rcu() in scsi_eh_scmd_add() in combination > - * with an RCU read lock in this function to ensure that this function in its > - * entirety either finishes before scsi_eh_scmd_add() increases the > + * Wake up the error handler if necessary. Avoid as follows that the error > + * handler is not woken up if host in-flight requests number == > + * shost->host_failed: use call_rcu() in scsi_eh_scmd_add() in combination > + * with an RCU read lock in this function to ensure that this function in > + * its entirety either finishes before scsi_eh_scmd_add() increases the > * host_failed counter or that it notices the shost state change made by > * scsi_eh_scmd_add(). > */ > -static void scsi_dec_host_busy(struct Scsi_Host *shost) > +static void scsi_dec_host_busy(struct Scsi_Host *shost, struct scsi_cmnd *cmd) > { > unsigned long flags; > > rcu_read_lock(); > - atomic_dec(&shost->host_busy); > + __clear_bit(SCMD_STATE_INFLIGHT, &cmd->state); > if (unlikely(scsi_host_in_recovery(shost))) { > spin_lock_irqsave(shost->host_lock, flags); > if (shost->host_failed || shost->host_eh_scheduled) > @@ -344,12 +344,12 @@ static void scsi_dec_host_busy(struct Scsi_Host *shost) > rcu_read_unlock(); > } > > -void scsi_device_unbusy(struct scsi_device *sdev) > +void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd) > { > struct Scsi_Host *shost = sdev->host; > struct scsi_target *starget = scsi_target(sdev); > > - scsi_dec_host_busy(shost); > + scsi_dec_host_busy(shost, cmd); > > if (starget->can_queue > 0) > atomic_dec(&starget->target_busy); > @@ -430,9 +430,6 @@ static inline bool scsi_target_is_busy(struct scsi_target *starget) > > static inline bool scsi_host_is_busy(struct Scsi_Host *shost) > { > - if (shost->can_queue > 0 && > - atomic_read(&shost->host_busy) >= shost->can_queue) > - return true; > if (atomic_read(&shost->host_blocked) > 0) > return true; > if (shost->host_self_blocked) > @@ -1139,6 +1136,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd) > unsigned int flags = cmd->flags & SCMD_PRESERVED_FLAGS; > unsigned long jiffies_at_alloc; > int retries; > + bool in_flight; > > if (!blk_rq_is_scsi(rq) && !(flags & SCMD_INITIALIZED)) { > flags |= SCMD_INITIALIZED; > @@ -1147,6 +1145,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd) > > jiffies_at_alloc = cmd->jiffies_at_alloc; > retries = cmd->retries; > + in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state); > /* zero out the cmd, except for the embedded scsi_request */ > memset((char *)cmd + sizeof(cmd->req), 0, > sizeof(*cmd) - sizeof(cmd->req) + dev->host->hostt->cmd_size); > @@ -1158,6 +1157,8 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd) > INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler); > cmd->jiffies_at_alloc = jiffies_at_alloc; > cmd->retries = retries; > + if (in_flight) > + __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); > > scsi_add_cmd_to_list(cmd); > } > @@ -1367,16 +1368,14 @@ static inline int scsi_target_queue_ready(struct Scsi_Host *shost, > */ > static inline int scsi_host_queue_ready(struct request_queue *q, > struct Scsi_Host *shost, > - struct scsi_device *sdev) > + struct scsi_device *sdev, > + struct scsi_cmnd *cmd) > { > - unsigned int busy; > - > if (scsi_host_in_recovery(shost)) > return 0; > > - busy = atomic_inc_return(&shost->host_busy) - 1; > if (atomic_read(&shost->host_blocked) > 0) { > - if (busy) > + if (scsi_host_busy(shost) > 0) > goto starved; > > /* > @@ -1390,8 +1389,6 @@ static inline int scsi_host_queue_ready(struct request_queue *q, > "unblocking host at zero depth\n")); > } > > - if (shost->can_queue > 0 && busy >= shost->can_queue) > - goto starved; > if (shost->host_self_blocked) > goto starved; > > @@ -1403,6 +1400,8 @@ static inline int scsi_host_queue_ready(struct request_queue *q, > spin_unlock_irq(shost->host_lock); > } > > + __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); > + > return 1; > > starved: > @@ -1411,7 +1410,7 @@ static inline int scsi_host_queue_ready(struct request_queue *q, > list_add_tail(&sdev->starved_entry, &shost->starved_list); > spin_unlock_irq(shost->host_lock); > out_dec: > - scsi_dec_host_busy(shost); > + scsi_dec_host_busy(shost, cmd); > return 0; > } > > @@ -1665,7 +1664,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, > ret = BLK_STS_RESOURCE; > if (!scsi_target_queue_ready(shost, sdev)) > goto out_put_budget; > - if (!scsi_host_queue_ready(q, shost, sdev)) > + if (!scsi_host_queue_ready(q, shost, sdev, cmd)) > goto out_dec_target_busy; > > if (!(req->rq_flags & RQF_DONTPREP)) { > @@ -1697,7 +1696,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, > return BLK_STS_OK; > > out_dec_host_busy: > - scsi_dec_host_busy(shost); > + scsi_dec_host_busy(shost, cmd); > out_dec_target_busy: > if (scsi_target(sdev)->can_queue > 0) > atomic_dec(&scsi_target(sdev)->target_busy); > diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h > index cc2859d76d81..3bff9f7aa684 100644 > --- a/drivers/scsi/scsi_priv.h > +++ b/drivers/scsi/scsi_priv.h > @@ -87,7 +87,7 @@ int scsi_noretry_cmd(struct scsi_cmnd *scmd); > extern void scsi_add_cmd_to_list(struct scsi_cmnd *cmd); > extern void scsi_del_cmd_from_list(struct scsi_cmnd *cmd); > extern int scsi_maybe_unblock_host(struct scsi_device *sdev); > -extern void scsi_device_unbusy(struct scsi_device *sdev); > +extern void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd); > extern void scsi_queue_insert(struct scsi_cmnd *cmd, int reason); > extern void scsi_io_completion(struct scsi_cmnd *, unsigned int); > extern void scsi_run_host_queues(struct Scsi_Host *shost); > diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h > index 91bd749a02f7..9c22e85902ec 100644 > --- a/include/scsi/scsi_cmnd.h > +++ b/include/scsi/scsi_cmnd.h > @@ -63,6 +63,7 @@ struct scsi_pointer { > > /* for scmd->state */ > #define SCMD_STATE_COMPLETE 0 > +#define SCMD_STATE_INFLIGHT 1 > > struct scsi_cmnd { > struct scsi_request req; > diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h > index 31e0d6ca1eba..d4452d0ea3c7 100644 > --- a/include/scsi/scsi_host.h > +++ b/include/scsi/scsi_host.h > @@ -345,7 +345,7 @@ struct scsi_host_template { > /* > * This determines if we will use a non-interrupt driven > * or an interrupt driven scheme. It is set to the maximum number > - * of simultaneous commands a given host adapter will accept. > + * of simultaneous commands a single hw queue in HBA will accept. > */ > int can_queue; > > @@ -551,7 +551,6 @@ struct Scsi_Host { > /* Area to keep a shared tag map */ > struct blk_mq_tag_set tag_set; > > - atomic_t host_busy; /* commands actually active on low-level */ > atomic_t host_blocked; > > unsigned int host_failed; /* commands that failed. > -- > 2.20.1 > Ping... Thanks, Ming Lei