Re: [PATCH RFC] scsi: ufs-core: Do not use clk_scaling_lock in ufshcd_queuecommand()

Adrian Hunter <adrian.hunter@xxxxxxxxx> · Mon, 1 Nov 2021 11:16:48 +0200

On 29/10/2021 19:31, Bart Van Assche wrote:
> On 10/29/21 6:37 AM, Adrian Hunter wrote:
>> Use flags and memory barriers instead of the clk_scaling_lock in
>> ufshcd_queuecommand().  This is done to prepare for potentially faster IOPS
>> in the future.
>>
>> On an x86 test system (Samsung Galaxy Book S), for random 4k reads this cut
>> the time of ufshcd_queuecommand() from about 690 ns to 460 ns.  With larger
>> I/O sizes, the increased duration of DMA mapping made the difference
>> increasingly negligible. Random 4k read IOPS was not affected, remaining at
>> about 97 kIOPS.
> 
> Hi Adrian,
> 
> That's an excellent performance improvement!
> 
> Earlier this week I looked into this myself and came up with a different
> approach. My patches add less new code. Please take a look at the patches
> below.
> 
> Thanks,
> 
> Bart.
> 
> 
> [PATCH 1/3] scsi: ufs: Do not block SCSI requests from the EE handler
> 
> It is not necessary to prevent ufshcd_queuecommand() calls while the EE
> handler is in progress. Additionally, calling ufshcd_scsi_block_requests()
> only prevents new ufshcd_queuecommand() calls and does not wait until
> existing calls have finished. Hence remove the ufshcd_scsi_block_requests()
> and ufshcd_scsi_unblock_requests() calls from the EE handler.
> 
> Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx>
> ---
>  drivers/scsi/ufs/ufshcd.c | 5 +----
>  1 file changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index bd0178b284f1..903750c836be 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -5815,12 +5815,11 @@ static void ufshcd_exception_event_handler(struct work_struct *work)
>      u32 status = 0;
>      hba = container_of(work, struct ufs_hba, eeh_work);
> 
> -    ufshcd_scsi_block_requests(hba);

I suspect that this was to facilitate the quick handling of urgent background ops,
so I am not sure it should be removed.

>      err = ufshcd_get_ee_status(hba, &status);
>      if (err) {
>          dev_err(hba->dev, "%s: failed to get exception status %d\n",
>                  __func__, err);
> -        goto out;
> +        return;
>      }
> 
>      trace_ufshcd_exception_event(dev_name(hba->dev), status);
> @@ -5832,8 +5831,6 @@ static void ufshcd_exception_event_handler(struct work_struct *work)
>          ufshcd_temp_exception_event_handler(hba, status);
> 
>      ufs_debugfs_exception_event(hba, status);
> -out:
> -    ufshcd_scsi_unblock_requests(hba);
>  }
> 
>  /* Complete requests that have door-bell cleared */
> 
> 
> 
> Subject: [PATCH 2/3] scsi: ufs: Fix a race condition related to clock scaling
> 
> Calling ufshcd_scsi_block_requests() after ungate_work has been queued
> does not guarantee that SCSI requests are blocked before the clock ungating
> work starts. Fix this race condition by moving the
> ufshcd_scsi_block_requests() call into ufshcd_ungate_work().
> 
> Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx>
> ---
>  drivers/scsi/ufs/ufshcd.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index 903750c836be..f35e7ab9054e 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -1624,6 +1624,8 @@ static void ufshcd_ungate_work(struct work_struct *work)
> 
>      cancel_delayed_work_sync(&hba->clk_gating.gate_work);
> 
> +    ufshcd_scsi_block_requests(hba);
> +
>      spin_lock_irqsave(hba->host->host_lock, flags);
>      if (hba->clk_gating.state == CLKS_ON) {
>          spin_unlock_irqrestore(hba->host->host_lock, flags);
> @@ -1714,9 +1716,8 @@ int ufshcd_hold(struct ufs_hba *hba, bool async)
>          hba->clk_gating.state = REQ_CLKS_ON;
>          trace_ufshcd_clk_gating(dev_name(hba->dev),
>                      hba->clk_gating.state);
> -        if (queue_work(hba->clk_gating.clk_gating_workq,
> -                   &hba->clk_gating.ungate_work))
> -            ufshcd_scsi_block_requests(hba);
> +        queue_work(hba->clk_gating.clk_gating_workq,
> +               &hba->clk_gating.ungate_work);
>          /*
>           * fall through to check if we should wait for this
>           * work to be done or not.
> 
> 
> 
> Subject: [PATCH 3/3] scsi: ufs: Remove the clock scaling lock
> 
> Remove the clock scaling lock since it is a performance bottleneck for the
> hot path. Freeze request queues instead of calling scsi_block_requests()
> from inside ufshcd_clock_scaling_prepare(). Use synchronize_rcu() to
> wait for ongoing ufshcd_queuecommand() calls.
> 
> Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx>
> ---
>  drivers/scsi/ufs/ufshcd.c | 146 ++++++++++----------------------------
>  drivers/scsi/ufs/ufshcd.h |   1 -
>  2 files changed, 38 insertions(+), 109 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index f35e7ab9054e..1b694be807bd 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -1069,65 +1069,6 @@ static bool ufshcd_is_devfreq_scaling_required(struct ufs_hba *hba,
>      return false;
>  }
> 
> -static int ufshcd_wait_for_doorbell_clr(struct ufs_hba *hba,
> -                    u64 wait_timeout_us)
> -{
> -    unsigned long flags;
> -    int ret = 0;
> -    u32 tm_doorbell;
> -    u32 tr_doorbell;
> -    bool timeout = false, do_last_check = false;
> -    ktime_t start;
> -
> -    ufshcd_hold(hba, false);
> -    spin_lock_irqsave(hba->host->host_lock, flags);
> -    /*
> -     * Wait for all the outstanding tasks/transfer requests.
> -     * Verify by checking the doorbell registers are clear.
> -     */
> -    start = ktime_get();
> -    do {
> -        if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL) {
> -            ret = -EBUSY;
> -            goto out;
> -        }
> -
> -        tm_doorbell = ufshcd_readl(hba, REG_UTP_TASK_REQ_DOOR_BELL);
> -        tr_doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
> -        if (!tm_doorbell && !tr_doorbell) {
> -            timeout = false;
> -            break;
> -        } else if (do_last_check) {
> -            break;
> -        }
> -
> -        spin_unlock_irqrestore(hba->host->host_lock, flags);
> -        schedule();
> -        if (ktime_to_us(ktime_sub(ktime_get(), start)) >
> -            wait_timeout_us) {
> -            timeout = true;
> -            /*
> -             * We might have scheduled out for long time so make
> -             * sure to check if doorbells are cleared by this time
> -             * or not.
> -             */
> -            do_last_check = true;
> -        }
> -        spin_lock_irqsave(hba->host->host_lock, flags);
> -    } while (tm_doorbell || tr_doorbell);
> -
> -    if (timeout) {
> -        dev_err(hba->dev,
> -            "%s: timedout waiting for doorbell to clear (tm=0x%x, tr=0x%x)\n",
> -            __func__, tm_doorbell, tr_doorbell);
> -        ret = -EBUSY;
> -    }
> -out:
> -    spin_unlock_irqrestore(hba->host->host_lock, flags);
> -    ufshcd_release(hba);
> -    return ret;
> -}
> -
>  /**
>   * ufshcd_scale_gear - scale up/down UFS gear
>   * @hba: per adapter instance
> @@ -1175,20 +1116,22 @@ static int ufshcd_scale_gear(struct ufs_hba *hba, bool scale_up)
> 
>  static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
>  {
> -    #define DOORBELL_CLR_TOUT_US        (1000 * 1000) /* 1 sec */
>      int ret = 0;
> +
>      /*
> -     * make sure that there are no outstanding requests when
> -     * clock scaling is in progress
> +     * Make sure that there are no outstanding requests while clock scaling
> +     * is in progress. Since the error handler may submit TMFs, limit the
> +     * time during which to wait for the doorbell registers to clear in
> +     * order not to block the UFS error handler.
>       */
> -    ufshcd_scsi_block_requests(hba);
> -    down_write(&hba->clk_scaling_lock);
> -
> -    if (!hba->clk_scaling.is_allowed ||
> -        ufshcd_wait_for_doorbell_clr(hba, DOORBELL_CLR_TOUT_US)) {
> +    blk_freeze_queue_start(hba->cmd_queue);
> +    blk_freeze_queue_start(hba->tmf_queue);
> +    if (blk_mq_freeze_queue_wait_timeout(hba->cmd_queue, HZ) <= 0 ||
> +        blk_mq_freeze_queue_wait_timeout(hba->tmf_queue, HZ / 10) <= 0 ||
> +        !READ_ONCE(hba->clk_scaling.is_allowed)) {
>          ret = -EBUSY;
> -        up_write(&hba->clk_scaling_lock);
> -        ufshcd_scsi_unblock_requests(hba);
> +        blk_mq_unfreeze_queue(hba->tmf_queue);
> +        blk_mq_unfreeze_queue(hba->cmd_queue);
>          goto out;
>      }
> 
> @@ -1199,13 +1142,10 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
>      return ret;
>  }
> 
> -static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
> +static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba)
>  {
> -    if (writelock)
> -        up_write(&hba->clk_scaling_lock);
> -    else
> -        up_read(&hba->clk_scaling_lock);
> -    ufshcd_scsi_unblock_requests(hba);
> +    blk_mq_unfreeze_queue(hba->tmf_queue);
> +    blk_mq_unfreeze_queue(hba->cmd_queue);
>      ufshcd_release(hba);
>  }
> 
> @@ -1220,8 +1160,7 @@ static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
>   */
>  static int ufshcd_devfreq_scale(struct ufs_hba *hba, bool scale_up)
>  {
> -    int ret = 0;
> -    bool is_writelock = true;
> +    int ret;
> 
>      ret = ufshcd_clock_scaling_prepare(hba);
>      if (ret)
> @@ -1249,14 +1188,19 @@ static int ufshcd_devfreq_scale(struct ufs_hba *hba, bool scale_up)
>              goto out_unprepare;
>          }
>      }
> -
> -    /* Enable Write Booster if we have scaled up else disable it */
> -    downgrade_write(&hba->clk_scaling_lock);
> -    is_writelock = false;
> +    ufshcd_scsi_block_requests(hba);
> +    ufshcd_clock_scaling_unprepare(hba);
> +    /*
> +     * Enable the Write Booster if we have scaled up else disable it.
> +     * ufshcd_wb_toggle() calls ufshcd_exec_dev_cmd() indirectly and hence
> +     * must be called after ufshcd_clock_scaling_unprepare().
> +     */
>      ufshcd_wb_toggle(hba, scale_up);
> +    ufshcd_scsi_unblock_requests(hba);
> +    return ret;
> 
>  out_unprepare:
> -    ufshcd_clock_scaling_unprepare(hba, is_writelock);
> +    ufshcd_clock_scaling_unprepare(hba);
>      return ret;
>  }
> 
> @@ -2696,9 +2640,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
> 
>      WARN_ONCE(tag < 0, "Invalid tag %d\n", tag);
> 
> -    if (!down_read_trylock(&hba->clk_scaling_lock))
> -        return SCSI_MLQUEUE_HOST_BUSY;
> -
>      switch (hba->ufshcd_state) {
>      case UFSHCD_STATE_OPERATIONAL:
>          break;
> @@ -2782,9 +2723,8 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>      }
> 
>      ufshcd_send_command(hba, tag);
> -out:
> -    up_read(&hba->clk_scaling_lock);
> 
> +out:
>      if (ufs_trigger_eh()) {
>          unsigned long flags;
> 
> @@ -2946,8 +2886,6 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
>      int err;
>      int tag;
> 
> -    down_read(&hba->clk_scaling_lock);
> -
>      /*
>       * Get free slot, sleep if slots are unavailable.
>       * Even though we use wait_event() which sleeps indefinitely,
> @@ -2982,7 +2920,6 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
>  out:
>      blk_put_request(req);
>  out_unlock:
> -    up_read(&hba->clk_scaling_lock);
>      return err;
>  }
> 
> @@ -5936,9 +5873,7 @@ void ufshcd_schedule_eh_work(struct ufs_hba *hba)
> 
>  static void ufshcd_clk_scaling_allow(struct ufs_hba *hba, bool allow)
>  {
> -    down_write(&hba->clk_scaling_lock);
> -    hba->clk_scaling.is_allowed = allow;
> -    up_write(&hba->clk_scaling_lock);
> +    WRITE_ONCE(hba->clk_scaling.is_allowed, allow);
>  }
> 
>  static void ufshcd_clk_scaling_suspend(struct ufs_hba *hba, bool suspend)
> @@ -5985,9 +5920,12 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
>          ufshcd_clk_scaling_allow(hba, false);
>      }
>      ufshcd_scsi_block_requests(hba);
> -    /* Drain ufshcd_queuecommand() */
> -    down_write(&hba->clk_scaling_lock);
> -    up_write(&hba->clk_scaling_lock);
> +    /*
> +     * Drain ufshcd_queuecommand(). Since ufshcd_queuecommand() does not
> +     * sleep, calling synchronize_rcu() is sufficient to wait for ongoing
> +     * ufshcd_queuecommand calls.
> +     */
> +    synchronize_rcu();

This depends upon block layer internals, so it must be called via a block
layer function i.e. the block layer must guarantee this will always work.

Also, scsi_unjam_host() does not look like it uses RCU when issuing
requests.

>      cancel_work_sync(&hba->eeh_work);
>  }
> 
> @@ -6212,11 +6150,8 @@ static void ufshcd_err_handler(struct work_struct *work)
>       */
>      if (needs_restore) {
>          spin_unlock_irqrestore(hba->host->host_lock, flags);
> -        /*
> -         * Hold the scaling lock just in case dev cmds
> -         * are sent via bsg and/or sysfs.
> -         */
> -        down_write(&hba->clk_scaling_lock);
> +        /* Block TMF submission, e.g. through BSG or sysfs. */

What about dev cmds?

Also, there is the outstanding question of synchronization for the call to
ufshcd_reset_and_restore() further down.

> +        blk_mq_freeze_queue(hba->tmf_queue);
>          hba->force_pmc = true;
>          pmc_err = ufshcd_config_pwr_mode(hba, &(hba->pwr_info));
>          if (pmc_err) {
> @@ -6226,7 +6161,7 @@ static void ufshcd_err_handler(struct work_struct *work)
>          }
>          hba->force_pmc = false;
>          ufshcd_print_pwr_info(hba);
> -        up_write(&hba->clk_scaling_lock);
> +        blk_mq_unfreeze_queue(hba->tmf_queue);
>          spin_lock_irqsave(hba->host->host_lock, flags);
>      }
> 
> @@ -6725,8 +6660,6 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
>      int tag;
>      u8 upiu_flags;
> 
> -    down_read(&hba->clk_scaling_lock);
> -
>      req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
>      if (IS_ERR(req)) {
>          err = PTR_ERR(req);
> @@ -6804,7 +6737,6 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
> 
>      blk_put_request(req);
>  out_unlock:
> -    up_read(&hba->clk_scaling_lock);
>      return err;
>  }
> 
> @@ -7982,7 +7914,7 @@ static int ufshcd_add_lus(struct ufs_hba *hba)
>              &hba->pwr_info,
>              sizeof(struct ufs_pa_layer_attr));
>          hba->clk_scaling.saved_pwr_info.is_valid = true;
> -        hba->clk_scaling.is_allowed = true;
> +        WRITE_ONCE(hba->clk_scaling.is_allowed, true);
> 
>          ret = ufshcd_devfreq_init(hba);
>          if (ret)
> @@ -9558,8 +9490,6 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
>      /* Initialize mutex for exception event control */
>      mutex_init(&hba->ee_ctrl_mutex);
> 
> -    init_rwsem(&hba->clk_scaling_lock);
> -
>      ufshcd_init_clk_gating(hba);
> 
>      ufshcd_init_clk_scaling(hba);
> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> index 0820d409585a..5514528cca58 100644
> --- a/drivers/scsi/ufs/ufshcd.h
> +++ b/drivers/scsi/ufs/ufshcd.h
> @@ -902,7 +902,6 @@ struct ufs_hba {
>      enum bkops_status urgent_bkops_lvl;
>      bool is_urgent_bkops_lvl_checked;
> 
> -    struct rw_semaphore clk_scaling_lock;
>      unsigned char desc_size[QUERY_DESC_IDN_MAX];
>      atomic_t scsi_block_reqs_cnt;
>