Re: [PATCH] scsi/eh: fix hang adding ehandler wakeups after decrementing host_busy

Bart Van Assche <Bart.VanAssche@xxxxxxx> · Tue, 21 Nov 2017 16:53:18 +0000

On Tue, 2017-09-05 at 15:54 +0300, Pavel Tikhomirov wrote:
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index f6097b89d5d3..6c99221d60aa 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -320,12 +320,11 @@ void scsi_device_unbusy(struct scsi_device *sdev)
>  	if (starget->can_queue > 0)
>  		atomic_dec(&starget->target_busy);
>  
> +	spin_lock_irqsave(shost->host_lock, flags);
>  	if (unlikely(scsi_host_in_recovery(shost) &&
> -		     (shost->host_failed || shost->host_eh_scheduled))) {
> -		spin_lock_irqsave(shost->host_lock, flags);
> +		     (shost->host_failed || shost->host_eh_scheduled)))
>  		scsi_eh_wakeup(shost);
> -		spin_unlock_irqrestore(shost->host_lock, flags);
> -	}
> +	spin_unlock_irqrestore(shost->host_lock, flags);
>  
>  	atomic_dec(&sdev->device_busy);
>  }
> @@ -1503,6 +1502,13 @@ static inline int scsi_host_queue_ready(struct request_queue *q,
>  	spin_unlock_irq(shost->host_lock);
>  out_dec:
>  	atomic_dec(&shost->host_busy);
> +
> +	spin_lock_irq(shost->host_lock);
> +	if (unlikely(scsi_host_in_recovery(shost) &&
> +		     (shost->host_failed || shost->host_eh_scheduled)))
> +		scsi_eh_wakeup(shost);
> +	spin_unlock_irq(shost->host_lock);
> +
>  	return 0;
>  }
>  
> @@ -1964,6 +1970,13 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
>  
>  out_dec_host_busy:
>  	atomic_dec(&shost->host_busy);
> +
> +	spin_lock_irq(shost->host_lock);
> +	if (unlikely(scsi_host_in_recovery(shost) &&
> +		     (shost->host_failed || shost->host_eh_scheduled)))
> +		scsi_eh_wakeup(shost);
> +	spin_unlock_irq(shost->host_lock);
> +
>  out_dec_target_busy:
>  	if (scsi_target(sdev)->can_queue > 0)
>  		atomic_dec(&scsi_target(sdev)->target_busy);

An important achievement of the scsi-mq code was removal of all
spin_lock_irq(shost->host_lock) statements from the hot path. The above
changes will have a significant negative performance impact, especially if
multiple LUNs associated with the same SCSI host are involved. Can the
reported race be fixed without slowing down the hot path significantly? I
think that both adding spin lock or smp_mb() calls in the hot path will
have a significant negative performance impact.

Thanks,

Bart.