Re: [PATCH v6 1/3] cxlflash: Base error recovery support

Brian King <brking@xxxxxxxxxxxxxxxxxx> · Mon, 17 Aug 2015 09:38:06 -0500

On 08/13/2015 09:47 PM, Matthew R. Ochs wrote:
> --- a/drivers/scsi/cxlflash/common.h
> +++ b/drivers/scsi/cxlflash/common.h
> @@ -76,6 +76,12 @@ enum cxlflash_init_state {
>  	INIT_STATE_SCSI
>  };
> 
> +enum cxlflash_state {
> +	STATE_NORMAL,	/* Normal running state, everything good */
> +	STATE_LIMBO,	/* Limbo running state, trying to reset/recover */

Might be more clear to call this STATE_RESETTING or STATE_RECOVERY

> +	STATE_FAILTERM	/* Failed/terminating state, error out users/threads */
> +};
> +
>  /*
>   * Each context has its own set of resource handles that is visible
>   * only from that context.

> @@ -105,7 +109,8 @@ struct cxlflash_cfg {
> 
>  	wait_queue_head_t tmf_waitq;
>  	bool tmf_active;
> -	u8 err_recovery_active:1;
> +	wait_queue_head_t limbo_waitq;

How about reset_waitq instead?

> +	enum cxlflash_state state;
>  };
> 
>  struct afu_cmd {

> @@ -455,9 +471,21 @@ static int cxlflash_eh_device_reset_handler(struct scsi_cmnd *scp)
>  		 get_unaligned_be32(&((u32 *)scp->cmnd)[2]),
>  		 get_unaligned_be32(&((u32 *)scp->cmnd)[3]));
> 
> -	rcr = send_tmf(afu, scp, TMF_LUN_RESET);
> -	if (unlikely(rcr))
> +	switch (cfg->state) {
> +	case STATE_NORMAL:
> +		rcr = send_tmf(afu, scp, TMF_LUN_RESET);
> +		if (unlikely(rcr))
> +			rc = FAILED;
> +		break;
> +	case STATE_LIMBO:
> +		wait_event(cfg->limbo_waitq, cfg->state != STATE_LIMBO);
> +		if (cfg->state == STATE_NORMAL)
> +			break;

In this case you've been asked to do a LUN reset but didn't actually reset the LUN. I'd suggest
restructuring this switch statement to send the LUN reset TMF in the case where you had
to wait for an AFU reset to complete.

> +		/* fall through */
> +	default:
>  		rc = FAILED;
> +		break;
> +	}
> 
>  	pr_debug("%s: returning rc=%d\n", __func__, rc);
>  	return rc;
> @@ -487,11 +515,29 @@ static int cxlflash_eh_host_reset_handler(struct scsi_cmnd *scp)
>  		 get_unaligned_be32(&((u32 *)scp->cmnd)[2]),
>  		 get_unaligned_be32(&((u32 *)scp->cmnd)[3]));
> 
> -	rcr = cxlflash_afu_reset(cfg);
> -	if (rcr == 0)
> -		rc = SUCCESS;
> -	else
> +	switch (cfg->state) {
> +	case STATE_NORMAL:
> +		cfg->state = STATE_LIMBO;
> +		scsi_block_requests(cfg->host);
> +
> +		rcr = cxlflash_afu_reset(cfg);
> +		if (rcr) {
> +			rc = FAILED;
> +			cfg->state = STATE_FAILTERM;
> +		} else
> +			cfg->state = STATE_NORMAL;
> +		wake_up_all(&cfg->limbo_waitq);
> +		scsi_unblock_requests(cfg->host);

The scsi_block_requests / scsi_unblock_requests is not necessary in this path, since SCSI EH
will already be preventing any new commands being issued via queuecommand.

> +		break;
> +	case STATE_LIMBO:
> +		wait_event(cfg->limbo_waitq, cfg->state != STATE_LIMBO);
> +		if (cfg->state == STATE_NORMAL)
> +			break;
> +		/* fall through */
> +	default:
>  		rc = FAILED;
> +		break;
> +	}
> 
>  	pr_debug("%s: returning rc=%d\n", __func__, rc);
>  	return rc;
> @@ -642,7 +688,7 @@ static void cxlflash_wait_for_pci_err_recovery(struct cxlflash_cfg *cfg)
>  	struct pci_dev *pdev = cfg->dev;
> 
>  	if (pci_channel_offline(pdev))
> -		wait_event_timeout(cfg->eeh_waitq,
> +		wait_event_timeout(cfg->limbo_waitq,
>  				   !pci_channel_offline(pdev),
>  				   CXLFLASH_PCI_ERROR_RECOVERY_TIMEOUT);
>  }
> @@ -825,6 +871,8 @@ static void cxlflash_remove(struct pci_dev *pdev)
>  						    !cfg->tmf_active);
>  	spin_unlock_irqrestore(&cfg->tmf_waitq.lock, lock_flags);
> 
> +	cfg->state = STATE_FAILTERM;

I don't see any locking around the setting or reading of this flag. What are the
implications if the processor reorders the store to change this state either here
or elsewhere. Same goes for the load associated with checking the state.

> +
>  	switch (cfg->init_state) {
>  	case INIT_STATE_SCSI:
>  		scsi_remove_host(cfg->host);
> @@ -1879,6 +1927,8 @@ static int init_afu(struct cxlflash_cfg *cfg)
>  	struct afu *afu = cfg->afu;
>  	struct device *dev = &cfg->dev->dev;
> 
> +	cxl_perst_reloads_same_image(cfg->cxl_afu, true);
> +
>  	rc = init_mc(cfg);
>  	if (rc) {
>  		dev_err(dev, "%s: call to init_mc failed, rc=%d!\n",

Reviewed-by: Brian King <brking@xxxxxxxxxxxxxxxxxx>

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html