Re: [PATCH] libsas: fix error handling

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



A similar change went into the SAS/SATL layer I support
shortly after I posted the version which you forked off.
The git date of the commit is: Tue Feb 7 22:14:54 2006 -0800.

BTW, the problem you're "debugging" may require a protocol
trace and a sequencer update by Adaptec.

    Luben

--- On Wed, 2/20/08, James Bottomley <James.Bottomley@xxxxxxxxxxxxxxxxxxxxx> wrote:

> From: James Bottomley <James.Bottomley@xxxxxxxxxxxxxxxxxxxxx>
> Subject: [PATCH] libsas: fix error handling
> To: "linux-scsi" <linux-scsi@xxxxxxxxxxxxxxx>
> Date: Wednesday, February 20, 2008, 8:07 AM
> The libsas error handler has two fairly fatal bugs
> 
> 1. scsi_sas_task_done calls scsi_eh_finish_cmd() too early.
>  This
>    happens if the task completes after it has been aborted
> but before
>    the error handler starts up.  Because
> scsi_eh_finish_cmd()
>    decrements host_failed and adds the task to the done
> list, the
>    error handler start check (host_failed == host_busy)
> never passes
>    and the eh never starts.
> 
> 2. The multiple task completion paths
> sas_scsi_clear_queue_... all
>    simply delete the task from the error queue.  This
> causes it to
>    disappear into the ether, since a command must be placed
> on the
>    done queue to be finished off by the error handler. 
> This behaviour
>    causes the HBA to hang on pending commands.
> 
> Fix 1. by removing the SAS_TASK_STATE_ABORTED check and
> calling
> ->scsi_done() unconditionally (it is a nop if the timer
> has fired).
> This keeps the task in the error handling queue until the
> eh starts.
> 
> Fix 2. by making sure every task goes through task complete
> followed
> by scsi_eh_finish_cmd().
> 
> Tested this by firing resets across a disk running a hammer
> test (now
> it actually survives without hanging the system)
> 
> Signed-off-by: James Bottomley
> <James.Bottomley@xxxxxxxxxxxxxxxxxxxxx>
> ---
>  drivers/scsi/libsas/sas_scsi_host.c |   53
> +++++++++++++++++++----------------
>  1 files changed, 29 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/scsi/libsas/sas_scsi_host.c
> b/drivers/scsi/libsas/sas_scsi_host.c
> index f869fba..b656e29 100644
> --- a/drivers/scsi/libsas/sas_scsi_host.c
> +++ b/drivers/scsi/libsas/sas_scsi_host.c
> @@ -51,8 +51,6 @@ static void sas_scsi_task_done(struct
> sas_task *task)
>  {
>  	struct task_status_struct *ts =
> &task->task_status;
>  	struct scsi_cmnd *sc = task->uldd_task;
> -	struct sas_ha_struct *sas_ha =
> SHOST_TO_SAS_HA(sc->device->host);
> -	unsigned ts_flags = task->task_state_flags;
>  	int hs = 0, stat = 0;
>  
>  	if (unlikely(!sc)) {
> @@ -120,11 +118,7 @@ static void sas_scsi_task_done(struct
> sas_task *task)
>  	sc->result = (hs << 16) | stat;
>  	list_del_init(&task->list);
>  	sas_free_task(task);
> -	/* This is very ugly but this is how SCSI Core works. */
> -	if (ts_flags & SAS_TASK_STATE_ABORTED)
> -		scsi_eh_finish_cmd(sc, &sas_ha->eh_done_q);
> -	else
> -		sc->scsi_done(sc);
> +	sc->scsi_done(sc);
>  }
>  
>  static enum task_attribute sas_scsi_get_task_attr(struct
> scsi_cmnd *cmd)
> @@ -255,13 +249,27 @@ out:
>  	return res;
>  }
>  
> +static void sas_eh_finish_cmd(struct scsi_cmnd *cmd)
> +{
> +	struct sas_task *task = TO_SAS_TASK(cmd);
> +	struct sas_ha_struct *sas_ha =
> SHOST_TO_SAS_HA(cmd->device->host);
> +
> +	/* First off call task_done.  However, task will
> +	 * be free'd after this */
> +	task->task_done(task);
> +	/* now finish the command and move it on to the error
> +	 * handler done list, this also takes it off the
> +	 * error handler pending list */
> +	scsi_eh_finish_cmd(cmd, &sas_ha->eh_done_q);
> +}
> +
>  static void sas_scsi_clear_queue_lu(struct list_head
> *error_q, struct scsi_cmnd *my_cmd)
>  {
>  	struct scsi_cmnd *cmd, *n;
>  
>  	list_for_each_entry_safe(cmd, n, error_q, eh_entry) {
>  		if (cmd == my_cmd)
> -			list_del_init(&cmd->eh_entry);
> +			sas_eh_finish_cmd(cmd);
>  	}
>  }
>  
> @@ -274,7 +282,7 @@ static void
> sas_scsi_clear_queue_I_T(struct list_head *error_q,
>  		struct domain_device *x = cmd_to_domain_dev(cmd);
>  
>  		if (x == dev)
> -			list_del_init(&cmd->eh_entry);
> +			sas_eh_finish_cmd(cmd);
>  	}
>  }
>  
> @@ -288,7 +296,7 @@ static void
> sas_scsi_clear_queue_port(struct list_head *error_q,
>  		struct asd_sas_port *x = dev->port;
>  
>  		if (x == port)
> -			list_del_init(&cmd->eh_entry);
> +			sas_eh_finish_cmd(cmd);
>  	}
>  }
>  
> @@ -528,14 +536,14 @@ Again:
>  		case TASK_IS_DONE:
>  			SAS_DPRINTK("%s: task 0x%p is done\n",
> __FUNCTION__,
>  				    task);
> -			task->task_done(task);
> +			sas_eh_finish_cmd(cmd);
>  			if (need_reset)
>  				try_to_reset_cmd_device(shost, cmd);
>  			continue;
>  		case TASK_IS_ABORTED:
>  			SAS_DPRINTK("%s: task 0x%p is aborted\n",
>  				    __FUNCTION__, task);
> -			task->task_done(task);
> +			sas_eh_finish_cmd(cmd);
>  			if (need_reset)
>  				try_to_reset_cmd_device(shost, cmd);
>  			continue;
> @@ -547,7 +555,7 @@ Again:
>  					    "recovered\n",
>  					    SAS_ADDR(task->dev),
>  					    cmd->device->lun);
> -				task->task_done(task);
> +				sas_eh_finish_cmd(cmd);
>  				if (need_reset)
>  					try_to_reset_cmd_device(shost, cmd);
>  				sas_scsi_clear_queue_lu(work_q, cmd);
> @@ -562,7 +570,7 @@ Again:
>  			if (tmf_resp == TMF_RESP_FUNC_COMPLETE) {
>  				SAS_DPRINTK("I_T %016llx recovered\n",
>  					    SAS_ADDR(task->dev->sas_addr));
> -				task->task_done(task);
> +				sas_eh_finish_cmd(cmd);
>  				if (need_reset)
>  					try_to_reset_cmd_device(shost, cmd);
>  				sas_scsi_clear_queue_I_T(work_q, task->dev);
> @@ -577,7 +585,7 @@ Again:
>  				if (res == TMF_RESP_FUNC_COMPLETE) {
>  					SAS_DPRINTK("clear nexus port:%d "
>  						    "succeeded\n", port->id);
> -					task->task_done(task);
> +					sas_eh_finish_cmd(cmd);
>  					if (need_reset)
>  						try_to_reset_cmd_device(shost, cmd);
>  					sas_scsi_clear_queue_port(work_q,
> @@ -591,10 +599,10 @@ Again:
>  				if (res == TMF_RESP_FUNC_COMPLETE) {
>  					SAS_DPRINTK("clear nexus ha "
>  						    "succeeded\n");
> -					task->task_done(task);
> +					sas_eh_finish_cmd(cmd);
>  					if (need_reset)
>  						try_to_reset_cmd_device(shost, cmd);
> -					goto out;
> +					goto clear_q;
>  				}
>  			}
>  			/* If we are here -- this means that no amount
> @@ -606,21 +614,18 @@ Again:
>  				    SAS_ADDR(task->dev->sas_addr),
>  				    cmd->device->lun);
>  
> -			task->task_done(task);
> +			sas_eh_finish_cmd(cmd);
>  			if (need_reset)
>  				try_to_reset_cmd_device(shost, cmd);
>  			goto clear_q;
>  		}
>  	}
> -out:
>  	return list_empty(work_q);
>  clear_q:
>  	SAS_DPRINTK("--- Exit %s -- clear_q\n",
> __FUNCTION__);
> -	list_for_each_entry_safe(cmd, n, work_q, eh_entry) {
> -		struct sas_task *task = TO_SAS_TASK(cmd);
> -		list_del_init(&cmd->eh_entry);
> -		task->task_done(task);
> -	}
> +	list_for_each_entry_safe(cmd, n, work_q, eh_entry)
> +		sas_eh_finish_cmd(cmd);
> +
>  	return list_empty(work_q);
>  }
>  
> -- 
> 1.5.4.1
> 
> 
> 
> -
> To unsubscribe from this list: send the line
> "unsubscribe linux-scsi" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at 
> http://vger.kernel.org/majordomo-info.html
-
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [SCSI Target Devel]     [Linux SCSI Target Infrastructure]     [Kernel Newbies]     [IDE]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux ATA RAID]     [Linux IIO]     [Samba]     [Device Mapper]
  Powered by Linux