[PATCH 1/7] zfcp: introduce eh_timed_out handler

Andreas Herrmann <aherrman@xxxxxxxxxx> · Sat, 3 Sep 2005 14:30:42 +0200

zfcp: introduce eh_timed_out handler

This handler is required to avoid offlined SCSI devices in a multipath
setup if scsi commands time out on cable pulls lasting longer than 30
seconds.

Signed-off-by: Andreas Herrmann <aherrman@xxxxxxxxxx>

diff -Nup linux-2.6.13/drivers/s390/scsi-orig/zfcp_scsi.c linux-2.6.13/drivers/s390/scsi/zfcp_scsi.c

--- linux-2.6.13/drivers/s390/scsi-orig/zfcp_scsi.c	2005-09-03 12:17:16.000000000 +0200
+++ linux-2.6.13/drivers/s390/scsi/zfcp_scsi.c	2005-09-03 12:17:53.000000000 +0200
@@ -44,6 +44,7 @@ static int zfcp_scsi_eh_abort_handler(st
 static int zfcp_scsi_eh_device_reset_handler(struct scsi_cmnd *);
 static int zfcp_scsi_eh_bus_reset_handler(struct scsi_cmnd *);
 static int zfcp_scsi_eh_host_reset_handler(struct scsi_cmnd *);
+static enum scsi_eh_timer_return zfcp_scsi_eh_timed_out(struct scsi_cmnd *);
 static int zfcp_task_management_function(struct zfcp_unit *, u8);
 
 static struct zfcp_unit *zfcp_unit_lookup(struct zfcp_adapter *, int, scsi_id_t,
@@ -69,6 +70,7 @@ struct zfcp_data zfcp_data = {
 	      eh_device_reset_handler: zfcp_scsi_eh_device_reset_handler,
 	      eh_bus_reset_handler:    zfcp_scsi_eh_bus_reset_handler,
 	      eh_host_reset_handler:   zfcp_scsi_eh_host_reset_handler,
+	      eh_timed_out:            zfcp_scsi_eh_timed_out,
 			               /* FIXME(openfcp): Tune */
 	      can_queue:               4096,
 	      this_id:	               0,
@@ -242,7 +244,6 @@ static void
 zfcp_scsi_command_fail(struct scsi_cmnd *scpnt, int result)
 {
 	set_host_byte(&scpnt->result, result);
-	zfcp_cmd_dbf_event_scsi("failing", scpnt);
 	/* return directly */
 	scpnt->scsi_done(scpnt);
 }
@@ -414,59 +415,18 @@ zfcp_port_lookup(struct zfcp_adapter *ad
 	return (struct zfcp_port *) NULL;
 }
 
-/*
- * function:	zfcp_scsi_eh_abort_handler
- *
- * purpose:	tries to abort the specified (timed out) SCSI command
- *
- * note: 	We do not need to care for a SCSI command which completes
- *		normally but late during this abort routine runs.
- *		We are allowed to return late commands to the SCSI stack.
- *		It tracks the state of commands and will handle late commands.
- *		(Usually, the normal completion of late commands is ignored with
- *		respect to the running abort operation. Grep for 'done_late'
- *		in the SCSI stacks sources.)
- *
- * returns:	SUCCESS	- command has been aborted and cleaned up in internal
- *			  bookkeeping,
- *			  SCSI stack won't be called for aborted command
- *		FAILED	- otherwise
- */
 int
-__zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt)
+zfcp_scsi_abort_async(struct scsi_cmnd *scpnt,
+ 		      struct zfcp_fsf_req **fsf_req_ptr)
 {
-	int retval = SUCCESS;
-	struct zfcp_fsf_req *new_fsf_req, *old_fsf_req;
-	struct zfcp_adapter *adapter = (struct zfcp_adapter *) scpnt->device->host->hostdata[0];
+ 	struct Scsi_Host *host = scpnt->device->host;
+ 	struct zfcp_adapter *adapter = (struct zfcp_adapter *) host->hostdata[0];
 	struct zfcp_unit *unit = (struct zfcp_unit *) scpnt->device->hostdata;
-	struct zfcp_port *port = unit->port;
-	struct Scsi_Host *scsi_host = scpnt->device->host;
 	union zfcp_req_data *req_data = NULL;
+  	struct zfcp_fsf_req *new_fsf_req;
+  	struct zfcp_fsf_req *old_fsf_req;
+  	int req_flags;
 	unsigned long flags;
-	u32 status = 0;
-
-	/* the components of a abort_dbf record (fixed size record) */
-	u64 dbf_scsi_cmnd = (unsigned long) scpnt;
-	char dbf_opcode[ZFCP_ABORT_DBF_LENGTH];
-	wwn_t dbf_wwn = port->wwpn;
-	fcp_lun_t dbf_fcp_lun = unit->fcp_lun;
-	u64 dbf_retries = scpnt->retries;
-	u64 dbf_allowed = scpnt->allowed;
-	u64 dbf_timeout = 0;
-	u64 dbf_fsf_req = 0;
-	u64 dbf_fsf_status = 0;
-	u64 dbf_fsf_qual[2] = { 0, 0 };
-	char dbf_result[ZFCP_ABORT_DBF_LENGTH] = "##undef";
-
-	memset(dbf_opcode, 0, ZFCP_ABORT_DBF_LENGTH);
-	memcpy(dbf_opcode,
-	       scpnt->cmnd,
-	       min(scpnt->cmd_len, (unsigned char) ZFCP_ABORT_DBF_LENGTH));
-
-	ZFCP_LOG_INFO("aborting scsi_cmnd=%p on adapter %s\n",
-		      scpnt, zfcp_get_busid_by_adapter(adapter));
-
-	spin_unlock_irq(scsi_host->host_lock);
 
 	/*
 	 * Race condition between normal (late) completion and abort has
@@ -494,31 +454,18 @@ __zfcp_scsi_eh_abort_handler(struct scsi
 		 * Do not initiate abort but return SUCCESS.
 		 */
 		write_unlock_irqrestore(&adapter->abort_lock, flags);
-		retval = SUCCESS;
-		strncpy(dbf_result, "##late1", ZFCP_ABORT_DBF_LENGTH);
-		goto out;
+		return SUCCESS;
 	}
 
 	/* Figure out which fsf_req needs to be aborted. */
 	old_fsf_req = req_data->send_fcp_command_task.fsf_req;
 
-	dbf_fsf_req = (unsigned long) old_fsf_req;
-	dbf_timeout =
-	    (jiffies - req_data->send_fcp_command_task.start_jiffies) / HZ;
-
 	ZFCP_LOG_DEBUG("old_fsf_req=%p\n", old_fsf_req);
 	if (!old_fsf_req) {
 		write_unlock_irqrestore(&adapter->abort_lock, flags);
-		ZFCP_LOG_NORMAL("bug: no old fsf request found\n");
-		ZFCP_LOG_NORMAL("req_data:\n");
-		ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_NORMAL,
-			      (char *) req_data, sizeof (union zfcp_req_data));
-		ZFCP_LOG_NORMAL("scsi_cmnd:\n");
-		ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_NORMAL,
-			      (char *) scpnt, sizeof (struct scsi_cmnd));
-		retval = FAILED;
-		strncpy(dbf_result, "##bug:r", ZFCP_ABORT_DBF_LENGTH);
-		goto out;
+		if (fsf_req_ptr)
+			*fsf_req_ptr = NULL;
+		return SUCCESS;
 	}
 	old_fsf_req->data.send_fcp_command_task.scsi_cmnd = NULL;
 	/* mark old request as being aborted */
@@ -543,83 +490,101 @@ __zfcp_scsi_eh_abort_handler(struct scsi
 	 * all critical accesses to scsi_req are done.
 	 */
 	write_unlock_irqrestore(&adapter->abort_lock, flags);
+
+	req_flags = (!fsf_req_ptr) ? ZFCP_REQ_AUTO_CLEANUP : 0;
+	new_fsf_req = zfcp_fsf_abort_fcp_command(
+			(unsigned long) old_fsf_req, adapter, unit, req_flags);
+
 	/* call FSF routine which does the abort */
-	new_fsf_req = zfcp_fsf_abort_fcp_command((unsigned long) old_fsf_req,
-						 adapter, unit, 0);
-	ZFCP_LOG_DEBUG("new_fsf_req=%p\n", new_fsf_req);
 	if (!new_fsf_req) {
-		retval = FAILED;
-		ZFCP_LOG_NORMAL("error: initiation of Abort FCP Cmnd "
-				"failed\n");
-		strncpy(dbf_result, "##nores", ZFCP_ABORT_DBF_LENGTH);
-		goto out;
+		ZFCP_LOG_INFO("error: initiation of Abort FCP Command failed\n");
+		if (fsf_req_ptr)
+			*fsf_req_ptr = NULL;
+		return FAILED;
 	}
 
-	/* wait for completion of abort */
-	ZFCP_LOG_DEBUG("waiting for cleanup...\n");
-#if 1
-	/*
-	 * FIXME:
-	 * copying zfcp_fsf_req_wait_and_cleanup code is not really nice
-	 */
-	__wait_event(new_fsf_req->completion_wq,
-		     new_fsf_req->status & ZFCP_STATUS_FSFREQ_COMPLETED);
-	status = new_fsf_req->status;
-	dbf_fsf_status = new_fsf_req->qtcb->header.fsf_status;
-	/*
-	 * Ralphs special debug load provides timestamps in the FSF
-	 * status qualifier. This might be specified later if being
-	 * useful for debugging aborts.
-	 */
-	dbf_fsf_qual[0] =
-	    *(u64 *) & new_fsf_req->qtcb->header.fsf_status_qual.word[0];
-	dbf_fsf_qual[1] =
-	    *(u64 *) & new_fsf_req->qtcb->header.fsf_status_qual.word[2];
-	zfcp_fsf_req_free(new_fsf_req);
-#else
-	retval = zfcp_fsf_req_wait_and_cleanup(new_fsf_req,
-					       ZFCP_UNINTERRUPTIBLE, &status);
-#endif
-	ZFCP_LOG_DEBUG("Waiting for cleanup complete, status=0x%x\n", status);
-	/* status should be valid since signals were not permitted */
-	if (status & ZFCP_STATUS_FSFREQ_ABORTSUCCEEDED) {
-		retval = SUCCESS;
-		strncpy(dbf_result, "##succ", ZFCP_ABORT_DBF_LENGTH);
-	} else if (status & ZFCP_STATUS_FSFREQ_ABORTNOTNEEDED) {
-		retval = SUCCESS;
-		strncpy(dbf_result, "##late2", ZFCP_ABORT_DBF_LENGTH);
-	} else {
-		retval = FAILED;
-		strncpy(dbf_result, "##fail", ZFCP_ABORT_DBF_LENGTH);
-	}
+	if (fsf_req_ptr)
+		*fsf_req_ptr = new_fsf_req;
+	return SUCCESS;
+}
 
- out:
-	debug_event(adapter->abort_dbf, 1, &dbf_scsi_cmnd, sizeof (u64));
-	debug_event(adapter->abort_dbf, 1, &dbf_opcode, ZFCP_ABORT_DBF_LENGTH);
-	debug_event(adapter->abort_dbf, 1, &dbf_wwn, sizeof (wwn_t));
-	debug_event(adapter->abort_dbf, 1, &dbf_fcp_lun, sizeof (fcp_lun_t));
-	debug_event(adapter->abort_dbf, 1, &dbf_retries, sizeof (u64));
-	debug_event(adapter->abort_dbf, 1, &dbf_allowed, sizeof (u64));
-	debug_event(adapter->abort_dbf, 1, &dbf_timeout, sizeof (u64));
-	debug_event(adapter->abort_dbf, 1, &dbf_fsf_req, sizeof (u64));
-	debug_event(adapter->abort_dbf, 1, &dbf_fsf_status, sizeof (u64));
-	debug_event(adapter->abort_dbf, 1, &dbf_fsf_qual[0], sizeof (u64));
-	debug_event(adapter->abort_dbf, 1, &dbf_fsf_qual[1], sizeof (u64));
-	debug_text_event(adapter->abort_dbf, 1, dbf_result);
 
-	spin_lock_irq(scsi_host->host_lock);
+int
+zfcp_scsi_abort_sync(struct scsi_cmnd *scpnt)
+{
+	struct zfcp_fsf_req *fsf_req;
+	int retval;
+
+	retval = zfcp_scsi_abort_async(scpnt, &fsf_req);
+	if (!fsf_req)
+		return retval;
+
+  	/* wait for completion of abort */
+  	__wait_event(
+		fsf_req->completion_wq,
+		fsf_req->status & ZFCP_STATUS_FSFREQ_COMPLETED);
+
+ 	/* status should be valid since signals were not permitted */
+	if (fsf_req->status & ZFCP_STATUS_FSFREQ_ABORTSUCCEEDED) {
+  		retval = SUCCESS;
+	} else if (fsf_req->status & ZFCP_STATUS_FSFREQ_ABORTNOTNEEDED) {
+  		retval = SUCCESS;
+  	} else {
+  		retval = FAILED;
+  	}
+  
+	zfcp_fsf_req_free(fsf_req);
+
 	return retval;
 }
 
+/**
+ * zfcp_scsi_eh_abort_handler - abort the specified SCSI command
+ * @scpnt: pointer to scsi_cmnd to be aborted 
+ * Return: SUCCESS - command has been aborted and cleaned up in internal
+ *          bookkeeping, SCSI stack won't be called for aborted command
+ *         FAILED - otherwise
+ *
+ * We do not need to care for a SCSI command which completes normally
+ * but late during this abort routine runs.  We are allowed to return
+ * late commands to the SCSI stack.  It tracks the state of commands and
+ * will handle late commands.  (Usually, the normal completion of late
+ * commands is ignored with respect to the running abort operation.)
+ */
 int
 zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt)
 {
-	int rc;
-	struct Scsi_Host *scsi_host = scpnt->device->host;
-	spin_lock_irq(scsi_host->host_lock);
-	rc = __zfcp_scsi_eh_abort_handler(scpnt);
-	spin_unlock_irq(scsi_host->host_lock);
-	return rc;
+	struct Scsi_Host *host = scpnt->device->host;
+	struct zfcp_adapter *adapter = (struct zfcp_adapter *) host->hostdata[0];
+	int retval;
+
+	ZFCP_LOG_INFO("aborting scsi_cmnd %p on adapter %s\n",
+		      scpnt, zfcp_get_busid_by_adapter(adapter));
+
+	retval = zfcp_scsi_abort_sync(scpnt);
+  
+  	return retval;
+}
+  
+/**
+ * zfcp_scsi_eh_timed_out - handle timed out SCSI command
+ * @scsi_cmnd: pointer to scsi command which timed out
+ * Return: EH_HANDLED - to notify SCSI layer that we would never call
+ *          scsi_done() for that command
+ */
+enum scsi_eh_timer_return
+zfcp_scsi_eh_timed_out(struct scsi_cmnd *scpnt)
+{
+	struct Scsi_Host *host = scpnt->device->host;
+	struct zfcp_adapter *adapter = (struct zfcp_adapter *) host->hostdata[0];
+
+	ZFCP_LOG_INFO("scsi_cmnd %p on adapter %s timed out\n",
+		      scpnt, zfcp_get_busid_by_adapter(adapter));
+
+        set_host_byte(&scpnt->result, DID_NO_CONNECT);
+	zfcp_scsi_abort_async(scpnt, NULL);
+
+	return EH_HANDLED;
 }
 
 /*
-
: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html