3.16.47-rc1 review patch. If anyone has any objections, please let me know. ------------------ From: Hannes Reinecke <hare@xxxxxxx> commit 7a38dc0bfb4cc39ed57e120e2224673f3d4d200f upstream. The current medium access timeout counter will be increased for each command, so if there are enough failed commands we'll hit the medium access timeout for even a single device failure and the following kernel message is displayed: sd H:C:T:L: [sdXY] Medium access timeout failure. Offlining disk! Fix this by making the timeout per EH run, ie the counter will only be increased once per device and EH run. Fixes: 18a4d0a ("[SCSI] Handle disk devices which can not process medium access commands") Cc: Ewan Milne <emilne@xxxxxxxxxx> Cc: Lawrence Obermann <loberman@xxxxxxxxxx> Cc: Benjamin Block <bblock@xxxxxxxxxxxxxxxxxx> Cc: Steffen Maier <maier@xxxxxxxxxxxxxxxxxx> Signed-off-by: Hannes Reinecke <hare@xxxxxxxx> Reviewed-by: Christoph Hellwig <hch@xxxxxx> Signed-off-by: Martin K. Petersen <martin.petersen@xxxxxxxxxx> [bwh: Backported to 3.16: - Open-code blk_rq_is_passthrough() - Adjust context] Signed-off-by: Ben Hutchings <ben@xxxxxxxxxxxxxxx> --- drivers/scsi/scsi_error.c | 18 ++++++++++++++++++ drivers/scsi/sd.c | 27 ++++++++++++++++++++++++++- drivers/scsi/sd.h | 1 + include/scsi/scsi_driver.h | 1 + 4 files changed, 46 insertions(+), 1 deletion(-) --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -224,6 +224,23 @@ scsi_abort_command(struct scsi_cmnd *scm } /** + * scsi_eh_reset - call into ->eh_action to reset internal counters + * @scmd: scmd to run eh on. + * + * The scsi driver might be carrying internal state about the + * devices, so we need to call into the driver to reset the + * internal state once the error handler is started. + */ +static void scsi_eh_reset(struct scsi_cmnd *scmd) +{ + if (scmd->request->cmd_type == REQ_TYPE_FS) { + struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); + if (sdrv->eh_reset) + sdrv->eh_reset(scmd); + } +} + +/** * scsi_eh_scmd_add - add scsi cmd to error handling. * @scmd: scmd to run eh on. * @eh_flag: optional SCSI_EH flag. @@ -252,6 +269,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *s if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) eh_flag &= ~SCSI_EH_CANCEL_CMD; scmd->eh_eflags |= eh_flag; + scsi_eh_reset(scmd); list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q); shost->host_failed++; scsi_eh_wakeup(shost); --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -112,6 +112,7 @@ static void sd_rescan(struct device *); static int sd_init_command(struct scsi_cmnd *SCpnt); static void sd_uninit_command(struct scsi_cmnd *SCpnt); static int sd_done(struct scsi_cmnd *); +static void sd_eh_reset(struct scsi_cmnd *); static int sd_eh_action(struct scsi_cmnd *, int); static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer); static void scsi_disk_release(struct device *cdev); @@ -509,6 +510,7 @@ static struct scsi_driver sd_template = .uninit_command = sd_uninit_command, .done = sd_done, .eh_action = sd_eh_action, + .eh_reset = sd_eh_reset, }; /* @@ -1536,6 +1538,26 @@ static const struct block_device_operati }; /** + * sd_eh_reset - reset error handling callback + * @scmd: sd-issued command that has failed + * + * This function is called by the SCSI midlayer before starting + * SCSI EH. When counting medium access failures we have to be + * careful to register it only only once per device and SCSI EH run; + * there might be several timed out commands which will cause the + * 'max_medium_access_timeouts' counter to trigger after the first + * SCSI EH run already and set the device to offline. + * So this function resets the internal counter before starting SCSI EH. + **/ +static void sd_eh_reset(struct scsi_cmnd *scmd) +{ + struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk); + + /* New SCSI EH run, reset gate variable */ + sdkp->ignore_medium_access_errors = false; +} + +/** * sd_eh_action - error handling callback * @scmd: sd-issued command that has failed * @eh_disp: The recovery disposition suggested by the midlayer @@ -1564,7 +1586,10 @@ static int sd_eh_action(struct scsi_cmnd * process of recovering or has it suffered an internal failure * that prevents access to the storage medium. */ - sdkp->medium_access_timed_out++; + if (!sdkp->ignore_medium_access_errors) { + sdkp->medium_access_timed_out++; + sdkp->ignore_medium_access_errors = true; + } /* * If the device keeps failing read/write commands but TEST UNIT --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -90,6 +90,7 @@ struct scsi_disk { unsigned lbpvpd : 1; unsigned ws10 : 1; unsigned ws16 : 1; + unsigned ignore_medium_access_errors : 1; }; #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,dev) --- a/include/scsi/scsi_driver.h +++ b/include/scsi/scsi_driver.h @@ -17,6 +17,7 @@ struct scsi_driver { void (*uninit_command)(struct scsi_cmnd *); int (*done)(struct scsi_cmnd *); int (*eh_action)(struct scsi_cmnd *, int); + void (*eh_reset)(struct scsi_cmnd *); }; #define to_scsi_driver(drv) \ container_of((drv), struct scsi_driver, gendrv)