On Wed, Jan 07, 2015 at 02:03:22PM +0100, Bart Van Assche wrote: > Since kernel v3.19-rc1 module_refcount() returns 1 instead of 0 > when called from inside module_exit(). This breaks the > module_refcount() test in scsi_device_put() and hence causes the > following kernel warning to be reported when unloading the ib_srp > kernel module: This is getting better, but I still think we need to sort out the root cause. The problemt started with commit 39b7f1e25 ("[SCSI] sd: Fix refcounting"), which added the calls to scsi_device_get in various struct device_driver/scsi_driver methods. From the BZ it seems like the rationale was to protect against races between ->rescan and ->remove, but instead of doing that using refcounting we better ensure that in the SCSI midlayer by taking scan_mutex around calls to ->rescan. The first attached patch does that, which allows us to functionally revert 39b7f1e25, which then also allows to revert 85b6c720 ("[SCSI] sd: fix cache flushing on module removal (and individual device removal)"). See the attached series to do that. Warnings: so far it only got minimal testing.
>From f92b9687a9663d4bf304303658acfe03167e0667 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@xxxxxx> Date: Thu, 8 Jan 2015 11:59:56 +0100 Subject: scsi: serialize ->rescan against ->remove Take the scan_mutex when rescanning a device to protect against a concurrent device removal. Signed-off-by: Christoph Hellwig <hch@xxxxxx> --- drivers/scsi/scsi_scan.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 0deb385..7d926cd 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -1569,16 +1569,18 @@ EXPORT_SYMBOL(scsi_add_device); void scsi_rescan_device(struct device *dev) { - if (!dev->driver) - return; + struct scsi_device *sdev = to_scsi_device(dev); + struct Scsi_Host *shost = sdev->host; - if (try_module_get(dev->driver->owner)) { + mutex_lock(&shost->scan_mutex); + if (dev->driver && try_module_get(dev->driver->owner)) { struct scsi_driver *drv = to_scsi_driver(dev->driver); if (drv->rescan) drv->rescan(dev); module_put(dev->driver->owner); } + mutex_unlock(&shost->scan_mutex); } EXPORT_SYMBOL(scsi_rescan_device); -- 1.9.1
>From c056e4f177617d095eca495f94753f16043eb38b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@xxxxxx> Date: Thu, 8 Jan 2015 12:07:05 +0100 Subject: sd: don't grab reference device references from driver methods The device model already takes care of races from ->remove and ->shutdown vs its other methods, and we now take care for the scsi-specific methods as well. This is a partial, manual revert of commit 39b7f1e25 ("[SCSI] sd: Fix refcounting"). Signed-off-by: Christoph Hellwig <hch@xxxxxx> --- drivers/scsi/sd.c | 55 +++++++++++-------------------------------------------- 1 file changed, 11 insertions(+), 44 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index ebf35cb6..be76130 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -564,10 +564,12 @@ static int sd_major(int major_idx) } } -static struct scsi_disk *__scsi_disk_get(struct gendisk *disk) +static struct scsi_disk *scsi_disk_get(struct gendisk *disk) { struct scsi_disk *sdkp = NULL; + mutex_lock(&sd_ref_mutex); + if (disk->private_data) { sdkp = scsi_disk(disk); if (scsi_device_get(sdkp->device) == 0) @@ -575,27 +577,6 @@ static struct scsi_disk *__scsi_disk_get(struct gendisk *disk) else sdkp = NULL; } - return sdkp; -} - -static struct scsi_disk *scsi_disk_get(struct gendisk *disk) -{ - struct scsi_disk *sdkp; - - mutex_lock(&sd_ref_mutex); - sdkp = __scsi_disk_get(disk); - mutex_unlock(&sd_ref_mutex); - return sdkp; -} - -static struct scsi_disk *scsi_disk_get_from_dev(struct device *dev) -{ - struct scsi_disk *sdkp; - - mutex_lock(&sd_ref_mutex); - sdkp = dev_get_drvdata(dev); - if (sdkp) - sdkp = __scsi_disk_get(sdkp->disk); mutex_unlock(&sd_ref_mutex); return sdkp; } @@ -610,8 +591,6 @@ static void scsi_disk_put(struct scsi_disk *sdkp) mutex_unlock(&sd_ref_mutex); } - - static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, unsigned int dix, unsigned int dif) { @@ -1525,12 +1504,9 @@ static int sd_sync_cache(struct scsi_disk *sdkp) static void sd_rescan(struct device *dev) { - struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev); + struct scsi_disk *sdkp = dev_get_drvdata(dev); - if (sdkp) { - revalidate_disk(sdkp->disk); - scsi_disk_put(sdkp); - } + revalidate_disk(sdkp->disk); } @@ -3147,13 +3123,13 @@ static int sd_start_stop_device(struct scsi_disk *sdkp, int start) */ static void sd_shutdown(struct device *dev) { - struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev); + struct scsi_disk *sdkp = dev_get_drvdata(dev); if (!sdkp) return; /* this can happen */ if (pm_runtime_suspended(dev)) - goto exit; + return; if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); @@ -3164,14 +3140,11 @@ static void sd_shutdown(struct device *dev) sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); sd_start_stop_device(sdkp, 0); } - -exit: - scsi_disk_put(sdkp); } static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) { - struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev); + struct scsi_disk *sdkp = dev_get_drvdata(dev); int ret = 0; if (!sdkp) @@ -3197,7 +3170,6 @@ static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) } done: - scsi_disk_put(sdkp); return ret; } @@ -3213,18 +3185,13 @@ static int sd_suspend_runtime(struct device *dev) static int sd_resume(struct device *dev) { - struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev); - int ret = 0; + struct scsi_disk *sdkp = dev_get_drvdata(dev); if (!sdkp->device->manage_start_stop) - goto done; + return 0; sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); - ret = sd_start_stop_device(sdkp, 1); - -done: - scsi_disk_put(sdkp); - return ret; + return sd_start_stop_device(sdkp, 1); } /** -- 1.9.1
>From 53378be96577c3fdb31dc1bdc0a73ec49171c763 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@xxxxxx> Date: Thu, 8 Jan 2015 12:10:29 +0100 Subject: Revert "[SCSI] sd: fix cache flushing on module removal (and individual device removal)" This reverts commit 85b6c720b0931101c8bcc3a5abdc2b8514b0fb4b. We now never call scsi_device_get from the shutdown path, and this workaround turned out to create more problems than it solves. Move back to properly checking the device state and carefully handle module refcounting including checking the return value. Signed-off-by: Christoph Hellwig <hch@xxxxxx> --- drivers/scsi/scsi.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 08c90a7..ff8f888 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -975,14 +975,14 @@ EXPORT_SYMBOL(scsi_report_opcode); */ int scsi_device_get(struct scsi_device *sdev) { - if (sdev->sdev_state == SDEV_DEL) + if (sdev->sdev_state == SDEV_DEL || sdev->sdev_state == SDEV_CANCEL) return -ENXIO; if (!get_device(&sdev->sdev_gendev)) return -ENXIO; - /* We can fail this if we're doing SCSI operations - * from module exit (like cache flush) */ - try_module_get(sdev->host->hostt->module); - + if (!try_module_get(sdev->host->hostt->module)) { + put_device(&sdev->sdev_gendev); + return -ENXIO; + } return 0; } EXPORT_SYMBOL(scsi_device_get); @@ -997,14 +997,7 @@ EXPORT_SYMBOL(scsi_device_get); */ void scsi_device_put(struct scsi_device *sdev) { -#ifdef CONFIG_MODULE_UNLOAD - struct module *module = sdev->host->hostt->module; - - /* The module refcount will be zero if scsi_device_get() - * was called from a module removal routine */ - if (module && module_refcount(module) != 0) - module_put(module); -#endif + module_put(sdev->host->hostt->module); put_device(&sdev->sdev_gendev); } EXPORT_SYMBOL(scsi_device_put); -- 1.9.1