On Tue, 2017-11-14 at 18:01 +0100, Jack Wang wrote: > I suspect we run into same bug you were trying to fix in this patch > set. we're running in v4.4.50 > > I was trying to reproduce it, but no lucky yet, do you still have your > reproducer? Hello Jack, I can reproduce this about every fifth run of test one of the srp-test software and with the SRP initiator and target drivers of what will become kernel v4.15-rc1 and by switching the ib_srpt driver from non-SRQ to SRQ mode while the initiator is logging in. I'm currently analyzing where in the block layer a queue run is missing. The patch below for the sd driver does not fix the root cause but seems to help. Bart. Subject: [PATCH] Increase SCSI disk probing concurrency --- drivers/scsi/scsi.c | 5 ----- drivers/scsi/scsi_pm.c | 6 ++++-- drivers/scsi/scsi_priv.h | 1 - drivers/scsi/sd.c | 26 +++++++++++++++++++++----- drivers/scsi/sd.h | 1 + include/scsi/scsi_driver.h | 1 + 6 files changed, 27 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index a7e4fba724b7..e6d69e647f6a 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -85,10 +85,6 @@ unsigned int scsi_logging_level; EXPORT_SYMBOL(scsi_logging_level); #endif -/* sd, scsi core and power management need to coordinate flushing async actions */ -ASYNC_DOMAIN(scsi_sd_probe_domain); -EXPORT_SYMBOL(scsi_sd_probe_domain); - /* * Separate domain (from scsi_sd_probe_domain) to maximize the benefit of * asynchronous system resume operations. It is marked 'exclusive' to avoid @@ -839,7 +835,6 @@ static void __exit exit_scsi(void) scsi_exit_devinfo(); scsi_exit_procfs(); scsi_exit_queue(); - async_unregister_domain(&scsi_sd_probe_domain); } subsys_initcall(init_scsi); diff --git a/drivers/scsi/scsi_pm.c b/drivers/scsi/scsi_pm.c index b44c1bb687a2..d8e43c2f4d40 100644 --- a/drivers/scsi/scsi_pm.c +++ b/drivers/scsi/scsi_pm.c @@ -171,9 +171,11 @@ static int scsi_bus_resume_common(struct device *dev, static int scsi_bus_prepare(struct device *dev) { if (scsi_is_sdev_device(dev)) { - /* sd probing uses async_schedule. Wait until it finishes. */ - async_synchronize_full_domain(&scsi_sd_probe_domain); + struct scsi_driver *drv = to_scsi_driver(dev->driver); + /* sd probing happens asynchronously. Wait until it finishes. */ + if (drv->sync) + drv->sync(dev); } else if (scsi_is_host_device(dev)) { /* Wait until async scanning is finished */ scsi_complete_async_scans(); diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h index dab29f538612..bf0cadf6a321 100644 --- a/drivers/scsi/scsi_priv.h +++ b/drivers/scsi/scsi_priv.h @@ -174,7 +174,6 @@ static inline void scsi_autopm_put_host(struct Scsi_Host *h) {} #endif /* CONFIG_PM */ extern struct async_domain scsi_sd_pm_domain; -extern struct async_domain scsi_sd_probe_domain; /* scsi_dh.c */ #ifdef CONFIG_SCSI_DH diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 0313486d85c8..c26dbb38b60c 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -112,6 +112,7 @@ static void sd_shutdown(struct device *); static int sd_suspend_system(struct device *); static int sd_suspend_runtime(struct device *); static int sd_resume(struct device *); +static void sd_sync_probe_domain(struct device *dev); static void sd_rescan(struct device *); static int sd_init_command(struct scsi_cmnd *SCpnt); static void sd_uninit_command(struct scsi_cmnd *SCpnt); @@ -564,6 +565,7 @@ static struct scsi_driver sd_template = { .shutdown = sd_shutdown, .pm = &sd_pm_ops, }, + .sync = sd_sync_probe_domain, .rescan = sd_rescan, .init_command = sd_init_command, .uninit_command = sd_uninit_command, @@ -3221,9 +3223,9 @@ static int sd_format_disk_name(char *prefix, int index, char *buf, int buflen) /* * The asynchronous part of sd_probe */ -static void sd_probe_async(void *data, async_cookie_t cookie) +static void sd_probe_async(struct work_struct *work) { - struct scsi_disk *sdkp = data; + struct scsi_disk *sdkp = container_of(work, typeof(*sdkp), probe_work); struct scsi_device *sdp; struct gendisk *gd; u32 index; @@ -3326,6 +3328,8 @@ static int sd_probe(struct device *dev) if (!sdkp) goto out; + INIT_WORK(&sdkp->probe_work, sd_probe_async); + gd = alloc_disk(SD_MINORS); if (!gd) goto out_free; @@ -3377,8 +3381,8 @@ static int sd_probe(struct device *dev) get_device(dev); dev_set_drvdata(dev, sdkp); - get_device(&sdkp->dev); /* prevent release before async_schedule */ - async_schedule_domain(sd_probe_async, sdkp, &scsi_sd_probe_domain); + get_device(&sdkp->dev); /* prevent release before sd_probe_async() */ + WARN_ON_ONCE(!queue_work(system_unbound_wq, &sdkp->probe_work)); return 0; @@ -3395,6 +3399,18 @@ static int sd_probe(struct device *dev) return error; } +static void sd_wait_for_probing(struct scsi_disk *sdkp) +{ + flush_work(&sdkp->probe_work); +} + +static void sd_sync_probe_domain(struct device *dev) +{ + struct scsi_disk *sdkp = dev_get_drvdata(dev); + + sd_wait_for_probing(sdkp); +} + /** * sd_remove - called whenever a scsi disk (previously recognized by * sd_probe) is detached from the system. It is called (potentially @@ -3416,7 +3432,7 @@ static int sd_remove(struct device *dev) scsi_autopm_get_device(sdkp->device); async_synchronize_full_domain(&scsi_sd_pm_domain); - async_synchronize_full_domain(&scsi_sd_probe_domain); + sd_wait_for_probing(sdkp); device_del(&sdkp->dev); del_gendisk(sdkp->disk); sd_shutdown(dev); diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 7b57dafcd45a..2cc47183c9aa 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -81,6 +81,7 @@ struct scsi_disk { unsigned int zones_optimal_nonseq; unsigned int zones_max_open; #endif + struct work_struct probe_work; atomic_t openers; sector_t capacity; /* size in logical blocks */ u32 max_xfer_blocks; diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h index a5534ccad859..145d6239eecf 100644 --- a/include/scsi/scsi_driver.h +++ b/include/scsi/scsi_driver.h @@ -11,6 +11,7 @@ struct scsi_device; struct scsi_driver { struct device_driver gendrv; + void (*sync)(struct device *); void (*rescan)(struct device *); int (*init_command)(struct scsi_cmnd *); void (*uninit_command)(struct scsi_cmnd *); -- 2.15.0