On 07-04-22, 17:39, Pierre-Louis Bossart wrote: > In the SoundWire probe, we store a pointer from the driver ops into > the 'slave' structure. This can lead to kernel oopses when unbinding > codec drivers, e.g. with the following sequence to remove machine > driver and codec driver. > > /sbin/modprobe -r snd_soc_sof_sdw > /sbin/modprobe -r snd_soc_rt711 > > The full details can be found in the BugLink below, for reference the > two following examples show different cases of driver ops/callbacks > being invoked after the driver .remove(). > > kernel: BUG: kernel NULL pointer dereference, address: 0000000000000150 > kernel: Workqueue: events cdns_update_slave_status_work [soundwire_cadence] > kernel: RIP: 0010:mutex_lock+0x19/0x30 > kernel: Call Trace: > kernel: ? sdw_handle_slave_status+0x426/0xe00 [soundwire_bus 94ff184bf398570c3f8ff7efe9e32529f532e4ae] > kernel: ? newidle_balance+0x26a/0x400 > kernel: ? cdns_update_slave_status_work+0x1e9/0x200 [soundwire_cadence 1bcf98eebe5ba9833cd433323769ac923c9c6f82] > > kernel: BUG: unable to handle page fault for address: ffffffffc07654c8 > kernel: Workqueue: pm pm_runtime_work > kernel: RIP: 0010:sdw_bus_prep_clk_stop+0x6f/0x160 [soundwire_bus] > kernel: Call Trace: > kernel: <TASK> > kernel: sdw_cdns_clock_stop+0xb5/0x1b0 [soundwire_cadence 1bcf98eebe5ba9833cd433323769ac923c9c6f82] > kernel: intel_suspend_runtime+0x5f/0x120 [soundwire_intel aca858f7c87048d3152a4a41bb68abb9b663a1dd] > kernel: ? dpm_sysfs_remove+0x60/0x60 > > This was not detected earlier in Intel tests since the tests first > remove the parent PCI device and shut down the bus. The sequence > above is a corner case which keeps the bus operational but without a > driver bound. > > This patch removes the use the 'slave' ops and uses proper locking to > make sure there are no live callbacks based on a dangling pointer > executed during or after the driver unbinding sequence. In one > specific case, indicated with comments in the code, the device lock is > already taken at a higher level when starting the resume operations. > > The issue with the ops pointer has been there since December 2017, but > there were so many changes in the bus handling code that this patch > will not apply cleanly all the way to this initial commit. The changes > can be easily backported though. > > Thanks to Dan Williams for his suggestions on an earlier version of > this patch. > > BugLink: https://github.com/thesofproject/linux/issues/3531 > Fixes: 56d4fe31af77 ("soundwire: Add MIPI DisCo property helpers") > Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@xxxxxxxxxxxxxxx> > --- > > This is a follow-up on the initial discussion in https://lore.kernel.org/alsa-devel/d0559e97-c4a0-b817-428c-d3e305390270@xxxxxxxxxxxxxxx/ > > I could use feedback on whether using device_lock() is appropriate and Looking at other uses of device_lock() it seems apt to me > test results on non-Intel platforms. Thanks! > Pierre > > drivers/soundwire/bus.c | 78 ++++++++++++++++++++++++++++-------- > drivers/soundwire/bus_type.c | 6 +-- > drivers/soundwire/stream.c | 57 +++++++++++++++++--------- > 3 files changed, 102 insertions(+), 39 deletions(-) > > diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c > index 8b7a680f388e..545b379a119e 100644 > --- a/drivers/soundwire/bus.c > +++ b/drivers/soundwire/bus.c > @@ -7,6 +7,7 @@ > #include <linux/pm_runtime.h> > #include <linux/soundwire/sdw_registers.h> > #include <linux/soundwire/sdw.h> > +#include <linux/soundwire/sdw_type.h> > #include "bus.h" > #include "sysfs_local.h" > > @@ -846,12 +847,18 @@ static int sdw_slave_clk_stop_callback(struct sdw_slave *slave, > enum sdw_clk_stop_mode mode, > enum sdw_clk_stop_type type) > { > - int ret; > + struct device *dev = &slave->dev; > + struct sdw_driver *drv; > > - if (slave->ops && slave->ops->clk_stop) { > - ret = slave->ops->clk_stop(slave, mode, type); > - if (ret < 0) > - return ret; > + /* > + * this function can only be called from a pm_runtime > + * sequence where the device is already locked > + */ If this is guaranteed.. > + > + if (dev->driver) { do we need to check this? Did you find a case where this was not valid while device is locked, maybe do this while holding the lock (kind of moot to process the calls if driver is gone) > + drv = drv_to_sdw_driver(dev->driver); > + if (drv && drv->ops && drv->ops->clk_stop) > + return drv->ops->clk_stop(slave, mode, type); > } > > return 0; > @@ -1616,14 +1623,25 @@ static int sdw_handle_slave_alerts(struct sdw_slave *slave) > } > > /* Update the Slave driver */ > - if (slave_notify && slave->ops && > - slave->ops->interrupt_callback) { > - slave_intr.sdca_cascade = sdca_cascade; > - slave_intr.control_port = clear; > - memcpy(slave_intr.port, &port_status, > - sizeof(slave_intr.port)); > - > - slave->ops->interrupt_callback(slave, &slave_intr); > + if (slave_notify) { > + struct device *dev = &slave->dev; > + struct sdw_driver *drv; > + > + device_lock(dev); device is locked > + > + if (dev->driver) { Same here as well > + drv = drv_to_sdw_driver(dev->driver); > + if (drv && drv->ops && drv->ops->interrupt_callback) { > + slave_intr.sdca_cascade = sdca_cascade; > + slave_intr.control_port = clear; > + memcpy(slave_intr.port, &port_status, > + sizeof(slave_intr.port)); > + > + drv->ops->interrupt_callback(slave, &slave_intr); > + } > + } > + > + device_unlock(dev); > } > > /* Ack interrupt */ > @@ -1697,7 +1715,12 @@ static int sdw_handle_slave_alerts(struct sdw_slave *slave) > static int sdw_update_slave_status(struct sdw_slave *slave, > enum sdw_slave_status status) > { > + struct device *dev = &slave->dev; > + struct sdw_driver *drv; > unsigned long time; > + int ret = 0; > + > + device_lock_assert(dev); > > if (!slave->probed) { > /* > @@ -1716,10 +1739,13 @@ static int sdw_update_slave_status(struct sdw_slave *slave, > } > } > > - if (!slave->ops || !slave->ops->update_status) > - return 0; > + if (dev->driver) { > + drv = drv_to_sdw_driver(dev->driver); > + if (drv && drv->ops && drv->ops->update_status) > + ret = drv->ops->update_status(slave, status); > + } > > - return slave->ops->update_status(slave, status); > + return ret; > } > > /** > @@ -1828,7 +1854,10 @@ int sdw_handle_slave_status(struct sdw_bus *bus, > break; > } > > + device_lock(&slave->dev); > ret = sdw_update_slave_status(slave, status[i]); > + device_unlock(&slave->dev); > + > if (ret < 0) > dev_err(&slave->dev, > "Update Slave status failed:%d\n", ret); > @@ -1860,6 +1889,7 @@ EXPORT_SYMBOL(sdw_handle_slave_status); > void sdw_clear_slave_status(struct sdw_bus *bus, u32 request) > { > struct sdw_slave *slave; > + bool lock; > int i; > > /* Check all non-zero devices */ > @@ -1878,7 +1908,23 @@ void sdw_clear_slave_status(struct sdw_bus *bus, u32 request) > if (slave->status != SDW_SLAVE_UNATTACHED) { > sdw_modify_slave_status(slave, SDW_SLAVE_UNATTACHED); > slave->first_interrupt_done = false; > + > + lock = device_trylock(&slave->dev); should we proceed if we dont get a lock? also why the trylock variant. We can do the lock, this is wq context > + > + /* > + * this bus/manager-level function can only be called from > + * a resume sequence. If the peripheral device (child of the > + * manager device) is locked, this indicates a resume operation > + * initiated by the device core to deal with .remove() or .shutdown() > + * at the peripheral level. With the parent-child order enforced > + * by PM frameworks on resume, the peripheral resume has not started > + * yet, so it's safe to assume the lock will not be released while > + * the update_status callback is invoked. > + */ > sdw_update_slave_status(slave, SDW_SLAVE_UNATTACHED); > + > + if (lock) > + device_unlock(&slave->dev); > } > > /* keep track of request, used in pm_runtime resume */ > diff --git a/drivers/soundwire/bus_type.c b/drivers/soundwire/bus_type.c > index 893296f3fe39..91f39c8c119a 100644 > --- a/drivers/soundwire/bus_type.c > +++ b/drivers/soundwire/bus_type.c > @@ -98,8 +98,6 @@ static int sdw_drv_probe(struct device *dev) > if (!id) > return -ENODEV; > > - slave->ops = drv->ops; > - > /* > * attach to power domain but don't turn on (last arg) > */ > @@ -118,8 +116,8 @@ static int sdw_drv_probe(struct device *dev) > } > > /* device is probed so let's read the properties now */ > - if (slave->ops && slave->ops->read_prop) > - slave->ops->read_prop(slave); > + if (drv->ops && drv->ops->read_prop) > + drv->ops->read_prop(slave); > > /* init the sysfs as we have properties now */ > ret = sdw_slave_sysfs_init(slave); > diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c > index f273459b2023..7862b4403d14 100644 > --- a/drivers/soundwire/stream.c > +++ b/drivers/soundwire/stream.c > @@ -13,6 +13,7 @@ > #include <linux/slab.h> > #include <linux/soundwire/sdw_registers.h> > #include <linux/soundwire/sdw.h> > +#include <linux/soundwire/sdw_type.h> > #include <sound/soc.h> > #include "bus.h" > > @@ -401,20 +402,26 @@ static int sdw_do_port_prep(struct sdw_slave_runtime *s_rt, > struct sdw_prepare_ch prep_ch, > enum sdw_port_prep_ops cmd) > { > - const struct sdw_slave_ops *ops = s_rt->slave->ops; > - int ret; > + struct device *dev = &s_rt->slave->dev; > + struct sdw_driver *drv; > + int ret = 0; > > - if (ops->port_prep) { > - ret = ops->port_prep(s_rt->slave, &prep_ch, cmd); > - if (ret < 0) { > - dev_err(&s_rt->slave->dev, > - "Slave Port Prep cmd %d failed: %d\n", > - cmd, ret); > - return ret; > + device_lock(dev); > + > + if (dev->driver) { > + drv = drv_to_sdw_driver(dev->driver); > + if (drv && drv->ops && drv->ops->port_prep) { > + ret = drv->ops->port_prep(s_rt->slave, &prep_ch, cmd); > + if (ret < 0) > + dev_err(&s_rt->slave->dev, > + "Slave Port Prep cmd %d failed: %d\n", > + cmd, ret); > } > } > > - return 0; > + device_unlock(dev); > + > + return ret; > } > > static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus, > @@ -578,7 +585,7 @@ static int sdw_notify_config(struct sdw_master_runtime *m_rt) > struct sdw_slave_runtime *s_rt; > struct sdw_bus *bus = m_rt->bus; > struct sdw_slave *slave; > - int ret = 0; > + int ret; > > if (bus->ops->set_bus_conf) { > ret = bus->ops->set_bus_conf(bus, &bus->params); > @@ -587,19 +594,31 @@ static int sdw_notify_config(struct sdw_master_runtime *m_rt) > } > > list_for_each_entry(s_rt, &m_rt->slave_rt_list, m_rt_node) { > - slave = s_rt->slave; > + struct sdw_driver *drv; > + struct device *dev; > > - if (slave->ops->bus_config) { > - ret = slave->ops->bus_config(slave, &bus->params); > - if (ret < 0) { > - dev_err(bus->dev, "Notify Slave: %d failed\n", > - slave->dev_num); > - return ret; > + slave = s_rt->slave; > + dev = &slave->dev; > + > + device_lock(dev); > + > + if (dev->driver) { > + drv = drv_to_sdw_driver(dev->driver); > + if (drv && drv->ops && drv->ops->bus_config) { > + ret = drv->ops->bus_config(slave, &bus->params); > + if (ret < 0) { > + device_unlock(dev); > + dev_err(bus->dev, "Notify Slave: %d failed\n", > + slave->dev_num); > + return ret; > + } > } > } > + > + device_unlock(dev); > } > > - return ret; > + return 0; > } > > /** > -- > 2.30.2 -- ~Vinod