On Tue, Oct 06, 2020 at 03:07:14PM +0200, Håkon Bugge wrote: > Fixes: b7d8e64fa9db ("IB/mlx4: Add support for REJ due to timeout") > > Signed-off-by: Håkon Bugge <haakon.bugge@xxxxxxxxxx> > drivers/infiniband/hw/mlx4/cm.c | 73 +++++++++++++++--------------------- > drivers/infiniband/hw/mlx4/mlx4_ib.h | 4 +- > 2 files changed, 32 insertions(+), 45 deletions(-) > > diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c > index 0ce4b5a..6c7986b 100644 > +++ b/drivers/infiniband/hw/mlx4/cm.c > @@ -58,9 +58,7 @@ struct rej_tmout_entry { > int slave; > u32 rem_pv_cm_id; > struct delayed_work timeout; > - struct radix_tree_root *rej_tmout_root; > - /* Points to the mutex protecting this radix-tree */ > - struct mutex *lock; > + struct xarray *xa_rej_tmout; > }; > > struct cm_generic_msg { > @@ -350,9 +348,7 @@ static void rej_tmout_timeout(struct work_struct *work) > struct rej_tmout_entry *item = container_of(delay, struct rej_tmout_entry, timeout); > struct rej_tmout_entry *deleted; > > - mutex_lock(item->lock); > - deleted = radix_tree_delete_item(item->rej_tmout_root, item->rem_pv_cm_id, NULL); > - mutex_unlock(item->lock); > + deleted = xa_cmpxchg(item->xa_rej_tmout, item->rem_pv_cm_id, item, NULL, 0); > > if (deleted != item) > pr_debug("deleted(%p) != item(%p)\n", deleted, item); > @@ -363,14 +359,13 @@ static void rej_tmout_timeout(struct work_struct *work) > static int alloc_rej_tmout(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id, int slave) > { > struct rej_tmout_entry *item; > - int sts; > + struct rej_tmout_entry *old; > + > + item = xa_load(&sriov->xa_rej_tmout, (unsigned long)rem_pv_cm_id); The locking that was here looks wrong, rej_tmout_timeout() is a work that could run at any time and kfree(item), so some kind of lock must be held across every touch to item Holding the xa_lock until the mod_delayed_work is done would be ok? > static int lookup_rej_tmout_slave(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id) > { > struct rej_tmout_entry *item; > > - mutex_lock(&sriov->rej_tmout_lock); > - item = radix_tree_lookup(&sriov->rej_tmout_root, (unsigned long)rem_pv_cm_id); > - mutex_unlock(&sriov->rej_tmout_lock); > + item = xa_load(&sriov->xa_rej_tmout, (unsigned long)rem_pv_cm_id); > > - if (!item || IS_ERR(item)) { > + if (!item || xa_err(item)) { > pr_debug("Could not find slave. rem_pv_cm_id 0x%x error: %d\n", > - rem_pv_cm_id, (int)PTR_ERR(item)); > - return !item ? -ENOENT : PTR_ERR(item); > + rem_pv_cm_id, xa_err(item)); > + return !item ? -ENOENT : xa_err(item); > } > > return item->slave; Here too > + xa_lock(&sriov->xa_rej_tmout); > + xa_for_each(&sriov->xa_rej_tmout, id, item) { > if (slave < 0 || slave == item->slave) { > mod_delayed_work(system_wq, &item->timeout, 0); > flush_needed = true; > ++cnt; > } > } > - mutex_unlock(&sriov->rej_tmout_lock); > + xa_unlock(&sriov->xa_rej_tmout); This is OK Jason