> On 8 Oct 2020, at 20:56, Jason Gunthorpe <jgg@xxxxxxxx> wrote: > > On Tue, Oct 06, 2020 at 03:07:14PM +0200, Håkon Bugge wrote: >> Fixes: b7d8e64fa9db ("IB/mlx4: Add support for REJ due to timeout") >> >> Signed-off-by: Håkon Bugge <haakon.bugge@xxxxxxxxxx> >> drivers/infiniband/hw/mlx4/cm.c | 73 +++++++++++++++--------------------- >> drivers/infiniband/hw/mlx4/mlx4_ib.h | 4 +- >> 2 files changed, 32 insertions(+), 45 deletions(-) >> >> diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c >> index 0ce4b5a..6c7986b 100644 >> +++ b/drivers/infiniband/hw/mlx4/cm.c >> @@ -58,9 +58,7 @@ struct rej_tmout_entry { >> int slave; >> u32 rem_pv_cm_id; >> struct delayed_work timeout; >> - struct radix_tree_root *rej_tmout_root; >> - /* Points to the mutex protecting this radix-tree */ >> - struct mutex *lock; >> + struct xarray *xa_rej_tmout; >> }; >> >> struct cm_generic_msg { >> @@ -350,9 +348,7 @@ static void rej_tmout_timeout(struct work_struct *work) >> struct rej_tmout_entry *item = container_of(delay, struct rej_tmout_entry, timeout); >> struct rej_tmout_entry *deleted; >> >> - mutex_lock(item->lock); >> - deleted = radix_tree_delete_item(item->rej_tmout_root, item->rem_pv_cm_id, NULL); >> - mutex_unlock(item->lock); >> + deleted = xa_cmpxchg(item->xa_rej_tmout, item->rem_pv_cm_id, item, NULL, 0); >> >> if (deleted != item) >> pr_debug("deleted(%p) != item(%p)\n", deleted, item); >> @@ -363,14 +359,13 @@ static void rej_tmout_timeout(struct work_struct *work) >> static int alloc_rej_tmout(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id, int slave) >> { >> struct rej_tmout_entry *item; >> - int sts; >> + struct rej_tmout_entry *old; >> + >> + item = xa_load(&sriov->xa_rej_tmout, (unsigned long)rem_pv_cm_id); > > The locking that was here looks wrong, rej_tmout_timeout() is a work > that could run at any time and kfree(item), so some kind of lock must > be held across every touch to item > > Holding the xa_lock until the mod_delayed_work is done would be ok? Good catch. I focused too much on the XArray itself. That works, but as you point out, dereferencing item with no locking is a no-no. Will send a v2. Thxs for review, Håkon > >> static int lookup_rej_tmout_slave(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id) >> { >> struct rej_tmout_entry *item; >> >> - mutex_lock(&sriov->rej_tmout_lock); >> - item = radix_tree_lookup(&sriov->rej_tmout_root, (unsigned long)rem_pv_cm_id); >> - mutex_unlock(&sriov->rej_tmout_lock); >> + item = xa_load(&sriov->xa_rej_tmout, (unsigned long)rem_pv_cm_id); >> >> - if (!item || IS_ERR(item)) { >> + if (!item || xa_err(item)) { >> pr_debug("Could not find slave. rem_pv_cm_id 0x%x error: %d\n", >> - rem_pv_cm_id, (int)PTR_ERR(item)); >> - return !item ? -ENOENT : PTR_ERR(item); >> + rem_pv_cm_id, xa_err(item)); >> + return !item ? -ENOENT : xa_err(item); >> } >> >> return item->slave; > > Here too > >> + xa_lock(&sriov->xa_rej_tmout); >> + xa_for_each(&sriov->xa_rej_tmout, id, item) { >> if (slave < 0 || slave == item->slave) { >> mod_delayed_work(system_wq, &item->timeout, 0); >> flush_needed = true; >> ++cnt; >> } >> } >> - mutex_unlock(&sriov->rej_tmout_lock); >> + xa_unlock(&sriov->xa_rej_tmout); > > This is OK > > Jason