On Tue, Mar 21, 2023 at 02:02:59PM +0200, Leon Romanovsky wrote: > On Tue, Mar 21, 2023 at 08:53:35AM -0300, Jason Gunthorpe wrote: > > On Tue, Mar 21, 2023 at 09:54:58AM +0200, Leon Romanovsky wrote: > > > On Mon, Mar 20, 2023 at 04:18:14PM -0300, Jason Gunthorpe wrote: > > > > On Thu, Mar 16, 2023 at 03:39:27PM +0200, Leon Romanovsky wrote: > > > > > From: Patrisious Haddad <phaddad@xxxxxxxxxx> > > > > > > > > > > Previously when destroying a DCT, if the firmware function for the > > > > > destruction failed, the common resource would have been destroyed > > > > > either way, since it was destroyed before the firmware object. > > > > > Which leads to kernel warning "refcount_t: underflow" which indicates > > > > > possible use-after-free. > > > > > Which is triggered when we try to destroy the common resource for the > > > > > second time and execute refcount_dec_and_test(&common->refcount). > > > > > > > > > > So, currently before destroying the common resource we check its > > > > > refcount and continue with the destruction only if it isn't zero. > > > > > > > > This seems super sketchy > > > > > > > > If the destruction fails why not set the refcount back to 1? > > > > > > Because destruction will fail in destroy_rq_tracked() which is after > > > destroy_resource_common(). > > > > > > In first destruction attempt, we delete qp from radix tree and wait for all > > > reference to drop. In order do not undo all this logic (setting 1 alone is > > > not enough), it is much safer simply skip destroy_resource_common() in reentry > > > case. > > > > This is the bug I pointed a long time ago, it is ordered wrong to > > remove restrack before destruction is assured > > It is not restrack, but internal to mlx5_core structure. > > 176 static void destroy_resource_common(struct mlx5_ib_dev *dev, > 177 struct mlx5_core_qp *qp) > 178 { > 179 struct mlx5_qp_table *table = &dev->qp_table; > 180 unsigned long flags; > 181 > > .... > > 185 spin_lock_irqsave(&table->lock, flags); > 186 radix_tree_delete(&table->tree, > 187 qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN)); > 188 spin_unlock_irqrestore(&table->lock, flags); > 189 mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp); > 190 wait_for_completion(&qp->common.free); > 191 } Same basic issue. "RSC"'s refcount stuff is really only for ODP to use, and the silly pseudo locking should really just be rwsem not a refcount. Get DCT out of that particular mess and the scheme is quite simple and doesn't nee hacky stuff. Please make a patch to remove radix tree from this code too... diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index bae0334d6e7f18..68009bff4bd544 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -88,23 +88,34 @@ static bool is_event_type_allowed(int rsc_type, int event_type) } } +static int dct_event_notifier(struct mlx5_ib_dev *dev, struct mlx5_eqe *eqe) +{ + struct mlx5_core_dct *dct; + u32 qpn; + + qpn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; + xa_lock(&dev->qp_table.dct_xa); + dct = xa_load(&dev->qp_table.dct_xa, qpn); + if (dct) + complete(&dct->drained); + xa_unlock(&dev->qp_table.dct_xa); + return NOTIFY_OK; +} + static int rsc_event_notifier(struct notifier_block *nb, unsigned long type, void *data) { + struct mlx5_ib_dev *dev = + container_of(nb, struct mlx5_ib_dev, qp_table.nb); struct mlx5_core_rsc_common *common; - struct mlx5_qp_table *table; - struct mlx5_core_dct *dct; + struct mlx5_eqe *eqe = data; u8 event_type = (u8)type; struct mlx5_core_qp *qp; - struct mlx5_eqe *eqe; u32 rsn; switch (event_type) { case MLX5_EVENT_TYPE_DCT_DRAINED: - eqe = data; - rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; - rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN); - break; + return dct_event_notifier(dev, eqe); case MLX5_EVENT_TYPE_PATH_MIG: case MLX5_EVENT_TYPE_COMM_EST: case MLX5_EVENT_TYPE_SQ_DRAINED: @@ -113,7 +124,6 @@ static int rsc_event_notifier(struct notifier_block *nb, case MLX5_EVENT_TYPE_PATH_MIG_FAILED: case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: - eqe = data; rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN); break; @@ -121,8 +131,7 @@ static int rsc_event_notifier(struct notifier_block *nb, return NOTIFY_DONE; } - table = container_of(nb, struct mlx5_qp_table, nb); - common = mlx5_get_rsc(table, rsn); + common = mlx5_get_rsc(&dev->qp_table, rsn); if (!common) return NOTIFY_OK; @@ -137,11 +146,6 @@ static int rsc_event_notifier(struct notifier_block *nb, qp->event(qp, event_type); /* Need to put resource in event handler */ return NOTIFY_OK; - case MLX5_RES_DCT: - dct = (struct mlx5_core_dct *)common; - if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED) - complete(&dct->drained); - break; default: break; } @@ -188,7 +192,7 @@ static void destroy_resource_common(struct mlx5_ib_dev *dev, } static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, - struct mlx5_core_dct *dct, bool need_cleanup) + struct mlx5_core_dct *dct) { u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {}; struct mlx5_core_qp *qp = &dct->mqp; @@ -203,13 +207,14 @@ static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, } wait_for_completion(&dct->drained); destroy: - if (need_cleanup) - destroy_resource_common(dev, &dct->mqp); MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT); MLX5_SET(destroy_dct_in, in, dctn, qp->qpn); MLX5_SET(destroy_dct_in, in, uid, qp->uid); err = mlx5_cmd_exec_in(dev->mdev, destroy_dct, in); - return err; + if (err) + return err; + xa_cmpxchg(&dev->qp_table.dct_xa, dct->mqp.qpn, dct, NULL, GFP_KERNEL); + return 0; } int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, @@ -227,13 +232,13 @@ int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, qp->qpn = MLX5_GET(create_dct_out, out, dctn); qp->uid = MLX5_GET(create_dct_in, in, uid); - err = create_resource_common(dev, qp, MLX5_RES_DCT); + err = xa_err(xa_store(&dev->qp_table.dct_xa, qp->qpn, dct, GFP_KERNEL)); if (err) goto err_cmd; return 0; err_cmd: - _mlx5_core_destroy_dct(dev, dct, false); + _mlx5_core_destroy_dct(dev, dct); return err; } @@ -284,7 +289,7 @@ static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev, int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct) { - return _mlx5_core_destroy_dct(dev, dct, true); + return _mlx5_core_destroy_dct(dev, dct); } int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) @@ -488,6 +493,7 @@ int mlx5_init_qp_table(struct mlx5_ib_dev *dev) spin_lock_init(&table->lock); INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); + xa_init(&table->dct_xa); mlx5_qp_debugfs_init(dev->mdev); table->nb.notifier_call = rsc_event_notifier; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f33389b42209e4..87e19e6d07a94a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -381,7 +381,6 @@ enum mlx5_res_type { MLX5_RES_SRQ = 3, MLX5_RES_XSRQ = 4, MLX5_RES_XRQ = 5, - MLX5_RES_DCT = MLX5_EVENT_QUEUE_TYPE_DCT, }; struct mlx5_core_rsc_common { @@ -443,6 +442,7 @@ struct mlx5_core_health { struct mlx5_qp_table { struct notifier_block nb; + struct xarray dct_xa; /* protect radix tree */