[PATCH rdma-next 1/2] IB/mlx5: Protect against prefetch of invalid MR

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Moni Shoua <monis@xxxxxxxxxxxx>

When deferring a prefetch request we need to protect against MR or PD being
destroyed while the request is still enqueued.

The first step is to validate that PD owns the lkey that describes the
MR and that the MR that the lkey refers to is owned by that PD.

The second step is to dequeue all requests when MR is destroyed.

Since PD can't be destroyed while it owns MRs it is guaranteed that when
a worker wakes up the request it refers to is still valid.

While that, replace the dedicated ordered workqueue with the system
unbound workqueue to reuse an existing resource and improve performance. This
will also fix a bug of queueing to the wrong workqueue.

Fixes: 813e90b1aeaa ("IB/mlx5: Add advise_mr() support")
Reported-by: Parav Pandit <parav@xxxxxxxxxxxx>
Signed-off-by: Moni Shoua <monis@xxxxxxxxxxxx>
Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx>
---
 drivers/infiniband/hw/mlx5/main.c    | 13 +---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  2 +-
 drivers/infiniband/hw/mlx5/mr.c      | 14 +++-
 drivers/infiniband/hw/mlx5/odp.c     | 98 +++++++++++++++++++++++++---
 4 files changed, 104 insertions(+), 23 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index f9cddc6f2ab6..efead59ca498 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -5770,8 +5770,6 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
 		srcu_barrier(&dev->mr_srcu);
 		cleanup_srcu_struct(&dev->mr_srcu);
-		drain_workqueue(dev->advise_mr_wq);
-		destroy_workqueue(dev->advise_mr_wq);
 	}
 	kfree(dev->port);
 }
@@ -5826,18 +5824,9 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
 	dev->memic.dev = mdev;
 
 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
-		dev->advise_mr_wq =
-			alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0);
-		if (!dev->advise_mr_wq) {
-			err = -ENOMEM;
-			goto err_mp;
-		}
-
 		err = init_srcu_struct(&dev->mr_srcu);
-		if (err) {
-			destroy_workqueue(dev->advise_mr_wq);
+		if (err)
 			goto err_mp;
-		}
 	}
 
 	return 0;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index f9817284c7a3..4a617d78eae1 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -589,6 +589,7 @@ struct mlx5_ib_mr {
 	atomic_t		num_leaf_free;
 	wait_queue_head_t       q_leaf_free;
 	struct mlx5_async_work  cb_work;
+	atomic_t		num_pending_prefetch;
 };
 
 static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
@@ -929,7 +930,6 @@ struct mlx5_ib_dev {
 	 */
 	struct srcu_struct      mr_srcu;
 	u32			null_mkey;
-	struct workqueue_struct *advise_mr_wq;
 	struct mlx5_ib_flow_db	*flow_db;
 	/* protect resources needed as part of reset flow */
 	spinlock_t		reset_flow_resource_lock;
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 705a79cd21da..c85f00255884 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1339,8 +1339,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		}
 	}
 
-	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
 		mr->live = 1;
+		atomic_set(&mr->num_pending_prefetch, 0);
+	}
 
 	return &mr->ibmr;
 error:
@@ -1576,8 +1578,16 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	if (is_odp_mr(mr)) {
 		struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem);
 
-		/* Prevent new page faults from succeeding */
+		/* Prevent new page faults and
+		 * prefetch requests from succeeding
+		 */
 		mr->live = 0;
+
+		/* dequeue pending prefetch requests for the mr */
+		if (atomic_read(&mr->num_pending_prefetch))
+			flush_workqueue(system_unbound_wq);
+		WARN_ON(atomic_read(&mr->num_pending_prefetch));
+
 		/* Wait for all running page-fault handlers to finish. */
 		synchronize_srcu(&dev->mr_srcu);
 		/* Destroy all page mappings */
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 8d32d3f91277..977d8f5d1e0f 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -535,6 +535,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 	imr->umem = umem;
 	init_waitqueue_head(&imr->q_leaf_free);
 	atomic_set(&imr->num_leaf_free, 0);
+	atomic_set(&imr->num_pending_prefetch, 0);
 
 	return imr;
 }
@@ -575,6 +576,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
 
 #define MLX5_PF_FLAGS_PREFETCH  BIT(0)
 #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
+#define MLX5_PF_FLAGS_PREFETCH_DEFERRED BIT(2)
 static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 			u64 io_virt, size_t bcnt, u32 *bytes_mapped,
 			u32 flags)
@@ -741,7 +743,10 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 key,
 					 u32 *bytes_mapped, u32 flags)
 {
 	int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
-	bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
+	bool prefetch = (flags & MLX5_PF_FLAGS_PREFETCH) ||
+			(flags & MLX5_PF_FLAGS_PREFETCH_DEFERRED);
+	bool deferred = flags & MLX5_PF_FLAGS_PREFETCH_DEFERRED;
+
 	struct pf_frame *head = NULL, *frame;
 	struct mlx5_core_mkey *mmkey;
 	struct mlx5_ib_mr *mr;
@@ -772,6 +777,10 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 key,
 	switch (mmkey->type) {
 	case MLX5_MKEY_MR:
 		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+
+		if (deferred)
+			atomic_dec(&mr->num_pending_prefetch);
+
 		if (!mr->live || !mr->ibmr.pd) {
 			mlx5_ib_dbg(dev, "got dead MR\n");
 			ret = -EFAULT;
@@ -1664,20 +1673,41 @@ static int mlx5_ib_prefetch_sg_list(struct mlx5_ib_dev *dev, u32 pf_flags,
 				    struct ib_sge *sg_list, u32 num_sge)
 {
 	int i;
+	int ret = 0;
 
 	for (i = 0; i < num_sge; ++i) {
 		struct ib_sge *sg = &sg_list[i];
 		int bytes_committed = 0;
-		int ret;
 
 		ret = pagefault_single_data_segment(dev, sg->lkey, sg->addr,
 						    sg->length,
 						    &bytes_committed, NULL,
 						    pf_flags);
 		if (ret < 0)
-			return ret;
+			break;
 	}
-	return 0;
+
+	if (ret < 0 &&
+	    pf_flags & MLX5_PF_FLAGS_PREFETCH_DEFERRED) {
+		int srcu_key;
+		int j;
+
+		srcu_key = srcu_read_lock(&dev->mr_srcu);
+		for (j = i + 1; j < num_sge ; ++j) {
+			struct mlx5_core_mkey *mmkey;
+			struct mlx5_ib_mr *mr;
+
+			mmkey = __mlx5_mr_lookup(dev->mdev,
+						 mlx5_base_mkey(sg_list[j].lkey));
+			mr = container_of(mmkey,
+					  struct mlx5_ib_mr,
+					  mmkey);
+			atomic_dec(&mr->num_pending_prefetch);
+		}
+		srcu_read_unlock(&dev->mr_srcu, srcu_key);
+	}
+
+	return ret < 0 ? ret : 0;
 }
 
 static void mlx5_ib_prefetch_mr_work(struct work_struct *work)
@@ -1696,9 +1726,13 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
 			       enum ib_uverbs_advise_mr_advice advice,
 			       u32 flags, struct ib_sge *sg_list, u32 num_sge)
 {
+	u32 pf_flags = (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) ?
+		MLX5_PF_FLAGS_PREFETCH : MLX5_PF_FLAGS_PREFETCH_DEFERRED;
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	u32 pf_flags = MLX5_PF_FLAGS_PREFETCH;
 	struct prefetch_mr_work *work;
+	bool valid_req = true;
+	int srcu_key;
+	int i;
 
 	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
 		pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
@@ -1717,10 +1751,58 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
 	memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge));
 
 	work->dev = dev;
-	work->pf_flags = pf_flags;
+	work->pf_flags = pf_flags | MLX5_PF_FLAGS_PREFETCH_DEFERRED;
 	work->num_sge = num_sge;
 
 	INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
-	schedule_work(&work->work);
-	return 0;
+
+	srcu_key = srcu_read_lock(&dev->mr_srcu);
+	for (i = 0; i < num_sge; ++i) {
+		struct mlx5_core_mkey *mmkey;
+		struct mlx5_ib_mr *mr;
+
+		mmkey = __mlx5_mr_lookup(dev->mdev,
+					 mlx5_base_mkey(sg_list[i].lkey));
+		if (!mmkey || mmkey->key != sg_list[i].lkey) {
+			valid_req = false;
+			break;
+		}
+
+		if (mmkey->type != MLX5_MKEY_MR) {
+			valid_req = false;
+			break;
+		}
+
+		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+
+		if (mr->ibmr.pd != pd) {
+			valid_req = false;
+			break;
+		}
+
+		if (!mr->live) {
+			valid_req = false;
+			break;
+		}
+
+		atomic_inc(&mr->num_pending_prefetch);
+	}
+	if (valid_req) {
+		queue_work(system_unbound_wq, &work->work);
+	} else {
+		while (--i >= 0) {
+			struct mlx5_core_mkey *mmkey;
+			struct mlx5_ib_mr *mr;
+
+			mmkey = __mlx5_mr_lookup(dev->mdev,
+						 mlx5_base_mkey(sg_list[i].lkey));
+			mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+			atomic_dec(&mr->num_pending_prefetch);
+		}
+		kfree(work);
+	}
+
+	srcu_read_unlock(&dev->mr_srcu, srcu_key);
+
+	return valid_req ? 0 : -EINVAL;
 }
-- 
2.19.1




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux