From: Moni Shoua <monis@xxxxxxxxxxxx> The verb advise_mr() is used to give advice to the kernel about an address range that belongs to a MR. Implement the verb and register it on the device. The current implementation supports the only known advice to date, prefetch. Signed-off-by: Moni Shoua <monis@xxxxxxxxxxxx> Reviewed-by: Guy Levi <guyle@xxxxxxxxxxxx> Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 17 ++++ drivers/infiniband/hw/mlx5/mr.c | 15 ++++ drivers/infiniband/hw/mlx5/odp.c | 122 +++++++++++++++++++++++++-- 3 files changed, 145 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 95652ef8d89b..0c96d2993f41 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1088,6 +1088,12 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); +int mlx5_ib_advise_mr(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, + struct ib_sge *sg_list, + u32 num_sge, + struct uverbs_attr_bundle *attrs); struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); int mlx5_ib_dealloc_mw(struct ib_mw *mw); @@ -1185,6 +1191,10 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, size_t nentries, struct mlx5_ib_mr *mr, int flags); + +int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) { @@ -1200,6 +1210,13 @@ static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, size_t nentries, struct mlx5_ib_mr *mr, int flags) {} +static int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, + u32 num_sge) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ /* Needed for rep profile */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 9b195d65a13e..9a6ffe4fb24f 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1280,6 +1280,21 @@ static struct ib_mr *mlx5_ib_get_memic_mr(struct ib_pd *pd, u64 memic_addr, return ERR_PTR(err); } +int mlx5_ib_advise_mr(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, + struct ib_sge *sg_list, + u32 num_sge, + struct uverbs_attr_bundle *attrs) +{ + if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH || + advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE) + return -EOPNOTSUPP; + + return mlx5_ib_advise_mr_prefetch(pd, advice, flags, + sg_list, num_sge); +} + struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, struct uverbs_attr_bundle *attrs) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 416d141322a0..8c1bfa2b7980 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -549,10 +549,15 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); } +#define MLX5_PF_FLAGS_PREFETCH BIT(0) +#define MLX5_PF_FLAGS_DOWNGRADE BIT(1) static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, - u64 io_virt, size_t bcnt, u32 *bytes_mapped) + u64 io_virt, size_t bcnt, u32 *bytes_mapped, + u32 flags) { struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); + bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; + bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH; u64 access_mask = ODP_READ_ALLOWED_BIT; int npages = 0, page_shift, np; u64 start_idx, page_mask; @@ -579,7 +584,15 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, page_mask = ~(BIT(page_shift) - 1); start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; - if (mr->umem->writable) + if (prefetch && !downgrade && !mr->umem->writable) { + /* prefetch with write-access must + * be supported by the MR + */ + ret = -EINVAL; + goto out; + } + + if (mr->umem->writable && !downgrade) access_mask |= ODP_WRITE_ALLOWED_BIT; current_seq = READ_ONCE(odp->notifiers_seq); @@ -684,12 +697,13 @@ struct pf_frame { * -EFAULT when there's an error mapping the requested pages. The caller will * abort the page fault handling. */ -static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, - u32 key, u64 io_virt, size_t bcnt, +static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 key, + u64 io_virt, size_t bcnt, u32 *bytes_committed, - u32 *bytes_mapped) + u32 *bytes_mapped, u32 flags) { int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; + bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH; struct pf_frame *head = NULL, *frame; struct mlx5_core_mkey *mmkey; struct mlx5_ib_mw *mw; @@ -711,6 +725,12 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, goto srcu_unlock; } + if (prefetch && mmkey->type != MLX5_MKEY_MR) { + mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n"); + ret = -EINVAL; + goto srcu_unlock; + } + switch (mmkey->type) { case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); @@ -720,7 +740,12 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, goto srcu_unlock; } - ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped); + if (prefetch && !mr->umem->is_odp) { + ret = -EINVAL; + goto srcu_unlock; + } + + ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags); if (ret < 0) goto srcu_unlock; @@ -896,7 +921,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, ret = pagefault_single_data_segment(dev, key, io_virt, bcnt, &pfault->bytes_committed, - bytes_mapped); + bytes_mapped, 0); if (ret < 0) break; npages += ret; @@ -1207,7 +1232,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, } ret = pagefault_single_data_segment(dev, rkey, address, length, - &pfault->bytes_committed, NULL); + &pfault->bytes_committed, NULL, + 0); if (ret == -EAGAIN) { /* We're racing with an invalidation, don't prefetch */ prefetch_activated = 0; @@ -1234,7 +1260,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, ret = pagefault_single_data_segment(dev, rkey, address, prefetch_len, - &bytes_committed, NULL); + &bytes_committed, NULL, + 0); if (ret < 0 && ret != -EAGAIN) { mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", ret, pfault->token, address, prefetch_len); @@ -1487,6 +1514,9 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) { int ret = 0; + if (dev->odp_caps.general_caps & IB_ODP_SUPPORT) + dev->ib_dev.advise_mr = mlx5_ib_advise_mr; + if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); if (ret) { @@ -1518,3 +1548,77 @@ int mlx5_ib_odp_init(void) return 0; } + +struct prefetch_mr_work { + struct work_struct work; + struct mlx5_ib_dev *dev; + u32 pf_flags; + struct ib_sge *sg_list; + u32 num_sge; +}; + +static int mlx5_ib_prefetch_sg_list(struct mlx5_ib_dev *dev, u32 pf_flags, + struct ib_sge *sg_list, u32 num_sge) +{ + int i; + + for (i = 0; i < num_sge; ++i) { + struct ib_sge *sg = &sg_list[i]; + int bytes_committed = 0; + int ret; + + ret = pagefault_single_data_segment(dev, sg->lkey, sg->addr, + sg->length, + &bytes_committed, NULL, + pf_flags); + if (ret < 0) + return ret; + } + return 0; +} + +static void mlx5_ib_prefetch_mr_work(struct work_struct *work) +{ + struct prefetch_mr_work *w = + container_of(work, struct prefetch_mr_work, work); + + mlx5_ib_prefetch_sg_list(w->dev, w->pf_flags, w->sg_list, w->num_sge); + + kfree(w->sg_list); + kfree(w); +} + +int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + u32 pf_flags = MLX5_PF_FLAGS_PREFETCH; + struct prefetch_mr_work *work; + + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) + pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; + + if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) + return mlx5_ib_prefetch_sg_list(dev, pf_flags, sg_list, + num_sge); + + work = kzalloc(sizeof(*work), GFP_KERNEL); + if (!work) + return -ENOMEM; + + work->sg_list = kcalloc(num_sge, sizeof(struct ib_sge), GFP_KERNEL); + if (!work->sg_list) { + kfree(work); + return -ENOMEM; + } + memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge)); + + work->dev = dev; + work->pf_flags = pf_flags; + work->num_sge = num_sge; + + INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); + schedule_work(&work->work); + return 0; +} -- 2.19.1