> -----Original Message----- > From: Yishai Hadas <yishaih@xxxxxxxxxx> > Sent: Tuesday, December 08, 2020 11:13 PM > To: Xiong, Jianxin <jianxin.xiong@xxxxxxxxx> > Cc: Doug Ledford <dledford@xxxxxxxxxx>; Jason Gunthorpe <jgg@xxxxxxxx>; Leon Romanovsky <leon@xxxxxxxxxx>; Sumit Semwal > <sumit.semwal@xxxxxxxxxx>; Christian Koenig <christian.koenig@xxxxxxx>; Vetter, Daniel <daniel.vetter@xxxxxxxxx>; linux- > rdma@xxxxxxxxxxxxxxx; dri-devel@xxxxxxxxxxxxxxxxxxxxx; Yishai Hadas <yishaih@xxxxxxxxxx> > Subject: Re: [PATCH v14 4/4] RDMA/mlx5: Support dma-buf based userspace memory region > > On 12/9/2020 12:39 AM, Jianxin Xiong wrote: > > Implement the new driver method 'reg_user_mr_dmabuf'. Utilize the > > core functions to import dma-buf based memory region and update the mappings. > > > > Add code to handle dma-buf related page fault. > > > > Signed-off-by: Jianxin Xiong <jianxin.xiong@xxxxxxxxx> > > Reviewed-by: Sean Hefty <sean.hefty@xxxxxxxxx> > > Acked-by: Michael J. Ruhl <michael.j.ruhl@xxxxxxxxx> > > Acked-by: Christian Koenig <christian.koenig@xxxxxxx> > > Acked-by: Daniel Vetter <daniel.vetter@xxxxxxxx> > > --- > > drivers/infiniband/hw/mlx5/main.c | 2 + > > drivers/infiniband/hw/mlx5/mlx5_ib.h | 18 +++++ > > drivers/infiniband/hw/mlx5/mr.c | 128 +++++++++++++++++++++++++++++++++-- > > drivers/infiniband/hw/mlx5/odp.c | 86 +++++++++++++++++++++-- > > 4 files changed, 225 insertions(+), 9 deletions(-) > > > > diff --git a/drivers/infiniband/hw/mlx5/main.c > > b/drivers/infiniband/hw/mlx5/main.c > > index 4a054eb..c025746 100644 > > --- a/drivers/infiniband/hw/mlx5/main.c > > +++ b/drivers/infiniband/hw/mlx5/main.c > > @@ -1,6 +1,7 @@ > > // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB > > /* > > * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. > > + * Copyright (c) 2020, Intel Corporation. All rights reserved. > > */ > > > > #include <linux/debugfs.h> > > @@ -4069,6 +4070,7 @@ static int mlx5_ib_enable_driver(struct ib_device *dev) > > .query_srq = mlx5_ib_query_srq, > > .query_ucontext = mlx5_ib_query_ucontext, > > .reg_user_mr = mlx5_ib_reg_user_mr, > > + .reg_user_mr_dmabuf = mlx5_ib_reg_user_mr_dmabuf, > > .req_notify_cq = mlx5_ib_arm_cq, > > .rereg_user_mr = mlx5_ib_rereg_user_mr, > > .resize_cq = mlx5_ib_resize_cq, > > diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h > > b/drivers/infiniband/hw/mlx5/mlx5_ib.h > > index 718e59f..6f4d1b4 100644 > > --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h > > +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h > > @@ -1,6 +1,7 @@ > > /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ > > /* > > * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. > > + * Copyright (c) 2020, Intel Corporation. All rights reserved. > > */ > > > > #ifndef MLX5_IB_H > > @@ -704,6 +705,12 @@ static inline bool is_odp_mr(struct mlx5_ib_mr *mr) > > mr->umem->is_odp; > > } > > > > +static inline bool is_dmabuf_mr(struct mlx5_ib_mr *mr) { > > + return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem && > > + mr->umem->is_dmabuf; > > +} > > + > > > Didn't we agree that IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) > should be checked earlier > > upon mlx5_ib_reg_user_mr_dmabuf () to fully prevent the functionality when ODP is not supported ? see note on previous versions. Yes, it's now checked at the entry point of the uverbs command for registering dmabuf mr (see the following lines extracted from patch 0003 in this set). The check is on the device's on-demand paging cap, which is reset when CONFIG_INFINIBAND_ON_DEMAND_PAGING is not enabled. + if (!(pd->device->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) + return -EOPNOTSUPP; > > > > struct mlx5_ib_mw { > > struct ib_mw ibmw; > > struct mlx5_core_mkey mmkey; > > @@ -1239,6 +1246,10 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, > > struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, > > u64 virt_addr, int access_flags, > > struct ib_udata *udata); > > +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, > > + u64 length, u64 virt_addr, > > + int fd, int access_flags, > > + struct ib_udata *udata); > > int mlx5_ib_advise_mr(struct ib_pd *pd, > > enum ib_uverbs_advise_mr_advice advice, > > u32 flags, > > @@ -1249,11 +1260,13 @@ int mlx5_ib_advise_mr(struct ib_pd *pd, > > int mlx5_ib_dealloc_mw(struct ib_mw *mw); > > int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, > > int page_shift, int flags); > > +int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); > > struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, > > struct ib_udata *udata, > > int access_flags); > > void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); > > void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr); > > +void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr); > > int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, > > u64 length, u64 virt_addr, int access_flags, > > struct ib_pd *pd, struct ib_udata *udata); > > @@ -1341,6 +1354,7 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, > > enum ib_uverbs_advise_mr_advice advice, > > u32 flags, struct ib_sge *sg_list, u32 num_sge); > > int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable); > > +int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr); > > #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > > static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) > > { > > @@ -1366,6 +1380,10 @@ static inline int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable) > > { > > return -EOPNOTSUPP; > > } > > +static inline int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr) > > +{ > > + return -EOPNOTSUPP; > > +} > > #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > > > > extern const struct mmu_interval_notifier_ops mlx5_mn_ops; > > diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c > > index b6116f6..e3be1f5 100644 > > --- a/drivers/infiniband/hw/mlx5/mr.c > > +++ b/drivers/infiniband/hw/mlx5/mr.c > > @@ -1,5 +1,6 @@ > > /* > > * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. > > + * Copyright (c) 2020, Intel Corporation. All rights reserved. > > * > > * This software is available to you under a choice of one of two > > * licenses. You may choose to be licensed under the terms of the GNU > > @@ -36,6 +37,8 @@ > > #include <linux/debugfs.h> > > #include <linux/export.h> > > #include <linux/delay.h> > > +#include <linux/dma-buf.h> > > +#include <linux/dma-resv.h> > > #include <rdma/ib_umem.h> > > #include <rdma/ib_umem_odp.h> > > #include <rdma/ib_verbs.h> > > @@ -957,6 +960,16 @@ static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, > > return &cache->ent[order]; > > } > > > > +static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, > > + u64 iova) > > +{ > > + if ((iova ^ umem->address) & (PAGE_SIZE - 1)) > > + return 0; > > + > > > Also here, see my note from previous versions, can user space trigger > this by supplying some invalid input ? > > If so, this may cause a WARN_ON() in the caller > (i.e.alloc_mr_from_cache) that we should prevent from being triggered by > user space application. I missed this one. Yes, this can be triggered by user input and the WARN_ON needs to be dropped. > > > + umem->iova = iova; > > + return PAGE_SIZE; > > +} > > + > > static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd, > > struct ib_umem *umem, u64 iova, > > int access_flags) > > @@ -966,7 +979,11 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd, > > struct mlx5_ib_mr *mr; > > unsigned int page_size; > > > > - page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova); > > + if (umem->is_dmabuf) > > + page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); > > + else > > + page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, > > + 0, iova); > > if (WARN_ON(!page_size)) > > return ERR_PTR(-EINVAL); > > ent = mr_cache_ent_from_order( > > @@ -1212,8 +1229,10 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, > > > > /* > > * Send the DMA list to the HW for a normal MR using UMR. > > + * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP > > + * flag may be used. > > */ > > -static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) > > +int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) > > { > > struct mlx5_ib_dev *dev = mr->dev; > > struct device *ddev = &dev->mdev->pdev->dev; > > @@ -1255,6 +1274,10 @@ static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) > > cur_mtt->ptag = > > cpu_to_be64(rdma_block_iter_dma_address(&biter) | > > MLX5_IB_MTT_PRESENT); > > + > > + if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) > > + cur_mtt->ptag = 0; > > + > > cur_mtt++; > > } > > > > @@ -1291,8 +1314,11 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, > > int err; > > bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); > > > > - page_size = > > - mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova); > > + if (umem->is_dmabuf) > > + page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); > > + else > > + page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, > > + 0, iova); > > if (WARN_ON(!page_size)) > > return ERR_PTR(-EINVAL); > > > > @@ -1572,6 +1598,96 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, > > return ERR_PTR(err); > > } > > > > +static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) > > +{ > > + struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; > > + struct mlx5_ib_mr *mr = umem_dmabuf->private; > > + > > + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); > > + > > + if (!umem_dmabuf->sgt) > > + return; > > + > > + mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); > > + ib_umem_dmabuf_unmap_pages(umem_dmabuf); > > +} > > + > > +static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { > > + .allow_peer2peer = 1, > > + .move_notify = mlx5_ib_dmabuf_invalidate_cb, > > +}; > > + > > +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, > > + u64 length, u64 virt_addr, > > + int fd, int access_flags, > > + struct ib_udata *udata) > > +{ > > + struct mlx5_ib_dev *dev = to_mdev(pd->device); > > + struct mlx5_ib_mr *mr = NULL; > > + struct ib_umem *umem; > > + int err; > > + > > + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) > > + return ERR_PTR(-EOPNOTSUPP); > > + > > + mlx5_ib_dbg(dev, > > + "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", > > + offset, virt_addr, length, fd, access_flags); > > + > > + if (!mlx5_ib_can_load_pas_with_umr(dev, length)) > > + return ERR_PTR(-EINVAL); > > + > > + umem = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, access_flags, > > + &mlx5_ib_dmabuf_attach_ops); > > + if (IS_ERR(umem)) { > > + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); > > + return ERR_PTR(PTR_ERR(umem)); > > + } > > + > > + mr = alloc_mr_from_cache(pd, umem, virt_addr, access_flags); > > + if (IS_ERR(mr)) > > + mr = NULL; > > + > > + if (!mr) { > > + mutex_lock(&dev->slow_path_mutex); > > + mr = reg_create(NULL, pd, umem, virt_addr, access_flags, > > + false); > > + mutex_unlock(&dev->slow_path_mutex); > > + } > > + > > + if (IS_ERR(mr)) { > > + err = PTR_ERR(mr); > > + goto error; > > + } > > + > > + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); > > + > > + mr->umem = umem; > > + atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); > > + set_mr_fields(dev, mr, length, access_flags); > > + > > + to_ib_umem_dmabuf(umem)->private = mr; > > + init_waitqueue_head(&mr->q_deferred_work); > > + atomic_set(&mr->num_deferred_work, 0); > > + err = xa_err(xa_store(&dev->odp_mkeys, > > + mlx5_base_mkey(mr->mmkey.key), &mr->mmkey, > > + GFP_KERNEL)); > > + if (err) { > > + dereg_mr(dev, mr); > > + return ERR_PTR(err); > > + } > > + > > + err = mlx5_ib_init_dmabuf_mr(mr); > > + if (err) { > > + dereg_mr(dev, mr); > > + return ERR_PTR(err); > > + } > > + return &mr->ibmr; > > +error: > > + ib_umem_release(umem); > > + return ERR_PTR(err); > > +} > > + > > /** > > * mlx5_mr_cache_invalidate - Fence all DMA on the MR > > * @mr: The MR to fence > > @@ -1640,7 +1756,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, > > if (!mr->umem) > > return -EINVAL; > > > > - if (is_odp_mr(mr)) > > + if (is_odp_mr(mr) || is_dmabuf_mr(mr)) > > return -EOPNOTSUPP; > > > > if (flags & IB_MR_REREG_TRANS) { > > @@ -1804,6 +1920,8 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) > > /* Stop all DMA */ > > if (is_odp_mr(mr)) > > mlx5_ib_fence_odp_mr(mr); > > + else if (is_dmabuf_mr(mr)) > > + mlx5_ib_fence_dmabuf_mr(mr); > > else > > clean_mr(dev, mr); > > > > diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c > > index 5c853ec..35d6770 100644 > > --- a/drivers/infiniband/hw/mlx5/odp.c > > +++ b/drivers/infiniband/hw/mlx5/odp.c > > @@ -33,6 +33,8 @@ > > #include <rdma/ib_umem.h> > > #include <rdma/ib_umem_odp.h> > > #include <linux/kernel.h> > > +#include <linux/dma-buf.h> > > +#include <linux/dma-resv.h> > > > > #include "mlx5_ib.h" > > #include "cmd.h" > > @@ -664,6 +666,37 @@ void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr) > > dma_fence_odp_mr(mr); > > } > > > > +/** > > + * mlx5_ib_fence_dmabuf_mr - Stop all access to the dmabuf MR > > + * @mr: to fence > > + * > > + * On return no parallel threads will be touching this MR and no DMA will be > > + * active. > > + */ > > +void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr) > > +{ > > + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); > > + > > + /* Prevent new page faults and prefetch requests from succeeding */ > > + xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); > > + > > + /* Wait for all running page-fault handlers to finish. */ > > + synchronize_srcu(&mr->dev->odp_srcu); > > + > > + wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work)); > > + > > + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); > > + mlx5_mr_cache_invalidate(mr); > > + umem_dmabuf->private = NULL; > > + ib_umem_dmabuf_unmap_pages(umem_dmabuf); > > + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); > > + > > + if (!mr->cache_ent) { > > + mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey); > > + WARN_ON(mr->descs); > > + } > > +} > > + > > #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) > > #define MLX5_PF_FLAGS_SNAPSHOT BIT(2) > > #define MLX5_PF_FLAGS_ENABLE BIT(3) > > @@ -797,6 +830,41 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, > > return ret; > > } > > > > +static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, > > + u32 *bytes_mapped, u32 flags) > > +{ > > + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); > > + u32 xlt_flags = 0; > > + int err; > > + unsigned int page_size; > > + > > + if (flags & MLX5_PF_FLAGS_ENABLE) > > + xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; > > + > > + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); > > + err = ib_umem_dmabuf_map_pages(umem_dmabuf); > > + if (!err) { > > + page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc, > > + log_page_size, 0, > > + umem_dmabuf->umem.iova); > > + if (unlikely(page_size < PAGE_SIZE)) { > > + ib_umem_dmabuf_unmap_pages(umem_dmabuf); > > + err = -EINVAL; > > + } else { > > + err = mlx5_ib_update_mr_pas(mr, xlt_flags); > > + } > > + } > > + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); > > + > > + if (err) > > + return err; > > + > > + if (bytes_mapped) > > + *bytes_mapped += bcnt; > > + > > + return ib_umem_num_pages(mr->umem); > > +} > > + > > /* > > * Returns: > > * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are > > @@ -815,6 +883,9 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, > > if (unlikely(io_virt < mr->mmkey.iova)) > > return -EFAULT; > > > > + if (mr->umem->is_dmabuf) > > + return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags); > > + > > if (!odp->is_implicit_odp) { > > u64 user_va; > > > > @@ -845,6 +916,16 @@ int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable) > > return ret >= 0 ? 0 : ret; > > } > > > > +int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr) > > +{ > > + int ret; > > + > > + ret = pagefault_dmabuf_mr(mr, mr->umem->length, NULL, > > + MLX5_PF_FLAGS_ENABLE); > > + > > + return ret >= 0 ? 0 : ret; > > +} > > + > > struct pf_frame { > > struct pf_frame *next; > > u32 key; > > @@ -1747,7 +1828,6 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work) > > { > > struct mlx5_ib_dev *dev = to_mdev(pd->device); > > struct mlx5_core_mkey *mmkey; > > - struct ib_umem_odp *odp; > > struct mlx5_ib_mr *mr; > > > > lockdep_assert_held(&dev->odp_srcu); > > @@ -1761,11 +1841,9 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work) > > if (mr->ibmr.pd != pd) > > return NULL; > > > > - odp = to_ib_umem_odp(mr->umem); > > - > > /* prefetch with write-access must be supported by the MR */ > > if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && > > - !odp->umem.writable) > > + !mr->umem->writable) > > return NULL; > > > > return mr; > _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel