On 5/18/23 03:21, Daisuke Matsuda wrote: > rxe_mr_copy() is used widely to copy data to/from a user MR. requester uses > it to load payloads of requesting packets; responder uses it to process > Send, Write, and Read operaetions; completer uses it to copy data from > response packets of Read and Atomic operations to a user MR. > > Allow these operations to be used with ODP by adding a subordinate function > rxe_odp_mr_copy(). It is comprised of the following steps: > 1. Check the driver page table(umem_odp->dma_list) to see if pages being > accessed are present with appropriate permission. > 2. If necessary, trigger page fault to map the pages. > 3. Update the MR xarray using PFNs in umem_odp->pfn_list. > 4. Execute data copy to/from the pages. > > umem_mutex is used to ensure that dma_list (an array of addresses of an MR) > is not changed while it is being checked and that mapped pages are not > invalidated before data copy completes. > > Signed-off-by: Daisuke Matsuda <matsuda-daisuke@xxxxxxxxxxx> > --- > drivers/infiniband/sw/rxe/rxe.c | 10 +++ > drivers/infiniband/sw/rxe/rxe_loc.h | 8 ++ > drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- > drivers/infiniband/sw/rxe/rxe_odp.c | 109 ++++++++++++++++++++++++++++ > 4 files changed, 128 insertions(+), 1 deletion(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c > index f2284d27229b..207a022156f0 100644 > --- a/drivers/infiniband/sw/rxe/rxe.c > +++ b/drivers/infiniband/sw/rxe/rxe.c > @@ -79,6 +79,16 @@ static void rxe_init_device_param(struct rxe_dev *rxe) > > /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */ > rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT; > + > + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; > + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV; > + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; > + > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; > } > } > > diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h > index 93247d123642..4b95c8c46bdc 100644 > --- a/drivers/infiniband/sw/rxe/rxe_loc.h > +++ b/drivers/infiniband/sw/rxe/rxe_loc.h > @@ -206,6 +206,8 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) > #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING > int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, > u64 iova, int access_flags, struct rxe_mr *mr); > +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, > + enum rxe_mr_copy_dir dir); > #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > static inline int > rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > @@ -213,6 +215,12 @@ rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > { > return -EOPNOTSUPP; > } > +static inline int > +rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, > + int length, enum rxe_mr_copy_dir dir) > +{ > + return -EOPNOTSUPP; > +} > > #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > > diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c > index cd368cd096c8..0e3cda59d702 100644 > --- a/drivers/infiniband/sw/rxe/rxe_mr.c > +++ b/drivers/infiniband/sw/rxe/rxe_mr.c > @@ -319,7 +319,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, > } > > if (mr->odp_enabled) > - return -EOPNOTSUPP; > + return rxe_odp_mr_copy(mr, iova, addr, length, dir); > else > return rxe_mr_copy_xarray(mr, iova, addr, length, dir); > } > diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c > index e5497d09c399..cbe5d0c3fcc4 100644 > --- a/drivers/infiniband/sw/rxe/rxe_odp.c > +++ b/drivers/infiniband/sw/rxe/rxe_odp.c > @@ -174,3 +174,112 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, > > return err; > } > + > +static inline bool rxe_is_pagefault_neccesary(struct ib_umem_odp *umem_odp, > + u64 iova, int length, u32 perm) > +{ > + int idx; > + u64 addr; > + bool need_fault = false; > + > + addr = iova & (~(BIT(umem_odp->page_shift) - 1)); > + > + /* Skim through all pages that are to be accessed. */ > + while (addr < iova + length) { > + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; > + > + if (!(umem_odp->dma_list[idx] & perm)) { > + need_fault = true; > + break; > + } > + > + addr += BIT(umem_odp->page_shift); > + } > + return need_fault; > +} > + > +/* umem mutex must be locked before entering this function. */ > +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags) > +{ > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > + const int max_tries = 3; > + int cnt = 0; > + > + int err; > + u64 perm; > + bool need_fault; > + > + if (unlikely(length < 1)) { > + mutex_unlock(&umem_odp->umem_mutex); > + return -EINVAL; > + } > + > + perm = ODP_READ_ALLOWED_BIT; > + if (!(flags & RXE_PAGEFAULT_RDONLY)) > + perm |= ODP_WRITE_ALLOWED_BIT; > + > + /* > + * A successful return from rxe_odp_do_pagefault() does not guarantee > + * that all pages in the range became present. Recheck the DMA address > + * array, allowing max 3 tries for pagefault. > + */ > + while ((need_fault = rxe_is_pagefault_neccesary(umem_odp, > + iova, length, perm))) { > + if (cnt >= max_tries) > + break; > + > + mutex_unlock(&umem_odp->umem_mutex); > + > + /* umem_mutex is locked on success. */ > + err = rxe_odp_do_pagefault(mr, iova, length, flags); > + if (err < 0) > + return err; > + > + cnt++; > + } > + > + if (need_fault) > + return -EFAULT; > + > + return 0; > +} > + > +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, > + enum rxe_mr_copy_dir dir) > +{ > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > + u32 flags = 0; > + int err; > + > + if (unlikely(!mr->odp_enabled)) > + return -EOPNOTSUPP; > + > + switch (dir) { > + case RXE_TO_MR_OBJ: > + break; > + > + case RXE_FROM_MR_OBJ: > + flags = RXE_PAGEFAULT_RDONLY; > + break; > + > + default: > + return -EINVAL; > + } > + > + /* If pagefault is not required, umem mutex will be held until data > + * copy to the MR completes. Otherwise, it is released and locked > + * again in rxe_odp_map_range() to let invalidation handler do its > + * work meanwhile. > + */ > + mutex_lock(&umem_odp->umem_mutex); > + > + err = rxe_odp_map_range(mr, iova, length, flags); > + if (err) > + return err; > + > + err = rxe_mr_copy_xarray(mr, iova, addr, length, dir); > + > + mutex_unlock(&umem_odp->umem_mutex); > + > + return err; > +} Reviewed-by: Bob Pearson <rpearsonhpe@xxxxxxxxx>