Hi Matsuda-san Thanks for your patches in ODP. It looks good to me. Reviewed-by: Li Zhijian <lizhijian@xxxxxxxxxxx> However, I find myself harboring a hint of hesitation. I'm wondering if we really need remap a page back from the back-end memory/pmem device for just doing a flush operation. I am uncertain about the circumstances under which ODP might occur. Does it possibly include scenarios ? 1) where a page has not yet had a mapping 2) where a page, once mapped, is subsequently swapped out When a pmem page that - for 1), it's meaningless to do the flush - for 2), a pmem page will be swaped-out to a swap-partition without flushing? Thanks Zhijian On 18/03/2025 17:49, Daisuke Matsuda wrote: > For persistent memories, add rxe_odp_flush_pmem_iova() so that ODP specific > steps are executed. Otherwise, no additional consideration is required. > > Signed-off-by: Daisuke Matsuda <matsuda-daisuke@xxxxxxxxxxx> > --- > drivers/infiniband/sw/rxe/rxe.c | 1 + > drivers/infiniband/sw/rxe/rxe_loc.h | 7 ++++ > drivers/infiniband/sw/rxe/rxe_mr.c | 36 ++++++++++------ > drivers/infiniband/sw/rxe/rxe_odp.c | 62 ++++++++++++++++++++++++++-- > drivers/infiniband/sw/rxe/rxe_resp.c | 4 -- > include/rdma/ib_verbs.h | 1 + > 6 files changed, 91 insertions(+), 20 deletions(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c > index 4e56a371deb5..df66f8f9efa1 100644 > --- a/drivers/infiniband/sw/rxe/rxe.c > +++ b/drivers/infiniband/sw/rxe/rxe.c > @@ -109,6 +109,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe) > rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; > rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; > rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; > + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH; > } > } > > diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h > index feb386d98d1d..0012bebe96ef 100644 > --- a/drivers/infiniband/sw/rxe/rxe_loc.h > +++ b/drivers/infiniband/sw/rxe/rxe_loc.h > @@ -194,6 +194,8 @@ int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, > enum rxe_mr_copy_dir dir); > int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, > u64 compare, u64 swap_add, u64 *orig_val); > +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, > + unsigned int length); > #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > static inline int > rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, > @@ -212,6 +214,11 @@ rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, > { > return RESPST_ERR_UNSUPPORTED_OPCODE; > } > +static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, > + unsigned int length) > +{ > + return -EOPNOTSUPP; > +} > #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ > > #endif /* RXE_LOC_H */ > diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c > index 868d2f0b74e9..93e4b5acd3ac 100644 > --- a/drivers/infiniband/sw/rxe/rxe_mr.c > +++ b/drivers/infiniband/sw/rxe/rxe_mr.c > @@ -424,7 +424,7 @@ int copy_data( > return err; > } > > -int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) > +static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) > { > unsigned int page_offset; > unsigned long index; > @@ -433,16 +433,6 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) > int err; > u8 *va; > > - /* mr must be valid even if length is zero */ > - if (WARN_ON(!mr)) > - return -EINVAL; > - > - if (length == 0) > - return 0; > - > - if (mr->ibmr.type == IB_MR_TYPE_DMA) > - return -EFAULT; > - > err = mr_check_range(mr, iova, length); > if (err) > return err; > @@ -454,7 +444,7 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) > if (!page) > return -EFAULT; > bytes = min_t(unsigned int, length, > - mr_page_size(mr) - page_offset); > + mr_page_size(mr) - page_offset); > > va = kmap_local_page(page); > arch_wb_cache_pmem(va + page_offset, bytes); > @@ -468,6 +458,28 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) > return 0; > } > > +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 start, unsigned int length) > +{ > + int err; > + > + /* mr must be valid even if length is zero */ > + if (WARN_ON(!mr)) > + return -EINVAL; > + > + if (length == 0) > + return 0; > + > + if (mr->ibmr.type == IB_MR_TYPE_DMA) > + return -EFAULT; > + > + if (mr->umem->is_odp) > + err = rxe_odp_flush_pmem_iova(mr, start, length); > + else > + err = rxe_mr_flush_pmem_iova(mr, start, length); > + > + return err; > +} > + > /* Guarantee atomicity of atomic operations at the machine level. */ > DEFINE_SPINLOCK(atomic_ops_lock); > > diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c > index 9f6e2bb2a269..9a9aae967486 100644 > --- a/drivers/infiniband/sw/rxe/rxe_odp.c > +++ b/drivers/infiniband/sw/rxe/rxe_odp.c > @@ -4,6 +4,7 @@ > */ > > #include <linux/hmm.h> > +#include <linux/libnvdimm.h> > > #include <rdma/ib_umem_odp.h> > > @@ -147,6 +148,16 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, > return need_fault; > } > > +static unsigned long rxe_odp_iova_to_index(struct ib_umem_odp *umem_odp, u64 iova) > +{ > + return (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; > +} > + > +static unsigned long rxe_odp_iova_to_page_offset(struct ib_umem_odp *umem_odp, u64 iova) > +{ > + return iova & (BIT(umem_odp->page_shift) - 1); > +} > + > static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u32 flags) > { > struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > @@ -190,8 +201,8 @@ static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, > size_t offset; > u8 *user_va; > > - idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; > - offset = iova & (BIT(umem_odp->page_shift) - 1); > + idx = rxe_odp_iova_to_index(umem_odp, iova); > + offset = rxe_odp_iova_to_page_offset(umem_odp, iova); > > while (length > 0) { > u8 *src, *dest; > @@ -277,8 +288,8 @@ static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, > return RESPST_ERR_RKEY_VIOLATION; > } > > - idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; > - page_offset = iova & (BIT(umem_odp->page_shift) - 1); > + idx = rxe_odp_iova_to_index(umem_odp, iova); > + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); > page = hmm_pfn_to_page(umem_odp->pfn_list[idx]); > if (!page) > return RESPST_ERR_RKEY_VIOLATION; > @@ -324,3 +335,46 @@ int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, > > return err; > } > + > +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, > + unsigned int length) > +{ > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > + unsigned int page_offset; > + unsigned long index; > + struct page *page; > + unsigned int bytes; > + int err; > + u8 *va; > + > + err = rxe_odp_map_range_and_lock(mr, iova, length, > + RXE_PAGEFAULT_DEFAULT); > + if (err) > + return err; > + > + while (length > 0) { > + index = rxe_odp_iova_to_index(umem_odp, iova); > + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); > + > + page = hmm_pfn_to_page(umem_odp->pfn_list[index]); > + if (!page) { > + mutex_unlock(&umem_odp->umem_mutex); > + return -EFAULT; > + } > + > + bytes = min_t(unsigned int, length, > + mr_page_size(mr) - page_offset); > + > + va = kmap_local_page(page); > + arch_wb_cache_pmem(va + page_offset, bytes); > + kunmap_local(va); > + > + length -= bytes; > + iova += bytes; > + page_offset = 0; > + } > + > + mutex_unlock(&umem_odp->umem_mutex); > + > + return 0; > +} > diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c > index 54ba9ee1acc5..304e3de740ad 100644 > --- a/drivers/infiniband/sw/rxe/rxe_resp.c > +++ b/drivers/infiniband/sw/rxe/rxe_resp.c > @@ -649,10 +649,6 @@ static enum resp_states process_flush(struct rxe_qp *qp, > struct rxe_mr *mr = qp->resp.mr; > struct resp_res *res = qp->resp.res; > > - /* ODP is not supported right now. WIP. */ > - if (mr->umem->is_odp) > - return RESPST_ERR_UNSUPPORTED_OPCODE; > - > /* oA19-14, oA19-15 */ > if (res && res->replay) > return RESPST_ACKNOWLEDGE; > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h > index 9941f4185c79..da07d3e2db1d 100644 > --- a/include/rdma/ib_verbs.h > +++ b/include/rdma/ib_verbs.h > @@ -325,6 +325,7 @@ enum ib_odp_transport_cap_bits { > IB_ODP_SUPPORT_READ = 1 << 3, > IB_ODP_SUPPORT_ATOMIC = 1 << 4, > IB_ODP_SUPPORT_SRQ_RECV = 1 << 5, > + IB_ODP_SUPPORT_FLUSH = 1 << 6, > }; > > struct ib_odp_caps {