On Wed, Sep 7, 2022 at 4:45 AM Daisuke Matsuda <matsuda-daisuke@xxxxxxxxxxx> wrote: > > Allow applications to register an ODP-enabled MR, in which case the flag > IB_ACCESS_ON_DEMAND is passed to rxe_reg_user_mr(). However, there is no > RDMA operation supported right now. They will be enabled later in the > subsequent two patches. > > rxe_odp_do_pagefault() is called to initialize an ODP-enabled MR here. > It syncs process address space from the CPU page table to the driver page > table(dma_list/pfn_list in umem_odp) when called with a > RXE_PAGEFAULT_SNAPSHOT flag. Additionally, It can be used to trigger page > fault when pages being accessed are not present or do not have proper > read/write permissions and possibly to prefetch pages in the future. > > Signed-off-by: Daisuke Matsuda <matsuda-daisuke@xxxxxxxxxxx> > --- > drivers/infiniband/sw/rxe/rxe.c | 7 +++ > drivers/infiniband/sw/rxe/rxe_loc.h | 5 ++ > drivers/infiniband/sw/rxe/rxe_mr.c | 7 ++- > drivers/infiniband/sw/rxe/rxe_odp.c | 80 +++++++++++++++++++++++++++ > drivers/infiniband/sw/rxe/rxe_resp.c | 21 +++++-- > drivers/infiniband/sw/rxe/rxe_verbs.c | 8 ++- > drivers/infiniband/sw/rxe/rxe_verbs.h | 2 + > 7 files changed, 121 insertions(+), 9 deletions(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c > index 51daac5c4feb..0719f451253c 100644 > --- a/drivers/infiniband/sw/rxe/rxe.c > +++ b/drivers/infiniband/sw/rxe/rxe.c > @@ -73,6 +73,13 @@ static void rxe_init_device_param(struct rxe_dev *rxe) > rxe->ndev->dev_addr); > > rxe->max_ucontext = RXE_MAX_UCONTEXT; > + > + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { > + rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING; > + > + /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */ > + rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT; > + } > } > > /* initialize port attributes */ > diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h > index 0f8cb9e38cc9..03b4078b90a3 100644 > --- a/drivers/infiniband/sw/rxe/rxe_loc.h > +++ b/drivers/infiniband/sw/rxe/rxe_loc.h > @@ -64,6 +64,7 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); > > /* rxe_mr.c */ > u8 rxe_get_next_key(u32 last_key); > +void rxe_mr_init(int access, struct rxe_mr *mr); > void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr); > int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, > int access, struct rxe_mr *mr); > @@ -188,4 +189,8 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) > return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type]; > } > > +/* rxe_odp.c */ > +int rxe_create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, u64 iova, > + int access_flags, struct rxe_mr *mr); > + > #endif /* RXE_LOC_H */ > diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c > index 814116ec4778..0ae72a4516be 100644 > --- a/drivers/infiniband/sw/rxe/rxe_mr.c > +++ b/drivers/infiniband/sw/rxe/rxe_mr.c > @@ -48,7 +48,7 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) > | IB_ACCESS_REMOTE_WRITE \ > | IB_ACCESS_REMOTE_ATOMIC) > > -static void rxe_mr_init(int access, struct rxe_mr *mr) > +void rxe_mr_init(int access, struct rxe_mr *mr) > { > u32 lkey = mr->elem.index << 8 | rxe_get_next_key(-1); > u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0; > @@ -438,7 +438,10 @@ int copy_data( > if (bytes > 0) { > iova = sge->addr + offset; > > - err = rxe_mr_copy(mr, iova, addr, bytes, dir); > + if (mr->odp_enabled) > + err = -EOPNOTSUPP; > + else > + err = rxe_mr_copy(mr, iova, addr, bytes, dir); > if (err) > goto err2; > > diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c > index 0f702787a66e..1f6930ba714c 100644 > --- a/drivers/infiniband/sw/rxe/rxe_odp.c > +++ b/drivers/infiniband/sw/rxe/rxe_odp.c > @@ -5,6 +5,8 @@ > > #include <rdma/ib_umem_odp.h> > > +#include "rxe.h" > + > bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni, > const struct mmu_notifier_range *range, > unsigned long cur_seq) > @@ -32,3 +34,81 @@ bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni, > const struct mmu_interval_notifier_ops rxe_mn_ops = { > .invalidate = rxe_ib_invalidate_range, > }; > + > +#define RXE_PAGEFAULT_RDONLY BIT(1) > +#define RXE_PAGEFAULT_SNAPSHOT BIT(2) > +static int rxe_odp_do_pagefault(struct rxe_mr *mr, u64 user_va, int bcnt, u32 flags) > +{ > + int np; > + u64 access_mask; > + bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT); > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > + > + access_mask = ODP_READ_ALLOWED_BIT; > + if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY)) > + access_mask |= ODP_WRITE_ALLOWED_BIT; > + > + /* > + * umem mutex is held after return from ib_umem_odp_map_dma_and_lock(). > + * Release it when access to user MR is done or not required. > + */ > + np = ib_umem_odp_map_dma_and_lock(umem_odp, user_va, bcnt, > + access_mask, fault); > + > + return np; > +} > + > +static int rxe_init_odp_mr(struct rxe_mr *mr) > +{ > + int ret; > + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); > + > + ret = rxe_odp_do_pagefault(mr, mr->umem->address, mr->umem->length, > + RXE_PAGEFAULT_SNAPSHOT); > + mutex_unlock(&umem_odp->umem_mutex); > + > + return ret >= 0 ? 0 : ret; > +} > + > +int rxe_create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, u64 iova, > + int access_flags, struct rxe_mr *mr) > +{ > + int err; > + struct ib_umem_odp *umem_odp; > + struct rxe_dev *dev = container_of(pd->device, struct rxe_dev, ib_dev); > + > + if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) > + return -EOPNOTSUPP; > + > + rxe_mr_init(access_flags, mr); > + > + if (!start && length == U64_MAX) { > + if (iova != 0) > + return -EINVAL; > + if (!(dev->attr.odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) > + return -EINVAL; > + > + /* Never reach here, for implicit ODP is not implemented. */ > + } > + > + umem_odp = ib_umem_odp_get(pd->device, start, length, access_flags, > + &rxe_mn_ops); > + if (IS_ERR(umem_odp)) > + return PTR_ERR(umem_odp); > + > + umem_odp->private = mr; > + > + mr->odp_enabled = true; > + mr->ibmr.pd = pd; > + mr->umem = &umem_odp->umem; > + mr->access = access_flags; > + mr->length = length; > + mr->iova = iova; > + mr->offset = ib_umem_offset(&umem_odp->umem); > + mr->state = RXE_MR_STATE_VALID; > + mr->type = IB_MR_TYPE_USER; > + > + err = rxe_init_odp_mr(mr); > + > + return err; > +} > diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c > index cadc8fa64dd0..dd8632e783f6 100644 > --- a/drivers/infiniband/sw/rxe/rxe_resp.c > +++ b/drivers/infiniband/sw/rxe/rxe_resp.c > @@ -535,8 +535,12 @@ static enum resp_states write_data_in(struct rxe_qp *qp, > int err; > int data_len = payload_size(pkt); > > - err = rxe_mr_copy(qp->resp.mr, qp->resp.va + qp->resp.offset, > - payload_addr(pkt), data_len, RXE_TO_MR_OBJ); > + if (qp->resp.mr->odp_enabled) You cannot use qp->resp.mr here, because for zero byte operations, resp.mr is not set in the function check_rkey(). The code fails for RTRS with the following stack trace, [Thu Sep 8 20:12:22 2022] BUG: kernel NULL pointer dereference, address: 0000000000000158 [Thu Sep 8 20:12:22 2022] #PF: supervisor read access in kernel mode [Thu Sep 8 20:12:22 2022] #PF: error_code(0x0000) - not-present page [Thu Sep 8 20:12:22 2022] PGD 0 P4D 0 [Thu Sep 8 20:12:22 2022] Oops: 0000 [#1] PREEMPT SMP [Thu Sep 8 20:12:22 2022] CPU: 3 PID: 38 Comm: kworker/u8:1 Not tainted 6.0.0-rc2-pserver+ #17 [Thu Sep 8 20:12:22 2022] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [Thu Sep 8 20:12:22 2022] Workqueue: rxe_resp rxe_do_work [rdma_rxe] [Thu Sep 8 20:12:22 2022] RIP: 0010:rxe_responder+0x1910/0x1d90 [rdma_rxe] [Thu Sep 8 20:12:22 2022] Code: 06 48 63 88 fc 15 63 c0 0f b6 46 01 83 ea 04 c0 e8 04 29 ca 83 e0 03 29 c2 49 8b 87 08 05 00 00 49 03 87 00 05 00 00 4c 63 ea <80> bf 58 01 00 00 00 48 8d 14 0e 48 89 c6 4d 89 ee 44 89 e9 0f 84 [Thu Sep 8 20:12:22 2022] RSP: 0018:ffffb0358015fd80 EFLAGS: 00010246 [Thu Sep 8 20:12:22 2022] RAX: 0000000000000000 RBX: ffff9af4839b5e28 RCX: 0000000000000020 [Thu Sep 8 20:12:22 2022] RDX: 0000000000000000 RSI: ffff9af485094a6a RDI: 0000000000000000 [Thu Sep 8 20:12:22 2022] RBP: ffff9af488bd7128 R08: 0000000000000000 R09: 0000000000000000 [Thu Sep 8 20:12:22 2022] R10: ffff9af4808eaf7c R11: 0000000000000001 R12: 0000000000000008 [Thu Sep 8 20:12:22 2022] R13: 0000000000000000 R14: ffff9af488bd7380 R15: ffff9af488bd7000 [Thu Sep 8 20:12:22 2022] FS: 0000000000000000(0000) GS:ffff9af5b7d80000(0000) knlGS:0000000000000000 [Thu Sep 8 20:12:22 2022] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [Thu Sep 8 20:12:22 2022] CR2: 0000000000000158 CR3: 000000004a60a000 CR4: 00000000000006e0 [Thu Sep 8 20:12:22 2022] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [Thu Sep 8 20:12:22 2022] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [Thu Sep 8 20:12:22 2022] Call Trace: [Thu Sep 8 20:12:22 2022] <TASK> [Thu Sep 8 20:12:22 2022] ? newidle_balance+0x2e5/0x400 [Thu Sep 8 20:12:22 2022] ? _raw_spin_unlock+0x12/0x30 [Thu Sep 8 20:12:22 2022] ? finish_task_switch+0x91/0x2a0 [Thu Sep 8 20:12:22 2022] rxe_do_work+0x86/0x110 [rdma_rxe] [Thu Sep 8 20:12:22 2022] process_one_work+0x1dc/0x3a0 [Thu Sep 8 20:12:22 2022] worker_thread+0x4a/0x3b0 [Thu Sep 8 20:12:22 2022] ? process_one_work+0x3a0/0x3a0 [Thu Sep 8 20:12:22 2022] kthread+0xe7/0x110 [Thu Sep 8 20:12:22 2022] ? kthread_complete_and_exit+0x20/0x20 [Thu Sep 8 20:12:22 2022] ret_from_fork+0x22/0x30 [Thu Sep 8 20:12:22 2022] </TASK> [Thu Sep 8 20:12:22 2022] Modules linked in: rnbd_server rtrs_server rtrs_core rdma_ucm rdma_cm iw_cm ib_cm crc32_generic rdma_rxe ip6_udp_tunnel udp_tunnel ib_uverbs ib_core loop null_blk [Thu Sep 8 20:12:22 2022] CR2: 0000000000000158 [Thu Sep 8 20:12:22 2022] ---[ end trace 0000000000000000 ]--- [Thu Sep 8 20:12:22 2022] BUG: kernel NULL pointer dereference, address: 0000000000000158 [Thu Sep 8 20:12:22 2022] RIP: 0010:rxe_responder+0x1910/0x1d90 [rdma_rxe] [Thu Sep 8 20:12:22 2022] #PF: supervisor read access in kernel mode [Thu Sep 8 20:12:22 2022] Code: 06 48 63 88 fc 15 63 c0 0f b6 46 01 83 ea 04 c0 e8 04 29 ca 83 e0 03 29 c2 49 8b 87 08 05 00 00 49 03 87 00 05 00 00 4c 63 ea <80> bf 58 01 00 00 00 48 8d 14 0e 48 89 c6 4d 89 ee 44 89 e9 0f 84 [Thu Sep 8 20:12:22 2022] #PF: error_code(0x0000) - not-present page [Thu Sep 8 20:12:22 2022] RSP: 0018:ffffb0358015fd80 EFLAGS: 00010246 [Thu Sep 8 20:12:22 2022] PGD 0 P4D 0 Technically, for operations with 0 length, the code can simply not do any of the *_mr_copy, and carry on with success. So maybe you can check data_len first and copy only if needed. > + err = -EOPNOTSUPP; > + else > + err = rxe_mr_copy(qp->resp.mr, qp->resp.va + qp->resp.offset, > + payload_addr(pkt), data_len, RXE_TO_MR_OBJ); > + > if (err) { > rc = RESPST_ERR_RKEY_VIOLATION; > goto out; > @@ -667,7 +671,10 @@ static enum resp_states rxe_atomic_reply(struct rxe_qp *qp, > if (mr->state != RXE_MR_STATE_VALID) > return RESPST_ERR_RKEY_VIOLATION; > > - ret = rxe_atomic_ops(qp, pkt, mr); > + if (mr->odp_enabled) > + ret = RESPST_ERR_UNSUPPORTED_OPCODE; > + else > + ret = rxe_atomic_ops(qp, pkt, mr); > } else > ret = RESPST_ACKNOWLEDGE; > > @@ -831,8 +838,12 @@ static enum resp_states read_reply(struct rxe_qp *qp, > if (!skb) > return RESPST_ERR_RNR; > > - err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), > - payload, RXE_FROM_MR_OBJ); > + if (mr->odp_enabled) > + err = -EOPNOTSUPP; > + else > + err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), > + payload, RXE_FROM_MR_OBJ); > + > if (err) > pr_err("Failed copying memory\n"); > if (mr) > diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c > index 7510f25c5ea3..b00e9b847382 100644 > --- a/drivers/infiniband/sw/rxe/rxe_verbs.c > +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c > @@ -926,10 +926,14 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, > goto err2; > } > > - > rxe_get(pd); > > - err = rxe_mr_init_user(pd, start, length, iova, access, mr); > + if (access & IB_ACCESS_ON_DEMAND) > + err = rxe_create_user_odp_mr(&pd->ibpd, start, length, iova, > + access, mr); > + else > + err = rxe_mr_init_user(pd, start, length, iova, access, mr); > + > if (err) > goto err3; > > diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h > index b09b4cb9897a..98d2bb737ebc 100644 > --- a/drivers/infiniband/sw/rxe/rxe_verbs.h > +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h > @@ -324,6 +324,8 @@ struct rxe_mr { > atomic_t num_mw; > > struct rxe_map **map; > + > + bool odp_enabled; > }; > > enum rxe_mw_state { > -- > 2.31.1 >