Finished decoupling indices and keys for MW and MR objects. Now user space can refer to an object by index and kernel can lookup object with l/rkey. Tweaked the user/kernel ABI for rxe WQEs to use indices instead of rkeys to identify MWs and MRs. Type 1 MWs can now be bound with the ibv_bind_mw api. Signed-off-by: Bob Pearson <rpearson@xxxxxxx> --- drivers/infiniband/sw/rxe/rxe_loc.h | 3 + drivers/infiniband/sw/rxe/rxe_mr.c | 55 +++++++----- drivers/infiniband/sw/rxe/rxe_mw.c | 116 ++++++++++++++++++++++---- drivers/infiniband/sw/rxe/rxe_pool.c | 30 ++++--- drivers/infiniband/sw/rxe/rxe_pool.h | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.c | 28 ++++++- include/uapi/rdma/rdma_user_rxe.h | 34 +++++++- 7 files changed, 212 insertions(+), 56 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 02df9bf76d1a..87d323b1ba07 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -98,6 +98,8 @@ struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev, u32 size, int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); /* rxe_mr.c */ +void rxe_set_mr_lkey(struct rxe_mr *mr); + enum copy_direction { to_mr_obj, from_mr_obj, @@ -137,6 +139,7 @@ void rxe_mr_cleanup(struct rxe_pool_entry *arg); int advance_dma_data(struct rxe_dma_info *dma, unsigned int length); /* rxe_mw.c */ +void rxe_set_mw_rkey(struct rxe_mw *mw); struct ib_mw *rxe_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type, struct ib_udata *udata); int rxe_dealloc_mw(struct ib_mw *ibmw); diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 0606f04e1d18..ba4e33227633 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -34,6 +34,23 @@ #include "rxe.h" #include "rxe_loc.h" +/* choose a unique non zero random number for lkey */ +void rxe_set_mr_lkey(struct rxe_mr *mr) +{ + int ret; + u32 lkey; + +next_lkey: + get_random_bytes(&lkey, sizeof(lkey)); + lkey &= 0x7fffffff; + if (unlikely(lkey == 0)) + goto next_lkey; + ret = rxe_add_key(mr, &lkey); + if (unlikely(ret == -EAGAIN)) + goto next_lkey; +} + +#if 0 /* * lfsr (linear feedback shift register) with period 255 */ @@ -50,6 +67,7 @@ static u8 rxe_get_key(void) return key; } +#endif int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) { @@ -76,16 +94,16 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) static void rxe_mr_init(int access, struct rxe_mr *mr) { - u32 lkey = mr->pelem.index << 8 | rxe_get_key(); - u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0; + rxe_set_mr_lkey(mr); - if (mr->pelem.pool->type == RXE_TYPE_MR) { - mr->ibmr.lkey = lkey; - mr->ibmr.rkey = rkey; - } + if (access & IB_ACCESS_REMOTE) + mr->ibmr.rkey = mr->ibmr.lkey; + else + mr->ibmr.rkey = 0; - mr->lkey = lkey; - mr->rkey = rkey; + // TODO we shouldn't carry two copies + mr->lkey = mr->ibmr.lkey; + mr->rkey = mr->ibmr.rkey; mr->state = RXE_MEM_STATE_INVALID; mr->type = RXE_MEM_TYPE_NONE; mr->map_shift = ilog2(RXE_BUF_PER_MAP); @@ -155,9 +173,9 @@ void rxe_mr_init_dma(struct rxe_pd *pd, mr->type = RXE_MEM_TYPE_DMA; } -int rxe_mr_init_user(struct rxe_pd *pd, u64 start, - u64 length, u64 iova, int access, struct ib_udata *udata, - struct rxe_mr *mr) +int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, + u64 iova, int access, struct ib_udata *udata, + struct rxe_mr *mr) { struct rxe_map **map; struct rxe_phys_buf *buf = NULL; @@ -233,15 +251,15 @@ int rxe_mr_init_user(struct rxe_pd *pd, u64 start, return err; } -int rxe_mr_init_fast(struct rxe_pd *pd, - int max_pages, struct rxe_mr *mr) +int rxe_mr_init_fast(struct rxe_pd *pd, int max_pages, + struct rxe_mr *mr) { int err; rxe_mr_init(0, mr); /* In fastreg, we also set the rkey */ - mr->ibmr.rkey = mr->ibmr.lkey; + mr->rkey = mr->ibmr.rkey = mr->ibmr.lkey; err = rxe_mr_alloc(mr, max_pages); if (err) @@ -564,18 +582,17 @@ int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) * (4) verify that mr state is valid */ struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, - enum lookup_type type) + enum lookup_type type) { struct rxe_mr *mr; struct rxe_dev *rxe = to_rdev(pd->ibpd.device); - int index = key >> 8; - mr = rxe_pool_get_index(&rxe->mr_pool, index); + mr = rxe_pool_get_key(&rxe->mr_pool, &key); if (!mr) return NULL; - if (unlikely((type == lookup_local && mr->lkey != key) || - (type == lookup_remote && mr->rkey != key) || + if (unlikely((type == lookup_local && mr->ibmr.lkey != key) || + (type == lookup_remote && mr->ibmr.rkey != key) || mr->pd != pd || (access && !(access & mr->access)) || mr->state != RXE_MEM_STATE_VALID)) { diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index 230263c6d3e5..b45a04efa4a0 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -35,49 +35,95 @@ #include "rxe.h" #include "rxe_loc.h" +/* choose a unique non zero random number for rkey */ +void rxe_set_mw_rkey(struct rxe_mw *mw) +{ + int ret; + u32 rkey; + +next_rkey: + get_random_bytes(&rkey, sizeof(rkey)); + if (unlikely(rkey == 0)) + goto next_rkey; + rkey |= 0x80000000; + ret = rxe_add_key(mw, &rkey); + if (unlikely(ret == -EAGAIN)) + goto next_rkey; +} + /* place holder alloc and dealloc routines - * need to add cross references between qp and mr with mw + * TODO add cross references between qp and mr with mw * and cleanup when one side is deleted. Enough to make * verbs function correctly for now */ struct ib_mw *rxe_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type, struct ib_udata *udata) { + int ret; + struct rxe_mw *mw; struct rxe_pd *pd = to_rpd(ibpd); struct rxe_dev *rxe = to_rdev(ibpd->device); - struct rxe_mw *mw; - u32 rkey; - u8 key; + struct rxe_alloc_mw_resp __user *uresp; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) { + ret = -EINVAL; + goto err1; + } + } if (unlikely((type != IB_MW_TYPE_1) && - (type != IB_MW_TYPE_2))) - return ERR_PTR(-EINVAL); + (type != IB_MW_TYPE_2))) { + ret = -EINVAL; + goto err1; + } rxe_add_ref(pd); mw = rxe_alloc(&rxe->mw_pool); if (!mw) { rxe_drop_ref(pd); - return ERR_PTR(-ENOMEM); + ret = -ENOMEM; + goto err1; } - /* pick a random key part as a starting point */ rxe_add_index(mw); - get_random_bytes(&key, sizeof(key)); - rkey = mw->pelem.index << 8 | key; + rxe_set_mw_rkey(mw); + + pr_info("rxe_alloc_mw: index = 0x%08x, rkey = 0x%08x\n", + mw->pelem.index, mw->ibmw.rkey); spin_lock_init(&mw->lock); + + if (type == IB_MW_TYPE_2) { + mw->state = RXE_MW_STATE_FREE; + } else { + mw->state = RXE_MW_STATE_VALID; + } + mw->qp = NULL; mw->mr = NULL; mw->addr = 0; mw->length = 0; mw->ibmw.pd = ibpd; mw->ibmw.type = type; - mw->ibmw.rkey = rkey; - mw->state = (type == IB_MW_TYPE_2) ? - RXE_MW_STATE_FREE : - RXE_MW_STATE_VALID; + + if (udata) { + uresp = udata->outbuf; + if (copy_to_user(&uresp->index, &mw->pelem.index, + sizeof(u32))) { + ret = -EFAULT; + goto err2; + } + } return &mw->ibmw; +err2: + rxe_drop_key(mw); + rxe_drop_index(mw); + rxe_drop_ref(mw); + rxe_drop_ref(pd); +err1: + return ERR_PTR(ret); } int rxe_dealloc_mw(struct ib_mw *ibmw) @@ -90,8 +136,9 @@ int rxe_dealloc_mw(struct ib_mw *ibmw) mw->state = RXE_MW_STATE_INVALID; spin_unlock_irqrestore(&mw->lock, flags); - rxe_drop_ref(pd); + rxe_drop_key(mw); rxe_drop_index(mw); + rxe_drop_ref(pd); rxe_drop_ref(mw); return 0; @@ -99,6 +146,41 @@ int rxe_dealloc_mw(struct ib_mw *ibmw) int rxe_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe) { - pr_err("rxe_bind_mw: not implemented\n"); - return -ENOSYS; + struct rxe_mw *mw; + struct rxe_mr *mr; + + pr_info("rxe_bind_mw: called\n"); + + if (qp->is_user) { + } else { + mw = to_rmw(wqe->wr.wr.kmw.ibmw); + mr = to_rmr(wqe->wr.wr.kmw.ibmr); + } + +#if 0 + wqe->wr.wr.bind_mw + __aligned_u64 addr; + __aligned_u64 length; + __u32 mr_rkey; + __u32 mw_rkey; + __u32 rkey; + __u32 access; + + mw + struct rxe_pool_entry pelem; // alloc + struct ib_mw ibmw; // alloc + struct ib_device *device; // alloc + struct ib_pd *pd; // alloc + struct ib_uobject *uobject; // alloc + u32 rkey; + enum ib_mw_type type; // alloc + struct rxe_qp *qp; // bind + struct rxe_mem *mr; // bind + spinlock_t lock; // alloc + enum rxe_mw_state state; // all + u32 access; // bind + u64 addr; // bind + u64 length; // bind +#endif + return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index e157bf945175..35e9646e104c 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -34,10 +34,6 @@ #include "rxe.h" #include "rxe_loc.h" -/* info about object pools - * note that mr and mw share a single index space - * so that one can map an lkey to the correct type of object - */ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { [RXE_TYPE_UC] = { .name = "rxe-uc", @@ -79,16 +75,22 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { .name = "rxe-mr", .size = sizeof(struct rxe_mr), .cleanup = rxe_mr_cleanup, - .flags = RXE_POOL_INDEX, + .flags = RXE_POOL_INDEX + | RXE_POOL_KEY, .max_index = RXE_MAX_MR_INDEX, .min_index = RXE_MIN_MR_INDEX, + .key_offset = offsetof(struct rxe_mr, ibmr.lkey), + .key_size = sizeof(u32), }, [RXE_TYPE_MW] = { .name = "rxe-mw", .size = sizeof(struct rxe_mw), - .flags = RXE_POOL_INDEX, + .flags = RXE_POOL_INDEX + | RXE_POOL_KEY, .max_index = RXE_MAX_MW_INDEX, .min_index = RXE_MIN_MW_INDEX, + .key_offset = offsetof(struct rxe_mw, ibmw.rkey), + .key_size = sizeof(u32), }, [RXE_TYPE_MC_GRP] = { .name = "rxe-mc_grp", @@ -308,8 +310,9 @@ static void insert_index(struct rxe_pool *pool, struct rxe_pool_entry *new) return; } -static void insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new) +static int insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new) { + int ret; struct rb_node **link = &pool->key.tree.rb_node; struct rb_node *parent = NULL; struct rxe_pool_entry *elem; @@ -323,7 +326,7 @@ static void insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new) (u8 *)new + pool->key.key_offset, pool->key.key_size); if (cmp == 0) { - pr_warn("key already exists!\n"); + ret = -EAGAIN; goto out; } @@ -335,20 +338,25 @@ static void insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new) rb_link_node(&new->key_node, parent, link); rb_insert_color(&new->key_node, &pool->key.tree); + + ret = 0; out: - return; + return ret; } -void rxe_add_key(void *arg, void *key) +int rxe_add_key(void *arg, void *key) { + int ret; struct rxe_pool_entry *elem = arg; struct rxe_pool *pool = elem->pool; unsigned long flags; write_lock_irqsave(&pool->pool_lock, flags); memcpy((u8 *)elem + pool->key.key_offset, key, pool->key.key_size); - insert_key(pool, elem); + ret = insert_key(pool, elem); write_unlock_irqrestore(&pool->pool_lock, flags); + + return ret; } void rxe_drop_key(void *arg) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index bd684df6d847..0ba811456f79 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -156,7 +156,7 @@ void rxe_drop_index(void *elem); /* assign a key to a keyed object and insert object into * pool's rb tree */ -void rxe_add_key(void *elem, void *key); +int rxe_add_key(void *elem, void *key); /* remove elem from rb tree */ void rxe_drop_key(void *elem); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index cac0f3f0c7c1..29191cacfc56 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -911,9 +911,20 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, int access, struct ib_udata *udata) { int err; + struct rxe_mr *mr; struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); - struct rxe_mr *mr; + struct rxe_reg_mr_resp __user *uresp = NULL; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) { + err = -EINVAL; + goto err2; + } + uresp = udata->outbuf; + } + + rxe_add_ref(pd); mr = rxe_alloc(&rxe->mr_pool); if (!mr) { @@ -923,19 +934,28 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, rxe_add_index(mr); - rxe_add_ref(pd); - err = rxe_mr_init_user(pd, start, length, iova, access, udata, mr); if (err) goto err3; + pr_info("rxe_reg_user_mr: index = 0x%08x, rkey = 0x%08x\n", + mr->pelem.index, mr->ibmr.rkey); + + if (uresp) { + if (copy_to_user(&uresp->index, &mr->pelem.index, + sizeof(uresp->index))) { + err = -EFAULT; + goto err3; + } + } + return &mr->ibmr; err3: - rxe_drop_ref(pd); rxe_drop_index(mr); rxe_drop_ref(mr); + rxe_drop_ref(pd); err2: return ERR_PTR(err); } diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index f88867d85c3f..c1e84cd69c37 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -96,12 +96,28 @@ struct rxe_send_wr { struct { __aligned_u64 addr; __aligned_u64 length; - __u32 mr_rkey; - __u32 mw_rkey; + __u32 mr_index; + __u32 pad1; + __u32 mw_index; + __u32 pad2; __u32 rkey; __u32 access; - } bind_mw; - /* reg is only used by the kernel and is not part of the uapi */ + } umw; + /* below are only used by the kernel */ + struct { + __aligned_u64 addr; + __aligned_u64 length; + union { + struct ib_mr *ibmr; + __aligned_u64 reserved1; + }; + union { + struct ib_mw *ibmw; + __aligned_u64 reserved2; + }; + __u32 rkey; + __u32 access; + } kmw; struct { union { struct ib_mr *mr; @@ -183,4 +199,14 @@ struct rxe_modify_srq_cmd { __aligned_u64 mmap_info_addr; }; +struct rxe_reg_mr_resp { + __u32 index; + __u32 reserved; +}; + +struct rxe_alloc_mw_resp { + __u32 index; + __u32 reserved; +}; + #endif /* RDMA_USER_RXE_H */ -- 2.25.1