2022-04-18 14:14 GMT+09:00, Hyunchul Lee <hyc.lee@xxxxxxxxx>: > Make ksmbd handle multiple buffer descriptors > when reading and writing files using SMB direct: > Post the work requests of rdma_rw_ctx for > RDMA read/write in smb_direct_rdma_xmit(), and > the work request for the READ/WRITE response > with a remote invalidation in smb_direct_writev(). > > Signed-off-by: Hyunchul Lee <hyc.lee@xxxxxxxxx> > --- > changes from v2: > - Split a v2 patch to 4 patches. > > fs/ksmbd/smb2pdu.c | 5 +- > fs/ksmbd/transport_rdma.c | 166 +++++++++++++++++++++++++------------- > 2 files changed, 109 insertions(+), 62 deletions(-) > > diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c > index fc9b8def50df..621fa3e55fab 100644 > --- a/fs/ksmbd/smb2pdu.c > +++ b/fs/ksmbd/smb2pdu.c > @@ -6133,11 +6133,8 @@ static int smb2_set_remote_key_for_rdma(struct > ksmbd_work *work, > le32_to_cpu(desc[i].length)); > } > } > - if (ch_count != 1) { > - ksmbd_debug(RDMA, "RDMA multiple buffer descriptors %d are not supported > yet\n", > - ch_count); > + if (!ch_count) > return -EINVAL; > - } > > work->need_invalidate_rkey = > (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); > diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c > index 1343ff8e00fd..410b79edc9f2 100644 > --- a/fs/ksmbd/transport_rdma.c > +++ b/fs/ksmbd/transport_rdma.c > @@ -208,7 +208,9 @@ struct smb_direct_recvmsg { > struct smb_direct_rdma_rw_msg { > struct smb_direct_transport *t; > struct ib_cqe cqe; > + int status; > struct completion *completion; > + struct list_head list; > struct rdma_rw_ctx rw_ctx; > struct sg_table sgt; > struct scatterlist sg_list[]; > @@ -1313,6 +1315,18 @@ static int smb_direct_writev(struct ksmbd_transport > *t, > return ret; > } > > +static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, > + struct smb_direct_rdma_rw_msg *msg, > + enum dma_data_direction dir) > +{ > + if (msg->sgt.orig_nents) { Is there any case where orig_ent is 0? > + rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, > + msg->sgt.sgl, msg->sgt.nents, dir); > + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); > + } > + kfree(msg); > +} > + > static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, > enum dma_data_direction dir) > { > @@ -1321,19 +1335,14 @@ static void read_write_done(struct ib_cq *cq, struct > ib_wc *wc, > struct smb_direct_transport *t = msg->t; > > if (wc->status != IB_WC_SUCCESS) { > + msg->status = -EIO; > pr_err("read/write error. opcode = %d, status = %s(%d)\n", > wc->opcode, ib_wc_status_msg(wc->status), wc->status); > - smb_direct_disconnect_rdma_connection(t); > + if (wc->status != IB_WC_WR_FLUSH_ERR) Why is this condition needed ? > + smb_direct_disconnect_rdma_connection(t); > } > > - if (atomic_inc_return(&t->rw_credits) > 0) > - wake_up(&t->wait_rw_credits); > - > - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, > - msg->sg_list, msg->sgt.nents, dir); > - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); > complete(msg->completion); > - kfree(msg); > } > > static void read_done(struct ib_cq *cq, struct ib_wc *wc) > @@ -1352,75 +1361,116 @@ static int smb_direct_rdma_xmit(struct > smb_direct_transport *t, > unsigned int desc_len, > bool is_read) > { > - struct smb_direct_rdma_rw_msg *msg; > - int ret; > + struct smb_direct_rdma_rw_msg *msg, *next_msg; > + int i, ret; > DECLARE_COMPLETION_ONSTACK(completion); > - struct ib_send_wr *first_wr = NULL; > - u32 remote_key = le32_to_cpu(desc[0].token); > - u64 remote_offset = le64_to_cpu(desc[0].offset); > + struct ib_send_wr *first_wr; > + LIST_HEAD(msg_list); > + char *desc_buf; > int credits_needed; > + unsigned int desc_buf_len; > + size_t total_length = 0; > + > + if (t->status != SMB_DIRECT_CS_CONNECTED) > + return -ENOTCONN; > + > + /* calculate needed credits */ > + credits_needed = 0; > + desc_buf = buf; > + for (i = 0; i < desc_len / sizeof(*desc); i++) { > + desc_buf_len = le32_to_cpu(desc[i].length); > + > + credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len); > + desc_buf += desc_buf_len; > + total_length += desc_buf_len; > + if (desc_buf_len == 0 || total_length > buf_len || > + total_length > t->max_rdma_rw_size) > + return -EINVAL; > + } > + > + ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", > + is_read ? "read" : "write", buf_len, credits_needed); > > - credits_needed = calc_rw_credits(t, buf, buf_len); > ret = wait_for_rw_credits(t, credits_needed); > if (ret < 0) > return ret; > > - /* TODO: mempool */ > - msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + > - sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); > - if (!msg) { > - atomic_add(credits_needed, &t->rw_credits); > - return -ENOMEM; > - } > + /* build rdma_rw_ctx for each descriptor */ > + desc_buf = buf; > + for (i = 0; i < desc_len / sizeof(*desc); i++) { > + msg = kzalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + > + sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); > + if (!msg) { > + ret = -ENOMEM; > + goto out; > + } > > - msg->sgt.sgl = &msg->sg_list[0]; > - ret = sg_alloc_table_chained(&msg->sgt, > - get_buf_page_count(buf, buf_len), > - msg->sg_list, SG_CHUNK_SIZE); > - if (ret) { > - atomic_add(credits_needed, &t->rw_credits); > - kfree(msg); > - return -ENOMEM; > - } > + desc_buf_len = le32_to_cpu(desc[i].length); > > - ret = get_sg_list(buf, buf_len, msg->sgt.sgl, msg->sgt.orig_nents); > - if (ret <= 0) { > - pr_err("failed to get pages\n"); > - goto err; > - } > + msg->t = t; > + msg->cqe.done = is_read ? read_done : write_done; > + msg->completion = &completion; > > - ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, > - msg->sg_list, get_buf_page_count(buf, buf_len), > - 0, remote_offset, remote_key, > - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); > - if (ret < 0) { > - pr_err("failed to init rdma_rw_ctx: %d\n", ret); > - goto err; > + msg->sgt.sgl = &msg->sg_list[0]; > + ret = sg_alloc_table_chained(&msg->sgt, > + get_buf_page_count(desc_buf, desc_buf_len), > + msg->sg_list, SG_CHUNK_SIZE); > + if (ret) { > + kfree(msg); > + ret = -ENOMEM; > + goto out; > + } > + > + ret = get_sg_list(desc_buf, desc_buf_len, > + msg->sgt.sgl, msg->sgt.orig_nents); > + if (ret <= 0) { Is there any problem if this function returns 0? > + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); > + kfree(msg); > + goto out; > + } > + > + ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, > + msg->sgt.sgl, > + get_buf_page_count(desc_buf, desc_buf_len), > + 0, > + le64_to_cpu(desc[i].offset), > + le32_to_cpu(desc[i].token), > + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); > + if (ret < 0) { > + pr_err("failed to init rdma_rw_ctx: %d\n", ret); > + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); > + kfree(msg); > + goto out; > + } > + > + list_add_tail(&msg->list, &msg_list); > + desc_buf += desc_buf_len; > } > > - msg->t = t; > - msg->cqe.done = is_read ? read_done : write_done; > - msg->completion = &completion; > - first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, > - &msg->cqe, NULL); > + /* concatenate work requests of rdma_rw_ctxs */ > + first_wr = NULL; > + list_for_each_entry_reverse(msg, &msg_list, list) { > + first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, > + &msg->cqe, first_wr); > + } > > ret = ib_post_send(t->qp, first_wr, NULL); > if (ret) { > - pr_err("failed to post send wr: %d\n", ret); > - goto err; > + pr_err("failed to post send wr for RDMA R/W: %d\n", ret); > + goto out; > } > > + msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list); > wait_for_completion(&completion); > - return 0; > - > -err: > + ret = msg->status; > +out: > + list_for_each_entry_safe(msg, next_msg, &msg_list, list) { > + list_del(&msg->list); > + smb_direct_free_rdma_rw_msg(t, msg, > + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); > + } > atomic_add(credits_needed, &t->rw_credits); > - if (first_wr) > - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, > - msg->sg_list, msg->sgt.nents, > - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); > - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); > - kfree(msg); > + wake_up(&t->wait_rw_credits); > return ret; > } > > -- > 2.25.1 > >