On Tue, Sep 24, 2019 at 1:49 AM Bart Van Assche <bvanassche@xxxxxxx> wrote: > > On 6/20/19 8:03 AM, Jack Wang wrote: > > +module_param_named(max_chunk_size, max_chunk_size, int, 0444); > > +MODULE_PARM_DESC(max_chunk_size, > > + "Max size for each IO request, when change the unit is in byte" > > + " (default: " __stringify(DEFAULT_MAX_CHUNK_SIZE_KB) "KB)"); > > Where can I find the definition of DEFAULT_MAX_CHUNK_SIZE_KB? oh, it's a typo, should be DEFAULT_MAX_CHUNK_SIZE. > > > +static char cq_affinity_list[256] = ""; > > No empty initializers for file-scope variables please. Is it guaranteed by the compiler, the file-scope variables will be empty initialized? > > > + pr_info("cq_affinity_list changed to %*pbl\n", > > + cpumask_pr_args(&cq_affinity_mask)); > > Should this pr_info() call perhaps be changed into pr_debug()? Because the setting could lead to performance drop, pr_info seems more appropriate. > > > +static bool __ibtrs_srv_change_state(struct ibtrs_srv_sess *sess, > > + enum ibtrs_srv_state new_state) > > +{ > > + enum ibtrs_srv_state old_state; > > + bool changed = false; > > + > > + old_state = sess->state; > > + switch (new_state) { > > Please add a lockdep_assert_held() statement that checks whether calls > of this function are serialized properly. will look into it. > > > +/** > > + * rdma_write_sg() - response on successful READ request > > + */ > > +static int rdma_write_sg(struct ibtrs_srv_op *id) > > +{ > > + struct ibtrs_srv_sess *sess = to_srv_sess(id->con->c.sess); > > + dma_addr_t dma_addr = sess->dma_addr[id->msg_id]; > > + struct ibtrs_srv *srv = sess->srv; > > + struct ib_send_wr inv_wr, imm_wr; > > + struct ib_rdma_wr *wr = NULL; > > + const struct ib_send_wr *bad_wr; > > + enum ib_send_flags flags; > > + size_t sg_cnt; > > + int err, i, offset; > > + bool need_inval; > > + u32 rkey = 0; > > + > > + sg_cnt = le16_to_cpu(id->rd_msg->sg_cnt); > > + need_inval = le16_to_cpu(id->rd_msg->flags) & IBTRS_MSG_NEED_INVAL_F; > > + if (unlikely(!sg_cnt)) > > + return -EINVAL; > > + > > + offset = 0; > > + for (i = 0; i < sg_cnt; i++) { > > + struct ib_sge *list; > > + > > + wr = &id->tx_wr[i]; > > + list = &id->tx_sg[i]; > > + list->addr = dma_addr + offset; > > + list->length = le32_to_cpu(id->rd_msg->desc[i].len); > > + > > + /* WR will fail with length error > > + * if this is 0 > > + */ > > + if (unlikely(list->length == 0)) { > > + ibtrs_err(sess, "Invalid RDMA-Write sg list length 0\n"); > > + return -EINVAL; > > + } > > + > > + list->lkey = sess->s.dev->ib_pd->local_dma_lkey; > > + offset += list->length; > > + > > + wr->wr.wr_cqe = &io_comp_cqe; > > + wr->wr.sg_list = list; > > + wr->wr.num_sge = 1; > > + wr->remote_addr = le64_to_cpu(id->rd_msg->desc[i].addr); > > + wr->rkey = le32_to_cpu(id->rd_msg->desc[i].key); > > + if (rkey == 0) > > + rkey = wr->rkey; > > + else > > + /* Only one key is actually used */ > > + WARN_ON_ONCE(rkey != wr->rkey); > > + > > + if (i < (sg_cnt - 1)) > > + wr->wr.next = &id->tx_wr[i + 1].wr; > > + else if (need_inval) > > + wr->wr.next = &inv_wr; > > + else > > + wr->wr.next = &imm_wr; > > + > > + wr->wr.opcode = IB_WR_RDMA_WRITE; > > + wr->wr.ex.imm_data = 0; > > + wr->wr.send_flags = 0; > > + } > > + /* > > + * From time to time we have to post signalled sends, > > + * or send queue will fill up and only QP reset can help. > > + */ > > + flags = atomic_inc_return(&id->con->wr_cnt) % srv->queue_depth ? > > + 0 : IB_SEND_SIGNALED; > > + > > + if (need_inval) { > > + inv_wr.next = &imm_wr; > > + inv_wr.wr_cqe = &io_comp_cqe; > > + inv_wr.sg_list = NULL; > > + inv_wr.num_sge = 0; > > + inv_wr.opcode = IB_WR_SEND_WITH_INV; > > + inv_wr.send_flags = 0; > > + inv_wr.ex.invalidate_rkey = rkey; > > + } > > + imm_wr.next = NULL; > > + imm_wr.wr_cqe = &io_comp_cqe; > > + imm_wr.sg_list = NULL; > > + imm_wr.num_sge = 0; > > + imm_wr.opcode = IB_WR_RDMA_WRITE_WITH_IMM; > > + imm_wr.send_flags = flags; > > + imm_wr.ex.imm_data = cpu_to_be32(ibtrs_to_io_rsp_imm(id->msg_id, > > + 0, need_inval)); > > + > > + ib_dma_sync_single_for_device(sess->s.dev->ib_dev, dma_addr, > > + offset, DMA_BIDIRECTIONAL); > > + > > + err = ib_post_send(id->con->c.qp, &id->tx_wr[0].wr, &bad_wr); > > + if (unlikely(err)) > > + ibtrs_err(sess, > > + "Posting RDMA-Write-Request to QP failed, err: %d\n", > > + err); > > + > > + return err; > > +} > > All other RDMA server implementations use rdma_rw_ctx_init() and > rdma_rw_ctx_wrs(). Please use these functions in IBTRS too. rdma_rw_ctx_* api doesn't support RDMA_WRITE_WITH_IMM, and ibtrs mainly use RDMA_WRITE_WITH_IMM. > > > +static void ibtrs_srv_hb_err_handler(struct ibtrs_con *c, int err) > > +{ > > + (void)err; > > + close_sess(to_srv_sess(c->sess)); > > +} > > Is the (void)err statement really necessary? No, will be removed. > > > +static int ibtrs_srv_rdma_init(struct ibtrs_srv_ctx *ctx, unsigned int port) > > +{ > > + struct sockaddr_in6 sin = { > > + .sin6_family = AF_INET6, > > + .sin6_addr = IN6ADDR_ANY_INIT, > > + .sin6_port = htons(port), > > + }; > > + struct sockaddr_ib sib = { > > + .sib_family = AF_IB, > > + .sib_addr.sib_subnet_prefix = 0ULL, > > + .sib_addr.sib_interface_id = 0ULL, > > + .sib_sid = cpu_to_be64(RDMA_IB_IP_PS_IB | port), > > + .sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL), > > + .sib_pkey = cpu_to_be16(0xffff), > > + }; > > + struct rdma_cm_id *cm_ip, *cm_ib; > > + int ret; > > + > > + /* > > + * We accept both IPoIB and IB connections, so we need to keep > > + * two cm id's, one for each socket type and port space. > > + * If the cm initialization of one of the id's fails, we abort > > + * everything. > > + */ > > + cm_ip = ibtrs_srv_cm_init(ctx, (struct sockaddr *)&sin, RDMA_PS_TCP); > > + if (unlikely(IS_ERR(cm_ip))) > > + return PTR_ERR(cm_ip); > > + > > + cm_ib = ibtrs_srv_cm_init(ctx, (struct sockaddr *)&sib, RDMA_PS_IB); > > + if (unlikely(IS_ERR(cm_ib))) { > > + ret = PTR_ERR(cm_ib); > > + goto free_cm_ip; > > + } > > + > > + ctx->cm_id_ip = cm_ip; > > + ctx->cm_id_ib = cm_ib; > > + > > + return 0; > > + > > +free_cm_ip: > > + rdma_destroy_id(cm_ip); > > + > > + return ret; > > +} > > Will the above work if CONFIG_IPV6=n? I tested with CONFIG_IPV6=n, it compiles. > > > +static int __init ibtrs_server_init(void) > > +{ > > + int err; > > + > > + if (!strlen(cq_affinity_list)) > > + init_cq_affinity(); > > Is the above if-test useful? Can that if-test be left out? You're right, will remove. > > Thanks, > > Bart. Thanks!